Initial setup for CLion
This commit is contained in:
206
ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
Normal file
206
ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
Normal file
@@ -0,0 +1,206 @@
|
||||
#include "RTOCRRecognizer.h"
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/cudaimgproc.hpp>
|
||||
#include <opencv2/cudawarping.hpp>
|
||||
#include <opencv2/cudaarithm.hpp>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <cmath>
|
||||
#include <cfloat>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace rtocr {
|
||||
|
||||
bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string& dictPath,
|
||||
int gpuId, const std::string& engineCacheDir) {
|
||||
try {
|
||||
// Load dictionary first
|
||||
keys_ = LoadDict(dictPath);
|
||||
if (keys_.size() < 2) {
|
||||
std::cerr << "[RTOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl;
|
||||
return false;
|
||||
}
|
||||
std::cout << "[RTOCRRecognizer] Loaded dictionary with " << keys_.size()
|
||||
<< " characters from: " << dictPath << std::endl;
|
||||
|
||||
ANSCENTER::Options options;
|
||||
options.deviceIndex = gpuId;
|
||||
options.precision = ANSCENTER::Precision::FP16;
|
||||
options.maxBatchSize = 1;
|
||||
options.optBatchSize = 1;
|
||||
|
||||
// Fixed height, dynamic width for recognition
|
||||
options.minInputHeight = imgH_;
|
||||
options.optInputHeight = imgH_;
|
||||
options.maxInputHeight = imgH_;
|
||||
options.minInputWidth = 32;
|
||||
options.optInputWidth = imgMaxW_;
|
||||
options.maxInputWidth = 960;
|
||||
|
||||
if (!engineCacheDir.empty()) {
|
||||
options.engineFileDir = engineCacheDir;
|
||||
}
|
||||
else {
|
||||
auto pos = onnxPath.find_last_of("/\\");
|
||||
options.engineFileDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
|
||||
}
|
||||
|
||||
m_poolKey = { onnxPath,
|
||||
static_cast<int>(options.precision),
|
||||
options.maxBatchSize };
|
||||
m_engine = EnginePoolManager<float>::instance().acquire(
|
||||
m_poolKey, options, onnxPath,
|
||||
kRecSubVals, kRecDivVals, true, -1);
|
||||
m_usingSharedPool = (m_engine != nullptr);
|
||||
|
||||
if (!m_engine) {
|
||||
std::cerr << "[RTOCRRecognizer] Failed to build/load TRT engine: " << onnxPath << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::cout << "[RTOCRRecognizer] Initialized TRT engine from: " << onnxPath << std::endl;
|
||||
return true;
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[RTOCRRecognizer] Initialize failed: " << e.what() << std::endl;
|
||||
m_engine.reset();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
TextLine RTOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
|
||||
std::lock_guard<std::mutex> lock(_mutex);
|
||||
|
||||
if (!m_engine || croppedImage.empty() || keys_.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
try {
|
||||
// Preprocess: resize to fixed height, proportional width
|
||||
cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
|
||||
int resizedW = resized.cols;
|
||||
|
||||
// Pad to at least kRecImgW width (matching official PaddleOCR behavior)
|
||||
// Official PaddleOCR pads with 0.0 in normalized space ≈ pixel value 128 (gray)
|
||||
int imgW = std::max(resizedW, kRecImgW);
|
||||
if (imgW > resizedW) {
|
||||
cv::Mat padded(imgH_, imgW, resized.type(), cv::Scalar(128, 128, 128));
|
||||
resized.copyTo(padded(cv::Rect(0, 0, resizedW, imgH_)));
|
||||
resized = padded;
|
||||
}
|
||||
|
||||
// Upload to GPU (keep BGR order - PaddleOCR official does NOT convert BGR→RGB)
|
||||
cv::cuda::GpuMat gpuImg;
|
||||
gpuImg.upload(resized);
|
||||
|
||||
// Run inference
|
||||
std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuImg } };
|
||||
std::vector<std::vector<std::vector<float>>> featureVectors;
|
||||
|
||||
if (!m_engine->runInference(inputs, featureVectors)) {
|
||||
std::cerr << "[RTOCRRecognizer] Inference failed" << std::endl;
|
||||
return {};
|
||||
}
|
||||
|
||||
if (featureVectors.empty() || featureVectors[0].empty() ||
|
||||
featureVectors[0][0].empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
// Output shape: [1, seqLen, numClasses] flattened to [seqLen * numClasses]
|
||||
// IMPORTANT: The TRT engine output buffer is pre-allocated to MAX dimensions
|
||||
// (e.g. 120 timesteps for max width 960), but the actual inference produces
|
||||
// fewer timesteps for narrower images. We must use the ACTUAL seqLen
|
||||
// derived from the input width, not getOutputDims() which returns max dims.
|
||||
const std::vector<float>& output = featureVectors[0][0];
|
||||
|
||||
// numClasses from dictionary size (keys_ includes blank at index 0)
|
||||
int numClasses = static_cast<int>(keys_.size());
|
||||
|
||||
// Actual seqLen from input width: recognition model stride = 8
|
||||
// (confirmed: 960px input → 120 timesteps, 960/120 = 8)
|
||||
int seqLen = imgW / 8;
|
||||
|
||||
// Sanity check: seqLen * numClasses must not exceed buffer size
|
||||
if (seqLen * numClasses > static_cast<int>(output.size())) {
|
||||
// Fallback: infer from buffer size
|
||||
seqLen = static_cast<int>(output.size()) / numClasses;
|
||||
}
|
||||
|
||||
return CTCDecode(output.data(), seqLen, numClasses);
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[RTOCRRecognizer] Recognize failed: " << e.what() << std::endl;
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TextLine> RTOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
|
||||
std::vector<TextLine> results;
|
||||
results.reserve(croppedImages.size());
|
||||
|
||||
// Process one at a time (each image has different width)
|
||||
for (size_t i = 0; i < croppedImages.size(); i++) {
|
||||
results.push_back(Recognize(croppedImages[i]));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
TextLine RTOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
|
||||
TextLine result;
|
||||
std::string text;
|
||||
std::vector<float> scores;
|
||||
|
||||
int lastIndex = 0; // CTC blank is index 0
|
||||
|
||||
for (int t = 0; t < seqLen; t++) {
|
||||
// Find argmax for this timestep
|
||||
int maxIndex = 0;
|
||||
float maxValue = -FLT_MAX;
|
||||
|
||||
const float* timeStep = outputData + t * numClasses;
|
||||
for (int c = 0; c < numClasses; c++) {
|
||||
if (timeStep[c] > maxValue) {
|
||||
maxValue = timeStep[c];
|
||||
maxIndex = c;
|
||||
}
|
||||
}
|
||||
|
||||
// CTC decode: skip blanks (index 0) and repeated characters
|
||||
if (maxIndex != 0 && maxIndex != lastIndex) {
|
||||
if (maxIndex > 0 && maxIndex < static_cast<int>(keys_.size())) {
|
||||
text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc.
|
||||
// Use raw model output value as confidence (PaddleOCR v5 models include softmax)
|
||||
scores.push_back(maxValue);
|
||||
}
|
||||
}
|
||||
lastIndex = maxIndex;
|
||||
}
|
||||
|
||||
result.text = text;
|
||||
if (!scores.empty()) {
|
||||
result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) /
|
||||
static_cast<float>(scores.size());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
RTOCRRecognizer::~RTOCRRecognizer() {
|
||||
try {
|
||||
if (m_usingSharedPool) {
|
||||
EnginePoolManager<float>::instance().release(m_poolKey);
|
||||
m_engine.reset();
|
||||
m_usingSharedPool = false;
|
||||
}
|
||||
else if (m_engine) {
|
||||
m_engine.reset();
|
||||
}
|
||||
}
|
||||
catch (...) {}
|
||||
}
|
||||
|
||||
} // namespace rtocr
|
||||
} // namespace ANSCENTER
|
||||
Reference in New Issue
Block a user