#include "RTOCRRecognizer.h" #include #include #include #include #include #include #include #include #include namespace ANSCENTER { namespace rtocr { bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string& dictPath, int gpuId, const std::string& engineCacheDir) { try { // Load dictionary first keys_ = LoadDict(dictPath); if (keys_.size() < 2) { std::cerr << "[RTOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl; return false; } std::cout << "[RTOCRRecognizer] Loaded dictionary with " << keys_.size() << " characters from: " << dictPath << std::endl; ANSCENTER::Options options; options.deviceIndex = gpuId; options.precision = ANSCENTER::Precision::FP16; // maxBatch=4 matches FaceRecognizer / ALPR configuration — allows the // recognizer to process up to 4 detected text lines in one call, // amortizing per-invocation overhead while keeping TRT workspace small. options.maxBatchSize = 4; options.optBatchSize = 4; // Fixed height, dynamic width for recognition options.minInputHeight = imgH_; options.optInputHeight = imgH_; options.maxInputHeight = imgH_; options.minInputWidth = 32; options.optInputWidth = imgMaxW_; options.maxInputWidth = 960; if (!engineCacheDir.empty()) { options.engineFileDir = engineCacheDir; } else { auto pos = onnxPath.find_last_of("/\\"); options.engineFileDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : "."; } m_poolKey = { onnxPath, static_cast(options.precision), options.maxBatchSize }; m_engine = EnginePoolManager::instance().acquire( m_poolKey, options, onnxPath, kRecSubVals, kRecDivVals, true, getPoolMaxSlotsPerGpu()); m_usingSharedPool = (m_engine != nullptr); if (!m_engine) { std::cerr << "[RTOCRRecognizer] Failed to build/load TRT engine: " << onnxPath << std::endl; return false; } std::cout << "[RTOCRRecognizer] Initialized TRT engine from: " << onnxPath << std::endl; return true; } catch (const std::exception& e) { std::cerr << "[RTOCRRecognizer] Initialize failed: " << e.what() << std::endl; m_engine.reset(); return false; } } TextLine RTOCRRecognizer::Recognize(const cv::Mat& croppedImage) { std::lock_guard lock(_mutex); if (!m_engine || croppedImage.empty() || keys_.empty()) { return {}; } try { // Preprocess: resize to fixed height, proportional width cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_); int resizedW = resized.cols; // Pad to at least kRecImgW width (matching official PaddleOCR behavior) // Official PaddleOCR pads with 0.0 in normalized space ≈ pixel value 128 (gray) int imgW = std::max(resizedW, kRecImgW); if (imgW > resizedW) { cv::Mat padded(imgH_, imgW, resized.type(), cv::Scalar(128, 128, 128)); resized.copyTo(padded(cv::Rect(0, 0, resizedW, imgH_))); resized = padded; } // Upload to GPU (keep BGR order - PaddleOCR official does NOT convert BGR→RGB) cv::cuda::GpuMat gpuImg; gpuImg.upload(resized); // Run inference std::vector> inputs = { { gpuImg } }; std::vector>> featureVectors; if (!m_engine->runInference(inputs, featureVectors)) { std::cerr << "[RTOCRRecognizer] Inference failed" << std::endl; return {}; } if (featureVectors.empty() || featureVectors[0].empty() || featureVectors[0][0].empty()) { return {}; } // Output shape: [1, seqLen, numClasses] flattened to [seqLen * numClasses] // IMPORTANT: The TRT engine output buffer is pre-allocated to MAX dimensions // (e.g. 120 timesteps for max width 960), but the actual inference produces // fewer timesteps for narrower images. We must use the ACTUAL seqLen // derived from the input width, not getOutputDims() which returns max dims. const std::vector& output = featureVectors[0][0]; // numClasses from dictionary size (keys_ includes blank at index 0) int numClasses = static_cast(keys_.size()); // Actual seqLen from input width: recognition model stride = 8 // (confirmed: 960px input → 120 timesteps, 960/120 = 8) int seqLen = imgW / 8; // Sanity check: seqLen * numClasses must not exceed buffer size if (seqLen * numClasses > static_cast(output.size())) { // Fallback: infer from buffer size seqLen = static_cast(output.size()) / numClasses; } return CTCDecode(output.data(), seqLen, numClasses); } catch (const std::exception& e) { std::cerr << "[RTOCRRecognizer] Recognize failed: " << e.what() << std::endl; return {}; } } std::vector RTOCRRecognizer::RecognizeBatch(const std::vector& croppedImages) { std::vector results; results.reserve(croppedImages.size()); // Process one at a time (each image has different width) for (size_t i = 0; i < croppedImages.size(); i++) { results.push_back(Recognize(croppedImages[i])); } return results; } TextLine RTOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) { TextLine result; std::string text; std::vector scores; int lastIndex = 0; // CTC blank is index 0 for (int t = 0; t < seqLen; t++) { // Find argmax for this timestep int maxIndex = 0; float maxValue = -FLT_MAX; const float* timeStep = outputData + t * numClasses; for (int c = 0; c < numClasses; c++) { if (timeStep[c] > maxValue) { maxValue = timeStep[c]; maxIndex = c; } } // CTC decode: skip blanks (index 0) and repeated characters if (maxIndex != 0 && maxIndex != lastIndex) { if (maxIndex > 0 && maxIndex < static_cast(keys_.size())) { text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc. // Use raw model output value as confidence (PaddleOCR v5 models include softmax) scores.push_back(maxValue); } } lastIndex = maxIndex; } result.text = text; if (!scores.empty()) { result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) / static_cast(scores.size()); } return result; } RTOCRRecognizer::~RTOCRRecognizer() { try { if (m_usingSharedPool) { EnginePoolManager::instance().release(m_poolKey); m_engine.reset(); m_usingSharedPool = false; } else if (m_engine) { m_engine.reset(); } } catch (...) {} } } // namespace rtocr } // namespace ANSCENTER