modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp

#include "RTOCRRecognizer.h"

#include <opencv2/imgproc.hpp>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/cudaarithm.hpp>
#include <iostream>
#include <algorithm>
#include <numeric>
#include <cmath>
#include <cfloat>

namespace ANSCENTER {
namespace rtocr {

bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string& dictPath,
                                  int gpuId, const std::string& engineCacheDir) {
    try {
        // Load dictionary first
        keys_ = LoadDict(dictPath);
        if (keys_.size() < 2) {
            std::cerr << "[RTOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl;
            return false;
        }
        std::cout << "[RTOCRRecognizer] Loaded dictionary with " << keys_.size()
                  << " characters from: " << dictPath << std::endl;

        ANSCENTER::Options options;
        options.deviceIndex = gpuId;
        options.precision = ANSCENTER::Precision::FP16;
        options.maxBatchSize = 1;
        options.optBatchSize = 1;

        // Fixed height, dynamic width for recognition
        options.minInputHeight = imgH_;
        options.optInputHeight = imgH_;
        options.maxInputHeight = imgH_;
        options.minInputWidth  = 32;
        options.optInputWidth  = imgMaxW_;
        options.maxInputWidth  = 960;

        if (!engineCacheDir.empty()) {
            options.engineFileDir = engineCacheDir;
        }
        else {
            auto pos = onnxPath.find_last_of("/\\");
            options.engineFileDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
        }

        m_poolKey = { onnxPath,
            static_cast<int>(options.precision),
            options.maxBatchSize };
        m_engine = EnginePoolManager<float>::instance().acquire(
            m_poolKey, options, onnxPath,
            kRecSubVals, kRecDivVals, true, -1);
        m_usingSharedPool = (m_engine != nullptr);

        if (!m_engine) {
            std::cerr << "[RTOCRRecognizer] Failed to build/load TRT engine: " << onnxPath << std::endl;
            return false;
        }

        std::cout << "[RTOCRRecognizer] Initialized TRT engine from: " << onnxPath << std::endl;
        return true;
    }
    catch (const std::exception& e) {
        std::cerr << "[RTOCRRecognizer] Initialize failed: " << e.what() << std::endl;
        m_engine.reset();
        return false;
    }
}

TextLine RTOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
    std::lock_guard<std::mutex> lock(_mutex);

    if (!m_engine || croppedImage.empty() || keys_.empty()) {
        return {};
    }

    try {
        // Preprocess: resize to fixed height, proportional width
        cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
        int resizedW = resized.cols;

        // Pad to at least kRecImgW width (matching official PaddleOCR behavior)
        // Official PaddleOCR pads with 0.0 in normalized space ≈ pixel value 128 (gray)
        int imgW = std::max(resizedW, kRecImgW);
        if (imgW > resizedW) {
            cv::Mat padded(imgH_, imgW, resized.type(), cv::Scalar(128, 128, 128));
            resized.copyTo(padded(cv::Rect(0, 0, resizedW, imgH_)));
            resized = padded;
        }

        // Upload to GPU (keep BGR order - PaddleOCR official does NOT convert BGR→RGB)
        cv::cuda::GpuMat gpuImg;
        gpuImg.upload(resized);

        // Run inference
        std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuImg } };
        std::vector<std::vector<std::vector<float>>> featureVectors;

        if (!m_engine->runInference(inputs, featureVectors)) {
            std::cerr << "[RTOCRRecognizer] Inference failed" << std::endl;
            return {};
        }

        if (featureVectors.empty() || featureVectors[0].empty() ||
            featureVectors[0][0].empty()) {
            return {};
        }

        // Output shape: [1, seqLen, numClasses] flattened to [seqLen * numClasses]
        // IMPORTANT: The TRT engine output buffer is pre-allocated to MAX dimensions
        // (e.g. 120 timesteps for max width 960), but the actual inference produces
        // fewer timesteps for narrower images.  We must use the ACTUAL seqLen
        // derived from the input width, not getOutputDims() which returns max dims.
        const std::vector<float>& output = featureVectors[0][0];

        // numClasses from dictionary size (keys_ includes blank at index 0)
        int numClasses = static_cast<int>(keys_.size());

        // Actual seqLen from input width: recognition model stride = 8
        // (confirmed: 960px input → 120 timesteps, 960/120 = 8)
        int seqLen = imgW / 8;

        // Sanity check: seqLen * numClasses must not exceed buffer size
        if (seqLen * numClasses > static_cast<int>(output.size())) {
            // Fallback: infer from buffer size
            seqLen = static_cast<int>(output.size()) / numClasses;
        }

        return CTCDecode(output.data(), seqLen, numClasses);
    }
    catch (const std::exception& e) {
        std::cerr << "[RTOCRRecognizer] Recognize failed: " << e.what() << std::endl;
        return {};
    }
}

std::vector<TextLine> RTOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
    std::vector<TextLine> results;
    results.reserve(croppedImages.size());

    // Process one at a time (each image has different width)
    for (size_t i = 0; i < croppedImages.size(); i++) {
        results.push_back(Recognize(croppedImages[i]));
    }

    return results;
}

TextLine RTOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
    TextLine result;
    std::string text;
    std::vector<float> scores;

    int lastIndex = 0; // CTC blank is index 0

    for (int t = 0; t < seqLen; t++) {
        // Find argmax for this timestep
        int maxIndex = 0;
        float maxValue = -FLT_MAX;

        const float* timeStep = outputData + t * numClasses;
        for (int c = 0; c < numClasses; c++) {
            if (timeStep[c] > maxValue) {
                maxValue = timeStep[c];
                maxIndex = c;
            }
        }

        // CTC decode: skip blanks (index 0) and repeated characters
        if (maxIndex != 0 && maxIndex != lastIndex) {
            if (maxIndex > 0 && maxIndex < static_cast<int>(keys_.size())) {
                text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc.
                // Use raw model output value as confidence (PaddleOCR v5 models include softmax)
                scores.push_back(maxValue);
            }
        }
        lastIndex = maxIndex;
    }

    result.text = text;
    if (!scores.empty()) {
        result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) /
                       static_cast<float>(scores.size());
    }
    return result;
}

RTOCRRecognizer::~RTOCRRecognizer() {
    try {
        if (m_usingSharedPool) {
            EnginePoolManager<float>::instance().release(m_poolKey);
            m_engine.reset();
            m_usingSharedPool = false;
        }
        else if (m_engine) {
            m_engine.reset();
        }
    }
    catch (...) {}
}

} // namespace rtocr
} // namespace ANSCENTER
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`#include "RTOCRRecognizer.h"`

			`#include <opencv2/imgproc.hpp>`
			`#include <opencv2/cudaimgproc.hpp>`
			`#include <opencv2/cudawarping.hpp>`
			`#include <opencv2/cudaarithm.hpp>`
			`#include <iostream>`
			`#include <algorithm>`
			`#include <numeric>`
			`#include <cmath>`
			`#include <cfloat>`

			`namespace ANSCENTER {`
			`namespace rtocr {`

			`bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string& dictPath,`
			`int gpuId, const std::string& engineCacheDir) {`
			`try {`
			`// Load dictionary first`
			`keys_ = LoadDict(dictPath);`
			`if (keys_.size() < 2) {`
			`std::cerr << "[RTOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl;`
			`return false;`
			`}`
			`std::cout << "[RTOCRRecognizer] Loaded dictionary with " << keys_.size()`
			`<< " characters from: " << dictPath << std::endl;`

			`ANSCENTER::Options options;`
			`options.deviceIndex = gpuId;`
			`options.precision = ANSCENTER::Precision::FP16;`
			`options.maxBatchSize = 1;`
			`options.optBatchSize = 1;`

			`// Fixed height, dynamic width for recognition`
			`options.minInputHeight = imgH_;`
			`options.optInputHeight = imgH_;`
			`options.maxInputHeight = imgH_;`
			`options.minInputWidth = 32;`
			`options.optInputWidth = imgMaxW_;`
			`options.maxInputWidth = 960;`

			`if (!engineCacheDir.empty()) {`
			`options.engineFileDir = engineCacheDir;`
			`}`
			`else {`
			`auto pos = onnxPath.find_last_of("/\\");`
			`options.engineFileDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";`
			`}`

			`m_poolKey = { onnxPath,`
			`static_cast<int>(options.precision),`
			`options.maxBatchSize };`
			`m_engine = EnginePoolManager<float>::instance().acquire(`
			`m_poolKey, options, onnxPath,`
			`kRecSubVals, kRecDivVals, true, -1);`
			`m_usingSharedPool = (m_engine != nullptr);`

			`if (!m_engine) {`
			`std::cerr << "[RTOCRRecognizer] Failed to build/load TRT engine: " << onnxPath << std::endl;`
			`return false;`
			`}`

			`std::cout << "[RTOCRRecognizer] Initialized TRT engine from: " << onnxPath << std::endl;`
			`return true;`
			`}`
			`catch (const std::exception& e) {`
			`std::cerr << "[RTOCRRecognizer] Initialize failed: " << e.what() << std::endl;`
			`m_engine.reset();`
			`return false;`
			`}`
			`}`

			`TextLine RTOCRRecognizer::Recognize(const cv::Mat& croppedImage) {`
			`std::lock_guard<std::mutex> lock(_mutex);`

			`if (!m_engine \|\| croppedImage.empty() \|\| keys_.empty()) {`
			`return {};`
			`}`

			`try {`
			`// Preprocess: resize to fixed height, proportional width`
			`cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);`
			`int resizedW = resized.cols;`

			`// Pad to at least kRecImgW width (matching official PaddleOCR behavior)`
			`// Official PaddleOCR pads with 0.0 in normalized space ≈ pixel value 128 (gray)`
			`int imgW = std::max(resizedW, kRecImgW);`
			`if (imgW > resizedW) {`
			`cv::Mat padded(imgH_, imgW, resized.type(), cv::Scalar(128, 128, 128));`
			`resized.copyTo(padded(cv::Rect(0, 0, resizedW, imgH_)));`
			`resized = padded;`
			`}`

			`// Upload to GPU (keep BGR order - PaddleOCR official does NOT convert BGR→RGB)`
			`cv::cuda::GpuMat gpuImg;`
			`gpuImg.upload(resized);`

			`// Run inference`
			`std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuImg } };`
			`std::vector<std::vector<std::vector<float>>> featureVectors;`

			`if (!m_engine->runInference(inputs, featureVectors)) {`
			`std::cerr << "[RTOCRRecognizer] Inference failed" << std::endl;`
			`return {};`
			`}`

			`if (featureVectors.empty() \|\| featureVectors[0].empty() \|\|`
			`featureVectors[0][0].empty()) {`
			`return {};`
			`}`

			`// Output shape: [1, seqLen, numClasses] flattened to [seqLen * numClasses]`
			`// IMPORTANT: The TRT engine output buffer is pre-allocated to MAX dimensions`
			`// (e.g. 120 timesteps for max width 960), but the actual inference produces`
			`// fewer timesteps for narrower images. We must use the ACTUAL seqLen`
			`// derived from the input width, not getOutputDims() which returns max dims.`
			`const std::vector<float>& output = featureVectors[0][0];`

			`// numClasses from dictionary size (keys_ includes blank at index 0)`
			`int numClasses = static_cast<int>(keys_.size());`

			`// Actual seqLen from input width: recognition model stride = 8`
			`// (confirmed: 960px input → 120 timesteps, 960/120 = 8)`
			`int seqLen = imgW / 8;`

			`// Sanity check: seqLen * numClasses must not exceed buffer size`
			`if (seqLen * numClasses > static_cast<int>(output.size())) {`
			`// Fallback: infer from buffer size`
			`seqLen = static_cast<int>(output.size()) / numClasses;`
			`}`

			`return CTCDecode(output.data(), seqLen, numClasses);`
			`}`
			`catch (const std::exception& e) {`
			`std::cerr << "[RTOCRRecognizer] Recognize failed: " << e.what() << std::endl;`
			`return {};`
			`}`
			`}`

			`std::vector<TextLine> RTOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {`
			`std::vector<TextLine> results;`
			`results.reserve(croppedImages.size());`

			`// Process one at a time (each image has different width)`
			`for (size_t i = 0; i < croppedImages.size(); i++) {`
			`results.push_back(Recognize(croppedImages[i]));`
			`}`

			`return results;`
			`}`

			`TextLine RTOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {`
			`TextLine result;`
			`std::string text;`
			`std::vector<float> scores;`

			`int lastIndex = 0; // CTC blank is index 0`

			`for (int t = 0; t < seqLen; t++) {`
			`// Find argmax for this timestep`
			`int maxIndex = 0;`
			`float maxValue = -FLT_MAX;`

			`const float* timeStep = outputData + t * numClasses;`
			`for (int c = 0; c < numClasses; c++) {`
			`if (timeStep[c] > maxValue) {`
			`maxValue = timeStep[c];`
			`maxIndex = c;`
			`}`
			`}`

			`// CTC decode: skip blanks (index 0) and repeated characters`
			`if (maxIndex != 0 && maxIndex != lastIndex) {`
			`if (maxIndex > 0 && maxIndex < static_cast<int>(keys_.size())) {`
			`text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc.`
			`// Use raw model output value as confidence (PaddleOCR v5 models include softmax)`
			`scores.push_back(maxValue);`
			`}`
			`}`
			`lastIndex = maxIndex;`
			`}`

			`result.text = text;`
			`if (!scores.empty()) {`
			`result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) /`
			`static_cast<float>(scores.size());`
			`}`
			`return result;`
			`}`

			`RTOCRRecognizer::~RTOCRRecognizer() {`
			`try {`
			`if (m_usingSharedPool) {`
			`EnginePoolManager<float>::instance().release(m_poolKey);`
			`m_engine.reset();`
			`m_usingSharedPool = false;`
			`}`
			`else if (m_engine) {`
			`m_engine.reset();`
			`}`
			`}`
			`catch (...) {}`
			`}`

			`} // namespace rtocr`
			`} // namespace ANSCENTER`