Initial setup for CLion

2026-03-28 16:54:11 +11:00
parent 239cc02591
commit 7b4134133c
1136 changed files with 811916 additions and 0 deletions
--- a/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
+++ b/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
@@ -0,0 +1,206 @@
+#include "RTOCRRecognizer.h"
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/cudawarping.hpp>
+#include <opencv2/cudaarithm.hpp>
+#include <iostream>
+#include <algorithm>
+#include <numeric>
+#include <cmath>
+#include <cfloat>
+
+namespace ANSCENTER {
+namespace rtocr {
+
+bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string& dictPath,
+                                  int gpuId, const std::string& engineCacheDir) {
+    try {
+        // Load dictionary first
+        keys_ = LoadDict(dictPath);
+        if (keys_.size() < 2) {
+            std::cerr << "[RTOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl;
+            return false;
+        }
+        std::cout << "[RTOCRRecognizer] Loaded dictionary with " << keys_.size()
+                  << " characters from: " << dictPath << std::endl;
+
+        ANSCENTER::Options options;
+        options.deviceIndex = gpuId;
+        options.precision = ANSCENTER::Precision::FP16;
+        options.maxBatchSize = 1;
+        options.optBatchSize = 1;
+
+        // Fixed height, dynamic width for recognition
+        options.minInputHeight = imgH_;
+        options.optInputHeight = imgH_;
+        options.maxInputHeight = imgH_;
+        options.minInputWidth  = 32;
+        options.optInputWidth  = imgMaxW_;
+        options.maxInputWidth  = 960;
+
+        if (!engineCacheDir.empty()) {
+            options.engineFileDir = engineCacheDir;
+        }
+        else {
+            auto pos = onnxPath.find_last_of("/\\");
+            options.engineFileDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
+        }
+
+        m_poolKey = { onnxPath,
+            static_cast<int>(options.precision),
+            options.maxBatchSize };
+        m_engine = EnginePoolManager<float>::instance().acquire(
+            m_poolKey, options, onnxPath,
+            kRecSubVals, kRecDivVals, true, -1);
+        m_usingSharedPool = (m_engine != nullptr);
+
+        if (!m_engine) {
+            std::cerr << "[RTOCRRecognizer] Failed to build/load TRT engine: " << onnxPath << std::endl;
+            return false;
+        }
+
+        std::cout << "[RTOCRRecognizer] Initialized TRT engine from: " << onnxPath << std::endl;
+        return true;
+    }
+    catch (const std::exception& e) {
+        std::cerr << "[RTOCRRecognizer] Initialize failed: " << e.what() << std::endl;
+        m_engine.reset();
+        return false;
+    }
+}
+
+TextLine RTOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
+    std::lock_guard<std::mutex> lock(_mutex);
+
+    if (!m_engine || croppedImage.empty() || keys_.empty()) {
+        return {};
+    }
+
+    try {
+        // Preprocess: resize to fixed height, proportional width
+        cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
+        int resizedW = resized.cols;
+
+        // Pad to at least kRecImgW width (matching official PaddleOCR behavior)
+        // Official PaddleOCR pads with 0.0 in normalized space ≈ pixel value 128 (gray)
+        int imgW = std::max(resizedW, kRecImgW);
+        if (imgW > resizedW) {
+            cv::Mat padded(imgH_, imgW, resized.type(), cv::Scalar(128, 128, 128));
+            resized.copyTo(padded(cv::Rect(0, 0, resizedW, imgH_)));
+            resized = padded;
+        }
+
+        // Upload to GPU (keep BGR order - PaddleOCR official does NOT convert BGR→RGB)
+        cv::cuda::GpuMat gpuImg;
+        gpuImg.upload(resized);
+
+        // Run inference
+        std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuImg } };
+        std::vector<std::vector<std::vector<float>>> featureVectors;
+
+        if (!m_engine->runInference(inputs, featureVectors)) {
+            std::cerr << "[RTOCRRecognizer] Inference failed" << std::endl;
+            return {};
+        }
+
+        if (featureVectors.empty() || featureVectors[0].empty() ||
+            featureVectors[0][0].empty()) {
+            return {};
+        }
+
+        // Output shape: [1, seqLen, numClasses] flattened to [seqLen * numClasses]
+        // IMPORTANT: The TRT engine output buffer is pre-allocated to MAX dimensions
+        // (e.g. 120 timesteps for max width 960), but the actual inference produces
+        // fewer timesteps for narrower images.  We must use the ACTUAL seqLen
+        // derived from the input width, not getOutputDims() which returns max dims.
+        const std::vector<float>& output = featureVectors[0][0];
+
+        // numClasses from dictionary size (keys_ includes blank at index 0)
+        int numClasses = static_cast<int>(keys_.size());
+
+        // Actual seqLen from input width: recognition model stride = 8
+        // (confirmed: 960px input → 120 timesteps, 960/120 = 8)
+        int seqLen = imgW / 8;
+
+        // Sanity check: seqLen * numClasses must not exceed buffer size
+        if (seqLen * numClasses > static_cast<int>(output.size())) {
+            // Fallback: infer from buffer size
+            seqLen = static_cast<int>(output.size()) / numClasses;
+        }
+
+        return CTCDecode(output.data(), seqLen, numClasses);
+    }
+    catch (const std::exception& e) {
+        std::cerr << "[RTOCRRecognizer] Recognize failed: " << e.what() << std::endl;
+        return {};
+    }
+}
+
+std::vector<TextLine> RTOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
+    std::vector<TextLine> results;
+    results.reserve(croppedImages.size());
+
+    // Process one at a time (each image has different width)
+    for (size_t i = 0; i < croppedImages.size(); i++) {
+        results.push_back(Recognize(croppedImages[i]));
+    }
+
+    return results;
+}
+
+TextLine RTOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
+    TextLine result;
+    std::string text;
+    std::vector<float> scores;
+
+    int lastIndex = 0; // CTC blank is index 0
+
+    for (int t = 0; t < seqLen; t++) {
+        // Find argmax for this timestep
+        int maxIndex = 0;
+        float maxValue = -FLT_MAX;
+
+        const float* timeStep = outputData + t * numClasses;
+        for (int c = 0; c < numClasses; c++) {
+            if (timeStep[c] > maxValue) {
+                maxValue = timeStep[c];
+                maxIndex = c;
+            }
+        }
+
+        // CTC decode: skip blanks (index 0) and repeated characters
+        if (maxIndex != 0 && maxIndex != lastIndex) {
+            if (maxIndex > 0 && maxIndex < static_cast<int>(keys_.size())) {
+                text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc.
+                // Use raw model output value as confidence (PaddleOCR v5 models include softmax)
+                scores.push_back(maxValue);
+            }
+        }
+        lastIndex = maxIndex;
+    }
+
+    result.text = text;
+    if (!scores.empty()) {
+        result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) /
+                       static_cast<float>(scores.size());
+    }
+    return result;
+}
+
+RTOCRRecognizer::~RTOCRRecognizer() {
+    try {
+        if (m_usingSharedPool) {
+            EnginePoolManager<float>::instance().release(m_poolKey);
+            m_engine.reset();
+            m_usingSharedPool = false;
+        }
+        else if (m_engine) {
+            m_engine.reset();
+        }
+    }
+    catch (...) {}
+}
+
+} // namespace rtocr
+} // namespace ANSCENTER