Refactor project structure

2026-03-28 19:56:39 +11:00
parent 1d267378b2
commit 8a2e721058
511 changed files with 59 additions and 48 deletions
--- a/modules/ANSOCR/ANSRTOCR/PaddleOCRV5RTEngine.cpp
+++ b/modules/ANSOCR/ANSRTOCR/PaddleOCRV5RTEngine.cpp
@@ -0,0 +1,151 @@
+#include "PaddleOCRV5RTEngine.h"
+#include <opencv2/imgproc.hpp>
+#include <iostream>
+
+namespace ANSCENTER {
+namespace rtocr {
+
+bool PaddleOCRV5RTEngine::Initialize(const std::string& detModelPath,
+                                      const std::string& clsModelPath,
+                                      const std::string& recModelPath,
+                                      const std::string& dictPath,
+                                      int gpuId,
+                                      const std::string& engineCacheDir) {
+    std::lock_guard<std::recursive_mutex> lock(_mutex);
+
+    gpuId_ = gpuId;
+    if (!engineCacheDir.empty()) {
+        engineCacheDir_ = engineCacheDir;
+    }
+
+    try {
+        // 1. Initialize detector
+        detector_ = std::make_unique<RTOCRDetector>();
+        if (!detector_->Initialize(detModelPath, gpuId_, engineCacheDir_, detMaxSideLen_)) {
+            std::cerr << "[PaddleOCRV5RTEngine] Failed to initialize detector" << std::endl;
+            return false;
+        }
+
+        // 2. Initialize classifier (optional - only if path provided)
+        if (!clsModelPath.empty()) {
+            classifier_ = std::make_unique<RTOCRClassifier>();
+            if (!classifier_->Initialize(clsModelPath, gpuId_, engineCacheDir_)) {
+                std::cerr << "[PaddleOCRV5RTEngine] Warning: Failed to initialize classifier, skipping"
+                          << std::endl;
+                classifier_.reset();
+            }
+        }
+
+        // 3. Initialize recognizer
+        recognizer_ = std::make_unique<RTOCRRecognizer>();
+        recognizer_->SetRecImageHeight(recImgH_);
+        recognizer_->SetRecImageMaxWidth(recImgMaxW_);
+        if (!recognizer_->Initialize(recModelPath, dictPath, gpuId_, engineCacheDir_)) {
+            std::cerr << "[PaddleOCRV5RTEngine] Failed to initialize recognizer" << std::endl;
+            return false;
+        }
+
+        std::cout << "[PaddleOCRV5RTEngine] Initialized successfully"
+                  << " (detector: yes, classifier: " << (classifier_ ? "yes" : "no")
+                  << ", recognizer: yes)" << std::endl;
+        return true;
+    }
+    catch (const std::exception& e) {
+        std::cerr << "[PaddleOCRV5RTEngine] Initialize failed: " << e.what() << std::endl;
+        return false;
+    }
+}
+
+std::vector<OCRPredictResult> PaddleOCRV5RTEngine::ocr(const cv::Mat& image) {
+    std::lock_guard<std::recursive_mutex> lock(_mutex);
+    std::vector<OCRPredictResult> results;
+
+    if (!detector_ || !recognizer_ || image.empty()) return results;
+
+    try {
+        // 1. Detection: find text boxes
+        std::vector<TextBox> textBoxes = detector_->Detect(
+            image, detMaxSideLen_, detDbThresh_, detBoxThresh_,
+            detUnclipRatio_, useDilation_);
+
+        if (textBoxes.empty()) return results;
+
+        // 2. Crop text regions
+        std::vector<cv::Mat> croppedImages;
+        croppedImages.reserve(textBoxes.size());
+
+        for (size_t i = 0; i < textBoxes.size(); i++) {
+            cv::Mat cropped = GetRotateCropImage(image, textBoxes[i]);
+            if (cropped.empty()) continue;
+            croppedImages.push_back(cropped);
+        }
+
+        if (croppedImages.size() != textBoxes.size()) {
+            // Some crops failed, rebuild aligned arrays
+            std::vector<TextBox> validBoxes;
+            std::vector<cv::Mat> validCrops;
+            for (size_t i = 0; i < textBoxes.size(); i++) {
+                cv::Mat cropped = GetRotateCropImage(image, textBoxes[i]);
+                if (!cropped.empty()) {
+                    validBoxes.push_back(textBoxes[i]);
+                    validCrops.push_back(cropped);
+                }
+            }
+            textBoxes = validBoxes;
+            croppedImages = validCrops;
+        }
+
+        // 3. Classification (optional): check orientation and rotate if needed
+        std::vector<int> clsLabels(croppedImages.size(), 0);
+        std::vector<float> clsScores(croppedImages.size(), 0.0f);
+
+        if (classifier_) {
+            auto clsResults = classifier_->Classify(croppedImages, clsThresh_);
+            for (size_t i = 0; i < clsResults.size() && i < croppedImages.size(); i++) {
+                clsLabels[i] = clsResults[i].first;
+                clsScores[i] = clsResults[i].second;
+
+                // Rotate 180 degrees if label is odd and confidence is high enough
+                if (clsLabels[i] % 2 == 1 && clsScores[i] > clsThresh_) {
+                    cv::rotate(croppedImages[i], croppedImages[i], cv::ROTATE_180);
+                }
+            }
+        }
+
+        // 4. Recognition: extract text from cropped images
+        std::vector<TextLine> textLines = recognizer_->RecognizeBatch(croppedImages);
+
+        // 5. Combine results
+        results.reserve(textBoxes.size());
+        for (size_t i = 0; i < textBoxes.size(); i++) {
+            OCRPredictResult res;
+
+            // Convert box to [[x,y], ...] format
+            for (int j = 0; j < 4; j++) {
+                res.box.push_back({
+                    static_cast<int>(textBoxes[i].points[j].x),
+                    static_cast<int>(textBoxes[i].points[j].y)
+                });
+            }
+
+            if (i < textLines.size()) {
+                res.text = textLines[i].text;
+                res.score = textLines[i].score;
+            }
+
+            res.cls_label = clsLabels[i];
+            res.cls_score = clsScores[i];
+
+            results.push_back(res);
+        }
+
+        return results;
+    }
+    catch (const std::exception& e) {
+        std::cerr << "[PaddleOCRV5RTEngine] OCR failed: " << e.what() << std::endl;
+        return results;
+    }
+}
+
+} // namespace rtocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSRTOCR/PaddleOCRV5RTEngine.h
+++ b/modules/ANSOCR/ANSRTOCR/PaddleOCRV5RTEngine.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include "RTOCRTypes.h"
+#include "RTOCRDetector.h"
+#include "RTOCRClassifier.h"
+#include "RTOCRRecognizer.h"
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+#include "ANSLicense.h"
+namespace ANSCENTER {
+namespace rtocr {
+
+class PaddleOCRV5RTEngine {
+public:
+    PaddleOCRV5RTEngine() = default;
+    ~PaddleOCRV5RTEngine() = default;
+    PaddleOCRV5RTEngine(const PaddleOCRV5RTEngine&) = delete;
+    PaddleOCRV5RTEngine& operator=(const PaddleOCRV5RTEngine&) = delete;
+
+    // Initialize all components
+    // clsModelPath can be empty to skip classifier
+    bool Initialize(const std::string& detModelPath,
+                    const std::string& clsModelPath,
+                    const std::string& recModelPath,
+                    const std::string& dictPath,
+                    int gpuId = 0,
+                    const std::string& engineCacheDir = "");
+
+    // Run full OCR pipeline: detect → crop → [classify →] recognize
+    std::vector<OCRPredictResult> ocr(const cv::Mat& image);
+
+    // Configuration setters
+    void SetDetMaxSideLen(int v) { detMaxSideLen_ = v; }
+    void SetDetDbThresh(float v) { detDbThresh_ = v; }
+    void SetDetBoxThresh(float v) { detBoxThresh_ = v; }
+    void SetDetUnclipRatio(float v) { detUnclipRatio_ = v; }
+    void SetClsThresh(float v) { clsThresh_ = v; }
+    void SetUseDilation(bool v) { useDilation_ = v; }
+    void SetRecImageHeight(int v) { recImgH_ = v; }
+    void SetRecImageMaxWidth(int v) { recImgMaxW_ = v; }
+    void SetGpuId(int v) { gpuId_ = v; }
+    void SetEngineCacheDir(const std::string& v) { engineCacheDir_ = v; }
+
+private:
+    std::unique_ptr<RTOCRDetector>    detector_;
+    std::unique_ptr<RTOCRClassifier>  classifier_;   // optional
+    std::unique_ptr<RTOCRRecognizer>  recognizer_;
+
+    // Configuration
+    int   detMaxSideLen_  = kDetMaxSideLen;
+    float detDbThresh_    = kDetDbThresh;
+    float detBoxThresh_   = kDetBoxThresh;
+    float detUnclipRatio_ = kDetUnclipRatio;
+    float clsThresh_      = kClsThresh;
+    bool  useDilation_    = false;
+    int   recImgH_        = kRecImgH;
+    int   recImgMaxW_     = kRecImgMaxW;
+    int   gpuId_          = 0;
+    std::string engineCacheDir_;
+
+    std::recursive_mutex _mutex;
+};
+
+} // namespace rtocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSRTOCR/RTOCRClassifier.cpp
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRClassifier.cpp
@@ -0,0 +1,143 @@
+#include "RTOCRClassifier.h"
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/cudawarping.hpp>
+#include <opencv2/cudaarithm.hpp>
+#include <iostream>
+#include <cmath>
+
+namespace ANSCENTER {
+namespace rtocr {
+
+bool RTOCRClassifier::Initialize(const std::string& onnxPath, int gpuId,
+                                  const std::string& engineCacheDir) {
+    try {
+        ANSCENTER::Options options;
+        options.deviceIndex = gpuId;
+        options.precision = ANSCENTER::Precision::FP16;
+        options.maxBatchSize = 1;
+        options.optBatchSize = 1;
+
+        // Fixed input size for classifier
+        options.minInputHeight = kClsImageH;
+        options.optInputHeight = kClsImageH;
+        options.maxInputHeight = kClsImageH;
+        options.minInputWidth  = kClsImageW;
+        options.optInputWidth  = kClsImageW;
+        options.maxInputWidth  = kClsImageW;
+
+        if (!engineCacheDir.empty()) {
+            options.engineFileDir = engineCacheDir;
+        }
+        else {
+            auto pos = onnxPath.find_last_of("/\\");
+            options.engineFileDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
+        }
+
+        m_poolKey = { onnxPath,
+            static_cast<int>(options.precision),
+            options.maxBatchSize };
+        m_engine = EnginePoolManager<float>::instance().acquire(
+            m_poolKey, options, onnxPath,
+            kClsSubVals, kClsDivVals, true, -1);
+        m_usingSharedPool = (m_engine != nullptr);
+
+        if (!m_engine) {
+            std::cerr << "[RTOCRClassifier] Failed to build/load TRT engine: " << onnxPath << std::endl;
+            return false;
+        }
+
+        std::cout << "[RTOCRClassifier] Initialized TRT engine from: " << onnxPath << std::endl;
+        return true;
+    }
+    catch (const std::exception& e) {
+        std::cerr << "[RTOCRClassifier] Initialize failed: " << e.what() << std::endl;
+        m_engine.reset();
+        return false;
+    }
+}
+
+std::vector<std::pair<int, float>> RTOCRClassifier::Classify(
+    const std::vector<cv::Mat>& images, float clsThresh) {
+
+    std::lock_guard<std::mutex> lock(_mutex);
+    std::vector<std::pair<int, float>> results;
+
+    if (!m_engine || images.empty()) return results;
+    results.reserve(images.size());
+
+    for (size_t i = 0; i < images.size(); i++) {
+        try {
+            if (images[i].empty()) {
+                results.push_back({ 0, 0.0f });
+                continue;
+            }
+
+            // Preprocess: direct resize to 80x160 (PP-LCNet_x1_0_textline_ori)
+            // No aspect ratio preservation — matches PaddleOCR official ResizeImage
+            cv::Mat resized;
+            cv::resize(images[i], resized, cv::Size(kClsImageW, kClsImageH));
+
+            // Upload to GPU (keep BGR order - PaddleOCR official does NOT convert BGR→RGB)
+            cv::cuda::GpuMat gpuImg;
+            gpuImg.upload(resized);
+
+            // Run inference
+            std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuImg } };
+            std::vector<std::vector<std::vector<float>>> featureVectors;
+
+            if (!m_engine->runInference(inputs, featureVectors)) {
+                results.push_back({ 0, 0.0f });
+                continue;
+            }
+
+            if (featureVectors.empty() || featureVectors[0].empty() ||
+                featureVectors[0][0].empty()) {
+                results.push_back({ 0, 0.0f });
+                continue;
+            }
+
+            // Find argmax and use raw output value as score
+            // PaddleOCR v5 models include softmax, so output values are probabilities
+            // Matches PaddleOCR official: score = preds[i, argmax_idx]
+            const std::vector<float>& output = featureVectors[0][0];
+            int numClasses = static_cast<int>(output.size());
+
+            int bestIdx = 0;
+            float bestScore = output[0];
+            for (int c = 1; c < numClasses; c++) {
+                if (output[c] > bestScore) {
+                    bestScore = output[c];
+                    bestIdx = c;
+                }
+            }
+
+            results.push_back({ bestIdx, bestScore });
+        }
+        catch (const std::exception& e) {
+            std::cerr << "[RTOCRClassifier] Classify failed for image " << i
+                      << ": " << e.what() << std::endl;
+            results.push_back({ 0, 0.0f });
+        }
+    }
+
+    return results;
+}
+
+RTOCRClassifier::~RTOCRClassifier() {
+    try {
+        if (m_usingSharedPool) {
+            EnginePoolManager<float>::instance().release(m_poolKey);
+            m_engine.reset();
+            m_usingSharedPool = false;
+        }
+        else if (m_engine) {
+            m_engine.reset();
+        }
+    }
+    catch (...) {}
+}
+
+} // namespace rtocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSRTOCR/RTOCRClassifier.h
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRClassifier.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "RTOCRTypes.h"
+#include "engine.h"
+#include "engine/EnginePoolManager.h"
+#include <memory>
+#include <mutex>
+
+namespace ANSCENTER {
+namespace rtocr {
+
+class RTOCRClassifier {
+public:
+    RTOCRClassifier() = default;
+    ~RTOCRClassifier();
+    RTOCRClassifier(const RTOCRClassifier&) = delete;
+    RTOCRClassifier& operator=(const RTOCRClassifier&) = delete;
+
+    bool Initialize(const std::string& onnxPath, int gpuId = 0,
+                    const std::string& engineCacheDir = "");
+
+    // Classify a batch of text images
+    // Returns vector of (cls_label, cls_score) per image
+    // cls_label: 0 = normal, 1 = rotated 180 degrees
+    std::vector<std::pair<int, float>> Classify(
+        const std::vector<cv::Mat>& images, float clsThresh = kClsThresh);
+
+private:
+    std::shared_ptr<Engine<float>> m_engine = nullptr;
+    EnginePoolManager<float>::PoolKey m_poolKey;
+    bool m_usingSharedPool = false;
+    std::mutex _mutex;
+};
+
+} // namespace rtocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp
@@ -0,0 +1,403 @@
+#include "RTOCRDetector.h"
+#include "include/clipper.h"
+#include "NV12PreprocessHelper.h"
+#include "ANSGpuFrameRegistry.h"
+
+#include <cuda_runtime.h>
+#include <opencv2/imgproc.hpp>
+
+// NV12→BGR fused resize via NV12PreprocessHelper (linked from ANSODEngine.dll)
+#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/cudawarping.hpp>
+#include <opencv2/cudaarithm.hpp>
+#include <iostream>
+#include <algorithm>
+#include <cmath>
+
+namespace ANSCENTER {
+namespace rtocr {
+
+bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId,
+                                const std::string& engineCacheDir,
+                                int maxSideLen) {
+    // Engine cache directory
+    std::string cacheDir;
+    if (!engineCacheDir.empty()) {
+        cacheDir = engineCacheDir;
+    } else {
+        auto pos = onnxPath.find_last_of("/\\");
+        cacheDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
+    }
+
+    try {
+        ANSCENTER::Options options;
+        options.deviceIndex  = gpuId;
+        // FP32 required for detection: this CNN (DBNet) produces NaN in FP16.
+        // The model has 142 Convolution + 87 Scale (fused BatchNorm) layers whose
+        // intermediate values overflow FP16 range (65504).  Mixed precision
+        // (forcing only Sigmoid/Softmax to FP32) is insufficient because the NaN
+        // originates deep in the conv->scale->relu backbone before reaching those layers.
+        // Classifier and recognizer remain FP16 with mixed precision -- only the
+        // detector needs full FP32.
+        options.precision    = ANSCENTER::Precision::FP32;
+        options.maxBatchSize = 1;
+        options.optBatchSize = 1;
+
+        // Dynamic spatial dimensions for detection (multiples of 32)
+        options.minInputHeight = 32;
+        options.minInputWidth  = 32;
+        options.optInputHeight = std::min(640, maxSideLen);
+        options.optInputWidth  = std::min(640, maxSideLen);
+        options.maxInputHeight = maxSideLen;
+        options.maxInputWidth  = maxSideLen;
+        options.engineFileDir  = cacheDir;
+
+        m_poolKey = { onnxPath,
+            static_cast<int>(options.precision),
+            options.maxBatchSize };
+        m_engine = EnginePoolManager<float>::instance().acquire(
+            m_poolKey, options, onnxPath,
+            kDetSubVals, kDetDivVals, true, -1);
+        m_usingSharedPool = (m_engine != nullptr);
+
+        if (!m_engine) {
+            std::cerr << "[RTOCRDetector] Failed to build/load TRT engine for: "
+                      << onnxPath << std::endl;
+            return false;
+        }
+
+        // Query actual profile max from the loaded engine
+        int profMaxH = m_engine->getProfileMaxHeight();
+        int profMaxW = m_engine->getProfileMaxWidth();
+        if (profMaxH > 0 && profMaxW > 0) {
+            m_engineMaxSideLen = std::min(profMaxH, profMaxW);
+        } else {
+            m_engineMaxSideLen = maxSideLen;
+        }
+
+        if (m_engineMaxSideLen < maxSideLen) {
+            std::cout << "[RTOCRDetector] Engine built with max " << m_engineMaxSideLen
+                      << "x" << m_engineMaxSideLen << " (requested " << maxSideLen
+                      << " exceeded GPU capacity)" << std::endl;
+        }
+        std::cout << "[RTOCRDetector] Initialized TRT engine from: " << onnxPath << std::endl;
+        return true;
+    }
+    catch (const std::exception& e) {
+        std::cerr << "[RTOCRDetector] Initialize failed: " << e.what() << std::endl;
+        return false;
+    }
+}
+
+std::vector<TextBox> RTOCRDetector::Detect(const cv::Mat& image,
+                                            int maxSideLen, float dbThresh,
+                                            float boxThresh, float unclipRatio,
+                                            bool useDilation) {
+    std::lock_guard<std::mutex> lock(_mutex);
+
+    if (!m_engine || image.empty()) return {};
+
+    try {
+        // Single-pass detection: resize the full image to fit within
+        // the engine's max spatial dimension (same approach as ONNX version).
+        int effectiveMaxSide = std::min(maxSideLen, m_engineMaxSideLen);
+
+        // 1. Compute resize dimensions (multiples of 32)
+        cv::Size resizeShape = ComputeDetResizeShape(image.rows, image.cols, effectiveMaxSide);
+        int newH = resizeShape.height;
+        int newW = resizeShape.width;
+
+        float ratioH = static_cast<float>(image.rows) / newH;
+        float ratioW = static_cast<float>(image.cols) / newW;
+
+        // 2. Upload to GPU and resize — try NV12 fast path first
+        cv::cuda::GpuMat gpuResized;
+        bool usedNV12 = false;
+
+        GpuFrameData* gpuFrame = tl_currentGpuFrame();
+        if (gpuFrame && gpuFrame->pixelFormat == 23 &&
+            gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&
+            gpuFrame->width > 0 && gpuFrame->height > 0) {
+            // NV12 fast path: fused NV12→BGR+resize kernel (1 kernel launch)
+            // instead of CPU BGR upload (24MB) + separate resize
+            int fW = gpuFrame->width;
+            int fH = gpuFrame->height;
+            int gpuIdx = m_engine ? m_engine->getOptions().deviceIndex : 0;
+
+            // Get NV12 Y/UV pointers on GPU (from cache or fresh upload)
+            const uint8_t* devY = nullptr;
+            const uint8_t* devUV = nullptr;
+            int yPitch = 0, uvPitch = 0;
+            {
+                auto regLock = ANSGpuFrameRegistry::instance().acquire_lock();
+                if (gpuFrame->gpuCacheValid && gpuFrame->gpuCacheDeviceIdx == gpuIdx) {
+                    // Cache hit
+                    devY    = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
+                    devUV   = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
+                    yPitch  = static_cast<int>(gpuFrame->gpuCacheYPitch);
+                    uvPitch = static_cast<int>(gpuFrame->gpuCacheUVPitch);
+                } else if (!gpuFrame->gpuCacheValid) {
+                    // Cache miss — upload CPU NV12 to GPU
+                    size_t yBytes  = static_cast<size_t>(fH) * gpuFrame->cpuYLinesize;
+                    size_t uvBytes = static_cast<size_t>(fH / 2) * gpuFrame->cpuUvLinesize;
+
+                    auto& reg = ANSGpuFrameRegistry::instance();
+                    if (reg.canAllocateGpuCache(yBytes + uvBytes)) {
+                        cudaMalloc(&gpuFrame->gpuCacheY,  yBytes);
+                        cudaMalloc(&gpuFrame->gpuCacheUV, uvBytes);
+                        cudaMemcpy(gpuFrame->gpuCacheY,  gpuFrame->cpuYPlane,  yBytes,  cudaMemcpyHostToDevice);
+                        cudaMemcpy(gpuFrame->gpuCacheUV, gpuFrame->cpuUvPlane, uvBytes, cudaMemcpyHostToDevice);
+                        gpuFrame->gpuCacheValid = true;
+                        gpuFrame->gpuCacheDeviceIdx = gpuIdx;
+                        gpuFrame->gpuCacheYPitch  = static_cast<size_t>(gpuFrame->cpuYLinesize);
+                        gpuFrame->gpuCacheUVPitch = static_cast<size_t>(gpuFrame->cpuUvLinesize);
+                        gpuFrame->gpuCacheBytes = yBytes + uvBytes;
+                        reg.onGpuCacheCreated(yBytes + uvBytes);
+
+                        devY    = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
+                        devUV   = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
+                        yPitch  = gpuFrame->cpuYLinesize;
+                        uvPitch = gpuFrame->cpuUvLinesize;
+                    }
+                }
+            } // release registry lock before GPU kernel
+
+            if (devY && devUV) {
+                // Single fused kernel: NV12→BGR + bilinear resize (1 launch, 1 output alloc)
+                gpuResized.create(newH, newW, CV_8UC3);
+                NV12PreprocessHelper::nv12ToBGRResize(
+                    devY, yPitch, devUV, uvPitch,
+                    gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),
+                    newW, newH, fW, fH);
+                usedNV12 = true;
+
+                // Update ratios to map from full-res NV12 to detection output
+                ratioH = static_cast<float>(fH) / newH;
+                ratioW = static_cast<float>(fW) / newW;
+            }
+        }
+
+        if (!usedNV12) {
+            // Fallback: standard BGR upload
+            cv::cuda::GpuMat gpuImg;
+            gpuImg.upload(image);
+            cv::cuda::resize(gpuImg, gpuResized, resizeShape);
+        }
+
+        // Keep BGR order (PaddleOCR official does NOT convert BGR->RGB)
+
+        // 3. Run inference
+        std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuResized } };
+        std::vector<std::vector<std::vector<float>>> featureVectors;
+
+        if (!m_engine->runInference(inputs, featureVectors)) {
+            std::cerr << "[RTOCRDetector] Inference failed" << std::endl;
+            return {};
+        }
+
+        if (featureVectors.empty() || featureVectors[0].empty()) return {};
+
+        // 4. Reshape output to probability map [H, W]
+        std::vector<float>& output = featureVectors[0][0];
+        int outputSize = static_cast<int>(output.size());
+
+        if (outputSize < newH * newW) {
+            std::cerr << "[RTOCRDetector] Output too small: expected at least "
+                      << newH * newW << " got " << outputSize << std::endl;
+            return {};
+        }
+
+        cv::Mat bitmap(newH, newW, CV_32FC1, output.data());
+
+        // 5. Threshold to binary (matches ONNX/PaddleOCR official order)
+        cv::Mat binaryMap;
+        cv::threshold(bitmap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);
+        binaryMap.convertTo(binaryMap, CV_8UC1);
+
+        // 6. Apply dilation if requested (on binaryMap, matching ONNX version)
+        if (useDilation) {
+            cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
+            cv::dilate(binaryMap, binaryMap, kernel);
+        }
+
+        // 7. Find contours and build text boxes
+        //    (matches ONNX/PaddleOCR official DBPostProcess.boxes_from_bitmap flow exactly)
+        std::vector<std::vector<cv::Point>> contours;
+        cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
+
+        int numCandidates = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);
+        std::vector<TextBox> boxes;
+
+        for (int i = 0; i < numCandidates; i++) {
+            if (contours[i].size() < 4) continue;
+
+            // Step 1: GetMiniBoxes - get ordered 4 corners of min area rect
+            cv::RotatedRect minRect = cv::minAreaRect(contours[i]);
+            float sside = std::min(minRect.size.width, minRect.size.height);
+            if (sside < 3.0f) continue;
+
+            auto ordered = GetMiniBoxes(minRect);
+
+            // Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon
+            float score = BoxScoreFast(bitmap, ordered);
+            if (score < boxThresh) continue;
+
+            // Step 3: UnclipPolygon - expand the 4-point box
+            auto expanded = UnclipPolygon(ordered, unclipRatio);
+            if (expanded.size() < 4) continue;
+
+            // Step 4: Re-compute GetMiniBoxes on the expanded polygon
+            std::vector<cv::Point> expandedInt;
+            expandedInt.reserve(expanded.size());
+            for (auto& p : expanded) {
+                expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));
+            }
+            cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);
+
+            // Filter by min_size + 2 = 5 (matches PaddleOCR official)
+            float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);
+            if (expandedSside < 5.0f) continue;
+
+            auto expandedOrdered = GetMiniBoxes(expandedRect);
+
+            // Step 5: Scale to original image coordinates
+            TextBox box;
+            for (int j = 0; j < 4; j++) {
+                box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(image.cols - 1));
+                box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(image.rows - 1));
+            }
+            box.score = score;
+            boxes.push_back(box);
+        }
+
+        SortTextBoxes(boxes);
+        return boxes;
+    }
+    catch (const std::exception& e) {
+        std::cerr << "[RTOCRDetector] Detect failed: " << e.what() << std::endl;
+        return {};
+    }
+}
+
+// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y
+std::array<cv::Point2f, 4> RTOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {
+    cv::Point2f vertices[4];
+    rect.points(vertices);
+
+    // Sort all 4 points by x-coordinate ascending
+    std::sort(vertices, vertices + 4,
+        [](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });
+
+    // Left two (indices 0,1): smaller y = top-left, larger y = bottom-left
+    cv::Point2f topLeft, bottomLeft;
+    if (vertices[0].y <= vertices[1].y) {
+        topLeft = vertices[0];
+        bottomLeft = vertices[1];
+    } else {
+        topLeft = vertices[1];
+        bottomLeft = vertices[0];
+    }
+
+    // Right two (indices 2,3): smaller y = top-right, larger y = bottom-right
+    cv::Point2f topRight, bottomRight;
+    if (vertices[2].y <= vertices[3].y) {
+        topRight = vertices[2];
+        bottomRight = vertices[3];
+    } else {
+        topRight = vertices[3];
+        bottomRight = vertices[2];
+    }
+
+    // Order: [TL, TR, BR, BL] (clockwise from top-left)
+    return { topLeft, topRight, bottomRight, bottomLeft };
+}
+
+// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon
+float RTOCRDetector::BoxScoreFast(const cv::Mat& probMap,
+                                    const std::array<cv::Point2f, 4>& box) {
+    int h = probMap.rows;
+    int w = probMap.cols;
+
+    // Get bounding rectangle with proper clamping (matches PaddleOCR official)
+    float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});
+    float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});
+    float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});
+    float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});
+
+    int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);
+    int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);
+    int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);
+    int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);
+
+    if (xmin >= xmax || ymin >= ymax) return 0.0f;
+
+    cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);
+
+    std::vector<cv::Point> pts(4);
+    for (int j = 0; j < 4; j++) {
+        pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,
+                            static_cast<int>(box[j].y) - ymin);
+    }
+    std::vector<std::vector<cv::Point>> polys = { pts };
+    cv::fillPoly(mask, polys, cv::Scalar(1));
+
+    cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
+    return static_cast<float>(cv::mean(roiMap, mask)[0]);
+}
+
+// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND
+// Uses integer coordinates for Clipper (matching PaddleOCR/ONNX version exactly)
+std::vector<cv::Point2f> RTOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,
+                                                        float unclipRatio) {
+    // Compute area using Shoelace formula and perimeter
+    float area = 0.0f;
+    float perimeter = 0.0f;
+    for (int i = 0; i < 4; i++) {
+        int j = (i + 1) % 4;
+        area += box[i].x * box[j].y - box[j].x * box[i].y;
+        float dx = box[j].x - box[i].x;
+        float dy = box[j].y - box[i].y;
+        perimeter += std::sqrt(dx * dx + dy * dy);
+    }
+    area = std::abs(area) * 0.5f;
+    if (perimeter < 1.0f) return {};
+
+    float distance = area * unclipRatio / perimeter;
+
+    ClipperLib::Path clipperPath;
+    for (int i = 0; i < 4; i++) {
+        clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),
+                                static_cast<ClipperLib::cInt>(box[i].y) });
+    }
+
+    ClipperLib::ClipperOffset offset;
+    offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);
+
+    ClipperLib::Paths solution;
+    offset.Execute(solution, distance);
+
+    if (solution.empty() || solution[0].empty()) return {};
+
+    std::vector<cv::Point2f> result;
+    for (auto& p : solution[0]) {
+        result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));
+    }
+    return result;
+}
+
+RTOCRDetector::~RTOCRDetector() {
+    try {
+        if (m_usingSharedPool) {
+            EnginePoolManager<float>::instance().release(m_poolKey);
+            m_engine.reset();
+            m_usingSharedPool = false;
+        }
+        else if (m_engine) {
+            m_engine.reset();
+        }
+    }
+    catch (...) {}
+}
+
+} // namespace rtocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSRTOCR/RTOCRDetector.h
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRDetector.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "RTOCRTypes.h"
+#include "engine.h"
+#include "engine/EnginePoolManager.h"
+#include <memory>
+#include <mutex>
+
+namespace ANSCENTER {
+namespace rtocr {
+
+class RTOCRDetector {
+public:
+    RTOCRDetector() = default;
+    ~RTOCRDetector();
+    RTOCRDetector(const RTOCRDetector&) = delete;
+    RTOCRDetector& operator=(const RTOCRDetector&) = delete;
+
+    bool Initialize(const std::string& onnxPath, int gpuId = 0,
+                    const std::string& engineCacheDir = "",
+                    int maxSideLen = kDetMaxSideLen);
+
+    std::vector<TextBox> Detect(const cv::Mat& image,
+                                int maxSideLen = kDetMaxSideLen,
+                                float dbThresh = kDetDbThresh,
+                                float boxThresh = kDetBoxThresh,
+                                float unclipRatio = kDetUnclipRatio,
+                                bool useDilation = false);
+
+private:
+    // Postprocessing helpers (matches ONNX/PaddleOCR official flow exactly)
+    std::array<cv::Point2f, 4> GetMiniBoxes(const cv::RotatedRect& rect);
+    float BoxScoreFast(const cv::Mat& probMap, const std::array<cv::Point2f, 4>& box);
+    std::vector<cv::Point2f> UnclipPolygon(const std::array<cv::Point2f, 4>& box, float unclipRatio);
+
+    std::shared_ptr<Engine<float>> m_engine = nullptr;
+    EnginePoolManager<float>::PoolKey m_poolKey;
+    bool m_usingSharedPool = false;
+    int m_engineMaxSideLen = kDetMaxSideLen;  // Actual TRT engine max spatial dim
+    std::mutex _mutex;
+};
+
+} // namespace rtocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
@@ -0,0 +1,206 @@
+#include "RTOCRRecognizer.h"
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/cudawarping.hpp>
+#include <opencv2/cudaarithm.hpp>
+#include <iostream>
+#include <algorithm>
+#include <numeric>
+#include <cmath>
+#include <cfloat>
+
+namespace ANSCENTER {
+namespace rtocr {
+
+bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string& dictPath,
+                                  int gpuId, const std::string& engineCacheDir) {
+    try {
+        // Load dictionary first
+        keys_ = LoadDict(dictPath);
+        if (keys_.size() < 2) {
+            std::cerr << "[RTOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl;
+            return false;
+        }
+        std::cout << "[RTOCRRecognizer] Loaded dictionary with " << keys_.size()
+                  << " characters from: " << dictPath << std::endl;
+
+        ANSCENTER::Options options;
+        options.deviceIndex = gpuId;
+        options.precision = ANSCENTER::Precision::FP16;
+        options.maxBatchSize = 1;
+        options.optBatchSize = 1;
+
+        // Fixed height, dynamic width for recognition
+        options.minInputHeight = imgH_;
+        options.optInputHeight = imgH_;
+        options.maxInputHeight = imgH_;
+        options.minInputWidth  = 32;
+        options.optInputWidth  = imgMaxW_;
+        options.maxInputWidth  = 960;
+
+        if (!engineCacheDir.empty()) {
+            options.engineFileDir = engineCacheDir;
+        }
+        else {
+            auto pos = onnxPath.find_last_of("/\\");
+            options.engineFileDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
+        }
+
+        m_poolKey = { onnxPath,
+            static_cast<int>(options.precision),
+            options.maxBatchSize };
+        m_engine = EnginePoolManager<float>::instance().acquire(
+            m_poolKey, options, onnxPath,
+            kRecSubVals, kRecDivVals, true, -1);
+        m_usingSharedPool = (m_engine != nullptr);
+
+        if (!m_engine) {
+            std::cerr << "[RTOCRRecognizer] Failed to build/load TRT engine: " << onnxPath << std::endl;
+            return false;
+        }
+
+        std::cout << "[RTOCRRecognizer] Initialized TRT engine from: " << onnxPath << std::endl;
+        return true;
+    }
+    catch (const std::exception& e) {
+        std::cerr << "[RTOCRRecognizer] Initialize failed: " << e.what() << std::endl;
+        m_engine.reset();
+        return false;
+    }
+}
+
+TextLine RTOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
+    std::lock_guard<std::mutex> lock(_mutex);
+
+    if (!m_engine || croppedImage.empty() || keys_.empty()) {
+        return {};
+    }
+
+    try {
+        // Preprocess: resize to fixed height, proportional width
+        cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
+        int resizedW = resized.cols;
+
+        // Pad to at least kRecImgW width (matching official PaddleOCR behavior)
+        // Official PaddleOCR pads with 0.0 in normalized space ≈ pixel value 128 (gray)
+        int imgW = std::max(resizedW, kRecImgW);
+        if (imgW > resizedW) {
+            cv::Mat padded(imgH_, imgW, resized.type(), cv::Scalar(128, 128, 128));
+            resized.copyTo(padded(cv::Rect(0, 0, resizedW, imgH_)));
+            resized = padded;
+        }
+
+        // Upload to GPU (keep BGR order - PaddleOCR official does NOT convert BGR→RGB)
+        cv::cuda::GpuMat gpuImg;
+        gpuImg.upload(resized);
+
+        // Run inference
+        std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuImg } };
+        std::vector<std::vector<std::vector<float>>> featureVectors;
+
+        if (!m_engine->runInference(inputs, featureVectors)) {
+            std::cerr << "[RTOCRRecognizer] Inference failed" << std::endl;
+            return {};
+        }
+
+        if (featureVectors.empty() || featureVectors[0].empty() ||
+            featureVectors[0][0].empty()) {
+            return {};
+        }
+
+        // Output shape: [1, seqLen, numClasses] flattened to [seqLen * numClasses]
+        // IMPORTANT: The TRT engine output buffer is pre-allocated to MAX dimensions
+        // (e.g. 120 timesteps for max width 960), but the actual inference produces
+        // fewer timesteps for narrower images.  We must use the ACTUAL seqLen
+        // derived from the input width, not getOutputDims() which returns max dims.
+        const std::vector<float>& output = featureVectors[0][0];
+
+        // numClasses from dictionary size (keys_ includes blank at index 0)
+        int numClasses = static_cast<int>(keys_.size());
+
+        // Actual seqLen from input width: recognition model stride = 8
+        // (confirmed: 960px input → 120 timesteps, 960/120 = 8)
+        int seqLen = imgW / 8;
+
+        // Sanity check: seqLen * numClasses must not exceed buffer size
+        if (seqLen * numClasses > static_cast<int>(output.size())) {
+            // Fallback: infer from buffer size
+            seqLen = static_cast<int>(output.size()) / numClasses;
+        }
+
+        return CTCDecode(output.data(), seqLen, numClasses);
+    }
+    catch (const std::exception& e) {
+        std::cerr << "[RTOCRRecognizer] Recognize failed: " << e.what() << std::endl;
+        return {};
+    }
+}
+
+std::vector<TextLine> RTOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
+    std::vector<TextLine> results;
+    results.reserve(croppedImages.size());
+
+    // Process one at a time (each image has different width)
+    for (size_t i = 0; i < croppedImages.size(); i++) {
+        results.push_back(Recognize(croppedImages[i]));
+    }
+
+    return results;
+}
+
+TextLine RTOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
+    TextLine result;
+    std::string text;
+    std::vector<float> scores;
+
+    int lastIndex = 0; // CTC blank is index 0
+
+    for (int t = 0; t < seqLen; t++) {
+        // Find argmax for this timestep
+        int maxIndex = 0;
+        float maxValue = -FLT_MAX;
+
+        const float* timeStep = outputData + t * numClasses;
+        for (int c = 0; c < numClasses; c++) {
+            if (timeStep[c] > maxValue) {
+                maxValue = timeStep[c];
+                maxIndex = c;
+            }
+        }
+
+        // CTC decode: skip blanks (index 0) and repeated characters
+        if (maxIndex != 0 && maxIndex != lastIndex) {
+            if (maxIndex > 0 && maxIndex < static_cast<int>(keys_.size())) {
+                text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc.
+                // Use raw model output value as confidence (PaddleOCR v5 models include softmax)
+                scores.push_back(maxValue);
+            }
+        }
+        lastIndex = maxIndex;
+    }
+
+    result.text = text;
+    if (!scores.empty()) {
+        result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) /
+                       static_cast<float>(scores.size());
+    }
+    return result;
+}
+
+RTOCRRecognizer::~RTOCRRecognizer() {
+    try {
+        if (m_usingSharedPool) {
+            EnginePoolManager<float>::instance().release(m_poolKey);
+            m_engine.reset();
+            m_usingSharedPool = false;
+        }
+        else if (m_engine) {
+            m_engine.reset();
+        }
+    }
+    catch (...) {}
+}
+
+} // namespace rtocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.h
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "RTOCRTypes.h"
+#include "engine.h"
+#include "engine/EnginePoolManager.h"
+#include <memory>
+#include <mutex>
+
+namespace ANSCENTER {
+namespace rtocr {
+
+class RTOCRRecognizer {
+public:
+    RTOCRRecognizer() = default;
+    ~RTOCRRecognizer();
+    RTOCRRecognizer(const RTOCRRecognizer&) = delete;
+    RTOCRRecognizer& operator=(const RTOCRRecognizer&) = delete;
+
+    bool Initialize(const std::string& onnxPath, const std::string& dictPath,
+                    int gpuId = 0, const std::string& engineCacheDir = "");
+
+    TextLine Recognize(const cv::Mat& croppedImage);
+    std::vector<TextLine> RecognizeBatch(const std::vector<cv::Mat>& croppedImages);
+
+    void SetRecImageHeight(int h) { imgH_ = h; }
+    void SetRecImageMaxWidth(int w) { imgMaxW_ = w; }
+
+private:
+    TextLine CTCDecode(const float* outputData, int seqLen, int numClasses);
+
+    std::shared_ptr<Engine<float>> m_engine = nullptr;
+    EnginePoolManager<float>::PoolKey m_poolKey;
+    bool m_usingSharedPool = false;
+    std::vector<std::string> keys_;
+    int imgH_    = kRecImgH;
+    int imgMaxW_ = kRecImgMaxW;
+    std::mutex _mutex;
+};
+
+} // namespace rtocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSRTOCR/RTOCRTypes.h
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRTypes.h
@@ -0,0 +1,196 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <array>
+#include <fstream>
+#include <algorithm>
+#include <numeric>
+#include <cmath>
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+namespace ANSCENTER {
+namespace rtocr {
+
+// ============================================================================
+// Engine normalization constants (BGR channel order, matching PaddleOCR official)
+// ============================================================================
+// PaddleOCR processes images in BGR order (no BGR→RGB conversion).
+// Engine applies: (pixel/255.0 - subVals[c]) / divVals[c] per channel.
+// When feeding BGR input (no cvtColor), subVals/divVals indices map to:
+//   [0]=B channel, [1]=G channel, [2]=R channel
+//
+// PaddleOCR config: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+// These are applied to BGR channels: B=0.485/0.229, G=0.456/0.224, R=0.406/0.225
+
+// Detection normalization (BGR order)
+constexpr std::array<float, 3> kDetSubVals = { 0.485f, 0.456f, 0.406f };
+constexpr std::array<float, 3> kDetDivVals = { 0.229f, 0.224f, 0.225f };
+
+// Classifier normalization: PP-LCNet_x1_0_textline_ori uses ImageNet normalization (BGR order)
+// Config: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], scale=1/255
+constexpr std::array<float, 3> kClsSubVals = { 0.485f, 0.456f, 0.406f };
+constexpr std::array<float, 3> kClsDivVals = { 0.229f, 0.224f, 0.225f };
+
+// Recognition normalization: (pixel/255 - 0.5) / 0.5 (same as classifier)
+constexpr std::array<float, 3> kRecSubVals = { 0.5f, 0.5f, 0.5f };
+constexpr std::array<float, 3> kRecDivVals = { 0.5f, 0.5f, 0.5f };
+
+// ============================================================================
+// Detection defaults (PP-OCRv5 server)
+// kDetMaxSideLen is the default max side length for detection preprocessing.
+// TRT engine auto-fallbacks to smaller max if GPU memory is insufficient during build.
+// ============================================================================
+constexpr int   kDetMaxSideLen    = 2560;
+constexpr int   kDetMaxSideLimit  = 4000;  // Safety cap on max dimension
+constexpr float kDetDbThresh      = 0.3f;
+constexpr float kDetBoxThresh     = 0.6f;
+constexpr float kDetUnclipRatio   = 1.5f;
+constexpr int   kDetMaxCandidates = 1000;
+
+// ============================================================================
+// Classifier defaults (PP-LCNet_x1_0_textline_ori model)
+// Input: [B, 3, 80, 160], ImageNet normalization, 2-class (0°/180°)
+// Direct resize to 80x160 (no aspect ratio preservation)
+// ============================================================================
+constexpr int   kClsImageH   = 80;
+constexpr int   kClsImageW   = 160;
+constexpr float kClsThresh   = 0.9f;
+
+// ============================================================================
+// Recognition defaults
+// ============================================================================
+constexpr int kRecImgH    = 48;
+constexpr int kRecImgW    = 320;   // Default rec width (PP-OCRv5 rec_image_shape[2]=320, min padded width)
+constexpr int kRecImgMaxW = 960;   // Allow wide recognition input for long text lines
+constexpr int kRecBatchSize = 6;
+
+// ============================================================================
+// Data structures
+// ============================================================================
+
+// A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left)
+struct TextBox {
+    std::array<cv::Point2f, 4> points;
+    float score = 0.0f;
+};
+
+// A single recognized text line
+struct TextLine {
+    std::string text;
+    float score = 0.0f;
+};
+
+// OCR result matching PaddleOCR::OCRPredictResult format
+struct OCRPredictResult {
+    std::vector<std::vector<int>> box;  // 4 corner points [[x,y], ...]
+    std::string text;
+    float score    = -1.0f;
+    float cls_score = 0.0f;
+    int   cls_label = -1;
+};
+
+// ============================================================================
+// Utility functions
+// ============================================================================
+
+// Load character dictionary from file
+inline std::vector<std::string> LoadDict(const std::string& dictPath) {
+    std::vector<std::string> keys;
+    std::ifstream file(dictPath);
+    if (!file.is_open()) return keys;
+    std::string line;
+    while (std::getline(file, line)) {
+        if (!line.empty() && line.back() == '\r') {
+            line.pop_back();
+        }
+        keys.push_back(line);
+    }
+    // CTC blank token at index 0
+    keys.insert(keys.begin(), "#");
+    // Space at end
+    keys.push_back(" ");
+    return keys;
+}
+
+// Compute resize dimensions for detection model (multiples of 32)
+// limit_type='max': scale down if max side > maxSideLen (PP-OCRv5 server default)
+// maxSideLimit: safety cap on final max dimension (default 4000)
+inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen,
+                                       int maxSideLimit = kDetMaxSideLimit) {
+    float ratio = 1.0f;
+    int maxSide = std::max(srcH, srcW);
+    if (maxSide > maxSideLen) {
+        ratio = static_cast<float>(maxSideLen) / static_cast<float>(maxSide);
+    }
+    int newH = static_cast<int>(srcH * ratio);
+    int newW = static_cast<int>(srcW * ratio);
+
+    // Safety cap: clamp if either dimension exceeds maxSideLimit
+    if (std::max(newH, newW) > maxSideLimit) {
+        float clampRatio = static_cast<float>(maxSideLimit) / static_cast<float>(std::max(newH, newW));
+        newH = static_cast<int>(newH * clampRatio);
+        newW = static_cast<int>(newW * clampRatio);
+    }
+
+    newH = std::max(32, static_cast<int>(std::round(newH / 32.0) * 32));
+    newW = std::max(32, static_cast<int>(std::round(newW / 32.0) * 32));
+    return cv::Size(newW, newH);
+}
+
+// Sort text boxes from top to bottom, left to right
+inline void SortTextBoxes(std::vector<TextBox>& boxes) {
+    std::sort(boxes.begin(), boxes.end(),
+        [](const TextBox& a, const TextBox& b) {
+            if (std::abs(a.points[0].y - b.points[0].y) < 10.0f) {
+                return a.points[0].x < b.points[0].x;
+            }
+            return a.points[0].y < b.points[0].y;
+        });
+}
+
+// Get rotated and cropped image from text box polygon
+inline cv::Mat GetRotateCropImage(const cv::Mat& srcImage, const TextBox& box) {
+    auto pts = box.points;
+    float width = static_cast<float>(std::max(
+        cv::norm(pts[0] - pts[1]),
+        cv::norm(pts[2] - pts[3])));
+    float height = static_cast<float>(std::max(
+        cv::norm(pts[0] - pts[3]),
+        cv::norm(pts[1] - pts[2])));
+
+    std::vector<cv::Point2f> srcPts = { pts[0], pts[1], pts[2], pts[3] };
+    std::vector<cv::Point2f> dstPts = {
+        {0, 0}, {width, 0}, {width, height}, {0, height}
+    };
+
+    cv::Mat M = cv::getPerspectiveTransform(srcPts, dstPts);
+    cv::Mat cropped;
+    cv::warpPerspective(srcImage, cropped, M,
+        cv::Size(static_cast<int>(width), static_cast<int>(height)),
+        cv::BORDER_REPLICATE);
+
+    if (cropped.rows > cropped.cols * 1.5f) {
+        cv::Mat rotated;
+        cv::transpose(cropped, rotated);
+        cv::flip(rotated, rotated, 0);
+        return rotated;
+    }
+    return cropped;
+}
+
+// Resize recognition image to fixed height, proportional width
+inline cv::Mat ResizeRecImage(const cv::Mat& img, int targetH, int maxW) {
+    float ratio = static_cast<float>(targetH) / img.rows;
+    int targetW = static_cast<int>(img.cols * ratio);
+    targetW = std::min(targetW, maxW);
+    targetW = std::max(targetW, 1);
+
+    cv::Mat resized;
+    cv::resize(img, resized, cv::Size(targetW, targetH));
+    return resized;
+}
+
+} // namespace rtocr
+} // namespace ANSCENTER