Refactor project structure

2026-03-28 19:56:39 +11:00
parent 1d267378b2
commit 8a2e721058
511 changed files with 59 additions and 48 deletions
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp
@@ -0,0 +1,107 @@
+#include "ONNXOCRClassifier.h"
+
+#include <opencv2/imgproc.hpp>
+#include <iostream>
+#include <algorithm>
+#include <cmath>
+
+namespace ANSCENTER {
+namespace onnxocr {
+
+ONNXOCRClassifier::ONNXOCRClassifier(const std::string& onnx_path, unsigned int num_threads)
+    : BasicOrtHandler(onnx_path, num_threads) {
+}
+
+Ort::Value ONNXOCRClassifier::transform(const cv::Mat& mat) {
+    cv::Mat resized;
+    // Direct resize to 80x160 (PP-LCNet_x1_0_textline_ori)
+    // No aspect ratio preservation — matches PaddleOCR official ResizeImage
+    cv::resize(mat, resized, cv::Size(kClsImageW, kClsImageH));
+
+    resized.convertTo(resized, CV_32FC3);
+
+    // PP-LCNet uses ImageNet normalization (same as detection)
+    auto data = NormalizeAndPermute(resized);
+
+    input_values_handler.assign(data.begin(), data.end());
+    return Ort::Value::CreateTensor<float>(
+        *memory_info_handler, input_values_handler.data(), input_values_handler.size(),
+        input_node_dims.data(), input_node_dims.size());
+}
+
+Ort::Value ONNXOCRClassifier::transformBatch(const std::vector<cv::Mat>& images) {
+    // Not used - classifier processes single images in Classify() loop
+    if (!images.empty()) {
+        return transform(images[0]);
+    }
+    return Ort::Value(nullptr);
+}
+
+void ONNXOCRClassifier::Classify(std::vector<cv::Mat>& img_list,
+                                  std::vector<int>& cls_labels,
+                                  std::vector<float>& cls_scores,
+                                  float cls_thresh) {
+    std::lock_guard<std::mutex> lock(_mutex);
+
+    cls_labels.clear();
+    cls_scores.clear();
+
+    if (!ort_session || img_list.empty()) return;
+
+    cls_labels.resize(img_list.size(), 0);
+    cls_scores.resize(img_list.size(), 0.0f);
+
+    // Process one image at a time (dynamic shapes)
+    for (size_t i = 0; i < img_list.size(); i++) {
+        if (img_list[i].empty()) continue;
+
+        try {
+            // Preprocess: direct resize to 80x160 (PP-LCNet_x1_0_textline_ori)
+            // No aspect ratio preservation — matches PaddleOCR official ResizeImage
+            cv::Mat resized;
+            cv::resize(img_list[i], resized, cv::Size(kClsImageW, kClsImageH));
+
+            resized.convertTo(resized, CV_32FC3);
+            // PP-LCNet uses ImageNet normalization (same as detection)
+            auto inputData = NormalizeAndPermute(resized);
+
+            std::array<int64_t, 4> inputShape = { 1, 3, kClsImageH, kClsImageW };
+            Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
+                *memory_info_handler, inputData.data(), inputData.size(),
+                inputShape.data(), inputShape.size());
+
+            auto outputTensors = ort_session->Run(
+                Ort::RunOptions{ nullptr },
+                input_node_names.data(), &inputTensor, 1,
+                output_node_names.data(), num_outputs);
+
+            float* outData = outputTensors[0].GetTensorMutableData<float>();
+            auto outShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
+            int numClasses = (outShape.size() > 1) ? static_cast<int>(outShape[1]) : 2;
+
+            // Find argmax and use raw output value as score
+            // PaddleOCR v5 models include softmax, so output values are probabilities
+            // Matches PaddleOCR official: score = preds[i, argmax_idx]
+            int maxIdx = 0;
+            float maxVal = outData[0];
+            for (int c = 1; c < numClasses; c++) {
+                if (outData[c] > maxVal) {
+                    maxVal = outData[c];
+                    maxIdx = c;
+                }
+            }
+
+            cls_labels[i] = maxIdx;
+            cls_scores[i] = maxVal;
+        }
+        catch (const Ort::Exception& e) {
+            std::cerr << "[ONNXOCRClassifier] Inference failed for image " << i
+                      << ": " << e.what() << std::endl;
+            cls_labels[i] = 0;
+            cls_scores[i] = 0.0f;
+        }
+    }
+}
+
+} // namespace onnxocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.h
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "ONNXOCRTypes.h"
+#include "ONNXEngine.h"
+#include <vector>
+#include <mutex>
+
+namespace ANSCENTER {
+namespace onnxocr {
+
+class ONNXOCRClassifier : public BasicOrtHandler {
+public:
+    explicit ONNXOCRClassifier(const std::string& onnx_path, unsigned int num_threads = 1);
+    ~ONNXOCRClassifier() override = default;
+
+    // Classify text orientation for a list of cropped images
+    // Returns vector of (cls_label, cls_score) pairs
+    // cls_label: 0 = normal, 1 = rotated 180 degrees
+    void Classify(std::vector<cv::Mat>& img_list,
+                  std::vector<int>& cls_labels,
+                  std::vector<float>& cls_scores,
+                  float cls_thresh = kClsThresh);
+
+private:
+    Ort::Value transform(const cv::Mat& mat) override;
+    Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
+
+    std::mutex _mutex;
+};
+
+} // namespace onnxocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp
@@ -0,0 +1,312 @@
+#include "ONNXOCRDetector.h"
+#include "include/clipper.h"
+#include "ANSGpuFrameRegistry.h"
+#include "NV12PreprocessHelper.h"
+
+#include <opencv2/imgproc.hpp>
+#include <iostream>
+#include <algorithm>
+#include <cmath>
+
+namespace ANSCENTER {
+namespace onnxocr {
+
+ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads)
+    : BasicOrtHandler(onnx_path, num_threads) {
+}
+
+Ort::Value ONNXOCRDetector::transform(const cv::Mat& mat) {
+    // Not used directly - detection uses custom Preprocess + manual tensor creation
+    // Provided to satisfy BasicOrtHandler pure virtual
+    cv::Mat canvas;
+    mat.convertTo(canvas, CV_32FC3);
+    auto data = NormalizeAndPermute(canvas);
+
+    input_values_handler.assign(data.begin(), data.end());
+    return Ort::Value::CreateTensor<float>(
+        *memory_info_handler, input_values_handler.data(), input_values_handler.size(),
+        input_node_dims.data(), input_node_dims.size());
+}
+
+Ort::Value ONNXOCRDetector::transformBatch(const std::vector<cv::Mat>& images) {
+    // Not used - detection processes single images with dynamic shapes
+    if (!images.empty()) {
+        return transform(images[0]);
+    }
+    return Ort::Value(nullptr);
+}
+
+std::vector<TextBox> ONNXOCRDetector::Detect(const cv::Mat& srcImage,
+                                              int maxSideLen,
+                                              float dbThresh,
+                                              float boxThresh,
+                                              float unclipRatio,
+                                              bool useDilation) {
+    std::lock_guard<std::mutex> lock(_mutex);
+
+    if (!ort_session || srcImage.empty()) {
+        return {};
+    }
+
+    // Try to get full-resolution image from NV12 frame (same pattern as ANSONNXYOLO)
+    cv::Mat inferenceImage = srcImage;
+    float bgrScaleX = 1.0f, bgrScaleY = 1.0f;
+
+    GpuFrameData* gpuFrame = tl_currentGpuFrame();
+    if (gpuFrame && gpuFrame->pixelFormat == 23 &&
+        gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&
+        gpuFrame->width > 0 && gpuFrame->height > 0) {
+        // Full-res NV12 available — convert to BGR on CPU for ONNX
+        cv::Mat yPlane(gpuFrame->height, gpuFrame->width, CV_8UC1,
+                       gpuFrame->cpuYPlane, gpuFrame->cpuYLinesize);
+        cv::Mat uvPlane(gpuFrame->height / 2, gpuFrame->width, CV_8UC1,
+                        gpuFrame->cpuUvPlane, gpuFrame->cpuUvLinesize);
+        cv::Mat fullResBGR;
+        cv::cvtColorTwoPlane(yPlane, uvPlane, fullResBGR, cv::COLOR_YUV2BGR_NV12);
+        if (!fullResBGR.empty()) {
+            bgrScaleX = static_cast<float>(srcImage.cols) / fullResBGR.cols;
+            bgrScaleY = static_cast<float>(srcImage.rows) / fullResBGR.rows;
+            inferenceImage = fullResBGR;
+        }
+    }
+
+    int resizeH, resizeW;
+    float ratioH, ratioW;
+
+    // Preprocess (using full-res image if NV12 was available)
+    auto inputData = Preprocess(inferenceImage, maxSideLen, resizeH, resizeW, ratioH, ratioW);
+
+    // Create input tensor with dynamic shape
+    std::array<int64_t, 4> inputShape = { 1, 3, resizeH, resizeW };
+    Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
+        *memory_info_handler, inputData.data(), inputData.size(),
+        inputShape.data(), inputShape.size());
+
+    // Run inference
+    auto outputTensors = ort_session->Run(
+        Ort::RunOptions{ nullptr },
+        input_node_names.data(), &inputTensor, 1,
+        output_node_names.data(), num_outputs);
+
+    // Get output data
+    float* outputData = outputTensors[0].GetTensorMutableData<float>();
+    auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
+
+    int outH = static_cast<int>(outputShape[2]);
+    int outW = static_cast<int>(outputShape[3]);
+
+    // Postprocess — detection coords are relative to inferenceImage (full-res),
+    // then scaled back to srcImage (display-res) coordinates
+    auto boxes = Postprocess(outputData, outH, outW, ratioH, ratioW,
+                             inferenceImage.rows, inferenceImage.cols,
+                             dbThresh, boxThresh, unclipRatio, useDilation);
+
+    // Rescale box coordinates from full-res to display-res
+    if (bgrScaleX != 1.0f || bgrScaleY != 1.0f) {
+        for (auto& box : boxes) {
+            for (auto& pt : box.points) {
+                pt.x *= bgrScaleX;
+                pt.y *= bgrScaleY;
+            }
+        }
+    }
+
+    return boxes;
+}
+
+std::vector<float> ONNXOCRDetector::Preprocess(const cv::Mat& srcImage, int maxSideLen,
+                                                int& resizeH, int& resizeW,
+                                                float& ratioH, float& ratioW) {
+    cv::Size newSize = ComputeDetResizeShape(srcImage.rows, srcImage.cols, maxSideLen);
+    resizeW = newSize.width;
+    resizeH = newSize.height;
+    ratioH = static_cast<float>(srcImage.rows) / resizeH;
+    ratioW = static_cast<float>(srcImage.cols) / resizeW;
+
+    cv::Mat resized;
+    cv::resize(srcImage, resized, newSize);
+    resized.convertTo(resized, CV_32FC3);
+
+    return NormalizeAndPermute(resized);
+}
+
+// Matches PaddleOCR official DBPostProcess.boxes_from_bitmap flow
+std::vector<TextBox> ONNXOCRDetector::Postprocess(const float* outputData, int outH, int outW,
+                                                   float ratioH, float ratioW,
+                                                   int srcH, int srcW,
+                                                   float dbThresh, float boxThresh,
+                                                   float unclipRatio, bool useDilation) {
+    // Create probability map from output
+    cv::Mat probMap(outH, outW, CV_32FC1, const_cast<float*>(outputData));
+
+    // Binary threshold
+    cv::Mat binaryMap;
+    cv::threshold(probMap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);
+    binaryMap.convertTo(binaryMap, CV_8UC1);
+
+    // Optional dilation (PaddleOCR default: use_dilation=False, kernel=[[1,1],[1,1]])
+    if (useDilation) {
+        cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
+        cv::dilate(binaryMap, binaryMap, kernel);
+    }
+
+    // Find contours
+    std::vector<std::vector<cv::Point>> contours;
+    cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
+
+    std::vector<TextBox> boxes;
+    int numContours = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);
+
+    for (int i = 0; i < numContours; i++) {
+        if (contours[i].size() < 4) continue;
+
+        // Step 1: GetMiniBoxes - get ordered 4 corners of min area rect
+        cv::RotatedRect minRect = cv::minAreaRect(contours[i]);
+        float sside = std::min(minRect.size.width, minRect.size.height);
+        if (sside < 3.0f) continue;
+
+        auto ordered = GetMiniBoxes(minRect);
+
+        // Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon
+        float score = BoxScoreFast(probMap, ordered);
+        if (score < boxThresh) continue;
+
+        // Step 3: UnclipPolygon - expand the 4-point box
+        auto expanded = UnclipPolygon(ordered, unclipRatio);
+        if (expanded.size() < 4) continue;
+
+        // Step 4: Re-compute GetMiniBoxes on the expanded polygon
+        std::vector<cv::Point> expandedInt;
+        expandedInt.reserve(expanded.size());
+        for (auto& p : expanded) {
+            expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));
+        }
+        cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);
+
+        // Filter by min_size + 2 = 5 (matches PaddleOCR official)
+        float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);
+        if (expandedSside < 5.0f) continue;
+
+        auto expandedOrdered = GetMiniBoxes(expandedRect);
+
+        // Step 5: Scale to original image coordinates
+        TextBox box;
+        for (int j = 0; j < 4; j++) {
+            box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(srcW - 1));
+            box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(srcH - 1));
+        }
+        box.score = score;
+        boxes.push_back(box);
+    }
+
+    SortTextBoxes(boxes);
+    return boxes;
+}
+
+// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y
+std::array<cv::Point2f, 4> ONNXOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {
+    cv::Point2f vertices[4];
+    rect.points(vertices);
+
+    // Sort all 4 points by x-coordinate ascending
+    std::sort(vertices, vertices + 4,
+        [](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });
+
+    // Left two (indices 0,1): smaller y = top-left, larger y = bottom-left
+    cv::Point2f topLeft, bottomLeft;
+    if (vertices[0].y <= vertices[1].y) {
+        topLeft = vertices[0];
+        bottomLeft = vertices[1];
+    } else {
+        topLeft = vertices[1];
+        bottomLeft = vertices[0];
+    }
+
+    // Right two (indices 2,3): smaller y = top-right, larger y = bottom-right
+    cv::Point2f topRight, bottomRight;
+    if (vertices[2].y <= vertices[3].y) {
+        topRight = vertices[2];
+        bottomRight = vertices[3];
+    } else {
+        topRight = vertices[3];
+        bottomRight = vertices[2];
+    }
+
+    // Order: [TL, TR, BR, BL] (clockwise from top-left)
+    return { topLeft, topRight, bottomRight, bottomLeft };
+}
+
+// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon
+float ONNXOCRDetector::BoxScoreFast(const cv::Mat& probMap,
+                                     const std::array<cv::Point2f, 4>& box) {
+    int h = probMap.rows;
+    int w = probMap.cols;
+
+    // Get bounding rectangle with proper clamping (matches PaddleOCR official)
+    float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});
+    float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});
+    float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});
+    float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});
+
+    int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);
+    int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);
+    int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);
+    int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);
+
+    if (xmin >= xmax || ymin >= ymax) return 0.0f;
+
+    cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);
+
+    std::vector<cv::Point> pts(4);
+    for (int j = 0; j < 4; j++) {
+        pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,
+                            static_cast<int>(box[j].y) - ymin);
+    }
+    std::vector<std::vector<cv::Point>> polys = { pts };
+    cv::fillPoly(mask, polys, cv::Scalar(1));
+
+    cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
+    return static_cast<float>(cv::mean(roiMap, mask)[0]);
+}
+
+// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND
+std::vector<cv::Point2f> ONNXOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,
+                                                         float unclipRatio) {
+    // Compute area using Shoelace formula and perimeter
+    float area = 0.0f;
+    float perimeter = 0.0f;
+    for (int i = 0; i < 4; i++) {
+        int j = (i + 1) % 4;
+        area += box[i].x * box[j].y - box[j].x * box[i].y;
+        float dx = box[j].x - box[i].x;
+        float dy = box[j].y - box[i].y;
+        perimeter += std::sqrt(dx * dx + dy * dy);
+    }
+    area = std::abs(area) * 0.5f;
+    if (perimeter < 1.0f) return {};
+
+    float distance = area * unclipRatio / perimeter;
+
+    ClipperLib::Path clipperPath;
+    for (int i = 0; i < 4; i++) {
+        clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),
+                                static_cast<ClipperLib::cInt>(box[i].y) });
+    }
+
+    ClipperLib::ClipperOffset offset;
+    offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);
+
+    ClipperLib::Paths solution;
+    offset.Execute(solution, distance);
+
+    if (solution.empty() || solution[0].empty()) return {};
+
+    std::vector<cv::Point2f> result;
+    for (auto& p : solution[0]) {
+        result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));
+    }
+    return result;
+}
+
+} // namespace onnxocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.h
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "ONNXOCRTypes.h"
+#include "ONNXEngine.h"
+#include <vector>
+#include <mutex>
+
+namespace ANSCENTER {
+namespace onnxocr {
+
+class ONNXOCRDetector : public BasicOrtHandler {
+public:
+    explicit ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads = 1);
+    ~ONNXOCRDetector() override = default;
+
+    // Run text detection on an image
+    std::vector<TextBox> Detect(const cv::Mat& srcImage,
+                                int maxSideLen    = kDetMaxSideLen,
+                                float dbThresh    = kDetDbThresh,
+                                float boxThresh   = kDetBoxThresh,
+                                float unclipRatio = kDetUnclipRatio,
+                                bool useDilation  = false);
+
+private:
+    Ort::Value transform(const cv::Mat& mat) override;
+    Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
+
+    // Preprocessing
+    std::vector<float> Preprocess(const cv::Mat& srcImage, int maxSideLen,
+                                   int& resizeH, int& resizeW,
+                                   float& ratioH, float& ratioW);
+
+    // Postprocessing: threshold -> contours -> boxes (matches PaddleOCR official flow)
+    std::vector<TextBox> Postprocess(const float* outputData, int outH, int outW,
+                                     float ratioH, float ratioW, int srcH, int srcW,
+                                     float dbThresh, float boxThresh, float unclipRatio,
+                                     bool useDilation);
+
+    // Get ordered 4 corners [TL, TR, BR, BL] from rotated rect (matches PaddleOCR GetMiniBoxes)
+    std::array<cv::Point2f, 4> GetMiniBoxes(const cv::RotatedRect& rect);
+
+    // Compute mean score inside box polygon on the probability map
+    float BoxScoreFast(const cv::Mat& probMap, const std::array<cv::Point2f, 4>& box);
+
+    // Expand 4-point box using Clipper offset
+    std::vector<cv::Point2f> UnclipPolygon(const std::array<cv::Point2f, 4>& box, float unclipRatio);
+
+    std::mutex _mutex;
+};
+
+} // namespace onnxocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp
@@ -0,0 +1,165 @@
+#include "ONNXOCRRecognizer.h"
+
+#include <opencv2/imgproc.hpp>
+#include <iostream>
+#include <algorithm>
+#include <numeric>
+#include <cmath>
+#include <cfloat>
+#include <cstring>
+
+namespace ANSCENTER {
+namespace onnxocr {
+
+ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads)
+    : BasicOrtHandler(onnx_path, num_threads) {
+}
+
+bool ONNXOCRRecognizer::LoadDictionary(const std::string& dictPath) {
+    keys_ = LoadDict(dictPath);
+    if (keys_.size() < 2) {
+        std::cerr << "[ONNXOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl;
+        return false;
+    }
+    std::cout << "[ONNXOCRRecognizer] Loaded dictionary with " << keys_.size()
+              << " characters from: " << dictPath << std::endl;
+    return true;
+}
+
+Ort::Value ONNXOCRRecognizer::transform(const cv::Mat& mat) {
+    // Not used directly - recognition uses custom preprocess with dynamic width
+    cv::Mat resized = ResizeRecImage(mat, imgH_, imgMaxW_);
+    resized.convertTo(resized, CV_32FC3);
+    auto data = NormalizeAndPermuteCls(resized);
+
+    input_values_handler.assign(data.begin(), data.end());
+    return Ort::Value::CreateTensor<float>(
+        *memory_info_handler, input_values_handler.data(), input_values_handler.size(),
+        input_node_dims.data(), input_node_dims.size());
+}
+
+Ort::Value ONNXOCRRecognizer::transformBatch(const std::vector<cv::Mat>& images) {
+    // Not used - recognizer processes single images with dynamic widths
+    if (!images.empty()) {
+        return transform(images[0]);
+    }
+    return Ort::Value(nullptr);
+}
+
+TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
+    std::lock_guard<std::mutex> lock(_mutex);
+
+    if (!ort_session || croppedImage.empty() || keys_.empty()) {
+        return {};
+    }
+
+    try {
+        // Preprocess: resize to fixed height, proportional width
+        cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
+        int resizedW = resized.cols;
+
+        resized.convertTo(resized, CV_32FC3);
+        // Recognition uses (pixel/255 - 0.5) / 0.5 normalization (same as classifier)
+        auto normalizedData = NormalizeAndPermuteCls(resized);
+
+        // Pad to at least kRecImgW width (matching official PaddleOCR behavior)
+        // Official PaddleOCR: padding_im = np.zeros((C, H, W)), then copies normalized
+        // image into left portion. Padding value = 0.0 in normalized space.
+        int imgW = std::max(resizedW, kRecImgW);
+
+        std::vector<float> inputData;
+        if (imgW > resizedW) {
+            // Zero-pad on the right (CHW layout)
+            inputData.resize(3 * imgH_ * imgW, 0.0f);
+            for (int c = 0; c < 3; c++) {
+                for (int y = 0; y < imgH_; y++) {
+                    std::memcpy(
+                        &inputData[c * imgH_ * imgW + y * imgW],
+                        &normalizedData[c * imgH_ * resizedW + y * resizedW],
+                        resizedW * sizeof(float));
+                }
+            }
+        } else {
+            inputData = std::move(normalizedData);
+        }
+
+        // Create input tensor with (possibly padded) width
+        std::array<int64_t, 4> inputShape = { 1, 3, imgH_, imgW };
+        Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
+            *memory_info_handler, inputData.data(), inputData.size(),
+            inputShape.data(), inputShape.size());
+
+        // Run inference
+        auto outputTensors = ort_session->Run(
+            Ort::RunOptions{ nullptr },
+            input_node_names.data(), &inputTensor, 1,
+            output_node_names.data(), num_outputs);
+
+        // Get output
+        float* outputData = outputTensors[0].GetTensorMutableData<float>();
+        auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
+
+        int seqLen = static_cast<int>(outputShape[1]);
+        int numClasses = static_cast<int>(outputShape[2]);
+
+        return CTCDecode(outputData, seqLen, numClasses);
+    }
+    catch (const Ort::Exception& e) {
+        std::cerr << "[ONNXOCRRecognizer] Inference failed: " << e.what() << std::endl;
+        return {};
+    }
+}
+
+std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
+    std::vector<TextLine> results;
+    results.reserve(croppedImages.size());
+
+    // Process one at a time (dynamic width per image)
+    for (size_t i = 0; i < croppedImages.size(); i++) {
+        results.push_back(Recognize(croppedImages[i]));
+    }
+
+    return results;
+}
+
+TextLine ONNXOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
+    TextLine result;
+    std::string text;
+    std::vector<float> scores;
+
+    int lastIndex = 0; // CTC blank is index 0
+
+    for (int t = 0; t < seqLen; t++) {
+        // Find argmax for this timestep
+        int maxIndex = 0;
+        float maxValue = -FLT_MAX;
+
+        const float* timeStep = outputData + t * numClasses;
+        for (int c = 0; c < numClasses; c++) {
+            if (timeStep[c] > maxValue) {
+                maxValue = timeStep[c];
+                maxIndex = c;
+            }
+        }
+
+        // CTC decode: skip blanks (index 0) and repeated characters
+        if (maxIndex != 0 && maxIndex != lastIndex) {
+            if (maxIndex > 0 && maxIndex < static_cast<int>(keys_.size())) {
+                text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc.
+                // Use raw model output value as confidence (PaddleOCR v5 models include softmax)
+                scores.push_back(maxValue);
+            }
+        }
+        lastIndex = maxIndex;
+    }
+
+    result.text = text;
+    if (!scores.empty()) {
+        result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) /
+                       static_cast<float>(scores.size());
+    }
+    return result;
+}
+
+} // namespace onnxocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.h
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "ONNXOCRTypes.h"
+#include "ONNXEngine.h"
+#include <vector>
+#include <string>
+#include <mutex>
+
+namespace ANSCENTER {
+namespace onnxocr {
+
+class ONNXOCRRecognizer : public BasicOrtHandler {
+public:
+    explicit ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads = 1);
+    ~ONNXOCRRecognizer() override = default;
+
+    // Load character dictionary (must be called before Recognize)
+    bool LoadDictionary(const std::string& dictPath);
+
+    // Recognize text from a single cropped text image
+    TextLine Recognize(const cv::Mat& croppedImage);
+
+    // Batch recognition for multiple cropped images
+    std::vector<TextLine> RecognizeBatch(const std::vector<cv::Mat>& croppedImages);
+
+private:
+    Ort::Value transform(const cv::Mat& mat) override;
+    Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
+
+    // CTC greedy decode
+    TextLine CTCDecode(const float* outputData, int seqLen, int numClasses);
+
+    std::vector<std::string> keys_;
+    int imgH_    = kRecImgH;
+    int imgMaxW_ = kRecImgMaxW;
+    std::mutex _mutex;
+};
+
+} // namespace onnxocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h
@@ -0,0 +1,212 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <array>
+#include <fstream>
+#include <algorithm>
+#include <numeric>
+#include <cmath>
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+namespace ANSCENTER {
+namespace onnxocr {
+
+// Detection normalization constants (BGR channel order, matching PaddleOCR official)
+// PaddleOCR config: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+// Applied directly to BGR channels WITHOUT BGR→RGB conversion:
+//   Channel 0 (B) → mean=0.485, std=0.229
+//   Channel 1 (G) → mean=0.456, std=0.224
+//   Channel 2 (R) → mean=0.406, std=0.225
+constexpr float kDetMean0 = 0.485f;  // B channel
+constexpr float kDetMean1 = 0.456f;  // G channel
+constexpr float kDetMean2 = 0.406f;  // R channel
+constexpr float kDetStd0  = 0.229f;  // B channel
+constexpr float kDetStd1  = 0.224f;  // G channel
+constexpr float kDetStd2  = 0.225f;  // R channel
+constexpr float kScale = 1.0f / 255.0f;
+
+// Detection defaults (PP-OCRv5 server: limit_type=max, limit_side_len=960)
+constexpr int   kDetMaxSideLen    = 960;
+constexpr int   kDetMaxSideLimit  = 4000;  // Safety cap on max dimension
+constexpr float kDetDbThresh      = 0.3f;
+constexpr float kDetBoxThresh     = 0.6f;
+constexpr float kDetUnclipRatio   = 1.5f;
+constexpr int   kDetMaxCandidates = 1000;
+
+// Classifier defaults (PP-LCNet_x1_0_textline_ori model)
+// Input: [B, 3, 80, 160], ImageNet normalization, 2-class (0°/180°)
+// Direct resize to 80x160 (no aspect ratio preservation)
+constexpr int   kClsImageH   = 80;
+constexpr int   kClsImageW   = 160;
+constexpr float kClsThresh   = 0.9f;
+
+// Recognition defaults
+constexpr int kRecImgH    = 48;
+constexpr int kRecImgW    = 320;   // Default rec width (PP-OCRv5 rec_image_shape[2]=320, min padded width)
+constexpr int kRecImgMaxW = 960;   // Allow wide recognition input for long text lines
+constexpr int kRecBatchSize = 6;
+
+// A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left)
+struct TextBox {
+    std::array<cv::Point2f, 4> points;
+    float score = 0.0f;
+};
+
+// A single recognized text line
+struct TextLine {
+    std::string text;
+    float score = 0.0f;
+};
+
+// OCR result matching PaddleOCR::OCRPredictResult format
+struct OCRPredictResult {
+    std::vector<std::vector<int>> box;  // 4 corner points [[x,y], ...]
+    std::string text;
+    float score    = -1.0f;
+    float cls_score = 0.0f;
+    int   cls_label = -1;
+};
+
+// Load character dictionary from file
+inline std::vector<std::string> LoadDict(const std::string& dictPath) {
+    std::vector<std::string> keys;
+    std::ifstream file(dictPath);
+    if (!file.is_open()) return keys;
+    std::string line;
+    while (std::getline(file, line)) {
+        if (!line.empty() && line.back() == '\r') {
+            line.pop_back();
+        }
+        keys.push_back(line);
+    }
+    // CTC blank token at index 0
+    keys.insert(keys.begin(), "#");
+    // Space at end
+    keys.push_back(" ");
+    return keys;
+}
+
+// Compute resize dimensions for detection model (multiples of 32)
+// limit_type='max': scale down if max side > maxSideLen (PP-OCRv5 server default)
+// maxSideLimit: safety cap on final max dimension (default 4000)
+inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen,
+                                       int maxSideLimit = kDetMaxSideLimit) {
+    float ratio = 1.0f;
+    int maxSide = std::max(srcH, srcW);
+    if (maxSide > maxSideLen) {
+        ratio = static_cast<float>(maxSideLen) / static_cast<float>(maxSide);
+    }
+    int newH = static_cast<int>(srcH * ratio);
+    int newW = static_cast<int>(srcW * ratio);
+
+    // Safety cap: clamp if either dimension exceeds maxSideLimit
+    if (std::max(newH, newW) > maxSideLimit) {
+        float clampRatio = static_cast<float>(maxSideLimit) / static_cast<float>(std::max(newH, newW));
+        newH = static_cast<int>(newH * clampRatio);
+        newW = static_cast<int>(newW * clampRatio);
+    }
+
+    newH = std::max(32, static_cast<int>(std::round(newH / 32.0) * 32));
+    newW = std::max(32, static_cast<int>(std::round(newW / 32.0) * 32));
+    return cv::Size(newW, newH);
+}
+
+// Normalize BGR float image to CHW BGR vector for detection
+// BGR channel order preserved (matching PaddleOCR official - no BGR→RGB conversion)
+inline std::vector<float> NormalizeAndPermute(const cv::Mat& img) {
+    int h = img.rows;
+    int w = img.cols;
+    std::vector<float> result(3 * h * w);
+    for (int y = 0; y < h; y++) {
+        const float* row = img.ptr<float>(y);
+        for (int x = 0; x < w; x++) {
+            float b = row[x * 3 + 0];
+            float g = row[x * 3 + 1];
+            float r = row[x * 3 + 2];
+            // BGR order: channel 0=B, 1=G, 2=R (matching PaddleOCR official)
+            result[0 * h * w + y * w + x] = (b * kScale - kDetMean0) / kDetStd0;
+            result[1 * h * w + y * w + x] = (g * kScale - kDetMean1) / kDetStd1;
+            result[2 * h * w + y * w + x] = (r * kScale - kDetMean2) / kDetStd2;
+        }
+    }
+    return result;
+}
+
+// Normalize for classifier and recognizer: (x/255 - 0.5) / 0.5
+// BGR channel order preserved (matching PaddleOCR official - no BGR→RGB conversion)
+inline std::vector<float> NormalizeAndPermuteCls(const cv::Mat& img) {
+    int h = img.rows;
+    int w = img.cols;
+    std::vector<float> result(3 * h * w);
+    for (int y = 0; y < h; y++) {
+        const float* row = img.ptr<float>(y);
+        for (int x = 0; x < w; x++) {
+            float b = row[x * 3 + 0];
+            float g = row[x * 3 + 1];
+            float r = row[x * 3 + 2];
+            // BGR order: channel 0=B, 1=G, 2=R (matching PaddleOCR official)
+            result[0 * h * w + y * w + x] = (b * kScale - 0.5f) / 0.5f;
+            result[1 * h * w + y * w + x] = (g * kScale - 0.5f) / 0.5f;
+            result[2 * h * w + y * w + x] = (r * kScale - 0.5f) / 0.5f;
+        }
+    }
+    return result;
+}
+
+// Sort text boxes from top to bottom, left to right
+inline void SortTextBoxes(std::vector<TextBox>& boxes) {
+    std::sort(boxes.begin(), boxes.end(),
+        [](const TextBox& a, const TextBox& b) {
+            if (std::abs(a.points[0].y - b.points[0].y) < 10.0f) {
+                return a.points[0].x < b.points[0].x;
+            }
+            return a.points[0].y < b.points[0].y;
+        });
+}
+
+// Get rotated and cropped image from text box polygon
+inline cv::Mat GetRotateCropImage(const cv::Mat& srcImage, const TextBox& box) {
+    auto pts = box.points;
+    float width = static_cast<float>(std::max(
+        cv::norm(pts[0] - pts[1]),
+        cv::norm(pts[2] - pts[3])));
+    float height = static_cast<float>(std::max(
+        cv::norm(pts[0] - pts[3]),
+        cv::norm(pts[1] - pts[2])));
+
+    std::vector<cv::Point2f> srcPts = { pts[0], pts[1], pts[2], pts[3] };
+    std::vector<cv::Point2f> dstPts = {
+        {0, 0}, {width, 0}, {width, height}, {0, height}
+    };
+
+    cv::Mat M = cv::getPerspectiveTransform(srcPts, dstPts);
+    cv::Mat cropped;
+    cv::warpPerspective(srcImage, cropped, M,
+        cv::Size(static_cast<int>(width), static_cast<int>(height)),
+        cv::BORDER_REPLICATE);
+
+    if (cropped.rows > cropped.cols * 1.5f) {
+        cv::Mat rotated;
+        cv::transpose(cropped, rotated);
+        cv::flip(rotated, rotated, 0);
+        return rotated;
+    }
+    return cropped;
+}
+
+// Resize recognition image to fixed height, proportional width
+inline cv::Mat ResizeRecImage(const cv::Mat& img, int targetH, int maxW) {
+    float ratio = static_cast<float>(targetH) / img.rows;
+    int targetW = static_cast<int>(img.cols * ratio);
+    targetW = std::min(targetW, maxW);
+    targetW = std::max(targetW, 1);
+
+    cv::Mat resized;
+    cv::resize(img, resized, cv::Size(targetW, targetH));
+    return resized;
+}
+
+} // namespace onnxocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
@@ -0,0 +1,130 @@
+#include "PaddleOCRV5Engine.h"
+#include "EPLoader.h"
+
+#include <opencv2/imgproc.hpp>
+#include <iostream>
+#include <algorithm>
+
+namespace ANSCENTER {
+namespace onnxocr {
+
+bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
+                                    const std::string& clsModelPath,
+                                    const std::string& recModelPath,
+                                    const std::string& dictPath) {
+    std::lock_guard<std::recursive_mutex> lock(_mutex);
+
+    try {
+        // Initialize detector (also triggers EPLoader init in BasicOrtHandler)
+        detector_ = std::make_unique<ONNXOCRDetector>(detModelPath);
+        std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl;
+
+        // Ensure this DLL's copy of Ort::Global<void>::api_ is initialized.
+        // BasicOrtHandler sets it in ONNXEngine.dll, but each DLL has its own
+        // inline-static copy. Without this, inference calls from ANSOCR.dll crash.
+        if (Ort::Global<void>::api_ == nullptr) {
+            Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
+        }
+
+        // Initialize classifier (optional)
+        if (!clsModelPath.empty()) {
+            classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath);
+            std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl;
+        }
+        else {
+            classifier_.reset();
+            std::cout << "[PaddleOCRV5Engine] Classifier skipped (no model path)" << std::endl;
+        }
+
+        // Initialize recognizer
+        recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath);
+        if (!recognizer_->LoadDictionary(dictPath)) {
+            std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl;
+            return false;
+        }
+        std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl;
+
+        _initialized = true;
+        std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl;
+        return true;
+    }
+    catch (const std::exception& e) {
+        std::cerr << "[PaddleOCRV5Engine] Initialization failed: " << e.what() << std::endl;
+        detector_.reset();
+        classifier_.reset();
+        recognizer_.reset();
+        _initialized = false;
+        return false;
+    }
+}
+
+std::vector<OCRPredictResult> PaddleOCRV5Engine::ocr(const cv::Mat& img) {
+    std::lock_guard<std::recursive_mutex> lock(_mutex);
+
+    std::vector<OCRPredictResult> results;
+
+    if (!_initialized || img.empty()) {
+        return results;
+    }
+
+    // Step 1: Text Detection
+    auto boxes = detector_->Detect(img, _maxSideLen, _detDbThresh, _detBoxThresh, _detUnclipRatio, _useDilation);
+
+    if (boxes.empty()) {
+        return results;
+    }
+
+    // Step 2: Crop detected text regions
+    std::vector<cv::Mat> croppedImages;
+    croppedImages.reserve(boxes.size());
+    for (auto& box : boxes) {
+        cv::Mat cropped = GetRotateCropImage(img, box);
+        if (!cropped.empty()) {
+            croppedImages.push_back(cropped);
+        }
+    }
+
+    // Step 3: Classification (optional)
+    std::vector<int> cls_labels(croppedImages.size(), 0);
+    std::vector<float> cls_scores(croppedImages.size(), 0.0f);
+
+    if (classifier_) {
+        classifier_->Classify(croppedImages, cls_labels, cls_scores, _clsThresh);
+
+        // Rotate images classified as upside-down (label=1 and score > threshold)
+        for (size_t i = 0; i < croppedImages.size(); i++) {
+            if (cls_labels[i] % 2 == 1 && cls_scores[i] > _clsThresh) {
+                cv::rotate(croppedImages[i], croppedImages[i], cv::ROTATE_180);
+            }
+        }
+    }
+
+    // Step 4: Text Recognition
+    auto textLines = recognizer_->RecognizeBatch(croppedImages);
+
+    // Step 5: Combine results
+    for (size_t i = 0; i < boxes.size() && i < textLines.size(); i++) {
+        OCRPredictResult result;
+
+        // Convert TextBox points to box format [[x0,y0], [x1,y1], [x2,y2], [x3,y3]]
+        result.box.resize(4);
+        for (int j = 0; j < 4; j++) {
+            result.box[j] = {
+                static_cast<int>(boxes[i].points[j].x),
+                static_cast<int>(boxes[i].points[j].y)
+            };
+        }
+
+        result.text      = textLines[i].text;
+        result.score      = textLines[i].score;
+        result.cls_label  = cls_labels[i];
+        result.cls_score  = cls_scores[i];
+
+        results.push_back(result);
+    }
+
+    return results;
+}
+
+} // namespace onnxocr
+} // namespace ANSCENTER
--- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.h
+++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "ONNXOCRTypes.h"
+#include "ONNXOCRDetector.h"
+#include "ONNXOCRClassifier.h"
+#include "ONNXOCRRecognizer.h"
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+namespace ANSCENTER {
+namespace onnxocr {
+
+// PaddleOCR V5 pipeline engine: Detection -> (Classification) -> Recognition
+// Mirrors the PaddleOCR::PPOCR interface for drop-in replacement
+class PaddleOCRV5Engine {
+public:
+    PaddleOCRV5Engine() = default;
+    ~PaddleOCRV5Engine() = default;
+
+    // Initialize the OCR pipeline
+    // clsModelPath can be empty to skip classification
+    bool Initialize(const std::string& detModelPath,
+                    const std::string& clsModelPath,
+                    const std::string& recModelPath,
+                    const std::string& dictPath);
+
+    // Run full OCR pipeline on an image
+    // Returns results matching PaddleOCR::OCRPredictResult format
+    std::vector<OCRPredictResult> ocr(const cv::Mat& img);
+
+    // Configuration setters (matching OCRModelConfig parameters)
+    void SetDetMaxSideLen(int val)          { _maxSideLen = val; }
+    void SetDetDbThresh(float val)          { _detDbThresh = val; }
+    void SetDetBoxThresh(float val)         { _detBoxThresh = val; }
+    void SetDetUnclipRatio(float val)       { _detUnclipRatio = val; }
+    void SetClsThresh(float val)            { _clsThresh = val; }
+    void SetUseDilation(bool val)           { _useDilation = val; }
+
+private:
+    std::unique_ptr<ONNXOCRDetector>    detector_;
+    std::unique_ptr<ONNXOCRClassifier>  classifier_;   // nullptr if not used
+    std::unique_ptr<ONNXOCRRecognizer>  recognizer_;
+
+    std::recursive_mutex _mutex;
+
+    // Detection parameters
+    int   _maxSideLen    = kDetMaxSideLen;
+    float _detDbThresh   = kDetDbThresh;
+    float _detBoxThresh  = kDetBoxThresh;
+    float _detUnclipRatio = kDetUnclipRatio;
+    bool  _useDilation   = false;
+
+    // Classifier parameters
+    float _clsThresh     = kClsThresh;
+
+    bool _initialized = false;
+};
+
+} // namespace onnxocr
+} // namespace ANSCENTER