ANSCORE/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp

#include "ONNXOCRDetector.h"
#include "include/clipper.h"
#include "ANSGpuFrameRegistry.h"
#include "NV12PreprocessHelper.h"

#include <opencv2/imgproc.hpp>
#include <iostream>
#include <algorithm>
#include <cmath>
#include <chrono>

namespace ANSCENTER {
namespace onnxocr {

ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads)
    : BasicOrtHandler(onnx_path, num_threads) {
}

ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path,
                                 const OrtHandlerOptions& options,
                                 unsigned int num_threads)
    : BasicOrtHandler(onnx_path, options, num_threads) {
}

Ort::Value ONNXOCRDetector::transform(const cv::Mat& mat) {
    // Not used directly - detection uses custom Preprocess + manual tensor creation
    // Provided to satisfy BasicOrtHandler pure virtual
    cv::Mat canvas;
    mat.convertTo(canvas, CV_32FC3);
    auto data = NormalizeAndPermute(canvas);

    input_values_handler.assign(data.begin(), data.end());
    return Ort::Value::CreateTensor<float>(
        *memory_info_handler, input_values_handler.data(), input_values_handler.size(),
        input_node_dims.data(), input_node_dims.size());
}

Ort::Value ONNXOCRDetector::transformBatch(const std::vector<cv::Mat>& images) {
    // Not used - detection processes single images with dynamic shapes
    if (!images.empty()) {
        return transform(images[0]);
    }
    return Ort::Value(nullptr);
}

std::vector<TextBox> ONNXOCRDetector::Detect(const cv::Mat& srcImage,
                                              int maxSideLen,
                                              float dbThresh,
                                              float boxThresh,
                                              float unclipRatio,
                                              bool useDilation) {
    std::lock_guard<std::mutex> lock(_mutex);

    if (!ort_session || srcImage.empty()) {
        return {};
    }

    // Try to get full-resolution image from NV12 frame (same pattern as ANSONNXYOLO)
    cv::Mat inferenceImage = srcImage;
    float bgrScaleX = 1.0f, bgrScaleY = 1.0f;

    GpuFrameData* gpuFrame = tl_currentGpuFrame();
    if (gpuFrame && gpuFrame->pixelFormat == 23 &&
        gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&
        gpuFrame->width > 0 && gpuFrame->height > 0) {
        // Full-res NV12 available — convert to BGR on CPU for ONNX
        cv::Mat yPlane(gpuFrame->height, gpuFrame->width, CV_8UC1,
                       gpuFrame->cpuYPlane, gpuFrame->cpuYLinesize);
        cv::Mat uvPlane(gpuFrame->height / 2, gpuFrame->width, CV_8UC1,
                        gpuFrame->cpuUvPlane, gpuFrame->cpuUvLinesize);
        cv::Mat fullResBGR;
        cv::cvtColorTwoPlane(yPlane, uvPlane, fullResBGR, cv::COLOR_YUV2BGR_NV12);
        if (!fullResBGR.empty()) {
            bgrScaleX = static_cast<float>(srcImage.cols) / fullResBGR.cols;
            bgrScaleY = static_cast<float>(srcImage.rows) / fullResBGR.rows;
            inferenceImage = fullResBGR;
        }
    }

    int resizeH, resizeW;
    float ratioH, ratioW;

    // Preprocess (using full-res image if NV12 was available)
    auto inputData = Preprocess(inferenceImage, maxSideLen, resizeH, resizeW, ratioH, ratioW);

    // Create input tensor with dynamic shape
    std::array<int64_t, 4> inputShape = { 1, 3, resizeH, resizeW };
    Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
        *memory_info_handler, inputData.data(), inputData.size(),
        inputShape.data(), inputShape.size());

    // Run inference
    auto outputTensors = ort_session->Run(
        Ort::RunOptions{ nullptr },
        input_node_names.data(), &inputTensor, 1,
        output_node_names.data(), num_outputs);

    // Get output data
    float* outputData = outputTensors[0].GetTensorMutableData<float>();
    auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();

    int outH = static_cast<int>(outputShape[2]);
    int outW = static_cast<int>(outputShape[3]);

    // Postprocess — detection coords are relative to inferenceImage (full-res),
    // then scaled back to srcImage (display-res) coordinates
    auto boxes = Postprocess(outputData, outH, outW, ratioH, ratioW,
                             inferenceImage.rows, inferenceImage.cols,
                             dbThresh, boxThresh, unclipRatio, useDilation);

    // Rescale box coordinates from full-res to display-res
    if (bgrScaleX != 1.0f || bgrScaleY != 1.0f) {
        for (auto& box : boxes) {
            for (auto& pt : box.points) {
                pt.x *= bgrScaleX;
                pt.y *= bgrScaleY;
            }
        }
    }

    return boxes;
}

std::vector<float> ONNXOCRDetector::Preprocess(const cv::Mat& srcImage, int maxSideLen,
                                                int& resizeH, int& resizeW,
                                                float& ratioH, float& ratioW) {
    cv::Size newSize = ComputeDetResizeShape(srcImage.rows, srcImage.cols, maxSideLen);
    resizeW = newSize.width;
    resizeH = newSize.height;
    ratioH = static_cast<float>(srcImage.rows) / resizeH;
    ratioW = static_cast<float>(srcImage.cols) / resizeW;

    cv::Mat resized;
    cv::resize(srcImage, resized, newSize);
    resized.convertTo(resized, CV_32FC3);

    return NormalizeAndPermute(resized);
}

// Matches PaddleOCR official DBPostProcess.boxes_from_bitmap flow
std::vector<TextBox> ONNXOCRDetector::Postprocess(const float* outputData, int outH, int outW,
                                                   float ratioH, float ratioW,
                                                   int srcH, int srcW,
                                                   float dbThresh, float boxThresh,
                                                   float unclipRatio, bool useDilation) {
    // Create probability map from output
    cv::Mat probMap(outH, outW, CV_32FC1, const_cast<float*>(outputData));

    // Binary threshold
    cv::Mat binaryMap;
    cv::threshold(probMap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);
    binaryMap.convertTo(binaryMap, CV_8UC1);

    // Optional dilation (PaddleOCR default: use_dilation=False, kernel=[[1,1],[1,1]])
    if (useDilation) {
        cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
        cv::dilate(binaryMap, binaryMap, kernel);
    }

    // Find contours
    std::vector<std::vector<cv::Point>> contours;
    cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);

    std::vector<TextBox> boxes;
    int numContours = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);

    for (int i = 0; i < numContours; i++) {
        if (contours[i].size() < 4) continue;

        // Step 1: GetMiniBoxes - get ordered 4 corners of min area rect
        cv::RotatedRect minRect = cv::minAreaRect(contours[i]);
        float sside = std::min(minRect.size.width, minRect.size.height);
        if (sside < 3.0f) continue;

        auto ordered = GetMiniBoxes(minRect);

        // Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon
        float score = BoxScoreFast(probMap, ordered);
        if (score < boxThresh) continue;

        // Step 3: UnclipPolygon - expand the 4-point box
        auto expanded = UnclipPolygon(ordered, unclipRatio);
        if (expanded.size() < 4) continue;

        // Step 4: Re-compute GetMiniBoxes on the expanded polygon
        std::vector<cv::Point> expandedInt;
        expandedInt.reserve(expanded.size());
        for (auto& p : expanded) {
            expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));
        }
        cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);

        // Filter by min_size + 2 = 5 (matches PaddleOCR official)
        float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);
        if (expandedSside < 5.0f) continue;

        auto expandedOrdered = GetMiniBoxes(expandedRect);

        // Step 5: Scale to original image coordinates
        TextBox box;
        for (int j = 0; j < 4; j++) {
            box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(srcW - 1));
            box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(srcH - 1));
        }
        box.score = score;
        boxes.push_back(box);
    }

    SortTextBoxes(boxes);
    return boxes;
}

// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y
std::array<cv::Point2f, 4> ONNXOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {
    cv::Point2f vertices[4];
    rect.points(vertices);

    // Sort all 4 points by x-coordinate ascending
    std::sort(vertices, vertices + 4,
        [](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });

    // Left two (indices 0,1): smaller y = top-left, larger y = bottom-left
    cv::Point2f topLeft, bottomLeft;
    if (vertices[0].y <= vertices[1].y) {
        topLeft = vertices[0];
        bottomLeft = vertices[1];
    } else {
        topLeft = vertices[1];
        bottomLeft = vertices[0];
    }

    // Right two (indices 2,3): smaller y = top-right, larger y = bottom-right
    cv::Point2f topRight, bottomRight;
    if (vertices[2].y <= vertices[3].y) {
        topRight = vertices[2];
        bottomRight = vertices[3];
    } else {
        topRight = vertices[3];
        bottomRight = vertices[2];
    }

    // Order: [TL, TR, BR, BL] (clockwise from top-left)
    return { topLeft, topRight, bottomRight, bottomLeft };
}

// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon
float ONNXOCRDetector::BoxScoreFast(const cv::Mat& probMap,
                                     const std::array<cv::Point2f, 4>& box) {
    int h = probMap.rows;
    int w = probMap.cols;

    // Get bounding rectangle with proper clamping (matches PaddleOCR official)
    float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});
    float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});
    float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});
    float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});

    int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);
    int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);
    int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);
    int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);

    if (xmin >= xmax || ymin >= ymax) return 0.0f;

    cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);

    std::vector<cv::Point> pts(4);
    for (int j = 0; j < 4; j++) {
        pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,
                            static_cast<int>(box[j].y) - ymin);
    }
    std::vector<std::vector<cv::Point>> polys = { pts };
    cv::fillPoly(mask, polys, cv::Scalar(1));

    cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
    return static_cast<float>(cv::mean(roiMap, mask)[0]);
}

// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND
std::vector<cv::Point2f> ONNXOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,
                                                         float unclipRatio) {
    // Compute area using Shoelace formula and perimeter
    float area = 0.0f;
    float perimeter = 0.0f;
    for (int i = 0; i < 4; i++) {
        int j = (i + 1) % 4;
        area += box[i].x * box[j].y - box[j].x * box[i].y;
        float dx = box[j].x - box[i].x;
        float dy = box[j].y - box[i].y;
        perimeter += std::sqrt(dx * dx + dy * dy);
    }
    area = std::abs(area) * 0.5f;
    if (perimeter < 1.0f) return {};

    float distance = area * unclipRatio / perimeter;

    ClipperLib::Path clipperPath;
    for (int i = 0; i < 4; i++) {
        clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),
                                static_cast<ClipperLib::cInt>(box[i].y) });
    }

    ClipperLib::ClipperOffset offset;
    offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);

    ClipperLib::Paths solution;
    offset.Execute(solution, distance);

    if (solution.empty() || solution[0].empty()) return {};

    std::vector<cv::Point2f> result;
    for (auto& p : solution[0]) {
        result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));
    }
    return result;
}

void ONNXOCRDetector::Warmup() {
    std::lock_guard<std::mutex> lock(_mutex);
    if (_warmedUp || !ort_session) return;

    // 320x320 covers the typical license-plate ROI after LPD crop +
    // multiple-of-32 rounding. cuDNN caches the algorithm for this
    // shape so the first real inference doesn't pay the picker cost.
    constexpr int kWarmupSide = 320;
    try {
        cv::Mat dummy(kWarmupSide, kWarmupSide, CV_8UC3, cv::Scalar(128, 128, 128));
        cv::Mat dummyF;
        dummy.convertTo(dummyF, CV_32FC3);
        auto inputData = NormalizeAndPermute(dummyF);

        std::array<int64_t, 4> inputShape = { 1, 3, kWarmupSide, kWarmupSide };
        Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
            *memory_info_handler, inputData.data(), inputData.size(),
            inputShape.data(), inputShape.size());

        auto t0 = std::chrono::high_resolution_clock::now();
        (void)ort_session->Run(
            Ort::RunOptions{ nullptr },
            input_node_names.data(), &inputTensor, 1,
            output_node_names.data(), num_outputs);
        auto t1 = std::chrono::high_resolution_clock::now();
        double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
        std::cout << "[ONNXOCRDetector] Warmup [1,3,"
                  << kWarmupSide << "," << kWarmupSide << "] "
                  << ms << " ms" << std::endl;
    }
    catch (const Ort::Exception& e) {
        std::cerr << "[ONNXOCRDetector] Warmup failed: " << e.what() << std::endl;
    }
    _warmedUp = true;
}

} // namespace onnxocr
} // namespace ANSCENTER