ANSCORE/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp

#include "RTOCRDetector.h"
#include "include/clipper.h"
#include "NV12PreprocessHelper.h"
#include "ANSGpuFrameRegistry.h"

#include <cuda_runtime.h>
#include <opencv2/imgproc.hpp>

// NV12→BGR fused resize via NV12PreprocessHelper (linked from ANSODEngine.dll)
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/cudaarithm.hpp>
#include <iostream>
#include <algorithm>
#include <cmath>

namespace ANSCENTER {
namespace rtocr {

bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId,
                                const std::string& engineCacheDir,
                                int maxSideLen) {
    // Engine cache directory
    std::string cacheDir;
    if (!engineCacheDir.empty()) {
        cacheDir = engineCacheDir;
    } else {
        auto pos = onnxPath.find_last_of("/\\");
        cacheDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
    }

    try {
        ANSCENTER::Options options;
        options.deviceIndex  = gpuId;
        // FP32 required for detection: this CNN (DBNet) produces NaN in FP16.
        // The model has 142 Convolution + 87 Scale (fused BatchNorm) layers whose
        // intermediate values overflow FP16 range (65504).  Mixed precision
        // (forcing only Sigmoid/Softmax to FP32) is insufficient because the NaN
        // originates deep in the conv->scale->relu backbone before reaching those layers.
        // Classifier and recognizer remain FP16 with mixed precision -- only the
        // detector needs full FP32.
        options.precision    = ANSCENTER::Precision::FP32;
        options.maxBatchSize = 1;
        options.optBatchSize = 1;

        // Dynamic spatial dimensions for detection (multiples of 32)
        options.minInputHeight = 32;
        options.minInputWidth  = 32;
        options.optInputHeight = std::min(640, maxSideLen);
        options.optInputWidth  = std::min(640, maxSideLen);
        options.maxInputHeight = maxSideLen;
        options.maxInputWidth  = maxSideLen;
        options.engineFileDir  = cacheDir;

        m_poolKey = { onnxPath,
            static_cast<int>(options.precision),
            options.maxBatchSize };
        m_engine = EnginePoolManager<float>::instance().acquire(
            m_poolKey, options, onnxPath,
            kDetSubVals, kDetDivVals, true, -1);
        m_usingSharedPool = (m_engine != nullptr);

        if (!m_engine) {
            std::cerr << "[RTOCRDetector] Failed to build/load TRT engine for: "
                      << onnxPath << std::endl;
            return false;
        }

        // Query actual profile max from the loaded engine
        int profMaxH = m_engine->getProfileMaxHeight();
        int profMaxW = m_engine->getProfileMaxWidth();
        if (profMaxH > 0 && profMaxW > 0) {
            m_engineMaxSideLen = std::min(profMaxH, profMaxW);
        } else {
            m_engineMaxSideLen = maxSideLen;
        }

        if (m_engineMaxSideLen < maxSideLen) {
            std::cout << "[RTOCRDetector] Engine built with max " << m_engineMaxSideLen
                      << "x" << m_engineMaxSideLen << " (requested " << maxSideLen
                      << " exceeded GPU capacity)" << std::endl;
        }
        std::cout << "[RTOCRDetector] Initialized TRT engine from: " << onnxPath << std::endl;
        return true;
    }
    catch (const std::exception& e) {
        std::cerr << "[RTOCRDetector] Initialize failed: " << e.what() << std::endl;
        return false;
    }
}

std::vector<TextBox> RTOCRDetector::Detect(const cv::Mat& image,
                                            int maxSideLen, float dbThresh,
                                            float boxThresh, float unclipRatio,
                                            bool useDilation) {
    std::lock_guard<std::mutex> lock(_mutex);

    if (!m_engine || image.empty()) return {};

    try {
        // Single-pass detection: resize the full image to fit within
        // the engine's max spatial dimension (same approach as ONNX version).
        int effectiveMaxSide = std::min(maxSideLen, m_engineMaxSideLen);

        // 1. Compute resize dimensions (multiples of 32)
        cv::Size resizeShape = ComputeDetResizeShape(image.rows, image.cols, effectiveMaxSide);
        int newH = resizeShape.height;
        int newW = resizeShape.width;

        float ratioH = static_cast<float>(image.rows) / newH;
        float ratioW = static_cast<float>(image.cols) / newW;

        // 2. Upload to GPU and resize — try NV12 fast path first
        cv::cuda::GpuMat gpuResized;
        bool usedNV12 = false;

        GpuFrameData* gpuFrame = tl_currentGpuFrame();
        if (gpuFrame && gpuFrame->pixelFormat == 23 &&
            gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&
            gpuFrame->width > 0 && gpuFrame->height > 0) {
            // NV12 fast path: fused NV12→BGR+resize kernel (1 kernel launch)
            // instead of CPU BGR upload (24MB) + separate resize
            int fW = gpuFrame->width;
            int fH = gpuFrame->height;
            int gpuIdx = m_engine ? m_engine->getOptions().deviceIndex : 0;

            // Get NV12 Y/UV pointers on GPU (from cache or fresh upload)
            const uint8_t* devY = nullptr;
            const uint8_t* devUV = nullptr;
            int yPitch = 0, uvPitch = 0;
            {
                auto regLock = ANSGpuFrameRegistry::instance().acquire_lock();
                if (gpuFrame->gpuCacheValid && gpuFrame->gpuCacheDeviceIdx == gpuIdx) {
                    // Cache hit
                    devY    = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
                    devUV   = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
                    yPitch  = static_cast<int>(gpuFrame->gpuCacheYPitch);
                    uvPitch = static_cast<int>(gpuFrame->gpuCacheUVPitch);
                } else if (!gpuFrame->gpuCacheValid) {
                    // Cache miss — upload CPU NV12 to GPU
                    size_t yBytes  = static_cast<size_t>(fH) * gpuFrame->cpuYLinesize;
                    size_t uvBytes = static_cast<size_t>(fH / 2) * gpuFrame->cpuUvLinesize;

                    auto& reg = ANSGpuFrameRegistry::instance();
                    if (reg.canAllocateGpuCache(yBytes + uvBytes)) {
                        cudaMalloc(&gpuFrame->gpuCacheY,  yBytes);
                        cudaMalloc(&gpuFrame->gpuCacheUV, uvBytes);
                        cudaMemcpy(gpuFrame->gpuCacheY,  gpuFrame->cpuYPlane,  yBytes,  cudaMemcpyHostToDevice);
                        cudaMemcpy(gpuFrame->gpuCacheUV, gpuFrame->cpuUvPlane, uvBytes, cudaMemcpyHostToDevice);
                        gpuFrame->gpuCacheValid = true;
                        gpuFrame->gpuCacheDeviceIdx = gpuIdx;
                        gpuFrame->gpuCacheYPitch  = static_cast<size_t>(gpuFrame->cpuYLinesize);
                        gpuFrame->gpuCacheUVPitch = static_cast<size_t>(gpuFrame->cpuUvLinesize);
                        gpuFrame->gpuCacheBytes = yBytes + uvBytes;
                        reg.onGpuCacheCreated(yBytes + uvBytes);

                        devY    = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
                        devUV   = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
                        yPitch  = gpuFrame->cpuYLinesize;
                        uvPitch = gpuFrame->cpuUvLinesize;
                    }
                }
            } // release registry lock before GPU kernel

            if (devY && devUV) {
                // Single fused kernel: NV12→BGR + bilinear resize (1 launch, 1 output alloc)
                gpuResized.create(newH, newW, CV_8UC3);
                NV12PreprocessHelper::nv12ToBGRResize(
                    devY, yPitch, devUV, uvPitch,
                    gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),
                    newW, newH, fW, fH);
                usedNV12 = true;

                // Update ratios to map from full-res NV12 to detection output
                ratioH = static_cast<float>(fH) / newH;
                ratioW = static_cast<float>(fW) / newW;
            }
        }

        if (!usedNV12) {
            // Fallback: standard BGR upload
            cv::cuda::GpuMat gpuImg;
            gpuImg.upload(image);
            cv::cuda::resize(gpuImg, gpuResized, resizeShape);
        }

        // Keep BGR order (PaddleOCR official does NOT convert BGR->RGB)

        // 3. Run inference
        std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuResized } };
        std::vector<std::vector<std::vector<float>>> featureVectors;

        if (!m_engine->runInference(inputs, featureVectors)) {
            std::cerr << "[RTOCRDetector] Inference failed" << std::endl;
            return {};
        }

        if (featureVectors.empty() || featureVectors[0].empty()) return {};

        // 4. Reshape output to probability map [H, W]
        std::vector<float>& output = featureVectors[0][0];
        int outputSize = static_cast<int>(output.size());

        if (outputSize < newH * newW) {
            std::cerr << "[RTOCRDetector] Output too small: expected at least "
                      << newH * newW << " got " << outputSize << std::endl;
            return {};
        }

        cv::Mat bitmap(newH, newW, CV_32FC1, output.data());

        // 5. Threshold to binary (matches ONNX/PaddleOCR official order)
        cv::Mat binaryMap;
        cv::threshold(bitmap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);
        binaryMap.convertTo(binaryMap, CV_8UC1);

        // 6. Apply dilation if requested (on binaryMap, matching ONNX version)
        if (useDilation) {
            cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
            cv::dilate(binaryMap, binaryMap, kernel);
        }

        // 7. Find contours and build text boxes
        //    (matches ONNX/PaddleOCR official DBPostProcess.boxes_from_bitmap flow exactly)
        std::vector<std::vector<cv::Point>> contours;
        cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);

        int numCandidates = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);
        std::vector<TextBox> boxes;

        for (int i = 0; i < numCandidates; i++) {
            if (contours[i].size() < 4) continue;

            // Step 1: GetMiniBoxes - get ordered 4 corners of min area rect
            cv::RotatedRect minRect = cv::minAreaRect(contours[i]);
            float sside = std::min(minRect.size.width, minRect.size.height);
            if (sside < 3.0f) continue;

            auto ordered = GetMiniBoxes(minRect);

            // Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon
            float score = BoxScoreFast(bitmap, ordered);
            if (score < boxThresh) continue;

            // Step 3: UnclipPolygon - expand the 4-point box
            auto expanded = UnclipPolygon(ordered, unclipRatio);
            if (expanded.size() < 4) continue;

            // Step 4: Re-compute GetMiniBoxes on the expanded polygon
            std::vector<cv::Point> expandedInt;
            expandedInt.reserve(expanded.size());
            for (auto& p : expanded) {
                expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));
            }
            cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);

            // Filter by min_size + 2 = 5 (matches PaddleOCR official)
            float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);
            if (expandedSside < 5.0f) continue;

            auto expandedOrdered = GetMiniBoxes(expandedRect);

            // Step 5: Scale to original image coordinates
            TextBox box;
            for (int j = 0; j < 4; j++) {
                box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(image.cols - 1));
                box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(image.rows - 1));
            }
            box.score = score;
            boxes.push_back(box);
        }

        SortTextBoxes(boxes);
        return boxes;
    }
    catch (const std::exception& e) {
        std::cerr << "[RTOCRDetector] Detect failed: " << e.what() << std::endl;
        return {};
    }
}

// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y
std::array<cv::Point2f, 4> RTOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {
    cv::Point2f vertices[4];
    rect.points(vertices);

    // Sort all 4 points by x-coordinate ascending
    std::sort(vertices, vertices + 4,
        [](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });

    // Left two (indices 0,1): smaller y = top-left, larger y = bottom-left
    cv::Point2f topLeft, bottomLeft;
    if (vertices[0].y <= vertices[1].y) {
        topLeft = vertices[0];
        bottomLeft = vertices[1];
    } else {
        topLeft = vertices[1];
        bottomLeft = vertices[0];
    }

    // Right two (indices 2,3): smaller y = top-right, larger y = bottom-right
    cv::Point2f topRight, bottomRight;
    if (vertices[2].y <= vertices[3].y) {
        topRight = vertices[2];
        bottomRight = vertices[3];
    } else {
        topRight = vertices[3];
        bottomRight = vertices[2];
    }

    // Order: [TL, TR, BR, BL] (clockwise from top-left)
    return { topLeft, topRight, bottomRight, bottomLeft };
}

// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon
float RTOCRDetector::BoxScoreFast(const cv::Mat& probMap,
                                    const std::array<cv::Point2f, 4>& box) {
    int h = probMap.rows;
    int w = probMap.cols;

    // Get bounding rectangle with proper clamping (matches PaddleOCR official)
    float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});
    float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});
    float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});
    float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});

    int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);
    int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);
    int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);
    int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);

    if (xmin >= xmax || ymin >= ymax) return 0.0f;

    cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);

    std::vector<cv::Point> pts(4);
    for (int j = 0; j < 4; j++) {
        pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,
                            static_cast<int>(box[j].y) - ymin);
    }
    std::vector<std::vector<cv::Point>> polys = { pts };
    cv::fillPoly(mask, polys, cv::Scalar(1));

    cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
    return static_cast<float>(cv::mean(roiMap, mask)[0]);
}

// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND
// Uses integer coordinates for Clipper (matching PaddleOCR/ONNX version exactly)
std::vector<cv::Point2f> RTOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,
                                                        float unclipRatio) {
    // Compute area using Shoelace formula and perimeter
    float area = 0.0f;
    float perimeter = 0.0f;
    for (int i = 0; i < 4; i++) {
        int j = (i + 1) % 4;
        area += box[i].x * box[j].y - box[j].x * box[i].y;
        float dx = box[j].x - box[i].x;
        float dy = box[j].y - box[i].y;
        perimeter += std::sqrt(dx * dx + dy * dy);
    }
    area = std::abs(area) * 0.5f;
    if (perimeter < 1.0f) return {};

    float distance = area * unclipRatio / perimeter;

    ClipperLib::Path clipperPath;
    for (int i = 0; i < 4; i++) {
        clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),
                                static_cast<ClipperLib::cInt>(box[i].y) });
    }

    ClipperLib::ClipperOffset offset;
    offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);

    ClipperLib::Paths solution;
    offset.Execute(solution, distance);

    if (solution.empty() || solution[0].empty()) return {};

    std::vector<cv::Point2f> result;
    for (auto& p : solution[0]) {
        result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));
    }
    return result;
}

RTOCRDetector::~RTOCRDetector() {
    try {
        if (m_usingSharedPool) {
            EnginePoolManager<float>::instance().release(m_poolKey);
            m_engine.reset();
            m_usingSharedPool = false;
        }
        else if (m_engine) {
            m_engine.reset();
        }
    }
    catch (...) {}
}

} // namespace rtocr
} // namespace ANSCENTER