modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp

#include "RTOCRDetector.h"
#include "include/clipper.h"
#include "NV12PreprocessHelper.h"
#include "ANSGpuFrameRegistry.h"

#include <cuda_runtime.h>
#include <opencv2/imgproc.hpp>

// NV12→BGR fused resize via NV12PreprocessHelper (linked from ANSODEngine.dll)
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/cudaarithm.hpp>
#include <iostream>
#include <algorithm>
#include <cmath>

namespace ANSCENTER {
namespace rtocr {

bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId,
                                const std::string& engineCacheDir,
                                int maxSideLen) {
    // Engine cache directory
    std::string cacheDir;
    if (!engineCacheDir.empty()) {
        cacheDir = engineCacheDir;
    } else {
        auto pos = onnxPath.find_last_of("/\\");
        cacheDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
    }

    try {
        ANSCENTER::Options options;
        options.deviceIndex  = gpuId;
        // FP32 required for detection: this CNN (DBNet) produces NaN in FP16.
        // The model has 142 Convolution + 87 Scale (fused BatchNorm) layers whose
        // intermediate values overflow FP16 range (65504).  Mixed precision
        // (forcing only Sigmoid/Softmax to FP32) is insufficient because the NaN
        // originates deep in the conv->scale->relu backbone before reaching those layers.
        // Classifier and recognizer remain FP16 with mixed precision -- only the
        // detector needs full FP32.
        options.precision    = ANSCENTER::Precision::FP32;
        options.maxBatchSize = 1;
        options.optBatchSize = 1;

        // Dynamic spatial dimensions for detection (multiples of 32)
        options.minInputHeight = 32;
        options.minInputWidth  = 32;
        options.optInputHeight = std::min(640, maxSideLen);
        options.optInputWidth  = std::min(640, maxSideLen);
        options.maxInputHeight = maxSideLen;
        options.maxInputWidth  = maxSideLen;
        options.engineFileDir  = cacheDir;

        m_poolKey = { onnxPath,
            static_cast<int>(options.precision),
            options.maxBatchSize };
        m_engine = EnginePoolManager<float>::instance().acquire(
            m_poolKey, options, onnxPath,
            kDetSubVals, kDetDivVals, true, getPoolMaxSlotsPerGpu());
        m_usingSharedPool = (m_engine != nullptr);

        if (!m_engine) {
            std::cerr << "[RTOCRDetector] Failed to build/load TRT engine for: "
                      << onnxPath << std::endl;
            return false;
        }

        // Query actual profile max from the loaded engine
        int profMaxH = m_engine->getProfileMaxHeight();
        int profMaxW = m_engine->getProfileMaxWidth();
        if (profMaxH > 0 && profMaxW > 0) {
            m_engineMaxSideLen = std::min(profMaxH, profMaxW);
        } else {
            m_engineMaxSideLen = maxSideLen;
        }

        if (m_engineMaxSideLen < maxSideLen) {
            std::cout << "[RTOCRDetector] Engine built with max " << m_engineMaxSideLen
                      << "x" << m_engineMaxSideLen << " (requested " << maxSideLen
                      << " exceeded GPU capacity)" << std::endl;
        }
        std::cout << "[RTOCRDetector] Initialized TRT engine from: " << onnxPath << std::endl;
        return true;
    }
    catch (const std::exception& e) {
        std::cerr << "[RTOCRDetector] Initialize failed: " << e.what() << std::endl;
        return false;
    }
}

std::vector<TextBox> RTOCRDetector::Detect(const cv::Mat& image,
                                            int maxSideLen, float dbThresh,
                                            float boxThresh, float unclipRatio,
                                            bool useDilation) {
    std::lock_guard<std::mutex> lock(_mutex);

    if (!m_engine || image.empty()) return {};

    try {
        // Single-pass detection: resize the full image to fit within
        // the engine's max spatial dimension (same approach as ONNX version).
        int effectiveMaxSide = std::min(maxSideLen, m_engineMaxSideLen);

        // 1. Compute resize dimensions (multiples of 32)
        cv::Size resizeShape = ComputeDetResizeShape(image.rows, image.cols, effectiveMaxSide);
        int newH = resizeShape.height;
        int newW = resizeShape.width;

        float ratioH = static_cast<float>(image.rows) / newH;
        float ratioW = static_cast<float>(image.cols) / newW;

        // 2. Upload to GPU and resize — try NV12 fast path first
        cv::cuda::GpuMat gpuResized;
        bool usedNV12 = false;

        GpuFrameData* gpuFrame = tl_currentGpuFrame();
        if (gpuFrame && gpuFrame->pixelFormat == 23 &&
            gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&
            gpuFrame->width > 0 && gpuFrame->height > 0) {
            // NV12 fast path: fused NV12→BGR+resize kernel (1 kernel launch)
            // instead of CPU BGR upload (24MB) + separate resize
            int fW = gpuFrame->width;
            int fH = gpuFrame->height;
            int gpuIdx = m_engine ? m_engine->getOptions().deviceIndex : 0;

            // Get NV12 Y/UV pointers on GPU (from cache or fresh upload)
            const uint8_t* devY = nullptr;
            const uint8_t* devUV = nullptr;
            int yPitch = 0, uvPitch = 0;
            {
                auto regLock = ANSGpuFrameRegistry::instance().acquire_lock();
                if (gpuFrame->gpuCacheValid && gpuFrame->gpuCacheDeviceIdx == gpuIdx) {
                    // Cache hit
                    devY    = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
                    devUV   = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
                    yPitch  = static_cast<int>(gpuFrame->gpuCacheYPitch);
                    uvPitch = static_cast<int>(gpuFrame->gpuCacheUVPitch);
                } else if (!gpuFrame->gpuCacheValid) {
                    // Cache miss — upload CPU NV12 to GPU
                    size_t yBytes  = static_cast<size_t>(fH) * gpuFrame->cpuYLinesize;
                    size_t uvBytes = static_cast<size_t>(fH / 2) * gpuFrame->cpuUvLinesize;

                    auto& reg = ANSGpuFrameRegistry::instance();
                    if (reg.canAllocateGpuCache(yBytes + uvBytes)) {
                        cudaMalloc(&gpuFrame->gpuCacheY,  yBytes);
                        cudaMalloc(&gpuFrame->gpuCacheUV, uvBytes);
                        cudaMemcpy(gpuFrame->gpuCacheY,  gpuFrame->cpuYPlane,  yBytes,  cudaMemcpyHostToDevice);
                        cudaMemcpy(gpuFrame->gpuCacheUV, gpuFrame->cpuUvPlane, uvBytes, cudaMemcpyHostToDevice);
                        gpuFrame->gpuCacheValid = true;
                        gpuFrame->gpuCacheDeviceIdx = gpuIdx;
                        gpuFrame->gpuCacheYPitch  = static_cast<size_t>(gpuFrame->cpuYLinesize);
                        gpuFrame->gpuCacheUVPitch = static_cast<size_t>(gpuFrame->cpuUvLinesize);
                        gpuFrame->gpuCacheBytes = yBytes + uvBytes;
                        reg.onGpuCacheCreated(yBytes + uvBytes);

                        devY    = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
                        devUV   = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
                        yPitch  = gpuFrame->cpuYLinesize;
                        uvPitch = gpuFrame->cpuUvLinesize;
                    }
                }
            } // release registry lock before GPU kernel

            if (devY && devUV) {
                // Single fused kernel: NV12→BGR + bilinear resize (1 launch, 1 output alloc)
                gpuResized.create(newH, newW, CV_8UC3);
                NV12PreprocessHelper::nv12ToBGRResize(
                    devY, yPitch, devUV, uvPitch,
                    gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),
                    newW, newH, fW, fH);
                usedNV12 = true;

                // Update ratios to map from full-res NV12 to detection output
                ratioH = static_cast<float>(fH) / newH;
                ratioW = static_cast<float>(fW) / newW;
            }
        }

        if (!usedNV12) {
            // Fallback: CPU resize then upload small image to GPU
            cv::Mat cpuResized;
            cv::resize(image, cpuResized, resizeShape, 0, 0, cv::INTER_LINEAR);
            gpuResized.upload(cpuResized);
        }

        // Keep BGR order (PaddleOCR official does NOT convert BGR->RGB)

        // 3. Run inference
        std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuResized } };
        std::vector<std::vector<std::vector<float>>> featureVectors;

        if (!m_engine->runInference(inputs, featureVectors)) {
            std::cerr << "[RTOCRDetector] Inference failed" << std::endl;
            return {};
        }

        if (featureVectors.empty() || featureVectors[0].empty()) return {};

        // 4. Reshape output to probability map [H, W]
        std::vector<float>& output = featureVectors[0][0];
        int outputSize = static_cast<int>(output.size());

        if (outputSize < newH * newW) {
            std::cerr << "[RTOCRDetector] Output too small: expected at least "
                      << newH * newW << " got " << outputSize << std::endl;
            return {};
        }

        cv::Mat bitmap(newH, newW, CV_32FC1, output.data());

        // 5. Threshold to binary (matches ONNX/PaddleOCR official order)
        cv::Mat binaryMap;
        cv::threshold(bitmap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);
        binaryMap.convertTo(binaryMap, CV_8UC1);

        // 6. Apply dilation if requested (on binaryMap, matching ONNX version)
        if (useDilation) {
            cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
            cv::dilate(binaryMap, binaryMap, kernel);
        }

        // 7. Find contours and build text boxes
        //    (matches ONNX/PaddleOCR official DBPostProcess.boxes_from_bitmap flow exactly)
        std::vector<std::vector<cv::Point>> contours;
        cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);

        int numCandidates = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);
        std::vector<TextBox> boxes;

        for (int i = 0; i < numCandidates; i++) {
            if (contours[i].size() < 4) continue;

            // Step 1: GetMiniBoxes - get ordered 4 corners of min area rect
            cv::RotatedRect minRect = cv::minAreaRect(contours[i]);
            float sside = std::min(minRect.size.width, minRect.size.height);
            if (sside < 3.0f) continue;

            auto ordered = GetMiniBoxes(minRect);

            // Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon
            float score = BoxScoreFast(bitmap, ordered);
            if (score < boxThresh) continue;

            // Step 3: UnclipPolygon - expand the 4-point box
            auto expanded = UnclipPolygon(ordered, unclipRatio);
            if (expanded.size() < 4) continue;

            // Step 4: Re-compute GetMiniBoxes on the expanded polygon
            std::vector<cv::Point> expandedInt;
            expandedInt.reserve(expanded.size());
            for (auto& p : expanded) {
                expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));
            }
            cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);

            // Filter by min_size + 2 = 5 (matches PaddleOCR official)
            float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);
            if (expandedSside < 5.0f) continue;

            auto expandedOrdered = GetMiniBoxes(expandedRect);

            // Step 5: Scale to original image coordinates
            TextBox box;
            for (int j = 0; j < 4; j++) {
                box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(image.cols - 1));
                box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(image.rows - 1));
            }
            box.score = score;
            boxes.push_back(box);
        }

        SortTextBoxes(boxes);
        return boxes;
    }
    catch (const std::exception& e) {
        std::cerr << "[RTOCRDetector] Detect failed: " << e.what() << std::endl;
        return {};
    }
}

// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y
std::array<cv::Point2f, 4> RTOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {
    cv::Point2f vertices[4];
    rect.points(vertices);

    // Sort all 4 points by x-coordinate ascending
    std::sort(vertices, vertices + 4,
        [](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });

    // Left two (indices 0,1): smaller y = top-left, larger y = bottom-left
    cv::Point2f topLeft, bottomLeft;
    if (vertices[0].y <= vertices[1].y) {
        topLeft = vertices[0];
        bottomLeft = vertices[1];
    } else {
        topLeft = vertices[1];
        bottomLeft = vertices[0];
    }

    // Right two (indices 2,3): smaller y = top-right, larger y = bottom-right
    cv::Point2f topRight, bottomRight;
    if (vertices[2].y <= vertices[3].y) {
        topRight = vertices[2];
        bottomRight = vertices[3];
    } else {
        topRight = vertices[3];
        bottomRight = vertices[2];
    }

    // Order: [TL, TR, BR, BL] (clockwise from top-left)
    return { topLeft, topRight, bottomRight, bottomLeft };
}

// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon
float RTOCRDetector::BoxScoreFast(const cv::Mat& probMap,
                                    const std::array<cv::Point2f, 4>& box) {
    int h = probMap.rows;
    int w = probMap.cols;

    // Get bounding rectangle with proper clamping (matches PaddleOCR official)
    float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});
    float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});
    float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});
    float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});

    int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);
    int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);
    int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);
    int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);

    if (xmin >= xmax || ymin >= ymax) return 0.0f;

    cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);

    std::vector<cv::Point> pts(4);
    for (int j = 0; j < 4; j++) {
        pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,
                            static_cast<int>(box[j].y) - ymin);
    }
    std::vector<std::vector<cv::Point>> polys = { pts };
    cv::fillPoly(mask, polys, cv::Scalar(1));

    cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
    return static_cast<float>(cv::mean(roiMap, mask)[0]);
}

// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND
// Uses integer coordinates for Clipper (matching PaddleOCR/ONNX version exactly)
std::vector<cv::Point2f> RTOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,
                                                        float unclipRatio) {
    // Compute area using Shoelace formula and perimeter
    float area = 0.0f;
    float perimeter = 0.0f;
    for (int i = 0; i < 4; i++) {
        int j = (i + 1) % 4;
        area += box[i].x * box[j].y - box[j].x * box[i].y;
        float dx = box[j].x - box[i].x;
        float dy = box[j].y - box[i].y;
        perimeter += std::sqrt(dx * dx + dy * dy);
    }
    area = std::abs(area) * 0.5f;
    if (perimeter < 1.0f) return {};

    float distance = area * unclipRatio / perimeter;

    ClipperLib::Path clipperPath;
    for (int i = 0; i < 4; i++) {
        clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),
                                static_cast<ClipperLib::cInt>(box[i].y) });
    }

    ClipperLib::ClipperOffset offset;
    offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);

    ClipperLib::Paths solution;
    offset.Execute(solution, distance);

    if (solution.empty() || solution[0].empty()) return {};

    std::vector<cv::Point2f> result;
    for (auto& p : solution[0]) {
        result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));
    }
    return result;
}

RTOCRDetector::~RTOCRDetector() {
    try {
        if (m_usingSharedPool) {
            EnginePoolManager<float>::instance().release(m_poolKey);
            m_engine.reset();
            m_usingSharedPool = false;
        }
        else if (m_engine) {
            m_engine.reset();
        }
    }
    catch (...) {}
}

} // namespace rtocr
} // namespace ANSCENTER
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`#include "RTOCRDetector.h"`
			`#include "include/clipper.h"`
			`#include "NV12PreprocessHelper.h"`
			`#include "ANSGpuFrameRegistry.h"`

			`#include <cuda_runtime.h>`
			`#include <opencv2/imgproc.hpp>`

			`// NV12→BGR fused resize via NV12PreprocessHelper (linked from ANSODEngine.dll)`
			`#include <opencv2/cudaimgproc.hpp>`
			`#include <opencv2/cudawarping.hpp>`
			`#include <opencv2/cudaarithm.hpp>`
			`#include <iostream>`
			`#include <algorithm>`
			`#include <cmath>`

			`namespace ANSCENTER {`
			`namespace rtocr {`

			`bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId,`
			`const std::string& engineCacheDir,`
			`int maxSideLen) {`
			`// Engine cache directory`
			`std::string cacheDir;`
			`if (!engineCacheDir.empty()) {`
			`cacheDir = engineCacheDir;`
			`} else {`
			`auto pos = onnxPath.find_last_of("/\\");`
			`cacheDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";`
			`}`

			`try {`
			`ANSCENTER::Options options;`
			`options.deviceIndex = gpuId;`
			`// FP32 required for detection: this CNN (DBNet) produces NaN in FP16.`
			`// The model has 142 Convolution + 87 Scale (fused BatchNorm) layers whose`
			`// intermediate values overflow FP16 range (65504). Mixed precision`
			`// (forcing only Sigmoid/Softmax to FP32) is insufficient because the NaN`
			`// originates deep in the conv->scale->relu backbone before reaching those layers.`
			`// Classifier and recognizer remain FP16 with mixed precision -- only the`
			`// detector needs full FP32.`
			`options.precision = ANSCENTER::Precision::FP32;`
			`options.maxBatchSize = 1;`
			`options.optBatchSize = 1;`

			`// Dynamic spatial dimensions for detection (multiples of 32)`
			`options.minInputHeight = 32;`
			`options.minInputWidth = 32;`
			`options.optInputHeight = std::min(640, maxSideLen);`
			`options.optInputWidth = std::min(640, maxSideLen);`
			`options.maxInputHeight = maxSideLen;`
			`options.maxInputWidth = maxSideLen;`
			`options.engineFileDir = cacheDir;`

			`m_poolKey = { onnxPath,`
			`static_cast<int>(options.precision),`
			`options.maxBatchSize };`
			`m_engine = EnginePoolManager<float>::instance().acquire(`
			`m_poolKey, options, onnxPath,`
Fix setting GPU behaviour: Condition maxSlotsPerGpu Behavior OptimizeModelStr 0 Bypass: non-shared temporary engine 1 GPU 1 Single slot, no round-robin >1 GPU, VRAM < 24 GB 1 Round-robin: 1 slot per GPU >1 GPU, VRAM >= 24 GB -1 Elastic: on-demand slot growth 2026-03-30 09:59:09 +11:00			`kDetSubVals, kDetDivVals, true, getPoolMaxSlotsPerGpu());`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`m_usingSharedPool = (m_engine != nullptr);`

			`if (!m_engine) {`
			`std::cerr << "[RTOCRDetector] Failed to build/load TRT engine for: "`
			`<< onnxPath << std::endl;`
			`return false;`
			`}`

			`// Query actual profile max from the loaded engine`
			`int profMaxH = m_engine->getProfileMaxHeight();`
			`int profMaxW = m_engine->getProfileMaxWidth();`
			`if (profMaxH > 0 && profMaxW > 0) {`
			`m_engineMaxSideLen = std::min(profMaxH, profMaxW);`
			`} else {`
			`m_engineMaxSideLen = maxSideLen;`
			`}`

			`if (m_engineMaxSideLen < maxSideLen) {`
			`std::cout << "[RTOCRDetector] Engine built with max " << m_engineMaxSideLen`
			`<< "x" << m_engineMaxSideLen << " (requested " << maxSideLen`
			`<< " exceeded GPU capacity)" << std::endl;`
			`}`
			`std::cout << "[RTOCRDetector] Initialized TRT engine from: " << onnxPath << std::endl;`
			`return true;`
			`}`
			`catch (const std::exception& e) {`
			`std::cerr << "[RTOCRDetector] Initialize failed: " << e.what() << std::endl;`
			`return false;`
			`}`
			`}`

			`std::vector<TextBox> RTOCRDetector::Detect(const cv::Mat& image,`
			`int maxSideLen, float dbThresh,`
			`float boxThresh, float unclipRatio,`
			`bool useDilation) {`
			`std::lock_guard<std::mutex> lock(_mutex);`

			`if (!m_engine \|\| image.empty()) return {};`

			`try {`
			`// Single-pass detection: resize the full image to fit within`
			`// the engine's max spatial dimension (same approach as ONNX version).`
			`int effectiveMaxSide = std::min(maxSideLen, m_engineMaxSideLen);`

			`// 1. Compute resize dimensions (multiples of 32)`
			`cv::Size resizeShape = ComputeDetResizeShape(image.rows, image.cols, effectiveMaxSide);`
			`int newH = resizeShape.height;`
			`int newW = resizeShape.width;`

			`float ratioH = static_cast<float>(image.rows) / newH;`
			`float ratioW = static_cast<float>(image.cols) / newW;`

			`// 2. Upload to GPU and resize — try NV12 fast path first`
			`cv::cuda::GpuMat gpuResized;`
			`bool usedNV12 = false;`

			`GpuFrameData* gpuFrame = tl_currentGpuFrame();`
			`if (gpuFrame && gpuFrame->pixelFormat == 23 &&`
			`gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&`
			`gpuFrame->width > 0 && gpuFrame->height > 0) {`
			`// NV12 fast path: fused NV12→BGR+resize kernel (1 kernel launch)`
			`// instead of CPU BGR upload (24MB) + separate resize`
			`int fW = gpuFrame->width;`
			`int fH = gpuFrame->height;`
			`int gpuIdx = m_engine ? m_engine->getOptions().deviceIndex : 0;`

			`// Get NV12 Y/UV pointers on GPU (from cache or fresh upload)`
			`const uint8_t* devY = nullptr;`
			`const uint8_t* devUV = nullptr;`
			`int yPitch = 0, uvPitch = 0;`
			`{`
			`auto regLock = ANSGpuFrameRegistry::instance().acquire_lock();`
			`if (gpuFrame->gpuCacheValid && gpuFrame->gpuCacheDeviceIdx == gpuIdx) {`
			`// Cache hit`
			`devY = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);`
			`devUV = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);`
			`yPitch = static_cast<int>(gpuFrame->gpuCacheYPitch);`
			`uvPitch = static_cast<int>(gpuFrame->gpuCacheUVPitch);`
			`} else if (!gpuFrame->gpuCacheValid) {`
			`// Cache miss — upload CPU NV12 to GPU`
			`size_t yBytes = static_cast<size_t>(fH) * gpuFrame->cpuYLinesize;`
			`size_t uvBytes = static_cast<size_t>(fH / 2) * gpuFrame->cpuUvLinesize;`

			`auto& reg = ANSGpuFrameRegistry::instance();`
			`if (reg.canAllocateGpuCache(yBytes + uvBytes)) {`
			`cudaMalloc(&gpuFrame->gpuCacheY, yBytes);`
			`cudaMalloc(&gpuFrame->gpuCacheUV, uvBytes);`
			`cudaMemcpy(gpuFrame->gpuCacheY, gpuFrame->cpuYPlane, yBytes, cudaMemcpyHostToDevice);`
			`cudaMemcpy(gpuFrame->gpuCacheUV, gpuFrame->cpuUvPlane, uvBytes, cudaMemcpyHostToDevice);`
			`gpuFrame->gpuCacheValid = true;`
			`gpuFrame->gpuCacheDeviceIdx = gpuIdx;`
			`gpuFrame->gpuCacheYPitch = static_cast<size_t>(gpuFrame->cpuYLinesize);`
			`gpuFrame->gpuCacheUVPitch = static_cast<size_t>(gpuFrame->cpuUvLinesize);`
			`gpuFrame->gpuCacheBytes = yBytes + uvBytes;`
			`reg.onGpuCacheCreated(yBytes + uvBytes);`

			`devY = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);`
			`devUV = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);`
			`yPitch = gpuFrame->cpuYLinesize;`
			`uvPitch = gpuFrame->cpuUvLinesize;`
			`}`
			`}`
			`} // release registry lock before GPU kernel`

			`if (devY && devUV) {`
			`// Single fused kernel: NV12→BGR + bilinear resize (1 launch, 1 output alloc)`
			`gpuResized.create(newH, newW, CV_8UC3);`
			`NV12PreprocessHelper::nv12ToBGRResize(`
			`devY, yPitch, devUV, uvPitch,`
			`gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),`
			`newW, newH, fW, fH);`
			`usedNV12 = true;`

			`// Update ratios to map from full-res NV12 to detection output`
			`ratioH = static_cast<float>(fH) / newH;`
			`ratioW = static_cast<float>(fW) / newW;`
			`}`
			`}`

			`if (!usedNV12) {`
Use CPU resize before upload to GPU to remove PCIe bottleneck 2026-04-04 22:29:08 +11:00			`// Fallback: CPU resize then upload small image to GPU`
			`cv::Mat cpuResized;`
			`cv::resize(image, cpuResized, resizeShape, 0, 0, cv::INTER_LINEAR);`
			`gpuResized.upload(cpuResized);`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`}`

			`// Keep BGR order (PaddleOCR official does NOT convert BGR->RGB)`

			`// 3. Run inference`
			`std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuResized } };`
			`std::vector<std::vector<std::vector<float>>> featureVectors;`

			`if (!m_engine->runInference(inputs, featureVectors)) {`
			`std::cerr << "[RTOCRDetector] Inference failed" << std::endl;`
			`return {};`
			`}`

			`if (featureVectors.empty() \|\| featureVectors[0].empty()) return {};`

			`// 4. Reshape output to probability map [H, W]`
			`std::vector<float>& output = featureVectors[0][0];`
			`int outputSize = static_cast<int>(output.size());`

			`if (outputSize < newH * newW) {`
			`std::cerr << "[RTOCRDetector] Output too small: expected at least "`
			`<< newH * newW << " got " << outputSize << std::endl;`
			`return {};`
			`}`

			`cv::Mat bitmap(newH, newW, CV_32FC1, output.data());`

			`// 5. Threshold to binary (matches ONNX/PaddleOCR official order)`
			`cv::Mat binaryMap;`
			`cv::threshold(bitmap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);`
			`binaryMap.convertTo(binaryMap, CV_8UC1);`

			`// 6. Apply dilation if requested (on binaryMap, matching ONNX version)`
			`if (useDilation) {`
			`cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));`
			`cv::dilate(binaryMap, binaryMap, kernel);`
			`}`

			`// 7. Find contours and build text boxes`
			`// (matches ONNX/PaddleOCR official DBPostProcess.boxes_from_bitmap flow exactly)`
			`std::vector<std::vector<cv::Point>> contours;`
			`cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);`

			`int numCandidates = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);`
			`std::vector<TextBox> boxes;`

			`for (int i = 0; i < numCandidates; i++) {`
			`if (contours[i].size() < 4) continue;`

			`// Step 1: GetMiniBoxes - get ordered 4 corners of min area rect`
			`cv::RotatedRect minRect = cv::minAreaRect(contours[i]);`
			`float sside = std::min(minRect.size.width, minRect.size.height);`
			`if (sside < 3.0f) continue;`

			`auto ordered = GetMiniBoxes(minRect);`

			`// Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon`
			`float score = BoxScoreFast(bitmap, ordered);`
			`if (score < boxThresh) continue;`

			`// Step 3: UnclipPolygon - expand the 4-point box`
			`auto expanded = UnclipPolygon(ordered, unclipRatio);`
			`if (expanded.size() < 4) continue;`

			`// Step 4: Re-compute GetMiniBoxes on the expanded polygon`
			`std::vector<cv::Point> expandedInt;`
			`expandedInt.reserve(expanded.size());`
			`for (auto& p : expanded) {`
			`expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));`
			`}`
			`cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);`

			`// Filter by min_size + 2 = 5 (matches PaddleOCR official)`
			`float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);`
			`if (expandedSside < 5.0f) continue;`

			`auto expandedOrdered = GetMiniBoxes(expandedRect);`

			`// Step 5: Scale to original image coordinates`
			`TextBox box;`
			`for (int j = 0; j < 4; j++) {`
			`box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(image.cols - 1));`
			`box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(image.rows - 1));`
			`}`
			`box.score = score;`
			`boxes.push_back(box);`
			`}`

			`SortTextBoxes(boxes);`
			`return boxes;`
			`}`
			`catch (const std::exception& e) {`
			`std::cerr << "[RTOCRDetector] Detect failed: " << e.what() << std::endl;`
			`return {};`
			`}`
			`}`

			`// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y`
			`std::array<cv::Point2f, 4> RTOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {`
			`cv::Point2f vertices[4];`
			`rect.points(vertices);`

			`// Sort all 4 points by x-coordinate ascending`
			`std::sort(vertices, vertices + 4,`
			`[](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });`

			`// Left two (indices 0,1): smaller y = top-left, larger y = bottom-left`
			`cv::Point2f topLeft, bottomLeft;`
			`if (vertices[0].y <= vertices[1].y) {`
			`topLeft = vertices[0];`
			`bottomLeft = vertices[1];`
			`} else {`
			`topLeft = vertices[1];`
			`bottomLeft = vertices[0];`
			`}`

			`// Right two (indices 2,3): smaller y = top-right, larger y = bottom-right`
			`cv::Point2f topRight, bottomRight;`
			`if (vertices[2].y <= vertices[3].y) {`
			`topRight = vertices[2];`
			`bottomRight = vertices[3];`
			`} else {`
			`topRight = vertices[3];`
			`bottomRight = vertices[2];`
			`}`

			`// Order: [TL, TR, BR, BL] (clockwise from top-left)`
			`return { topLeft, topRight, bottomRight, bottomLeft };`
			`}`

			`// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon`
			`float RTOCRDetector::BoxScoreFast(const cv::Mat& probMap,`
			`const std::array<cv::Point2f, 4>& box) {`
			`int h = probMap.rows;`
			`int w = probMap.cols;`

			`// Get bounding rectangle with proper clamping (matches PaddleOCR official)`
			`float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});`
			`float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});`
			`float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});`
			`float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});`

			`int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);`
			`int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);`
			`int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);`
			`int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);`

			`if (xmin >= xmax \|\| ymin >= ymax) return 0.0f;`

			`cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);`

			`std::vector<cv::Point> pts(4);`
			`for (int j = 0; j < 4; j++) {`
			`pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,`
			`static_cast<int>(box[j].y) - ymin);`
			`}`
			`std::vector<std::vector<cv::Point>> polys = { pts };`
			`cv::fillPoly(mask, polys, cv::Scalar(1));`

			`cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));`
			`return static_cast<float>(cv::mean(roiMap, mask)[0]);`
			`}`

			`// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND`
			`// Uses integer coordinates for Clipper (matching PaddleOCR/ONNX version exactly)`
			`std::vector<cv::Point2f> RTOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,`
			`float unclipRatio) {`
			`// Compute area using Shoelace formula and perimeter`
			`float area = 0.0f;`
			`float perimeter = 0.0f;`
			`for (int i = 0; i < 4; i++) {`
			`int j = (i + 1) % 4;`
			`area += box[i].x * box[j].y - box[j].x * box[i].y;`
			`float dx = box[j].x - box[i].x;`
			`float dy = box[j].y - box[i].y;`
			`perimeter += std::sqrt(dx * dx + dy * dy);`
			`}`
			`area = std::abs(area) * 0.5f;`
			`if (perimeter < 1.0f) return {};`

			`float distance = area * unclipRatio / perimeter;`

			`ClipperLib::Path clipperPath;`
			`for (int i = 0; i < 4; i++) {`
			`clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),`
			`static_cast<ClipperLib::cInt>(box[i].y) });`
			`}`

			`ClipperLib::ClipperOffset offset;`
			`offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);`

			`ClipperLib::Paths solution;`
			`offset.Execute(solution, distance);`

			`if (solution.empty() \|\| solution[0].empty()) return {};`

			`std::vector<cv::Point2f> result;`
			`for (auto& p : solution[0]) {`
			`result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));`
			`}`
			`return result;`
			`}`

			`RTOCRDetector::~RTOCRDetector() {`
			`try {`
			`if (m_usingSharedPool) {`
			`EnginePoolManager<float>::instance().release(m_poolKey);`
			`m_engine.reset();`
			`m_usingSharedPool = false;`
			`}`
			`else if (m_engine) {`
			`m_engine.reset();`
			`}`
			`}`
			`catch (...) {}`
			`}`

			`} // namespace rtocr`
			`} // namespace ANSCENTER`