ANSCORE/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp

#include "PaddleOCRV5Engine.h"
#include "EPLoader.h"

#include <opencv2/imgproc.hpp>
#include <iostream>
#include <algorithm>

namespace ANSCENTER {
namespace onnxocr {

// ============================================================================
//  Per-backend OCR option builders
//
//  Each backend (NVIDIA / AMD / Intel / CPU) has its own helper that returns
//  a fully-populated set of OrtHandlerOptions for the detector, classifier,
//  and recognizer sub-models. PaddleOCRV5Engine::Initialize dispatches to the
//  correct helper based on the engine type that EPLoader resolved at startup.
//
//  Adding a new backend optimization is a strictly contained change: touch
//  only that backend's builder. The others — especially NVIDIA, which is
//  hand-tuned and should not regress — stay untouched.
// ============================================================================

namespace {

struct PerModelOcrOptions {
    OrtHandlerOptions detectorOpts;
    OrtHandlerOptions classifierOpts;
    OrtHandlerOptions recognizerOpts;
};

// ----------------------------------------------------------------------------
//  NVIDIA — LOCKED. Do NOT modify this helper unless fixing a specific
//  NVIDIA-observable regression.
//
//  The OCR sub-models split into two groups:
//    1. Detector — variable input shape per plate-ROI aspect. TRT EP is a
//       poor fit (one engine build per unique shape, minutes each). Runs on
//       CUDA EP with *conservative* cuDNN workspace: empirical measurements
//       showed that max-workspace mode forces cuDNN to pick Winograd/
//       implicit-precomp-GEMM variants that silently fall back to slow
//       NO-WORKSPACE algorithms when the big workspace can't be allocated
//       under VRAM pressure (LPD TRT engine + rec TRT engine + ORT arena).
//    2. Classifier + Recognizer — TRT EP. Classifier has fixed shape so no
//       profile is needed. Recognizer gets a dynamic profile
//       [batch=1..16, W=320..960] so a single pre-built engine handles every
//       runtime shape without mid-stream rebuilds (fixes 60–90 s hangs).
// ----------------------------------------------------------------------------
static PerModelOcrOptions BuildNvidiaOcrOptions(
        const std::string& recModelPath,
        bool preferTensorRT) {
    PerModelOcrOptions opts;

    // Detector: CUDA EP, conservative workspace, never TRT.
    opts.detectorOpts.useMaxCudnnWorkspace = false;
    opts.detectorOpts.preferTensorRT       = false;

    // Classifier: TRT EP, no profile (fixed [1,3,80,160]).
    opts.classifierOpts.useMaxCudnnWorkspace = true;
    opts.classifierOpts.preferTensorRT       = preferTensorRT;
    opts.classifierOpts.trtFP16              = true;

    // Recognizer: TRT EP with dynamic shape profile. The max-batch
    // dimension is kRecMaxBatch (defined in ONNXOCRTypes.h) — the same
    // constant that ONNXOCRRecognizer::RecognizeBatch uses to chunk
    // oversized bucket groups. Keeping them in lockstep ensures the
    // recognizer never submits a shape that falls outside the TRT profile.
    opts.recognizerOpts.useMaxCudnnWorkspace = true;
    opts.recognizerOpts.preferTensorRT       = preferTensorRT;
    opts.recognizerOpts.trtFP16              = true;
    if (preferTensorRT) {
        std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath);
        if (recInputName.empty()) {
            std::cerr << "[PaddleOCRV5Engine] Could not query recognizer "
                         "input name — defaulting to 'x'" << std::endl;
            recInputName = "x";
        }
        const std::string maxB = std::to_string(kRecMaxBatch);
        std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
                  << recInputName << "' — building TRT dynamic profile "
                  << "[batch=1.." << maxB << ", W=320..960]" << std::endl;
        opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
        opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
        opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":" + maxB + "x3x48x960";
    }
    return opts;
}

// ----------------------------------------------------------------------------
//  Intel (OpenVINO EP) — placeholder.
//
//  Returns default-constructed options: no backend-specific tuning applied
//  yet. When adding Intel optimizations (OpenVINO cache_dir, explicit device
//  selection, INT8 paths, etc.), add the corresponding fields to the Intel
//  section of OrtHandlerOptions and populate them here.
// ----------------------------------------------------------------------------
static PerModelOcrOptions BuildIntelOcrOptions() {
    return PerModelOcrOptions{};  // defaults everywhere
}

// ----------------------------------------------------------------------------
//  AMD (DirectML EP / MIGraphX EP) — placeholder.
//
//  Returns default-constructed options: no backend-specific tuning applied
//  yet. When adding AMD optimizations (graph opt gate for RDNA3+ desktop
//  cards, MIGraphX cache on Linux, etc.), add the corresponding fields to
//  the AMD section of OrtHandlerOptions and populate them here.
// ----------------------------------------------------------------------------
static PerModelOcrOptions BuildAmdOcrOptions() {
    return PerModelOcrOptions{};  // defaults everywhere
}

// ----------------------------------------------------------------------------
//  CPU / unknown hardware — no tuning.
// ----------------------------------------------------------------------------
static PerModelOcrOptions BuildDefaultOcrOptions() {
    return PerModelOcrOptions{};  // defaults everywhere
}

// Dispatch entry point used by Initialize().
static PerModelOcrOptions BuildOcrOptionsForBackend(
        const std::string& recModelPath,
        bool preferTensorRT) {
    const EngineType backend = EPLoader::Current().type;
    switch (backend) {
        case EngineType::NVIDIA_GPU:
            return BuildNvidiaOcrOptions(recModelPath, preferTensorRT);
        case EngineType::AMD_GPU:
            return BuildAmdOcrOptions();
        case EngineType::OPENVINO_GPU:
            return BuildIntelOcrOptions();
        case EngineType::CPU:
        default:
            return BuildDefaultOcrOptions();
    }
}

} // namespace (anonymous)

bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
                                    const std::string& clsModelPath,
                                    const std::string& recModelPath,
                                    const std::string& dictPath,
                                    bool preferTensorRT) {
    std::lock_guard<std::recursive_mutex> lock(_mutex);
    ModelLoadingGuard mlg(_modelLoading);

    // Dispatch to the correct per-backend option builder. The NVIDIA path
    // is fully locked-in; AMD/Intel/CPU paths currently return defaults
    // and are the place to add future backend-specific tuning.
    const PerModelOcrOptions opts =
        BuildOcrOptionsForBackend(recModelPath, preferTensorRT);
    const OrtHandlerOptions& detectorOpts   = opts.detectorOpts;
    const OrtHandlerOptions& classifierOpts = opts.classifierOpts;
    const OrtHandlerOptions& recognizerOpts = opts.recognizerOpts;

    try {
        // Initialize detector (also triggers EPLoader init in BasicOrtHandler)
        detector_ = std::make_unique<ONNXOCRDetector>(detModelPath, detectorOpts);
        std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl;

        // Ensure this DLL's copy of Ort::Global<void>::api_ is initialized.
        // BasicOrtHandler sets it in ONNXEngine.dll, but each DLL has its own
        // inline-static copy. Without this, inference calls from ANSOCR.dll crash.
        if (Ort::Global<void>::api_ == nullptr) {
            Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
        }

        // Initialize classifier (optional)
        if (!clsModelPath.empty()) {
            classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath, classifierOpts);
            std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl;
        }
        else {
            classifier_.reset();
            std::cout << "[PaddleOCRV5Engine] Classifier skipped (no model path)" << std::endl;
        }

        // Initialize recognizer
        recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath, recognizerOpts);
        if (!recognizer_->LoadDictionary(dictPath)) {
            std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl;
            return false;
        }
        std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl;

        // Pre-warm classifier (fixed [1,3,80,160]) and recognizer (4
        // bucket widths) so the first frame doesn't pay the cuDNN/TRT
        // algorithm-selection tax. The detector is intentionally NOT
        // warmed up: its input shape varies continuously with each
        // plate-ROI aspect ratio, so a warmup at any single canonical
        // shape would cost minutes (TRT) or be useless (CUDA cache miss
        // on the real frame anyway). Real frames will pay the per-shape
        // cuDNN HEURISTIC cost on first use.
        std::cout << "[PaddleOCRV5Engine] Warming up OCR pipeline..." << std::endl;
        if (classifier_) classifier_->Warmup();
        if (recognizer_) recognizer_->Warmup();
        std::cout << "[PaddleOCRV5Engine] Warmup complete" << std::endl;

        _initialized = true;
        std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl;
        return true;
    }
    catch (const std::exception& e) {
        std::cerr << "[PaddleOCRV5Engine] Initialization failed: " << e.what() << std::endl;
        detector_.reset();
        classifier_.reset();
        recognizer_.reset();
        _initialized = false;
        return false;
    }
}

std::vector<OCRPredictResult> PaddleOCRV5Engine::ocr(const cv::Mat& img) {
    if (_modelLoading.load()) return {};

    std::vector<OCRPredictResult> results;

    {
        auto lk = TryLockWithTimeout("PaddleOCRV5Engine::ocr");
        if (!lk.owns_lock()) return results;
        if (!_initialized || img.empty()) return results;
    }
    // _mutex released — heavy pipeline runs lock-free

    // Step 1: Text Detection
    auto boxes = detector_->Detect(img, _maxSideLen, _detDbThresh, _detBoxThresh, _detUnclipRatio, _useDilation);

    if (boxes.empty()) {
        return results;
    }

    // Step 2: Crop detected text regions
    std::vector<cv::Mat> croppedImages;
    croppedImages.reserve(boxes.size());
    for (auto& box : boxes) {
        cv::Mat cropped = GetRotateCropImage(img, box);
        if (!cropped.empty()) {
            croppedImages.push_back(cropped);
        }
    }

    // Step 3: Classification (optional)
    std::vector<int> cls_labels(croppedImages.size(), 0);
    std::vector<float> cls_scores(croppedImages.size(), 0.0f);

    if (classifier_) {
        classifier_->Classify(croppedImages, cls_labels, cls_scores, _clsThresh);

        // Rotate images classified as upside-down (label=1 and score > threshold)
        for (size_t i = 0; i < croppedImages.size(); i++) {
            if (cls_labels[i] % 2 == 1 && cls_scores[i] > _clsThresh) {
                cv::rotate(croppedImages[i], croppedImages[i], cv::ROTATE_180);
            }
        }
    }

    // Step 4: Text Recognition
    auto textLines = recognizer_->RecognizeBatch(croppedImages);

    // Step 5: Combine results
    for (size_t i = 0; i < boxes.size() && i < textLines.size(); i++) {
        OCRPredictResult result;

        // Convert TextBox points to box format [[x0,y0], [x1,y1], [x2,y2], [x3,y3]]
        result.box.resize(4);
        for (int j = 0; j < 4; j++) {
            result.box[j] = {
                static_cast<int>(boxes[i].points[j].x),
                static_cast<int>(boxes[i].points[j].y)
            };
        }

        result.text      = textLines[i].text;
        result.score      = textLines[i].score;
        result.cls_label  = cls_labels[i];
        result.cls_score  = cls_scores[i];

        results.push_back(result);
    }

    return results;
}

TextLine PaddleOCRV5Engine::recognizeOnly(const cv::Mat& croppedImage) {
    if (_modelLoading.load()) return { "", 0.0f };
    {
        auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeOnly");
        if (!lk.owns_lock()) return { "", 0.0f };
        if (!_initialized || !recognizer_ || croppedImage.empty()) return { "", 0.0f };
    }
    return recognizer_->Recognize(croppedImage);
}

std::vector<TextLine> PaddleOCRV5Engine::recognizeMany(const std::vector<cv::Mat>& croppedImages) {
    if (_modelLoading.load()) return std::vector<TextLine>(croppedImages.size());
    {
        auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeMany");
        if (!lk.owns_lock()) return std::vector<TextLine>(croppedImages.size());
        if (!_initialized || !recognizer_ || croppedImages.empty()) {
            return std::vector<TextLine>(croppedImages.size());
        }
    }
    // Delegates to the bucketed, batched path in ONNXOCRRecognizer.
    return recognizer_->RecognizeBatch(croppedImages);
}

} // namespace onnxocr
} // namespace ANSCENTER