#include "PaddleOCRV5Engine.h" #include "EPLoader.h" #include #include #include namespace ANSCENTER { namespace onnxocr { // ============================================================================ // Per-backend OCR option builders // // Each backend (NVIDIA / AMD / Intel / CPU) has its own helper that returns // a fully-populated set of OrtHandlerOptions for the detector, classifier, // and recognizer sub-models. PaddleOCRV5Engine::Initialize dispatches to the // correct helper based on the engine type that EPLoader resolved at startup. // // Adding a new backend optimization is a strictly contained change: touch // only that backend's builder. The others — especially NVIDIA, which is // hand-tuned and should not regress — stay untouched. // ============================================================================ namespace { struct PerModelOcrOptions { OrtHandlerOptions detectorOpts; OrtHandlerOptions classifierOpts; OrtHandlerOptions recognizerOpts; }; // ---------------------------------------------------------------------------- // NVIDIA — LOCKED. Do NOT modify this helper unless fixing a specific // NVIDIA-observable regression. // // The OCR sub-models split into two groups: // 1. Detector — variable input shape per plate-ROI aspect. TRT EP is a // poor fit (one engine build per unique shape, minutes each). Runs on // CUDA EP with *conservative* cuDNN workspace: empirical measurements // showed that max-workspace mode forces cuDNN to pick Winograd/ // implicit-precomp-GEMM variants that silently fall back to slow // NO-WORKSPACE algorithms when the big workspace can't be allocated // under VRAM pressure (LPD TRT engine + rec TRT engine + ORT arena). // 2. Classifier + Recognizer — TRT EP. Classifier has fixed shape so no // profile is needed. Recognizer gets a dynamic profile // [batch=1..16, W=320..960] so a single pre-built engine handles every // runtime shape without mid-stream rebuilds (fixes 60–90 s hangs). // ---------------------------------------------------------------------------- static PerModelOcrOptions BuildNvidiaOcrOptions( const std::string& recModelPath, bool preferTensorRT) { PerModelOcrOptions opts; // Detector: CUDA EP, conservative workspace, never TRT. opts.detectorOpts.useMaxCudnnWorkspace = false; opts.detectorOpts.preferTensorRT = false; // Classifier: TRT EP, no profile (fixed [1,3,80,160]). opts.classifierOpts.useMaxCudnnWorkspace = true; opts.classifierOpts.preferTensorRT = preferTensorRT; opts.classifierOpts.trtFP16 = true; // Recognizer: TRT EP with dynamic shape profile. The max-batch // dimension is kRecMaxBatch (defined in ONNXOCRTypes.h) — the same // constant that ONNXOCRRecognizer::RecognizeBatch uses to chunk // oversized bucket groups. Keeping them in lockstep ensures the // recognizer never submits a shape that falls outside the TRT profile. opts.recognizerOpts.useMaxCudnnWorkspace = true; opts.recognizerOpts.preferTensorRT = preferTensorRT; opts.recognizerOpts.trtFP16 = true; if (preferTensorRT) { std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath); if (recInputName.empty()) { std::cerr << "[PaddleOCRV5Engine] Could not query recognizer " "input name — defaulting to 'x'" << std::endl; recInputName = "x"; } const std::string maxB = std::to_string(kRecMaxBatch); std::cout << "[PaddleOCRV5Engine] Recognizer input name: '" << recInputName << "' — building TRT dynamic profile " << "[batch=1.." << maxB << ", W=320..960]" << std::endl; opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320"; opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480"; opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":" + maxB + "x3x48x960"; } return opts; } // ---------------------------------------------------------------------------- // Intel (OpenVINO EP) — placeholder. // // Returns default-constructed options: no backend-specific tuning applied // yet. When adding Intel optimizations (OpenVINO cache_dir, explicit device // selection, INT8 paths, etc.), add the corresponding fields to the Intel // section of OrtHandlerOptions and populate them here. // ---------------------------------------------------------------------------- static PerModelOcrOptions BuildIntelOcrOptions() { return PerModelOcrOptions{}; // defaults everywhere } // ---------------------------------------------------------------------------- // AMD (DirectML EP / MIGraphX EP) — placeholder. // // Returns default-constructed options: no backend-specific tuning applied // yet. When adding AMD optimizations (graph opt gate for RDNA3+ desktop // cards, MIGraphX cache on Linux, etc.), add the corresponding fields to // the AMD section of OrtHandlerOptions and populate them here. // ---------------------------------------------------------------------------- static PerModelOcrOptions BuildAmdOcrOptions() { return PerModelOcrOptions{}; // defaults everywhere } // ---------------------------------------------------------------------------- // CPU / unknown hardware — no tuning. // ---------------------------------------------------------------------------- static PerModelOcrOptions BuildDefaultOcrOptions() { return PerModelOcrOptions{}; // defaults everywhere } // Dispatch entry point used by Initialize(). static PerModelOcrOptions BuildOcrOptionsForBackend( const std::string& recModelPath, bool preferTensorRT) { const EngineType backend = EPLoader::Current().type; switch (backend) { case EngineType::NVIDIA_GPU: return BuildNvidiaOcrOptions(recModelPath, preferTensorRT); case EngineType::AMD_GPU: return BuildAmdOcrOptions(); case EngineType::OPENVINO_GPU: return BuildIntelOcrOptions(); case EngineType::CPU: default: return BuildDefaultOcrOptions(); } } } // namespace (anonymous) bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath, const std::string& clsModelPath, const std::string& recModelPath, const std::string& dictPath, bool preferTensorRT) { std::lock_guard lock(_mutex); ModelLoadingGuard mlg(_modelLoading); // Dispatch to the correct per-backend option builder. The NVIDIA path // is fully locked-in; AMD/Intel/CPU paths currently return defaults // and are the place to add future backend-specific tuning. const PerModelOcrOptions opts = BuildOcrOptionsForBackend(recModelPath, preferTensorRT); const OrtHandlerOptions& detectorOpts = opts.detectorOpts; const OrtHandlerOptions& classifierOpts = opts.classifierOpts; const OrtHandlerOptions& recognizerOpts = opts.recognizerOpts; try { // Initialize detector (also triggers EPLoader init in BasicOrtHandler) detector_ = std::make_unique(detModelPath, detectorOpts); std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl; // Ensure this DLL's copy of Ort::Global::api_ is initialized. // BasicOrtHandler sets it in ONNXEngine.dll, but each DLL has its own // inline-static copy. Without this, inference calls from ANSOCR.dll crash. if (Ort::Global::api_ == nullptr) { Ort::InitApi(static_cast(EPLoader::GetOrtApiRaw())); } // Initialize classifier (optional) if (!clsModelPath.empty()) { classifier_ = std::make_unique(clsModelPath, classifierOpts); std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl; } else { classifier_.reset(); std::cout << "[PaddleOCRV5Engine] Classifier skipped (no model path)" << std::endl; } // Initialize recognizer recognizer_ = std::make_unique(recModelPath, recognizerOpts); if (!recognizer_->LoadDictionary(dictPath)) { std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl; return false; } std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl; // Pre-warm classifier (fixed [1,3,80,160]) and recognizer (4 // bucket widths) so the first frame doesn't pay the cuDNN/TRT // algorithm-selection tax. The detector is intentionally NOT // warmed up: its input shape varies continuously with each // plate-ROI aspect ratio, so a warmup at any single canonical // shape would cost minutes (TRT) or be useless (CUDA cache miss // on the real frame anyway). Real frames will pay the per-shape // cuDNN HEURISTIC cost on first use. std::cout << "[PaddleOCRV5Engine] Warming up OCR pipeline..." << std::endl; if (classifier_) classifier_->Warmup(); if (recognizer_) recognizer_->Warmup(); std::cout << "[PaddleOCRV5Engine] Warmup complete" << std::endl; _initialized = true; std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl; return true; } catch (const std::exception& e) { std::cerr << "[PaddleOCRV5Engine] Initialization failed: " << e.what() << std::endl; detector_.reset(); classifier_.reset(); recognizer_.reset(); _initialized = false; return false; } } std::vector PaddleOCRV5Engine::ocr(const cv::Mat& img) { if (_modelLoading.load()) return {}; std::vector results; { auto lk = TryLockWithTimeout("PaddleOCRV5Engine::ocr"); if (!lk.owns_lock()) return results; if (!_initialized || img.empty()) return results; } // _mutex released — heavy pipeline runs lock-free // Step 1: Text Detection auto boxes = detector_->Detect(img, _maxSideLen, _detDbThresh, _detBoxThresh, _detUnclipRatio, _useDilation); if (boxes.empty()) { return results; } // Step 2: Crop detected text regions std::vector croppedImages; croppedImages.reserve(boxes.size()); for (auto& box : boxes) { cv::Mat cropped = GetRotateCropImage(img, box); if (!cropped.empty()) { croppedImages.push_back(cropped); } } // Step 3: Classification (optional) std::vector cls_labels(croppedImages.size(), 0); std::vector cls_scores(croppedImages.size(), 0.0f); if (classifier_) { classifier_->Classify(croppedImages, cls_labels, cls_scores, _clsThresh); // Rotate images classified as upside-down (label=1 and score > threshold) for (size_t i = 0; i < croppedImages.size(); i++) { if (cls_labels[i] % 2 == 1 && cls_scores[i] > _clsThresh) { cv::rotate(croppedImages[i], croppedImages[i], cv::ROTATE_180); } } } // Step 4: Text Recognition auto textLines = recognizer_->RecognizeBatch(croppedImages); // Step 5: Combine results for (size_t i = 0; i < boxes.size() && i < textLines.size(); i++) { OCRPredictResult result; // Convert TextBox points to box format [[x0,y0], [x1,y1], [x2,y2], [x3,y3]] result.box.resize(4); for (int j = 0; j < 4; j++) { result.box[j] = { static_cast(boxes[i].points[j].x), static_cast(boxes[i].points[j].y) }; } result.text = textLines[i].text; result.score = textLines[i].score; result.cls_label = cls_labels[i]; result.cls_score = cls_scores[i]; results.push_back(result); } return results; } TextLine PaddleOCRV5Engine::recognizeOnly(const cv::Mat& croppedImage) { if (_modelLoading.load()) return { "", 0.0f }; { auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeOnly"); if (!lk.owns_lock()) return { "", 0.0f }; if (!_initialized || !recognizer_ || croppedImage.empty()) return { "", 0.0f }; } return recognizer_->Recognize(croppedImage); } std::vector PaddleOCRV5Engine::recognizeMany(const std::vector& croppedImages) { if (_modelLoading.load()) return std::vector(croppedImages.size()); { auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeMany"); if (!lk.owns_lock()) return std::vector(croppedImages.size()); if (!_initialized || !recognizer_ || croppedImages.empty()) { return std::vector(croppedImages.size()); } } // Delegates to the bucketed, batched path in ONNXOCRRecognizer. return recognizer_->RecognizeBatch(croppedImages); } } // namespace onnxocr } // namespace ANSCENTER