#include "PaddleOCRV5Engine.h" #include "EPLoader.h" #include #include #include namespace ANSCENTER { namespace onnxocr { bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath, const std::string& clsModelPath, const std::string& recModelPath, const std::string& dictPath, bool preferTensorRT) { std::lock_guard lock(_mutex); ModelLoadingGuard mlg(_modelLoading); // High-perf options. The OCR sub-models split into two groups: // // 1. Detector — its input shape varies continuously with every // plate-ROI aspect ratio. TRT EP is a poor fit because it // builds a fresh engine for each unique shape (minutes each). // We keep it on CUDA EP with the largest cuDNN workspace and // let cuDNN HEURISTIC handle the per-shape algo selection. // // 2. Classifier + Recognizer — fixed-bucket shapes (cls is // [1,3,80,160], rec is [1,3,48,{320,480,640,960}]). These // benefit massively from TRT EP because the engine is built // once per shape and reused forever. OrtHandlerOptions detectorOpts; // Detector uses CUDA EP with *conservative* cuDNN workspace. // Empirical: on VRAM-constrained GPUs (LPD TRT engine + rec TRT // engine + ORT arena in play) the max-workspace mode causes cuDNN // to pick Winograd/implicit-precomp-GEMM variants that silently // fall back to slow NO-WORKSPACE algorithms when the big workspace // can't be allocated. With "0" cuDNN picks algorithms that are // known to fit and runs ~10x faster in practice. detectorOpts.useMaxCudnnWorkspace = false; detectorOpts.preferTensorRT = false; // never TRT for the detector // Classifier (fixed [1,3,80,160]): TRT with no profile is fine. OrtHandlerOptions classifierOpts; classifierOpts.useMaxCudnnWorkspace = true; classifierOpts.preferTensorRT = preferTensorRT; classifierOpts.trtFP16 = true; // Recognizer: needs a DYNAMIC profile so one TRT engine covers every // (batch, bucket_width) pair we generate at runtime. Without this, // each new shape triggers a ~80s engine rebuild mid-stream when a // new plate appears or the plate count changes. // // Profile range: // batch : 1 .. 16 (16 plates worth of crops is generous) // H : 48 (fixed) // W : 320 .. 960 (covers all 4 recognizer buckets) // // Query the actual input name from the .onnx file instead of // hardcoding — PaddleOCR usually exports it as "x" but the name can // vary across model versions. OrtHandlerOptions recognizerOpts; recognizerOpts.useMaxCudnnWorkspace = true; recognizerOpts.preferTensorRT = preferTensorRT; recognizerOpts.trtFP16 = true; if (preferTensorRT) { std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath); if (recInputName.empty()) { std::cerr << "[PaddleOCRV5Engine] Could not query recognizer " "input name — defaulting to 'x'" << std::endl; recInputName = "x"; } std::cout << "[PaddleOCRV5Engine] Recognizer input name: '" << recInputName << "' — building TRT dynamic profile " << "[batch=1..16, W=320..960]" << std::endl; recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320"; recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480"; recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960"; } try { // Initialize detector (also triggers EPLoader init in BasicOrtHandler) detector_ = std::make_unique(detModelPath, detectorOpts); std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl; // Ensure this DLL's copy of Ort::Global::api_ is initialized. // BasicOrtHandler sets it in ONNXEngine.dll, but each DLL has its own // inline-static copy. Without this, inference calls from ANSOCR.dll crash. if (Ort::Global::api_ == nullptr) { Ort::InitApi(static_cast(EPLoader::GetOrtApiRaw())); } // Initialize classifier (optional) if (!clsModelPath.empty()) { classifier_ = std::make_unique(clsModelPath, classifierOpts); std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl; } else { classifier_.reset(); std::cout << "[PaddleOCRV5Engine] Classifier skipped (no model path)" << std::endl; } // Initialize recognizer recognizer_ = std::make_unique(recModelPath, recognizerOpts); if (!recognizer_->LoadDictionary(dictPath)) { std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl; return false; } std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl; // Pre-warm classifier (fixed [1,3,80,160]) and recognizer (4 // bucket widths) so the first frame doesn't pay the cuDNN/TRT // algorithm-selection tax. The detector is intentionally NOT // warmed up: its input shape varies continuously with each // plate-ROI aspect ratio, so a warmup at any single canonical // shape would cost minutes (TRT) or be useless (CUDA cache miss // on the real frame anyway). Real frames will pay the per-shape // cuDNN HEURISTIC cost on first use. std::cout << "[PaddleOCRV5Engine] Warming up OCR pipeline..." << std::endl; if (classifier_) classifier_->Warmup(); if (recognizer_) recognizer_->Warmup(); std::cout << "[PaddleOCRV5Engine] Warmup complete" << std::endl; _initialized = true; std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl; return true; } catch (const std::exception& e) { std::cerr << "[PaddleOCRV5Engine] Initialization failed: " << e.what() << std::endl; detector_.reset(); classifier_.reset(); recognizer_.reset(); _initialized = false; return false; } } std::vector PaddleOCRV5Engine::ocr(const cv::Mat& img) { if (_modelLoading.load()) return {}; std::vector results; { auto lk = TryLockWithTimeout("PaddleOCRV5Engine::ocr"); if (!lk.owns_lock()) return results; if (!_initialized || img.empty()) return results; } // _mutex released — heavy pipeline runs lock-free // Step 1: Text Detection auto boxes = detector_->Detect(img, _maxSideLen, _detDbThresh, _detBoxThresh, _detUnclipRatio, _useDilation); if (boxes.empty()) { return results; } // Step 2: Crop detected text regions std::vector croppedImages; croppedImages.reserve(boxes.size()); for (auto& box : boxes) { cv::Mat cropped = GetRotateCropImage(img, box); if (!cropped.empty()) { croppedImages.push_back(cropped); } } // Step 3: Classification (optional) std::vector cls_labels(croppedImages.size(), 0); std::vector cls_scores(croppedImages.size(), 0.0f); if (classifier_) { classifier_->Classify(croppedImages, cls_labels, cls_scores, _clsThresh); // Rotate images classified as upside-down (label=1 and score > threshold) for (size_t i = 0; i < croppedImages.size(); i++) { if (cls_labels[i] % 2 == 1 && cls_scores[i] > _clsThresh) { cv::rotate(croppedImages[i], croppedImages[i], cv::ROTATE_180); } } } // Step 4: Text Recognition auto textLines = recognizer_->RecognizeBatch(croppedImages); // Step 5: Combine results for (size_t i = 0; i < boxes.size() && i < textLines.size(); i++) { OCRPredictResult result; // Convert TextBox points to box format [[x0,y0], [x1,y1], [x2,y2], [x3,y3]] result.box.resize(4); for (int j = 0; j < 4; j++) { result.box[j] = { static_cast(boxes[i].points[j].x), static_cast(boxes[i].points[j].y) }; } result.text = textLines[i].text; result.score = textLines[i].score; result.cls_label = cls_labels[i]; result.cls_score = cls_scores[i]; results.push_back(result); } return results; } TextLine PaddleOCRV5Engine::recognizeOnly(const cv::Mat& croppedImage) { if (_modelLoading.load()) return { "", 0.0f }; { auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeOnly"); if (!lk.owns_lock()) return { "", 0.0f }; if (!_initialized || !recognizer_ || croppedImage.empty()) return { "", 0.0f }; } return recognizer_->Recognize(croppedImage); } std::vector PaddleOCRV5Engine::recognizeMany(const std::vector& croppedImages) { if (_modelLoading.load()) return std::vector(croppedImages.size()); { auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeMany"); if (!lk.owns_lock()) return std::vector(croppedImages.size()); if (!_initialized || !recognizer_ || croppedImages.empty()) { return std::vector(croppedImages.size()); } } // Delegates to the bucketed, batched path in ONNXOCRRecognizer. return recognizer_->RecognizeBatch(croppedImages); } } // namespace onnxocr } // namespace ANSCENTER