Improve ALPR_OCR peformance

2026-04-14 20:30:21 +10:00
parent 3349b45ade
commit f9a0af8949
18 changed files with 991 additions and 77 deletions
--- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
@@ -11,13 +11,75 @@ namespace onnxocr {
 bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
                                    const std::string& clsModelPath,
                                    const std::string& recModelPath,
-                                    const std::string& dictPath) {
+                                    const std::string& dictPath,
+                                    bool preferTensorRT) {
    std::lock_guard<std::recursive_mutex> lock(_mutex);
    ModelLoadingGuard mlg(_modelLoading);

+    // High-perf options.  The OCR sub-models split into two groups:
+    //
+    //   1. Detector — its input shape varies continuously with every
+    //      plate-ROI aspect ratio.  TRT EP is a poor fit because it
+    //      builds a fresh engine for each unique shape (minutes each).
+    //      We keep it on CUDA EP with the largest cuDNN workspace and
+    //      let cuDNN HEURISTIC handle the per-shape algo selection.
+    //
+    //   2. Classifier + Recognizer — fixed-bucket shapes (cls is
+    //      [1,3,80,160], rec is [1,3,48,{320,480,640,960}]).  These
+    //      benefit massively from TRT EP because the engine is built
+    //      once per shape and reused forever.
+    OrtHandlerOptions detectorOpts;
+    // Detector uses CUDA EP with *conservative* cuDNN workspace.
+    // Empirical: on VRAM-constrained GPUs (LPD TRT engine + rec TRT
+    // engine + ORT arena in play) the max-workspace mode causes cuDNN
+    // to pick Winograd/implicit-precomp-GEMM variants that silently
+    // fall back to slow NO-WORKSPACE algorithms when the big workspace
+    // can't be allocated. With "0" cuDNN picks algorithms that are
+    // known to fit and runs ~10x faster in practice.
+    detectorOpts.useMaxCudnnWorkspace = false;
+    detectorOpts.preferTensorRT       = false;   // never TRT for the detector
+
+    // Classifier (fixed [1,3,80,160]): TRT with no profile is fine.
+    OrtHandlerOptions classifierOpts;
+    classifierOpts.useMaxCudnnWorkspace = true;
+    classifierOpts.preferTensorRT       = preferTensorRT;
+    classifierOpts.trtFP16              = true;
+
+    // Recognizer: needs a DYNAMIC profile so one TRT engine covers every
+    // (batch, bucket_width) pair we generate at runtime. Without this,
+    // each new shape triggers a ~80s engine rebuild mid-stream when a
+    // new plate appears or the plate count changes.
+    //
+    // Profile range:
+    //   batch  : 1 .. 16       (16 plates worth of crops is generous)
+    //   H      : 48 (fixed)
+    //   W      : 320 .. 960    (covers all 4 recognizer buckets)
+    //
+    // Query the actual input name from the .onnx file instead of
+    // hardcoding — PaddleOCR usually exports it as "x" but the name can
+    // vary across model versions.
+    OrtHandlerOptions recognizerOpts;
+    recognizerOpts.useMaxCudnnWorkspace = true;
+    recognizerOpts.preferTensorRT       = preferTensorRT;
+    recognizerOpts.trtFP16              = true;
+    if (preferTensorRT) {
+        std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath);
+        if (recInputName.empty()) {
+            std::cerr << "[PaddleOCRV5Engine] Could not query recognizer "
+                         "input name — defaulting to 'x'" << std::endl;
+            recInputName = "x";
+        }
+        std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
+                  << recInputName << "' — building TRT dynamic profile "
+                  << "[batch=1..16, W=320..960]" << std::endl;
+        recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
+        recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
+        recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
+    }
+
    try {
        // Initialize detector (also triggers EPLoader init in BasicOrtHandler)
-        detector_ = std::make_unique<ONNXOCRDetector>(detModelPath);
+        detector_ = std::make_unique<ONNXOCRDetector>(detModelPath, detectorOpts);
        std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl;

        // Ensure this DLL's copy of Ort::Global<void>::api_ is initialized.
@@ -29,7 +91,7 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,

        // Initialize classifier (optional)
        if (!clsModelPath.empty()) {
-            classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath);
+            classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath, classifierOpts);
            std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl;
        }
        else {
@@ -38,13 +100,26 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
        }

        // Initialize recognizer
-        recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath);
+        recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath, recognizerOpts);
        if (!recognizer_->LoadDictionary(dictPath)) {
            std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl;
            return false;
        }
        std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl;

+        // Pre-warm classifier (fixed [1,3,80,160]) and recognizer (4
+        // bucket widths) so the first frame doesn't pay the cuDNN/TRT
+        // algorithm-selection tax. The detector is intentionally NOT
+        // warmed up: its input shape varies continuously with each
+        // plate-ROI aspect ratio, so a warmup at any single canonical
+        // shape would cost minutes (TRT) or be useless (CUDA cache miss
+        // on the real frame anyway). Real frames will pay the per-shape
+        // cuDNN HEURISTIC cost on first use.
+        std::cout << "[PaddleOCRV5Engine] Warming up OCR pipeline..." << std::endl;
+        if (classifier_) classifier_->Warmup();
+        if (recognizer_) recognizer_->Warmup();
+        std::cout << "[PaddleOCRV5Engine] Warmup complete" << std::endl;
+
        _initialized = true;
        std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl;
        return true;
@@ -140,5 +215,18 @@ TextLine PaddleOCRV5Engine::recognizeOnly(const cv::Mat& croppedImage) {
    return recognizer_->Recognize(croppedImage);
 }

+std::vector<TextLine> PaddleOCRV5Engine::recognizeMany(const std::vector<cv::Mat>& croppedImages) {
+    if (_modelLoading.load()) return std::vector<TextLine>(croppedImages.size());
+    {
+        auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeMany");
+        if (!lk.owns_lock()) return std::vector<TextLine>(croppedImages.size());
+        if (!_initialized || !recognizer_ || croppedImages.empty()) {
+            return std::vector<TextLine>(croppedImages.size());
+        }
+    }
+    // Delegates to the bucketed, batched path in ONNXOCRRecognizer.
+    return recognizer_->RecognizeBatch(croppedImages);
+}
+
 } // namespace onnxocr
 } // namespace ANSCENTER