Support tracker to improve ALPR_OCR

2026-04-14 21:18:10 +10:00
parent f9a0af8949
commit 5706615ed5
4 changed files with 435 additions and 62 deletions
--- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
@@ -8,60 +8,62 @@
 namespace ANSCENTER {
 namespace onnxocr {

-bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
-                                    const std::string& clsModelPath,
-                                    const std::string& recModelPath,
-                                    const std::string& dictPath,
-                                    bool preferTensorRT) {
-    std::lock_guard<std::recursive_mutex> lock(_mutex);
-    ModelLoadingGuard mlg(_modelLoading);
+// ============================================================================
+//  Per-backend OCR option builders
+//
+//  Each backend (NVIDIA / AMD / Intel / CPU) has its own helper that returns
+//  a fully-populated set of OrtHandlerOptions for the detector, classifier,
+//  and recognizer sub-models. PaddleOCRV5Engine::Initialize dispatches to the
+//  correct helper based on the engine type that EPLoader resolved at startup.
+//
+//  Adding a new backend optimization is a strictly contained change: touch
+//  only that backend's builder. The others — especially NVIDIA, which is
+//  hand-tuned and should not regress — stay untouched.
+// ============================================================================

-    // High-perf options.  The OCR sub-models split into two groups:
-    //
-    //   1. Detector — its input shape varies continuously with every
-    //      plate-ROI aspect ratio.  TRT EP is a poor fit because it
-    //      builds a fresh engine for each unique shape (minutes each).
-    //      We keep it on CUDA EP with the largest cuDNN workspace and
-    //      let cuDNN HEURISTIC handle the per-shape algo selection.
-    //
-    //   2. Classifier + Recognizer — fixed-bucket shapes (cls is
-    //      [1,3,80,160], rec is [1,3,48,{320,480,640,960}]).  These
-    //      benefit massively from TRT EP because the engine is built
-    //      once per shape and reused forever.
+namespace {
+
+struct PerModelOcrOptions {
    OrtHandlerOptions detectorOpts;
-    // Detector uses CUDA EP with *conservative* cuDNN workspace.
-    // Empirical: on VRAM-constrained GPUs (LPD TRT engine + rec TRT
-    // engine + ORT arena in play) the max-workspace mode causes cuDNN
-    // to pick Winograd/implicit-precomp-GEMM variants that silently
-    // fall back to slow NO-WORKSPACE algorithms when the big workspace
-    // can't be allocated. With "0" cuDNN picks algorithms that are
-    // known to fit and runs ~10x faster in practice.
-    detectorOpts.useMaxCudnnWorkspace = false;
-    detectorOpts.preferTensorRT       = false;   // never TRT for the detector
-
-    // Classifier (fixed [1,3,80,160]): TRT with no profile is fine.
    OrtHandlerOptions classifierOpts;
-    classifierOpts.useMaxCudnnWorkspace = true;
-    classifierOpts.preferTensorRT       = preferTensorRT;
-    classifierOpts.trtFP16              = true;
-
-    // Recognizer: needs a DYNAMIC profile so one TRT engine covers every
-    // (batch, bucket_width) pair we generate at runtime. Without this,
-    // each new shape triggers a ~80s engine rebuild mid-stream when a
-    // new plate appears or the plate count changes.
-    //
-    // Profile range:
-    //   batch  : 1 .. 16       (16 plates worth of crops is generous)
-    //   H      : 48 (fixed)
-    //   W      : 320 .. 960    (covers all 4 recognizer buckets)
-    //
-    // Query the actual input name from the .onnx file instead of
-    // hardcoding — PaddleOCR usually exports it as "x" but the name can
-    // vary across model versions.
    OrtHandlerOptions recognizerOpts;
-    recognizerOpts.useMaxCudnnWorkspace = true;
-    recognizerOpts.preferTensorRT       = preferTensorRT;
-    recognizerOpts.trtFP16              = true;
+};
+
+// ----------------------------------------------------------------------------
+//  NVIDIA — LOCKED. Do NOT modify this helper unless fixing a specific
+//  NVIDIA-observable regression.
+//
+//  The OCR sub-models split into two groups:
+//    1. Detector — variable input shape per plate-ROI aspect. TRT EP is a
+//       poor fit (one engine build per unique shape, minutes each). Runs on
+//       CUDA EP with *conservative* cuDNN workspace: empirical measurements
+//       showed that max-workspace mode forces cuDNN to pick Winograd/
+//       implicit-precomp-GEMM variants that silently fall back to slow
+//       NO-WORKSPACE algorithms when the big workspace can't be allocated
+//       under VRAM pressure (LPD TRT engine + rec TRT engine + ORT arena).
+//    2. Classifier + Recognizer — TRT EP. Classifier has fixed shape so no
+//       profile is needed. Recognizer gets a dynamic profile
+//       [batch=1..16, W=320..960] so a single pre-built engine handles every
+//       runtime shape without mid-stream rebuilds (fixes 60–90 s hangs).
+// ----------------------------------------------------------------------------
+static PerModelOcrOptions BuildNvidiaOcrOptions(
+        const std::string& recModelPath,
+        bool preferTensorRT) {
+    PerModelOcrOptions opts;
+
+    // Detector: CUDA EP, conservative workspace, never TRT.
+    opts.detectorOpts.useMaxCudnnWorkspace = false;
+    opts.detectorOpts.preferTensorRT       = false;
+
+    // Classifier: TRT EP, no profile (fixed [1,3,80,160]).
+    opts.classifierOpts.useMaxCudnnWorkspace = true;
+    opts.classifierOpts.preferTensorRT       = preferTensorRT;
+    opts.classifierOpts.trtFP16              = true;
+
+    // Recognizer: TRT EP with dynamic shape profile.
+    opts.recognizerOpts.useMaxCudnnWorkspace = true;
+    opts.recognizerOpts.preferTensorRT       = preferTensorRT;
+    opts.recognizerOpts.trtFP16              = true;
    if (preferTensorRT) {
        std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath);
        if (recInputName.empty()) {
@@ -72,10 +74,80 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
        std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
                  << recInputName << "' — building TRT dynamic profile "
                  << "[batch=1..16, W=320..960]" << std::endl;
-        recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
-        recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
-        recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
+        opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
+        opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
+        opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
    }
+    return opts;
+}
+
+// ----------------------------------------------------------------------------
+//  Intel (OpenVINO EP) — placeholder.
+//
+//  Returns default-constructed options: no backend-specific tuning applied
+//  yet. When adding Intel optimizations (OpenVINO cache_dir, explicit device
+//  selection, INT8 paths, etc.), add the corresponding fields to the Intel
+//  section of OrtHandlerOptions and populate them here.
+// ----------------------------------------------------------------------------
+static PerModelOcrOptions BuildIntelOcrOptions() {
+    return PerModelOcrOptions{};  // defaults everywhere
+}
+
+// ----------------------------------------------------------------------------
+//  AMD (DirectML EP / MIGraphX EP) — placeholder.
+//
+//  Returns default-constructed options: no backend-specific tuning applied
+//  yet. When adding AMD optimizations (graph opt gate for RDNA3+ desktop
+//  cards, MIGraphX cache on Linux, etc.), add the corresponding fields to
+//  the AMD section of OrtHandlerOptions and populate them here.
+// ----------------------------------------------------------------------------
+static PerModelOcrOptions BuildAmdOcrOptions() {
+    return PerModelOcrOptions{};  // defaults everywhere
+}
+
+// ----------------------------------------------------------------------------
+//  CPU / unknown hardware — no tuning.
+// ----------------------------------------------------------------------------
+static PerModelOcrOptions BuildDefaultOcrOptions() {
+    return PerModelOcrOptions{};  // defaults everywhere
+}
+
+// Dispatch entry point used by Initialize().
+static PerModelOcrOptions BuildOcrOptionsForBackend(
+        const std::string& recModelPath,
+        bool preferTensorRT) {
+    const EngineType backend = EPLoader::Current().type;
+    switch (backend) {
+        case EngineType::NVIDIA_GPU:
+            return BuildNvidiaOcrOptions(recModelPath, preferTensorRT);
+        case EngineType::AMD_GPU:
+            return BuildAmdOcrOptions();
+        case EngineType::OPENVINO_GPU:
+            return BuildIntelOcrOptions();
+        case EngineType::CPU:
+        default:
+            return BuildDefaultOcrOptions();
+    }
+}
+
+} // namespace (anonymous)
+
+bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
+                                    const std::string& clsModelPath,
+                                    const std::string& recModelPath,
+                                    const std::string& dictPath,
+                                    bool preferTensorRT) {
+    std::lock_guard<std::recursive_mutex> lock(_mutex);
+    ModelLoadingGuard mlg(_modelLoading);
+
+    // Dispatch to the correct per-backend option builder. The NVIDIA path
+    // is fully locked-in; AMD/Intel/CPU paths currently return defaults
+    // and are the place to add future backend-specific tuning.
+    const PerModelOcrOptions opts =
+        BuildOcrOptionsForBackend(recModelPath, preferTensorRT);
+    const OrtHandlerOptions& detectorOpts   = opts.detectorOpts;
+    const OrtHandlerOptions& classifierOpts = opts.classifierOpts;
+    const OrtHandlerOptions& recognizerOpts = opts.recognizerOpts;

    try {
        // Initialize detector (also triggers EPLoader init in BasicOrtHandler)