Improve ALPR_OCR peformance

2026-04-14 20:30:21 +10:00
parent 3349b45ade
commit f9a0af8949
18 changed files with 991 additions and 77 deletions
--- a/engines/ONNXEngine/ONNXEngine.h
+++ b/engines/ONNXEngine/ONNXEngine.h
@@ -248,6 +248,46 @@ namespace ANSCENTER {
        return std::string(ort_session->GetOutputNameAllocated(index, allocator).get());
    }

+    // ====================================================================
+    // High-perf options for OCR sub-models that need TRT EP and full
+    // cuDNN workspace.  Default-constructed = identical to the legacy
+    // behavior (CUDA EP only, minimal cuDNN workspace).
+    // ====================================================================
+    struct OrtHandlerOptions {
+        // Try to attach TensorRT EP before CUDA EP (NVIDIA only).
+        // Falls back to CUDA EP automatically if TRT EP creation or session
+        // creation fails.  Engines are cached on disk for fast reload.
+        bool preferTensorRT = false;
+
+        // Use the largest cuDNN conv workspace.  cuDNN can then pick fast
+        // algorithms (Winograd, implicit-precomp-GEMM with big workspaces).
+        // Defaults off because some deployments share VRAM with TRT engines
+        // and need the minimal-workspace mode to avoid OOM.
+        bool useMaxCudnnWorkspace = false;
+
+        // Where to cache built TRT engines.  Empty → default
+        // %TEMP%/ANSCENTER/TRTEngineCache.  Only used when preferTensorRT.
+        std::string trtEngineCacheDir;
+
+        // FP16 builds for TRT EP.  Recommended for inference; ignored if
+        // preferTensorRT is false.
+        bool trtFP16 = true;
+
+        // Dynamic-shape profile for TRT EP. When set, TRT builds ONE
+        // engine that handles every input shape in the [min..max] range
+        // instead of rebuilding per unique shape. Critical for models
+        // that see many (batch_size, spatial) combinations at runtime.
+        //
+        // Format: "input_name:d0xd1xd2xd3[,input2:...]"
+        //   e.g. "x:1x3x48x320"  for batch=1, C=3, H=48, W=320
+        //
+        // All three fields must be set together. An empty min implies
+        // no profile (fall back to static-shape-per-unique-input mode).
+        std::string trtProfileMinShapes;
+        std::string trtProfileOptShapes;
+        std::string trtProfileMaxShapes;
+    };
+
    // ====================================================================
    // BasicOrtHandler
    // ====================================================================
@@ -280,6 +320,9 @@ namespace ANSCENTER {
        const unsigned int num_threads;
        EngineType m_engineType;

+        // Per-session high-perf options. Default = legacy behavior.
+        OrtHandlerOptions m_handlerOptions;
+
    protected:
        // Default: hardware auto-detection via ANSLicenseHelper through EPLoader
        explicit BasicOrtHandler(const std::string& _onnx_path,
@@ -290,6 +333,19 @@ namespace ANSCENTER {
            EngineType engineType,
            unsigned int _num_threads = 1);

+        // Engine override + per-session high-perf options (TRT EP, max
+        // cuDNN workspace, etc.).  Used by OCR sub-models that need
+        // shape-stable, high-throughput inference.
+        explicit BasicOrtHandler(const std::string& _onnx_path,
+            EngineType engineType,
+            const OrtHandlerOptions& options,
+            unsigned int _num_threads = 1);
+
+        // Auto-detect engine via EPLoader, but with high-perf options.
+        explicit BasicOrtHandler(const std::string& _onnx_path,
+            const OrtHandlerOptions& options,
+            unsigned int _num_threads = 1);
+
        virtual ~BasicOrtHandler();

        BasicOrtHandler(const BasicOrtHandler&) = delete;
@@ -298,6 +354,13 @@ namespace ANSCENTER {
        // Resolved EP type (after EPLoader fallback). Subclasses use this
        // to branch on actual EP at inference time.
        EngineType getEngineType() const { return m_engineType; }
+
+        // Spin up a tiny CPU-only ORT session just long enough to read
+        // the name of the model's first input, then tear it down. Used
+        // by callers that need to build TRT profile-shape strings
+        // (which require the input name) BEFORE the real session is
+        // created. Returns an empty string on failure.
+        static std::string QueryModelInputName(const std::string& onnxPath);
    private:
        void initialize_handler();
    protected:
@@ -306,6 +369,7 @@ namespace ANSCENTER {

        // EP-specific session option builders
        bool TryAppendCUDA(Ort::SessionOptions& opts);
+        bool TryAppendTensorRT(Ort::SessionOptions& opts);
        bool TryAppendDirectML(Ort::SessionOptions& opts);
        bool TryAppendOpenVINO(Ort::SessionOptions& opts);
    };