Improve ALPR_OCR peformance

2026-04-14 20:30:21 +10:00
parent 3349b45ade
commit f9a0af8949
18 changed files with 991 additions and 77 deletions
--- a/engines/ONNXEngine/ONNXEngine.cpp
+++ b/engines/ONNXEngine/ONNXEngine.cpp
@@ -6,6 +6,8 @@
 #include <limits>
 #include <filesystem>
 #include <fstream>
+#include <cstdlib>
+#include <system_error>

 namespace ANSCENTER {

@@ -13,6 +15,40 @@ namespace ANSCENTER {
    // BasicOrtHandler — constructors
    // ====================================================================

+    std::string BasicOrtHandler::QueryModelInputName(const std::string& onnxPath)
+    {
+        try {
+            // Make sure the Ort API pointer is initialised in THIS DLL.
+            if (Ort::Global<void>::api_ == nullptr) {
+                Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
+            }
+
+            Ort::Env env(ORT_LOGGING_LEVEL_ERROR, "QueryModelInputName");
+            Ort::SessionOptions opts;
+            opts.SetIntraOpNumThreads(1);
+            opts.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_DISABLE_ALL);
+            // Intentionally NOT attaching CUDA/TRT EP — CPU is fastest
+            // for a no-inference metadata read.
+
+            std::wstring wpath(onnxPath.begin(), onnxPath.end());
+            Ort::Session session(env, wpath.c_str(), opts);
+
+            Ort::AllocatorWithDefaultOptions alloc;
+            auto inName = session.GetInputNameAllocated(0, alloc);
+            return std::string(inName.get());
+        }
+        catch (const Ort::Exception& e) {
+            std::cerr << "[QueryModelInputName] ORT exception: "
+                      << e.what() << " (path=" << onnxPath << ")" << std::endl;
+            return "";
+        }
+        catch (const std::exception& e) {
+            std::cerr << "[QueryModelInputName] std exception: "
+                      << e.what() << " (path=" << onnxPath << ")" << std::endl;
+            return "";
+        }
+    }
+
    BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
        unsigned int _num_threads)
        : log_id(_onnx_path.data()),
@@ -36,6 +72,33 @@ namespace ANSCENTER {
        initialize_handler();
    }

+    BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
+        EngineType engineType,
+        const OrtHandlerOptions& options,
+        unsigned int _num_threads)
+        : log_id(_onnx_path.data()),
+        num_threads(_num_threads),
+        m_engineType(engineType),
+        m_handlerOptions(options),
+        onnx_path_w(_onnx_path.begin(), _onnx_path.end())
+    {
+        onnx_path = onnx_path_w.c_str();
+        initialize_handler();
+    }
+
+    BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
+        const OrtHandlerOptions& options,
+        unsigned int _num_threads)
+        : log_id(_onnx_path.data()),
+        num_threads(_num_threads),
+        m_engineType(static_cast<EngineType>(-1)),  // EPLoader auto-detect
+        m_handlerOptions(options),
+        onnx_path_w(_onnx_path.begin(), _onnx_path.end())
+    {
+        onnx_path = onnx_path_w.c_str();
+        initialize_handler();
+    }
+
    BasicOrtHandler::~BasicOrtHandler()
    {
        if (ort_session) {
@@ -66,11 +129,15 @@ namespace ANSCENTER {
            //  - arena_extend_strategy = 1 (kSameAsRequested) to avoid
            //    pre-allocating huge GPU memory blocks that may exceed VRAM
            //  - cudnn_conv_algo_search = HEURISTIC for faster session init
-            //  - cudnn_conv_use_max_workspace = 0 — use minimal cuDNN workspace
-            //    to prevent CUDNN_BACKEND_API_FAILED when TRT engines already
-            //    occupy most VRAM on the same GPU
+            //  - cudnn_conv_use_max_workspace defaults to "0" to prevent
+            //    CUDNN_BACKEND_API_FAILED when TRT engines already occupy
+            //    most VRAM on the same GPU. OCR sub-models that need fast
+            //    convs opt into "1" via OrtHandlerOptions::useMaxCudnnWorkspace
            //  - gpu_mem_limit — cap ONNX Runtime's GPU memory arena to 2 GB
            //    so it doesn't compete with TensorRT for the remaining VRAM
+            const char* maxWorkspace =
+                m_handlerOptions.useMaxCudnnWorkspace ? "1" : "0";
+
            const char* keys[] = {
                "device_id",
                "arena_extend_strategy",
@@ -82,7 +149,7 @@ namespace ANSCENTER {
                "0",
                "1",              // kSameAsRequested
                "HEURISTIC",      // avoid exhaustive algo search on large model
-                "0",              // minimal cuDNN workspace (prevents OOM)
+                maxWorkspace,     // "1" for OCR (perf), "0" elsewhere (safety)
                "2147483648"      // 2 GB arena limit
            };
            Ort::GetApi().UpdateCUDAProviderOptions(
@@ -92,7 +159,8 @@ namespace ANSCENTER {
            Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options);

            std::cout << "[ORT] CUDA EP attached (arena=SameAsRequested, "
-                         "cudnn=HEURISTIC, maxWorkspace=0, memLimit=2GB)." << std::endl;
+                         "cudnn=HEURISTIC, maxWorkspace=" << maxWorkspace
+                      << ", memLimit=2GB)." << std::endl;
            return true;
        }
        catch (const Ort::Exception& e) {
@@ -100,6 +168,113 @@ namespace ANSCENTER {
            return false;
        }
    }
+
+    bool BasicOrtHandler::TryAppendTensorRT(Ort::SessionOptions& session_options)
+    {
+        try {
+            OrtTensorRTProviderOptionsV2* trt_options = nullptr;
+            Ort::GetApi().CreateTensorRTProviderOptions(&trt_options);
+
+            // Cache built engines on disk so subsequent runs skip the
+            // multi-minute build. Engines are keyed on (model hash, GPU
+            // arch, shape profile) so changing any of those triggers
+            // a rebuild automatically.
+            std::string cacheDir = m_handlerOptions.trtEngineCacheDir;
+            if (cacheDir.empty()) {
+                // %TEMP%\ANSCENTER\TRTEngineCache
+                const char* tmp = std::getenv("TEMP");
+                if (!tmp) tmp = std::getenv("TMP");
+                if (!tmp) tmp = ".";
+                std::filesystem::path p(tmp);
+                p /= "ANSCENTER";
+                p /= "TRTEngineCache";
+                std::error_code ec;
+                std::filesystem::create_directories(p, ec);
+                cacheDir = p.string();
+            }
+
+            // Builder options tuned for *fast first-run*:
+            //   - opt_level 1: builds in seconds, ~5–10 % runtime cost vs 3
+            //   - workspace 1 GB: leaves room for CUDA EP arena and the
+            //     LPD's own TRT engine on the same GPU
+            //   - timing cache: persists kernel timings between runs so
+            //     builds at new shapes get progressively faster
+            //   - profile shapes (if set): build ONE dynamic-shape
+            //     engine that handles all (batch, width) combos instead
+            //     of rebuilding per unique input. Critical for variable
+            //     batch workloads — without this, TRT EP rebuilds every
+            //     time runtime sees a new shape pair, causing 60-90 s
+            //     hangs mid-stream.
+            std::filesystem::path timingCachePath =
+                std::filesystem::path(cacheDir) / "timing.cache";
+            std::string timingCacheStr = timingCachePath.string();
+
+            const bool haveProfile = !m_handlerOptions.trtProfileMinShapes.empty()
+                                  && !m_handlerOptions.trtProfileOptShapes.empty()
+                                  && !m_handlerOptions.trtProfileMaxShapes.empty();
+
+            // Build the key/value arrays. We always set the first 8 keys;
+            // the profile shapes are appended only when provided.
+            std::vector<const char*> keys = {
+                "device_id",
+                "trt_fp16_enable",
+                "trt_engine_cache_enable",
+                "trt_engine_cache_path",
+                "trt_max_workspace_size",
+                "trt_builder_optimization_level",
+                "trt_timing_cache_enable",
+                "trt_timing_cache_path"
+            };
+            std::vector<const char*> values = {
+                "0",
+                m_handlerOptions.trtFP16 ? "1" : "0",
+                "1",
+                cacheDir.c_str(),
+                "1073741824",     // 1 GB build workspace
+                "1",              // fast build (was "3")
+                "1",
+                cacheDir.c_str()
+            };
+
+            if (haveProfile) {
+                keys.push_back("trt_profile_min_shapes");
+                values.push_back(m_handlerOptions.trtProfileMinShapes.c_str());
+                keys.push_back("trt_profile_opt_shapes");
+                values.push_back(m_handlerOptions.trtProfileOptShapes.c_str());
+                keys.push_back("trt_profile_max_shapes");
+                values.push_back(m_handlerOptions.trtProfileMaxShapes.c_str());
+            }
+
+            Ort::GetApi().UpdateTensorRTProviderOptions(
+                trt_options, keys.data(), values.data(), keys.size());
+
+            session_options.AppendExecutionProvider_TensorRT_V2(*trt_options);
+            Ort::GetApi().ReleaseTensorRTProviderOptions(trt_options);
+
+            std::cout << "[ORT] TensorRT EP attached (fp16="
+                      << (m_handlerOptions.trtFP16 ? "1" : "0")
+                      << ", cache=" << cacheDir
+                      << ", profile=" << (haveProfile ? "dynamic" : "static")
+                      << ")." << std::endl;
+            if (haveProfile) {
+                std::cout << "[ORT]   profile min: "
+                          << m_handlerOptions.trtProfileMinShapes << std::endl
+                          << "[ORT]   profile opt: "
+                          << m_handlerOptions.trtProfileOptShapes << std::endl
+                          << "[ORT]   profile max: "
+                          << m_handlerOptions.trtProfileMaxShapes << std::endl;
+            }
+            return true;
+        }
+        catch (const Ort::Exception& e) {
+            std::cerr << "[ORT] TensorRT EP failed: " << e.what() << std::endl;
+            return false;
+        }
+        catch (const std::exception& e) {
+            std::cerr << "[ORT] TensorRT EP failed (std): " << e.what() << std::endl;
+            return false;
+        }
+    }
    bool BasicOrtHandler::TryAppendDirectML(Ort::SessionOptions& session_options)
    {
        try {
@@ -267,9 +442,28 @@ namespace ANSCENTER {
        {
            // --------------------------------------------------------
        case EngineType::NVIDIA_GPU:
+            // Try TensorRT EP first when explicitly requested. Falls
+            // through to CUDA EP if TRT is missing or option creation
+            // fails. Both EPs may be attached at once — ORT picks TRT
+            // for nodes it supports and CUDA for the rest.
+            if (m_handlerOptions.preferTensorRT
+                && hasProvider("TensorrtExecutionProvider")) {
+                ANS_DBG("OrtHandler", "Trying TensorRT EP...");
+                if (TryAppendTensorRT(session_options)) {
+                    epAttached = true;
+                }
+                else {
+                    std::cerr << "[ORT] TensorRT EP attach failed — "
+                                 "falling back to CUDA EP." << std::endl;
+                }
+            }
+
            ANS_DBG("OrtHandler", "Trying CUDA EP...");
-            if (hasProvider("CUDAExecutionProvider"))
-                epAttached = TryAppendCUDA(session_options);
+            if (hasProvider("CUDAExecutionProvider")) {
+                if (TryAppendCUDA(session_options)) {
+                    epAttached = true;
+                }
+            }
            if (!epAttached) {
                std::cerr << "[ORT] CUDA EP unavailable — falling back to CPU."
                << std::endl;