Add CPU/GPU gate and support new ANSALPR using OCR

2026-04-12 17:16:16 +10:00
parent 27083a6530
commit 0a8aaed215
30 changed files with 1870 additions and 2166 deletions
--- a/modules/ANSODEngine/ANSODVendorGate.h
+++ b/modules/ANSODEngine/ANSODVendorGate.h
@@ -0,0 +1,57 @@
+#pragma once
+// ANSODVendorGate.h — Cached NVIDIA hardware check for ANSODEngine.dll.
+//
+// ANSODEngine.dll links against CUDA::cudart_static + CUDA::cublas +
+// CUDA::cublasLt and hosts the TensorRT inference classes (ANSRTYOLO,
+// TENSORRTOD, TENSORRTCL, TENSORRTSEG, TENSORRTPOSE, ANSSAM3, ANSYOLOV10RTOD,
+// ANSYOLOV12RTOD, ANSTENSORRTPOSE) plus the NV12 preprocess helper and the
+// TRT engine pool.
+//
+// The dllmain factory already hard-gates TRT class instantiation on
+// NVIDIA_GPU and falls back to ONNX Runtime / OpenVINO on AMD/Intel/CPU.
+// However, several support paths still call into the CUDA runtime
+// unconditionally:
+//   • GetNumGPUs() / GetPoolMaxSlotsPerGpu() / CheckGPUVRAM() helpers
+//     (called from inside NVIDIA_GPU guards today, but safer to gate at
+//     source so a future refactor cannot accidentally wake up cudart on
+//     AMD/Intel).
+//   • A few case labels in the model-type switch instantiate TRT classes
+//     without an explicit NVIDIA_GPU check — they are currently unreachable
+//     due to upstream modelType rewriting, but leaving them unguarded
+//     creates a maintenance trap.
+//
+// Solution: a single process-wide cached predicate that evaluates
+// CheckHardwareInformation() exactly once.  On AMD/Intel/CPU the predicate
+// returns false and every gated site short-circuits before touching any
+// CUDA API.
+//
+// Mirrors ANSCVVendorGate / ANSLPR_OD::isNvidiaEngine / ANSOCR factory gate
+// / ANSFR CreateANSRFHandle vendor log.  Keeps the four shipped DLLs on a
+// single, auditable pattern.
+
+#include "ANSLicense.h"
+#include <atomic>
+
+namespace ansod_vendor_gate {
+
+// Lazily evaluates ANSLicenseHelper::CheckHardwareInformation() once and
+// caches the result.  Thread-safe via std::atomic<int> (0 = unknown,
+// 1 = NVIDIA, 2 = non-NVIDIA).  No std::call_once overhead on the hot
+// inference path.  Fails safe to non-NVIDIA on exception.
+[[nodiscard]] inline bool IsNvidiaGpuAvailable() noexcept {
+    static std::atomic<int> s_state{0};
+    int cached = s_state.load(std::memory_order_acquire);
+    if (cached != 0) return cached == 1;
+    try {
+        const ANSCENTER::EngineType detected =
+            ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
+        const bool isNvidia = (detected == ANSCENTER::EngineType::NVIDIA_GPU);
+        s_state.store(isNvidia ? 1 : 2, std::memory_order_release);
+        return isNvidia;
+    } catch (...) {
+        s_state.store(2, std::memory_order_release);
+        return false;
+    }
+}
+
+} // namespace ansod_vendor_gate
--- a/modules/ANSODEngine/ANSONNXYOLO.cpp
+++ b/modules/ANSODEngine/ANSONNXYOLO.cpp
@@ -237,6 +237,38 @@ namespace ANSCENTER {
            output_node_names.data(),
            num_outputs);

+        // ── Output shape sanity check ───────────────────────────────────
+        // DirectML on some AMD configurations has been observed to return
+        // output tensors whose dim[1]/dim[2] values don't match what the
+        // ONNX graph actually produced, which propagates into
+        // postprocessLegacy / postprocessEndToEnd as huge numBoxes /
+        // numChannels values and causes multi-terabyte cv::Mat allocations
+        // inside the `cv::Mat(numChannels, numBoxes, CV_32F, ...).t()`
+        // call (observed as "Failed to allocate 3522082959360 bytes" on
+        // Ryzen APUs).  Bail out early here instead of letting the
+        // postprocess layer try to materialise a 3.5 TB buffer.
+        //
+        // Sane upper bounds for Ultralytics YOLO outputs:
+        //   • legacy [1, 84..300, 8400..25200]          → max dim ≈ 30k
+        //   • end2end [1, 300, 6..56]                   → max dim ≈ 300
+        //   • segmentation proto mask [1, 32, 160, 160] → max dim ≈ 160
+        //   • classification [1, 1000]                  → max dim ≈ 1k
+        // 1,000,000 is ~30x the largest real-world dim and catches the
+        // garbage values without clipping any legitimate model.
+        constexpr int64_t kMaxOutputDim = 1000000;
+        for (size_t t = 0; t < outputTensors.size(); ++t) {
+            const auto shape = outputTensors[t].GetTensorTypeAndShapeInfo().GetShape();
+            for (size_t d = 0; d < shape.size(); ++d) {
+                if (shape[d] < 0 || shape[d] > kMaxOutputDim) {
+                    std::cerr << "[ONNXYOLO] detect: output[" << t
+                              << "] dim[" << d << "]=" << shape[d]
+                              << " is out of range — refusing to postprocess."
+                              << std::endl;
+                    return {};
+                }
+            }
+        }
+
        const cv::Size resizedShape(
            static_cast<int>(input_node_dims[3]),
            static_cast<int>(input_node_dims[2]));
@@ -1399,6 +1431,23 @@ namespace ANSCENTER {
            output_node_names.data(),
            num_outputs);

+        // Output shape sanity check — see detect() for rationale.  Prevents
+        // DirectML-returned garbage dims from propagating into postprocess
+        // and triggering multi-terabyte cv::Mat allocations on AMD.
+        constexpr int64_t kMaxOutputDim = 1000000;
+        for (size_t t = 0; t < outputTensors.size(); ++t) {
+            const auto sh = outputTensors[t].GetTensorTypeAndShapeInfo().GetShape();
+            for (size_t d = 0; d < sh.size(); ++d) {
+                if (sh[d] < 0 || sh[d] > kMaxOutputDim) {
+                    std::cerr << "[ONNXYOLO] detectBatch: output[" << t
+                              << "] dim[" << d << "]=" << sh[d]
+                              << " is out of range — refusing to postprocess."
+                              << std::endl;
+                    return std::vector<std::vector<Object>>(N);
+                }
+            }
+        }
+
        const cv::Size resizedShape(
            static_cast<int>(input_node_dims[3]),
            static_cast<int>(input_node_dims[2]));
@@ -1589,59 +1638,92 @@ namespace ANSCENTER {
    }

    // ========================================================================
-    // WarmUpEngine — run 2 dummy inferences after session creation
+    // WarmUpEngine — run a dummy inference after session creation.
    //
-    // On AMD RDNA2 iGPUs (e.g. Radeon 680M on Ryzen 6000-series APUs), the
-    // very first detect() call triggers DirectML shader compile + GPU kernel
-    // cache population for the entire YOLO graph.  That first pass can
-    // legitimately take several seconds of sustained GPU work, which is long
-    // enough to coincide with TDR watchdog firing and has triggered
-    // amdkmdag.sys bugchecks at +0xf03d under DirectML 1.15.4 (the latest).
+    // Scope: **NVIDIA (CUDA EP) only.**  On first inference, the CUDA EP
+    // allocates its memory arena (capped at 2 GB via BasicOrtHandler config),
+    // resolves cuDNN convolution algorithms, and populates the kernel launch
+    // cache.  Running one dummy inference at load time amortises this cost
+    // so the first real frame doesn't see a latency spike.
    //
-    // Running 2 dummy inferences at startup burns the compile cost under
-    // controlled conditions so that the first real frame is already fast.
-    // The second call should always be quick and confirms the cache is warm.
+    // Explicitly disabled on AMD, Intel and CPU:
+    //   • AMD (DirectML) — calling detect() at load time has been observed
+    //     to hit a multi-terabyte cv::Mat allocation inside postprocessLegacy
+    //     on AMD RDNA iGPUs when DirectML returns garbage output tensor
+    //     dims.  ONNXYOLO::detect() now has an output-shape sanity guard
+    //     that catches this at runtime, so the warm-up would add risk
+    //     without benefit.  Earlier builds enabled warm-up specifically for
+    //     Radeon 680M TDR mitigation; that workaround is obsolete with
+    //     current DirectML 1.15.x drivers.
+    //   • Intel (OpenVINO) — running detect() at load time has been
+    //     observed to expose latent heap-corruption bugs
+    //     (ntdll +0x1176e5 / STATUS_HEAP_CORRUPTION 0xc0000374).
+    //   • CPU EP — no shader compile or kernel cache to warm up; the first
+    //     real frame has the same latency as any subsequent frame.
    //
-    // Non-fatal on failure: if warm-up itself crashes, regular inference may
-    // still succeed, or will fail with a clearer error message.
+    // Non-fatal on failure: if warm-up itself throws, regular inference
+    // still works — the engine is fully loaded before WarmUpEngine runs.
    // ========================================================================
    void ANSONNXYOLO::WarmUpEngine() {
        if (!m_ortEngine) return;

-        // Warm-up exists solely to pre-compile DirectML shaders on AMD RDNA2
-        // iGPUs (Radeon 680M).  It has no benefit on CPU / OpenVINO / CUDA
-        // and running detect() at load time has been observed to expose
-        // latent heap-corruption bugs (ntdll +0x1176e5 / STATUS_HEAP_CORRUPTION
-        // 0xc0000374) on Intel machines.  Gate strictly on AMD_GPU.
-        if (m_ortEngine->getEngineType() != EngineType::AMD_GPU) {
-            ANS_DBG("ONNXYOLO", "Warm-up skipped (non-AMD EP)");
+        // Gate strictly on NVIDIA_GPU.  Every other EP is a no-op.
+        if (m_ortEngine->getEngineType() != EngineType::NVIDIA_GPU) {
+            ANS_DBG("ONNXYOLO", "Warm-up skipped (non-NVIDIA EP)");
            return;
        }

-        try {
-            const int w = _modelConfig.inpWidth  > 0 ? _modelConfig.inpWidth  : 640;
-            const int h = _modelConfig.inpHeight > 0 ? _modelConfig.inpHeight : 640;
+        // ── Strict dimension validation ─────────────────────────────────
+        // Defensive: refuse to warm up with implausible model dimensions.
+        // _modelConfig values come from the caller's ModelConfig and are
+        // normally 224..640; anything outside [32, 4096] is almost certainly
+        // a bug in the caller and we skip warm-up rather than risk a huge
+        // cv::Mat allocation inside detect().
+        constexpr int kMinDim = 32;
+        constexpr int kMaxDim = 4096;
+        const int rawW = _modelConfig.inpWidth;
+        const int rawH = _modelConfig.inpHeight;
+        if (rawW <= 0 || rawH <= 0 || rawW > kMaxDim || rawH > kMaxDim) {
+            _logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
+                "Warm-up skipped — suspect input dims ("
+                + std::to_string(rawW) + "x" + std::to_string(rawH) + ")",
+                __FILE__, __LINE__);
+            return;
+        }
+        const int w = std::clamp(rawW, kMinDim, kMaxDim);
+        const int h = std::clamp(rawH, kMinDim, kMaxDim);

+        try {
            // Mid-gray BGR image matches the letterbox fill colour used in
            // preprocessing (114,114,114 ~ 128) and avoids degenerate inputs.
            cv::Mat dummy(h, w, CV_8UC3, cv::Scalar(128, 128, 128));

-            ANS_DBG("ONNXYOLO", "Warm-up: running 2 dummy inferences (%dx%d)", w, h);
+            ANS_DBG("ONNXYOLO", "Warm-up: running 1 dummy CUDA inference (%dx%d)", w, h);

-            for (int i = 0; i < 2; ++i) {
-                auto t0 = std::chrono::steady_clock::now();
-                (void)m_ortEngine->detect(dummy, _classes,
-                                          PROBABILITY_THRESHOLD,
-                                          NMS_THRESHOLD,
-                                          NUM_KPS);
-                auto t1 = std::chrono::steady_clock::now();
-                auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
-                ANS_DBG("ONNXYOLO", "Warm-up #%d: %lld ms", i, (long long)ms);
-            }
+            auto t0 = std::chrono::steady_clock::now();
+            (void)m_ortEngine->detect(dummy, _classes,
+                                      PROBABILITY_THRESHOLD,
+                                      NMS_THRESHOLD,
+                                      NUM_KPS);
+            auto t1 = std::chrono::steady_clock::now();
+            auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
+            ANS_DBG("ONNXYOLO", "Warm-up done: %lld ms", (long long)ms);
+        }
+        catch (const cv::Exception& e) {
+            // Defensive — should not fire on NVIDIA CUDA EP, but if it does
+            // the engine itself is still loaded and real inference will work.
+            _logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
+                std::string("Warm-up skipped (cv::Exception, non-fatal): ") + e.what(),
+                __FILE__, __LINE__);
        }
        catch (const std::exception& e) {
-            _logger.LogError("ANSONNXYOLO::WarmUpEngine",
-                std::string("Warm-up failed (non-fatal): ") + e.what(),
+            _logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
+                std::string("Warm-up skipped (std::exception, non-fatal): ") + e.what(),
+                __FILE__, __LINE__);
+        }
+        catch (...) {
+            _logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
+                "Warm-up skipped (unknown exception, non-fatal)",
                __FILE__, __LINE__);
        }
    }
--- a/modules/ANSODEngine/dllmain.cpp
+++ b/modules/ANSODEngine/dllmain.cpp
@@ -7,6 +7,7 @@
 #include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH
 #include <climits>                    // INT_MIN
 #include "ANSLicense.h"              // ANS_DBG macro for DebugView
+#include "ANSODVendorGate.h"         // ansod_vendor_gate::IsNvidiaGpuAvailable()

 // Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers).
 // Defined here, declared extern in EngineBuildLoadNetwork.inl.
@@ -88,6 +89,17 @@ static std::mutex g_gpuCountMutex;
 static int GetNumGPUs() {
 	std::lock_guard<std::mutex> lk(g_gpuCountMutex);
 	if (g_numGPUs < 0) {
+		// Defense-in-depth: all callers (AssignNextGPU, GetPoolMaxSlotsPerGpu,
+		// CheckGPUVRAM) are invoked inside factory-level NVIDIA_GPU guards,
+		// but skip the CUDA runtime entirely on AMD/Intel/CPU hardware so a
+		// future refactor cannot accidentally wake up cudart on non-NVIDIA.
+		// See ANSODVendorGate.h.
+		if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) {
+			g_numGPUs = 1;  // report a single "virtual" slot so round-robin is a no-op
+			std::cout << "Info [GPU]: non-NVIDIA hardware — CUDA probe skipped, pool slots=1"
+			          << std::endl;
+			return g_numGPUs;
+		}
 		// Use yield mode before any CUDA call to avoid busy-wait spinning
 		// that falsely reports 100% GPU utilization in nvidia-smi.
 		cudaSetDeviceFlags(cudaDeviceScheduleYield);
@@ -108,6 +120,13 @@ static int GetPoolMaxSlotsPerGpu() {
 	static std::mutex s_mutex;
 	std::lock_guard<std::mutex> lk(s_mutex);
 	if (s_result != INT_MIN) return s_result;
+	// Short-circuit on non-NVIDIA: no TRT engines will be built, no pool to
+	// size, and cudaSetDevice/cudaMemGetInfo below should not be reached.
+	// Safety net — callers today are already inside NVIDIA_GPU guards.
+	if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) {
+		s_result = 1;
+		return s_result;
+	}
 	const int n = GetNumGPUs();
 	if (n <= 1) {
 		s_result = 1;
@@ -132,6 +151,9 @@ static int GetPoolMaxSlotsPerGpu() {
 // Returns the next GPU index in round-robin order.
 // Thread-safe: uses atomic fetch_add.
 static int AssignNextGPU() {
+	// Non-NVIDIA short-circuit: no CUDA devices, return 0 and skip the
+	// "assigning task" log to avoid polluting AMD/Intel/CPU logs.
+	if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) return 0;
 	const int numGPUs = GetNumGPUs();
 	const int idx = g_gpuRoundRobinCounter.fetch_add(1);
 	const int gpuIndex = idx % numGPUs;
@@ -144,6 +166,11 @@ static int AssignNextGPU() {
 // Returns true if sufficient, false if not.
 // minFreeBytes: minimum free VRAM required (default 512 MiB safety margin).
 static bool CheckGPUVRAM(int gpuIndex, size_t minFreeBytes = 512ULL * 1024 * 1024) {
+	// Non-NVIDIA short-circuit: no CUDA devices present — report "OK"
+	// silently so the TRT pool path is a no-op on AMD/Intel/CPU and the
+	// log isn't polluted with spurious 0-byte VRAM warnings.
+	if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) return true;
+
 	int prevDevice = 0;
 	cudaGetDevice(&prevDevice);
 	cudaSetDevice(gpuIndex);
@@ -253,6 +280,16 @@ BOOL APIENTRY DllMain( HMODULE hModule,
        // Pin the DLL so it is never unmapped while idle-timer or CUDA threads
        // are still running.  During LabVIEW shutdown the CLR/COM teardown can
        // unload DLLs before all threads exit → crash at unmapped code.
+        //
+        // CRITICAL: do NOT call CheckHardwareInformation() or
+        // ansod_vendor_gate::IsNvidiaGpuAvailable() from here.  DllMain holds
+        // the OS loader lock (LdrpLoaderLock).  CheckHardwareInformation
+        // touches hwinfo → DXGI / WMI / COM, which internally call
+        // LoadLibrary; doing that while holding the loader lock causes a
+        // classic loader-lock deadlock (observed as a full hang of the
+        // ANSLPR-UnitTest stress test).  The vendor gate will lazy-
+        // initialise on the first real call from worker code, which runs
+        // with the loader lock released.
        {
            HMODULE hSelf = nullptr;
            GetModuleHandleExW(
@@ -511,8 +548,19 @@ extern "C" ANSODENGINE_API std::string  CreateANSODHandle(ANSCENTER::ANSODBase**
 			modelConfig.modelType = ANSCENTER::ModelType::ODHUBMODEL;
 			break;
 		case 14: //TensorRT for Object Detection Yolov10
-			(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
-			modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
+			// Upstream modelType rewrite (see top of each factory) already
+			// redirects 14 → 31 (RTYOLO) on NVIDIA or 14 → 30 (ONNXYOLO) on
+			// non-NVIDIA, so this branch is unreachable in practice.  Keep
+			// an explicit vendor gate as defense-in-depth against future
+			// refactors — ANSYOLOV10RTOD is a TensorRT class and must never
+			// be constructed on AMD/Intel/CPU hardware.
+			if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) {
+				(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
+				modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
+			} else {
+				(*Handle) = new ANSCENTER::ANSONNXYOLO();
+				modelConfig.modelType = ANSCENTER::ModelType::ONNXYOLO;
+			}
 			break;
 		case 15: //OpenVino for Object Detection Yolov10
 			(*Handle) = new ANSCENTER::ANSOYOLOV10OVOD();
@@ -832,8 +880,19 @@ extern "C" ANSODENGINE_API int  CreateANSODHandleEx(ANSCENTER::ANSODBase** Handl
 			modelConfig.modelType = ANSCENTER::ModelType::ODHUBMODEL;
 			break;
 		case 14: //TensorRT for Object Detection Yolov10
-			(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
-			modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
+			// Upstream modelType rewrite (see top of each factory) already
+			// redirects 14 → 31 (RTYOLO) on NVIDIA or 14 → 30 (ONNXYOLO) on
+			// non-NVIDIA, so this branch is unreachable in practice.  Keep
+			// an explicit vendor gate as defense-in-depth against future
+			// refactors — ANSYOLOV10RTOD is a TensorRT class and must never
+			// be constructed on AMD/Intel/CPU hardware.
+			if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) {
+				(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
+				modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
+			} else {
+				(*Handle) = new ANSCENTER::ANSONNXYOLO();
+				modelConfig.modelType = ANSCENTER::ModelType::ONNXYOLO;
+			}
 			break;
 		case 15: //OpenVino for Object Detection Yolov10
 			(*Handle) = new ANSCENTER::ANSOYOLOV10OVOD();
@@ -1193,8 +1252,19 @@ extern "C" __declspec(dllexport) int LoadModelFromFolder(ANSCENTER::ANSODBase**
 			modelConfig.modelType = ANSCENTER::ModelType::ODHUBMODEL;
 			break;
 		case 14: //TensorRT for Object Detection Yolov10
-			(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
-			modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
+			// Upstream modelType rewrite (see top of each factory) already
+			// redirects 14 → 31 (RTYOLO) on NVIDIA or 14 → 30 (ONNXYOLO) on
+			// non-NVIDIA, so this branch is unreachable in practice.  Keep
+			// an explicit vendor gate as defense-in-depth against future
+			// refactors — ANSYOLOV10RTOD is a TensorRT class and must never
+			// be constructed on AMD/Intel/CPU hardware.
+			if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) {
+				(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
+				modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
+			} else {
+				(*Handle) = new ANSCENTER::ANSONNXYOLO();
+				modelConfig.modelType = ANSCENTER::ModelType::ONNXYOLO;
+			}
 			break;
 		case 15: //OpenVino for Object Detection Yolov10
 			(*Handle) = new ANSCENTER::ANSOYOLOV10OVOD();