Add CPU/GPU gate and support new ANSALPR using OCR

2026-04-12 17:16:16 +10:00
parent 27083a6530
commit 0a8aaed215
30 changed files with 1870 additions and 2166 deletions
--- a/modules/ANSODEngine/dllmain.cpp
+++ b/modules/ANSODEngine/dllmain.cpp
@@ -7,6 +7,7 @@
 #include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH
 #include <climits>                    // INT_MIN
 #include "ANSLicense.h"              // ANS_DBG macro for DebugView
+#include "ANSODVendorGate.h"         // ansod_vendor_gate::IsNvidiaGpuAvailable()

 // Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers).
 // Defined here, declared extern in EngineBuildLoadNetwork.inl.
@@ -88,6 +89,17 @@ static std::mutex g_gpuCountMutex;
 static int GetNumGPUs() {
 	std::lock_guard<std::mutex> lk(g_gpuCountMutex);
 	if (g_numGPUs < 0) {
+		// Defense-in-depth: all callers (AssignNextGPU, GetPoolMaxSlotsPerGpu,
+		// CheckGPUVRAM) are invoked inside factory-level NVIDIA_GPU guards,
+		// but skip the CUDA runtime entirely on AMD/Intel/CPU hardware so a
+		// future refactor cannot accidentally wake up cudart on non-NVIDIA.
+		// See ANSODVendorGate.h.
+		if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) {
+			g_numGPUs = 1;  // report a single "virtual" slot so round-robin is a no-op
+			std::cout << "Info [GPU]: non-NVIDIA hardware — CUDA probe skipped, pool slots=1"
+			          << std::endl;
+			return g_numGPUs;
+		}
 		// Use yield mode before any CUDA call to avoid busy-wait spinning
 		// that falsely reports 100% GPU utilization in nvidia-smi.
 		cudaSetDeviceFlags(cudaDeviceScheduleYield);
@@ -108,6 +120,13 @@ static int GetPoolMaxSlotsPerGpu() {
 	static std::mutex s_mutex;
 	std::lock_guard<std::mutex> lk(s_mutex);
 	if (s_result != INT_MIN) return s_result;
+	// Short-circuit on non-NVIDIA: no TRT engines will be built, no pool to
+	// size, and cudaSetDevice/cudaMemGetInfo below should not be reached.
+	// Safety net — callers today are already inside NVIDIA_GPU guards.
+	if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) {
+		s_result = 1;
+		return s_result;
+	}
 	const int n = GetNumGPUs();
 	if (n <= 1) {
 		s_result = 1;
@@ -132,6 +151,9 @@ static int GetPoolMaxSlotsPerGpu() {
 // Returns the next GPU index in round-robin order.
 // Thread-safe: uses atomic fetch_add.
 static int AssignNextGPU() {
+	// Non-NVIDIA short-circuit: no CUDA devices, return 0 and skip the
+	// "assigning task" log to avoid polluting AMD/Intel/CPU logs.
+	if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) return 0;
 	const int numGPUs = GetNumGPUs();
 	const int idx = g_gpuRoundRobinCounter.fetch_add(1);
 	const int gpuIndex = idx % numGPUs;
@@ -144,6 +166,11 @@ static int AssignNextGPU() {
 // Returns true if sufficient, false if not.
 // minFreeBytes: minimum free VRAM required (default 512 MiB safety margin).
 static bool CheckGPUVRAM(int gpuIndex, size_t minFreeBytes = 512ULL * 1024 * 1024) {
+	// Non-NVIDIA short-circuit: no CUDA devices present — report "OK"
+	// silently so the TRT pool path is a no-op on AMD/Intel/CPU and the
+	// log isn't polluted with spurious 0-byte VRAM warnings.
+	if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) return true;
+
 	int prevDevice = 0;
 	cudaGetDevice(&prevDevice);
 	cudaSetDevice(gpuIndex);
@@ -253,6 +280,16 @@ BOOL APIENTRY DllMain( HMODULE hModule,
        // Pin the DLL so it is never unmapped while idle-timer or CUDA threads
        // are still running.  During LabVIEW shutdown the CLR/COM teardown can
        // unload DLLs before all threads exit → crash at unmapped code.
+        //
+        // CRITICAL: do NOT call CheckHardwareInformation() or
+        // ansod_vendor_gate::IsNvidiaGpuAvailable() from here.  DllMain holds
+        // the OS loader lock (LdrpLoaderLock).  CheckHardwareInformation
+        // touches hwinfo → DXGI / WMI / COM, which internally call
+        // LoadLibrary; doing that while holding the loader lock causes a
+        // classic loader-lock deadlock (observed as a full hang of the
+        // ANSLPR-UnitTest stress test).  The vendor gate will lazy-
+        // initialise on the first real call from worker code, which runs
+        // with the loader lock released.
        {
            HMODULE hSelf = nullptr;
            GetModuleHandleExW(
@@ -511,8 +548,19 @@ extern "C" ANSODENGINE_API std::string  CreateANSODHandle(ANSCENTER::ANSODBase**
 			modelConfig.modelType = ANSCENTER::ModelType::ODHUBMODEL;
 			break;
 		case 14: //TensorRT for Object Detection Yolov10
-			(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
-			modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
+			// Upstream modelType rewrite (see top of each factory) already
+			// redirects 14 → 31 (RTYOLO) on NVIDIA or 14 → 30 (ONNXYOLO) on
+			// non-NVIDIA, so this branch is unreachable in practice.  Keep
+			// an explicit vendor gate as defense-in-depth against future
+			// refactors — ANSYOLOV10RTOD is a TensorRT class and must never
+			// be constructed on AMD/Intel/CPU hardware.
+			if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) {
+				(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
+				modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
+			} else {
+				(*Handle) = new ANSCENTER::ANSONNXYOLO();
+				modelConfig.modelType = ANSCENTER::ModelType::ONNXYOLO;
+			}
 			break;
 		case 15: //OpenVino for Object Detection Yolov10
 			(*Handle) = new ANSCENTER::ANSOYOLOV10OVOD();
@@ -832,8 +880,19 @@ extern "C" ANSODENGINE_API int  CreateANSODHandleEx(ANSCENTER::ANSODBase** Handl
 			modelConfig.modelType = ANSCENTER::ModelType::ODHUBMODEL;
 			break;
 		case 14: //TensorRT for Object Detection Yolov10
-			(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
-			modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
+			// Upstream modelType rewrite (see top of each factory) already
+			// redirects 14 → 31 (RTYOLO) on NVIDIA or 14 → 30 (ONNXYOLO) on
+			// non-NVIDIA, so this branch is unreachable in practice.  Keep
+			// an explicit vendor gate as defense-in-depth against future
+			// refactors — ANSYOLOV10RTOD is a TensorRT class and must never
+			// be constructed on AMD/Intel/CPU hardware.
+			if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) {
+				(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
+				modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
+			} else {
+				(*Handle) = new ANSCENTER::ANSONNXYOLO();
+				modelConfig.modelType = ANSCENTER::ModelType::ONNXYOLO;
+			}
 			break;
 		case 15: //OpenVino for Object Detection Yolov10
 			(*Handle) = new ANSCENTER::ANSOYOLOV10OVOD();
@@ -1193,8 +1252,19 @@ extern "C" __declspec(dllexport) int LoadModelFromFolder(ANSCENTER::ANSODBase**
 			modelConfig.modelType = ANSCENTER::ModelType::ODHUBMODEL;
 			break;
 		case 14: //TensorRT for Object Detection Yolov10
-			(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
-			modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
+			// Upstream modelType rewrite (see top of each factory) already
+			// redirects 14 → 31 (RTYOLO) on NVIDIA or 14 → 30 (ONNXYOLO) on
+			// non-NVIDIA, so this branch is unreachable in practice.  Keep
+			// an explicit vendor gate as defense-in-depth against future
+			// refactors — ANSYOLOV10RTOD is a TensorRT class and must never
+			// be constructed on AMD/Intel/CPU hardware.
+			if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) {
+				(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
+				modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
+			} else {
+				(*Handle) = new ANSCENTER::ANSONNXYOLO();
+				modelConfig.modelType = ANSCENTER::ModelType::ONNXYOLO;
+			}
 			break;
 		case 15: //OpenVino for Object Detection Yolov10
 			(*Handle) = new ANSCENTER::ANSOYOLOV10OVOD();