Add CPU/GPU gate and support new ANSALPR using OCR

This commit is contained in:
2026-04-12 17:16:16 +10:00
parent 27083a6530
commit 0a8aaed215
30 changed files with 1870 additions and 2166 deletions

View File

@@ -7,6 +7,7 @@
#include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH
#include <climits> // INT_MIN
#include "ANSLicense.h" // ANS_DBG macro for DebugView
#include "ANSODVendorGate.h" // ansod_vendor_gate::IsNvidiaGpuAvailable()
// Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers).
// Defined here, declared extern in EngineBuildLoadNetwork.inl.
@@ -88,6 +89,17 @@ static std::mutex g_gpuCountMutex;
static int GetNumGPUs() {
std::lock_guard<std::mutex> lk(g_gpuCountMutex);
if (g_numGPUs < 0) {
// Defense-in-depth: all callers (AssignNextGPU, GetPoolMaxSlotsPerGpu,
// CheckGPUVRAM) are invoked inside factory-level NVIDIA_GPU guards,
// but skip the CUDA runtime entirely on AMD/Intel/CPU hardware so a
// future refactor cannot accidentally wake up cudart on non-NVIDIA.
// See ANSODVendorGate.h.
if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) {
g_numGPUs = 1; // report a single "virtual" slot so round-robin is a no-op
std::cout << "Info [GPU]: non-NVIDIA hardware — CUDA probe skipped, pool slots=1"
<< std::endl;
return g_numGPUs;
}
// Use yield mode before any CUDA call to avoid busy-wait spinning
// that falsely reports 100% GPU utilization in nvidia-smi.
cudaSetDeviceFlags(cudaDeviceScheduleYield);
@@ -108,6 +120,13 @@ static int GetPoolMaxSlotsPerGpu() {
static std::mutex s_mutex;
std::lock_guard<std::mutex> lk(s_mutex);
if (s_result != INT_MIN) return s_result;
// Short-circuit on non-NVIDIA: no TRT engines will be built, no pool to
// size, and cudaSetDevice/cudaMemGetInfo below should not be reached.
// Safety net — callers today are already inside NVIDIA_GPU guards.
if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) {
s_result = 1;
return s_result;
}
const int n = GetNumGPUs();
if (n <= 1) {
s_result = 1;
@@ -132,6 +151,9 @@ static int GetPoolMaxSlotsPerGpu() {
// Returns the next GPU index in round-robin order.
// Thread-safe: uses atomic fetch_add.
static int AssignNextGPU() {
// Non-NVIDIA short-circuit: no CUDA devices, return 0 and skip the
// "assigning task" log to avoid polluting AMD/Intel/CPU logs.
if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) return 0;
const int numGPUs = GetNumGPUs();
const int idx = g_gpuRoundRobinCounter.fetch_add(1);
const int gpuIndex = idx % numGPUs;
@@ -144,6 +166,11 @@ static int AssignNextGPU() {
// Returns true if sufficient, false if not.
// minFreeBytes: minimum free VRAM required (default 512 MiB safety margin).
static bool CheckGPUVRAM(int gpuIndex, size_t minFreeBytes = 512ULL * 1024 * 1024) {
// Non-NVIDIA short-circuit: no CUDA devices present — report "OK"
// silently so the TRT pool path is a no-op on AMD/Intel/CPU and the
// log isn't polluted with spurious 0-byte VRAM warnings.
if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) return true;
int prevDevice = 0;
cudaGetDevice(&prevDevice);
cudaSetDevice(gpuIndex);
@@ -253,6 +280,16 @@ BOOL APIENTRY DllMain( HMODULE hModule,
// Pin the DLL so it is never unmapped while idle-timer or CUDA threads
// are still running. During LabVIEW shutdown the CLR/COM teardown can
// unload DLLs before all threads exit → crash at unmapped code.
//
// CRITICAL: do NOT call CheckHardwareInformation() or
// ansod_vendor_gate::IsNvidiaGpuAvailable() from here. DllMain holds
// the OS loader lock (LdrpLoaderLock). CheckHardwareInformation
// touches hwinfo → DXGI / WMI / COM, which internally call
// LoadLibrary; doing that while holding the loader lock causes a
// classic loader-lock deadlock (observed as a full hang of the
// ANSLPR-UnitTest stress test). The vendor gate will lazy-
// initialise on the first real call from worker code, which runs
// with the loader lock released.
{
HMODULE hSelf = nullptr;
GetModuleHandleExW(
@@ -511,8 +548,19 @@ extern "C" ANSODENGINE_API std::string CreateANSODHandle(ANSCENTER::ANSODBase**
modelConfig.modelType = ANSCENTER::ModelType::ODHUBMODEL;
break;
case 14: //TensorRT for Object Detection Yolov10
(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
// Upstream modelType rewrite (see top of each factory) already
// redirects 14 → 31 (RTYOLO) on NVIDIA or 14 → 30 (ONNXYOLO) on
// non-NVIDIA, so this branch is unreachable in practice. Keep
// an explicit vendor gate as defense-in-depth against future
// refactors — ANSYOLOV10RTOD is a TensorRT class and must never
// be constructed on AMD/Intel/CPU hardware.
if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) {
(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
} else {
(*Handle) = new ANSCENTER::ANSONNXYOLO();
modelConfig.modelType = ANSCENTER::ModelType::ONNXYOLO;
}
break;
case 15: //OpenVino for Object Detection Yolov10
(*Handle) = new ANSCENTER::ANSOYOLOV10OVOD();
@@ -832,8 +880,19 @@ extern "C" ANSODENGINE_API int CreateANSODHandleEx(ANSCENTER::ANSODBase** Handl
modelConfig.modelType = ANSCENTER::ModelType::ODHUBMODEL;
break;
case 14: //TensorRT for Object Detection Yolov10
(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
// Upstream modelType rewrite (see top of each factory) already
// redirects 14 → 31 (RTYOLO) on NVIDIA or 14 → 30 (ONNXYOLO) on
// non-NVIDIA, so this branch is unreachable in practice. Keep
// an explicit vendor gate as defense-in-depth against future
// refactors — ANSYOLOV10RTOD is a TensorRT class and must never
// be constructed on AMD/Intel/CPU hardware.
if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) {
(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
} else {
(*Handle) = new ANSCENTER::ANSONNXYOLO();
modelConfig.modelType = ANSCENTER::ModelType::ONNXYOLO;
}
break;
case 15: //OpenVino for Object Detection Yolov10
(*Handle) = new ANSCENTER::ANSOYOLOV10OVOD();
@@ -1193,8 +1252,19 @@ extern "C" __declspec(dllexport) int LoadModelFromFolder(ANSCENTER::ANSODBase**
modelConfig.modelType = ANSCENTER::ModelType::ODHUBMODEL;
break;
case 14: //TensorRT for Object Detection Yolov10
(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
// Upstream modelType rewrite (see top of each factory) already
// redirects 14 → 31 (RTYOLO) on NVIDIA or 14 → 30 (ONNXYOLO) on
// non-NVIDIA, so this branch is unreachable in practice. Keep
// an explicit vendor gate as defense-in-depth against future
// refactors — ANSYOLOV10RTOD is a TensorRT class and must never
// be constructed on AMD/Intel/CPU hardware.
if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) {
(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
} else {
(*Handle) = new ANSCENTER::ANSONNXYOLO();
modelConfig.modelType = ANSCENTER::ModelType::ONNXYOLO;
}
break;
case 15: //OpenVino for Object Detection Yolov10
(*Handle) = new ANSCENTER::ANSOYOLOV10OVOD();