Add CPU/GPU gate and support new ANSALPR using OCR
This commit is contained in:
@@ -7,6 +7,7 @@
|
||||
#include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH
|
||||
#include <climits> // INT_MIN
|
||||
#include "ANSLicense.h" // ANS_DBG macro for DebugView
|
||||
#include "ANSODVendorGate.h" // ansod_vendor_gate::IsNvidiaGpuAvailable()
|
||||
|
||||
// Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers).
|
||||
// Defined here, declared extern in EngineBuildLoadNetwork.inl.
|
||||
@@ -88,6 +89,17 @@ static std::mutex g_gpuCountMutex;
|
||||
static int GetNumGPUs() {
|
||||
std::lock_guard<std::mutex> lk(g_gpuCountMutex);
|
||||
if (g_numGPUs < 0) {
|
||||
// Defense-in-depth: all callers (AssignNextGPU, GetPoolMaxSlotsPerGpu,
|
||||
// CheckGPUVRAM) are invoked inside factory-level NVIDIA_GPU guards,
|
||||
// but skip the CUDA runtime entirely on AMD/Intel/CPU hardware so a
|
||||
// future refactor cannot accidentally wake up cudart on non-NVIDIA.
|
||||
// See ANSODVendorGate.h.
|
||||
if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) {
|
||||
g_numGPUs = 1; // report a single "virtual" slot so round-robin is a no-op
|
||||
std::cout << "Info [GPU]: non-NVIDIA hardware — CUDA probe skipped, pool slots=1"
|
||||
<< std::endl;
|
||||
return g_numGPUs;
|
||||
}
|
||||
// Use yield mode before any CUDA call to avoid busy-wait spinning
|
||||
// that falsely reports 100% GPU utilization in nvidia-smi.
|
||||
cudaSetDeviceFlags(cudaDeviceScheduleYield);
|
||||
@@ -108,6 +120,13 @@ static int GetPoolMaxSlotsPerGpu() {
|
||||
static std::mutex s_mutex;
|
||||
std::lock_guard<std::mutex> lk(s_mutex);
|
||||
if (s_result != INT_MIN) return s_result;
|
||||
// Short-circuit on non-NVIDIA: no TRT engines will be built, no pool to
|
||||
// size, and cudaSetDevice/cudaMemGetInfo below should not be reached.
|
||||
// Safety net — callers today are already inside NVIDIA_GPU guards.
|
||||
if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) {
|
||||
s_result = 1;
|
||||
return s_result;
|
||||
}
|
||||
const int n = GetNumGPUs();
|
||||
if (n <= 1) {
|
||||
s_result = 1;
|
||||
@@ -132,6 +151,9 @@ static int GetPoolMaxSlotsPerGpu() {
|
||||
// Returns the next GPU index in round-robin order.
|
||||
// Thread-safe: uses atomic fetch_add.
|
||||
static int AssignNextGPU() {
|
||||
// Non-NVIDIA short-circuit: no CUDA devices, return 0 and skip the
|
||||
// "assigning task" log to avoid polluting AMD/Intel/CPU logs.
|
||||
if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) return 0;
|
||||
const int numGPUs = GetNumGPUs();
|
||||
const int idx = g_gpuRoundRobinCounter.fetch_add(1);
|
||||
const int gpuIndex = idx % numGPUs;
|
||||
@@ -144,6 +166,11 @@ static int AssignNextGPU() {
|
||||
// Returns true if sufficient, false if not.
|
||||
// minFreeBytes: minimum free VRAM required (default 512 MiB safety margin).
|
||||
static bool CheckGPUVRAM(int gpuIndex, size_t minFreeBytes = 512ULL * 1024 * 1024) {
|
||||
// Non-NVIDIA short-circuit: no CUDA devices present — report "OK"
|
||||
// silently so the TRT pool path is a no-op on AMD/Intel/CPU and the
|
||||
// log isn't polluted with spurious 0-byte VRAM warnings.
|
||||
if (!ansod_vendor_gate::IsNvidiaGpuAvailable()) return true;
|
||||
|
||||
int prevDevice = 0;
|
||||
cudaGetDevice(&prevDevice);
|
||||
cudaSetDevice(gpuIndex);
|
||||
@@ -253,6 +280,16 @@ BOOL APIENTRY DllMain( HMODULE hModule,
|
||||
// Pin the DLL so it is never unmapped while idle-timer or CUDA threads
|
||||
// are still running. During LabVIEW shutdown the CLR/COM teardown can
|
||||
// unload DLLs before all threads exit → crash at unmapped code.
|
||||
//
|
||||
// CRITICAL: do NOT call CheckHardwareInformation() or
|
||||
// ansod_vendor_gate::IsNvidiaGpuAvailable() from here. DllMain holds
|
||||
// the OS loader lock (LdrpLoaderLock). CheckHardwareInformation
|
||||
// touches hwinfo → DXGI / WMI / COM, which internally call
|
||||
// LoadLibrary; doing that while holding the loader lock causes a
|
||||
// classic loader-lock deadlock (observed as a full hang of the
|
||||
// ANSLPR-UnitTest stress test). The vendor gate will lazy-
|
||||
// initialise on the first real call from worker code, which runs
|
||||
// with the loader lock released.
|
||||
{
|
||||
HMODULE hSelf = nullptr;
|
||||
GetModuleHandleExW(
|
||||
@@ -511,8 +548,19 @@ extern "C" ANSODENGINE_API std::string CreateANSODHandle(ANSCENTER::ANSODBase**
|
||||
modelConfig.modelType = ANSCENTER::ModelType::ODHUBMODEL;
|
||||
break;
|
||||
case 14: //TensorRT for Object Detection Yolov10
|
||||
(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
|
||||
modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
|
||||
// Upstream modelType rewrite (see top of each factory) already
|
||||
// redirects 14 → 31 (RTYOLO) on NVIDIA or 14 → 30 (ONNXYOLO) on
|
||||
// non-NVIDIA, so this branch is unreachable in practice. Keep
|
||||
// an explicit vendor gate as defense-in-depth against future
|
||||
// refactors — ANSYOLOV10RTOD is a TensorRT class and must never
|
||||
// be constructed on AMD/Intel/CPU hardware.
|
||||
if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) {
|
||||
(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
|
||||
modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
|
||||
} else {
|
||||
(*Handle) = new ANSCENTER::ANSONNXYOLO();
|
||||
modelConfig.modelType = ANSCENTER::ModelType::ONNXYOLO;
|
||||
}
|
||||
break;
|
||||
case 15: //OpenVino for Object Detection Yolov10
|
||||
(*Handle) = new ANSCENTER::ANSOYOLOV10OVOD();
|
||||
@@ -832,8 +880,19 @@ extern "C" ANSODENGINE_API int CreateANSODHandleEx(ANSCENTER::ANSODBase** Handl
|
||||
modelConfig.modelType = ANSCENTER::ModelType::ODHUBMODEL;
|
||||
break;
|
||||
case 14: //TensorRT for Object Detection Yolov10
|
||||
(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
|
||||
modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
|
||||
// Upstream modelType rewrite (see top of each factory) already
|
||||
// redirects 14 → 31 (RTYOLO) on NVIDIA or 14 → 30 (ONNXYOLO) on
|
||||
// non-NVIDIA, so this branch is unreachable in practice. Keep
|
||||
// an explicit vendor gate as defense-in-depth against future
|
||||
// refactors — ANSYOLOV10RTOD is a TensorRT class and must never
|
||||
// be constructed on AMD/Intel/CPU hardware.
|
||||
if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) {
|
||||
(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
|
||||
modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
|
||||
} else {
|
||||
(*Handle) = new ANSCENTER::ANSONNXYOLO();
|
||||
modelConfig.modelType = ANSCENTER::ModelType::ONNXYOLO;
|
||||
}
|
||||
break;
|
||||
case 15: //OpenVino for Object Detection Yolov10
|
||||
(*Handle) = new ANSCENTER::ANSOYOLOV10OVOD();
|
||||
@@ -1193,8 +1252,19 @@ extern "C" __declspec(dllexport) int LoadModelFromFolder(ANSCENTER::ANSODBase**
|
||||
modelConfig.modelType = ANSCENTER::ModelType::ODHUBMODEL;
|
||||
break;
|
||||
case 14: //TensorRT for Object Detection Yolov10
|
||||
(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
|
||||
modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
|
||||
// Upstream modelType rewrite (see top of each factory) already
|
||||
// redirects 14 → 31 (RTYOLO) on NVIDIA or 14 → 30 (ONNXYOLO) on
|
||||
// non-NVIDIA, so this branch is unreachable in practice. Keep
|
||||
// an explicit vendor gate as defense-in-depth against future
|
||||
// refactors — ANSYOLOV10RTOD is a TensorRT class and must never
|
||||
// be constructed on AMD/Intel/CPU hardware.
|
||||
if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) {
|
||||
(*Handle) = new ANSCENTER::ANSYOLOV10RTOD();
|
||||
modelConfig.modelType = ANSCENTER::ModelType::YOLOV10RTOD;
|
||||
} else {
|
||||
(*Handle) = new ANSCENTER::ANSONNXYOLO();
|
||||
modelConfig.modelType = ANSCENTER::ModelType::ONNXYOLO;
|
||||
}
|
||||
break;
|
||||
case 15: //OpenVino for Object Detection Yolov10
|
||||
(*Handle) = new ANSCENTER::ANSOYOLOV10OVOD();
|
||||
|
||||
Reference in New Issue
Block a user