Add CPU/GPU gate and support new ANSALPR using OCR

2026-04-12 17:16:16 +10:00
parent 27083a6530
commit 0a8aaed215
30 changed files with 1870 additions and 2166 deletions
--- a/modules/ANSCV/ANSCVVendorGate.h
+++ b/modules/ANSCV/ANSCVVendorGate.h
@@ -0,0 +1,59 @@
+#pragma once
+// ANSCVVendorGate.h — Cached NVIDIA hardware check for ANSCV.dll.
+//
+// ANSCV.dll links against CUDA::cudart_static + CUDA::cublasLt + CUDA::nvjpeg
+// because it hosts NVDEC hardware decode, NV12 GPU frame pool, and the RTSP /
+// SRT / RTMP / MJPEG / FLV players that feed NV12 frames into the downstream
+// inference DLLs (ANSLPR, ANSOCR, ANSFR).
+//
+// Several code paths in ANSCV call into the CUDA runtime unconditionally:
+//   • Post-NVDEC memory pool cleanup in Destroy/Reconnect
+//   • cudaGetDeviceCount() probes inside AutoConfigureHWDecoders
+//   • nvJPEG encoder helpers
+//
+// On NVIDIA hardware these are fine.  On AMD / Intel / pure-CPU machines:
+//   • cudart_static is linked, but calling it wakes up CUDA driver state
+//     that was never needed — wastes address space and (when combined with
+//     DirectML decode on AMD) has been observed to destabilise amdkmdag.
+//   • The post-NVDEC cleanup runs even though no NVDEC decoder was ever
+//     created, which is pure waste on AMD/Intel.
+//
+// Solution: gate every CUDA runtime call behind this cached predicate, which
+// evaluates CheckHardwareInformation() exactly once per process.  If the
+// detected engine is not NVIDIA_GPU, all CUDA/NVDEC cleanup paths become
+// no-ops — decoders fall back to DXVA/D3D11VA/CPU automatically via the
+// existing AutoConfigureHWDecoders_Platform() fallback.
+//
+// Mirrors the ANSLPR_OD / ANSOCR / ANSFR vendor gates that were added to
+// ANSALPR_OD::LoadEngine, CreateANSOCRHandleEx, and CreateANSRFHandle.
+
+#include "ANSLicense.h"
+#include <atomic>
+
+namespace anscv_vendor_gate {
+
+// Lazily evaluates ANSLicenseHelper::CheckHardwareInformation() once and
+// caches the result.  Thread-safe: the first call on any thread performs
+// the detection, all subsequent calls return the cached bool.  Using an
+// atomic bool + init-flag avoids pulling in std::call_once and its
+// exception-safety overhead (the helper is on the hot decoder path).
+[[nodiscard]] inline bool IsNvidiaGpuAvailable() noexcept {
+    static std::atomic<int> s_state{0};   // 0 = unknown, 1 = NVIDIA, 2 = non-NVIDIA
+    int cached = s_state.load(std::memory_order_acquire);
+    if (cached != 0) return cached == 1;
+    try {
+        const ANSCENTER::EngineType detected =
+            ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
+        const bool isNvidia = (detected == ANSCENTER::EngineType::NVIDIA_GPU);
+        // Last-writer-wins is fine — CheckHardwareInformation is deterministic.
+        s_state.store(isNvidia ? 1 : 2, std::memory_order_release);
+        return isNvidia;
+    } catch (...) {
+        // If detection throws (should not happen), fail safe to non-NVIDIA so
+        // we never activate CUDA runtime on unknown hardware.
+        s_state.store(2, std::memory_order_release);
+        return false;
+    }
+}
+
+} // namespace anscv_vendor_gate
--- a/modules/ANSCV/ANSFLV.cpp
+++ b/modules/ANSCV/ANSFLV.cpp
@@ -1,6 +1,7 @@
 #include "ANSFLV.h"
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
+#include "ANSCVVendorGate.h"  // anscv_vendor_gate::IsNvidiaGpuAvailable()
 #include <memory>
 #include <cstdint>
 #include "media_codec.h"
@@ -551,6 +552,12 @@ namespace ANSCENTER {
 #endif

    int ANSFLVClient::AutoConfigureHWDecoders(int maxPerGpuOverride) {
+        // Skip the CUDA probe on non-NVIDIA hardware — the Platform fallback
+        // handles Intel/AMD auto configuration.  See ANSCVVendorGate.h.
+        if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
+            return AutoConfigureHWDecoders_Platform_FLV();
+        }
+
        int gpuCount = 0;
        cudaError_t err = cudaGetDeviceCount(&gpuCount);
        if (err != cudaSuccess || gpuCount <= 0) {
--- a/modules/ANSCV/ANSMJPEG.cpp
+++ b/modules/ANSCV/ANSMJPEG.cpp
@@ -1,6 +1,7 @@
 #include "ANSMJPEG.h"
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
+#include "ANSCVVendorGate.h"  // anscv_vendor_gate::IsNvidiaGpuAvailable()
 #include <memory>
 #include <cstdint>
 #include "media_codec.h"
@@ -549,6 +550,12 @@ namespace ANSCENTER {
 #endif

    int ANSMJPEGClient::AutoConfigureHWDecoders(int maxPerGpuOverride) {
+        // Skip the CUDA probe on non-NVIDIA hardware — the Platform fallback
+        // handles Intel/AMD auto configuration.  See ANSCVVendorGate.h.
+        if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
+            return AutoConfigureHWDecoders_Platform_MJPEG();
+        }
+
        int gpuCount = 0;
        cudaError_t err = cudaGetDeviceCount(&gpuCount);
        if (err != cudaSuccess || gpuCount <= 0) {
--- a/modules/ANSCV/ANSRTMP.cpp
+++ b/modules/ANSCV/ANSRTMP.cpp
@@ -1,6 +1,7 @@
 #include "ANSRTMP.h"
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
+#include "ANSCVVendorGate.h"  // anscv_vendor_gate::IsNvidiaGpuAvailable()
 #include <memory>
 #include "media_codec.h"
 #include <cstdint>
@@ -563,6 +564,12 @@ namespace ANSCENTER {
 #endif

    int ANSRTMPClient::AutoConfigureHWDecoders(int maxPerGpuOverride) {
+        // Skip the CUDA probe on non-NVIDIA hardware — the Platform fallback
+        // handles Intel/AMD auto configuration.  See ANSCVVendorGate.h.
+        if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
+            return AutoConfigureHWDecoders_Platform_RTMP();
+        }
+
        int gpuCount = 0;
        cudaError_t err = cudaGetDeviceCount(&gpuCount);
        if (err != cudaSuccess || gpuCount <= 0) {
--- a/modules/ANSCV/ANSRTSP.cpp
+++ b/modules/ANSCV/ANSRTSP.cpp
@@ -3,6 +3,7 @@
 #include "ANSGpuFrameOps.h"
 #include "GpuNV12SlotPool.h"
 #include "ANSLicense.h"       // ANS_DBG macro
+#include "ANSCVVendorGate.h"  // anscv_vendor_gate::IsNvidiaGpuAvailable()
 #include <memory>
 #include <chrono>
 #include <format>
@@ -136,17 +137,26 @@ namespace ANSCENTER {
            // memory → VRAM grows by ~200-300MB per destroy/create cycle.
            // cudaDeviceSynchronize ensures all pending GPU ops are done, then
            // cudaMemPool trim releases the freed blocks back to the OS.
-            cudaDeviceSynchronize();
-            cudaMemPool_t memPool = nullptr;
-            int currentDev = 0;
-            cudaGetDevice(&currentDev);
-            if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
-                cudaMemPoolTrimTo(memPool, 0);  // Release all unused memory
+            //
+            // AMD/Intel/CPU gate: this entire block is a no-op on non-NVIDIA
+            // machines because NVDEC never ran, the CUDA memory pool is empty,
+            // and calling cuda*() here would wake up cudart_static for nothing
+            // (and on AMD can destabilise amdkmdag when DirectML is active).
+            if (anscv_vendor_gate::IsNvidiaGpuAvailable()) {
+                cudaDeviceSynchronize();
+                cudaMemPool_t memPool = nullptr;
+                int currentDev = 0;
+                cudaGetDevice(&currentDev);
+                if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
+                    cudaMemPoolTrimTo(memPool, 0);  // Release all unused memory
+                }
+                size_t vramFree = 0, vramTotal = 0;
+                cudaMemGetInfo(&vramFree, &vramTotal);
+                ANS_DBG("RTSP_Destroy", "NVDEC closed + memPool trimmed GPU%d VRAM=%zuMB/%zuMB",
+                        currentDev, (vramTotal - vramFree) / (1024*1024), vramFree / (1024*1024));
+            } else {
+                ANS_DBG("RTSP_Destroy", "non-NVIDIA hardware — skipped CUDA memory pool trim");
            }
-            size_t vramFree = 0, vramTotal = 0;
-            cudaMemGetInfo(&vramFree, &vramTotal);
-            ANS_DBG("RTSP_Destroy", "NVDEC closed + memPool trimmed GPU%d VRAM=%zuMB/%zuMB",
-                    currentDev, (vramTotal - vramFree) / (1024*1024), vramFree / (1024*1024));
        }
    }
    static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) {
@@ -281,23 +291,32 @@ namespace ANSCENTER {
        auto _rc1 = std::chrono::steady_clock::now();

        // Force CUDA runtime to release cached memory from the destroyed NVDEC decoder.
-        cudaDeviceSynchronize();
-        auto _rc2 = std::chrono::steady_clock::now();
-        cudaMemPool_t memPool = nullptr;
-        int currentDev = 0;
-        cudaGetDevice(&currentDev);
-        if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
-            cudaMemPoolTrimTo(memPool, 0);
-        }
-        auto _rc3 = std::chrono::steady_clock::now();
-        {
-            size_t vf = 0, vt = 0;
-            cudaMemGetInfo(&vf, &vt);
+        // Gated on NVIDIA: on AMD/Intel/CPU there was no NVDEC decoder and no
+        // CUDA memory pool to trim, so calling into cudart is pure overhead
+        // (and combined with DirectML on AMD has been observed to destabilise
+        // amdkmdag).  See ANSCVVendorGate.h for the rationale.
+        if (anscv_vendor_gate::IsNvidiaGpuAvailable()) {
+            cudaDeviceSynchronize();
+            auto _rc2 = std::chrono::steady_clock::now();
+            cudaMemPool_t memPool = nullptr;
+            int currentDev = 0;
+            cudaGetDevice(&currentDev);
+            if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
+                cudaMemPoolTrimTo(memPool, 0);
+            }
+            auto _rc3 = std::chrono::steady_clock::now();
+            {
+                size_t vf = 0, vt = 0;
+                cudaMemGetInfo(&vf, &vt);
+                double closeMs = std::chrono::duration<double, std::milli>(_rc1 - _rc0).count();
+                double syncMs  = std::chrono::duration<double, std::milli>(_rc2 - _rc1).count();
+                double trimMs  = std::chrono::duration<double, std::milli>(_rc3 - _rc2).count();
+                ANS_DBG("RTSP_Reconnect", "close=%.1fms sync=%.1fms trim=%.1fms VRAM=%zuMB/%zuMB",
+                        closeMs, syncMs, trimMs, (vt - vf) / (1024*1024), vf / (1024*1024));
+            }
+        } else {
            double closeMs = std::chrono::duration<double, std::milli>(_rc1 - _rc0).count();
-            double syncMs  = std::chrono::duration<double, std::milli>(_rc2 - _rc1).count();
-            double trimMs  = std::chrono::duration<double, std::milli>(_rc3 - _rc2).count();
-            ANS_DBG("RTSP_Reconnect", "close=%.1fms sync=%.1fms trim=%.1fms VRAM=%zuMB/%zuMB",
-                    closeMs, syncMs, trimMs, (vt - vf) / (1024*1024), vf / (1024*1024));
+            ANS_DBG("RTSP_Reconnect", "close=%.1fms (non-NVIDIA — CUDA memory pool trim skipped)", closeMs);
        }
        RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this);

@@ -882,6 +901,14 @@ namespace ANSCENTER {
 #endif

    int ANSRTSPClient::AutoConfigureHWDecoders(int maxPerGpuOverride) {
+        // Skip the CUDA probe entirely on non-NVIDIA hardware — the Platform
+        // fallback (DXGI on Windows, sysfs on Linux) handles Intel/AMD auto
+        // configuration, and calling cudaGetDeviceCount() on AMD wakes up
+        // cudart_static for no benefit.  See ANSCVVendorGate.h.
+        if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
+            return AutoConfigureHWDecoders_Platform();
+        }
+
        int gpuCount = 0;
        cudaError_t err = cudaGetDeviceCount(&gpuCount);
        if (err != cudaSuccess || gpuCount <= 0) {
--- a/modules/ANSCV/ANSSRT.cpp
+++ b/modules/ANSCV/ANSSRT.cpp
@@ -1,6 +1,7 @@
 #include "ANSSRT.h"
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
+#include "ANSCVVendorGate.h"  // anscv_vendor_gate::IsNvidiaGpuAvailable()
 #include <memory>
 #include "media_codec.h"
 #include <cstdint>
@@ -577,6 +578,13 @@ namespace ANSCENTER {
 #endif

    int ANSSRTClient::AutoConfigureHWDecoders(int maxPerGpuOverride) {
+        // Skip the CUDA probe on non-NVIDIA hardware — the Platform fallback
+        // (DXGI/sysfs) handles Intel/AMD auto configuration.  See
+        // ANSCVVendorGate.h for rationale.
+        if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
+            return AutoConfigureHWDecoders_Platform_SRT();
+        }
+
        int gpuCount = 0;
        cudaError_t err = cudaGetDeviceCount(&gpuCount);
        if (err != cudaSuccess || gpuCount <= 0) {
--- a/modules/ANSCV/ANSWebcam.cpp
+++ b/modules/ANSCV/ANSWebcam.cpp
@@ -1,6 +1,7 @@
 #include "ANSWEBCAM.h"
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameRegistry.h"
+#include "ANSCVVendorGate.h"  // anscv_vendor_gate::IsNvidiaGpuAvailable()
 #include <cstdint>
 #include <memory>
 extern "C" {
@@ -914,6 +915,15 @@ namespace ANSCENTER {
        return result;
    }
    void ANSWEBCAMPlayer::uploadPlanarBGRToGPU(const cv::Mat& inputMat, unsigned char** data) {
+        // Refuse on non-NVIDIA — cudaMalloc/cudaMemcpy are NVIDIA-only.
+        // The public entry point encodeMatToJpegWithNvJPEG() also guards,
+        // but defense-in-depth in case a future caller wires this up directly.
+        if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
+            this->_logger.LogWarn("ANSWEBCAMPlayer::uploadPlanarBGRToGPU",
+                "skipped — non-NVIDIA hardware, nvJPEG path unavailable", __FILE__, __LINE__);
+            if (data) *data = nullptr;
+            return;
+        }
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {
            int width = inputMat.cols;
@@ -933,6 +943,13 @@ namespace ANSCENTER {
    }
    std::string ANSWEBCAMPlayer::encodeMatToJpegWithNvJPEG(const cv::Mat& inputMat, int quality)
    {
+        // nvJPEG encoder is NVIDIA-only (part of CUDA toolkit).  Refuse on
+        // AMD/Intel/CPU and let the caller fall back to the turbojpeg path.
+        if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
+            this->_logger.LogWarn("ANSWEBCAMPlayer::encodeMatToJpegWithNvJPEG",
+                "nvJPEG requires NVIDIA GPU; falling back to last cached JPEG", __FILE__, __LINE__);
+            return _lastJpegImage;
+        }
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {
            // Image dimensions
--- a/modules/ANSCV/dllmain.cpp
+++ b/modules/ANSCV/dllmain.cpp
@@ -1,5 +1,7 @@
 // dllmain.cpp : Defines the entry point for the DLL application.
 #include "pch.h"
+#include "ANSCVVendorGate.h"  // anscv_vendor_gate::IsNvidiaGpuAvailable()
+#include "ANSLicense.h"        // ANSCENTER::EngineType, CheckHardwareInformation
 #include <mutex>
 #include <unordered_map>
 #include <iostream>
@@ -61,6 +63,15 @@ BOOL APIENTRY DllMain( HMODULE hModule,
        // Pinning keeps the code pages mapped; the OS kills all threads when
        // the process exits, so this is safe and is Microsoft's recommended
        // pattern for DLLs that own threads.
+        //
+        // CRITICAL: do NOT call CheckHardwareInformation() or
+        // anscv_vendor_gate::IsNvidiaGpuAvailable() here.  DllMain holds the
+        // OS loader lock (LdrpLoaderLock).  CheckHardwareInformation touches
+        // hwinfo → DXGI / WMI / COM which internally call LoadLibrary; doing
+        // that while holding the loader lock causes a classic loader-lock
+        // deadlock (confirmed by stress-test hang).  The vendor gate will
+        // lazy-initialise on the first real call from worker code, which
+        // runs with the loader lock released.
        {
            HMODULE hSelf = nullptr;
            GetModuleHandleExW(