Add CPU/GPU gate and support new ANSALPR using OCR
This commit is contained in:
59
modules/ANSCV/ANSCVVendorGate.h
Normal file
59
modules/ANSCV/ANSCVVendorGate.h
Normal file
@@ -0,0 +1,59 @@
|
||||
#pragma once
|
||||
// ANSCVVendorGate.h — Cached NVIDIA hardware check for ANSCV.dll.
|
||||
//
|
||||
// ANSCV.dll links against CUDA::cudart_static + CUDA::cublasLt + CUDA::nvjpeg
|
||||
// because it hosts NVDEC hardware decode, NV12 GPU frame pool, and the RTSP /
|
||||
// SRT / RTMP / MJPEG / FLV players that feed NV12 frames into the downstream
|
||||
// inference DLLs (ANSLPR, ANSOCR, ANSFR).
|
||||
//
|
||||
// Several code paths in ANSCV call into the CUDA runtime unconditionally:
|
||||
// • Post-NVDEC memory pool cleanup in Destroy/Reconnect
|
||||
// • cudaGetDeviceCount() probes inside AutoConfigureHWDecoders
|
||||
// • nvJPEG encoder helpers
|
||||
//
|
||||
// On NVIDIA hardware these are fine. On AMD / Intel / pure-CPU machines:
|
||||
// • cudart_static is linked, but calling it wakes up CUDA driver state
|
||||
// that was never needed — wastes address space and (when combined with
|
||||
// DirectML decode on AMD) has been observed to destabilise amdkmdag.
|
||||
// • The post-NVDEC cleanup runs even though no NVDEC decoder was ever
|
||||
// created, which is pure waste on AMD/Intel.
|
||||
//
|
||||
// Solution: gate every CUDA runtime call behind this cached predicate, which
|
||||
// evaluates CheckHardwareInformation() exactly once per process. If the
|
||||
// detected engine is not NVIDIA_GPU, all CUDA/NVDEC cleanup paths become
|
||||
// no-ops — decoders fall back to DXVA/D3D11VA/CPU automatically via the
|
||||
// existing AutoConfigureHWDecoders_Platform() fallback.
|
||||
//
|
||||
// Mirrors the ANSLPR_OD / ANSOCR / ANSFR vendor gates that were added to
|
||||
// ANSALPR_OD::LoadEngine, CreateANSOCRHandleEx, and CreateANSRFHandle.
|
||||
|
||||
#include "ANSLicense.h"
|
||||
#include <atomic>
|
||||
|
||||
namespace anscv_vendor_gate {
|
||||
|
||||
// Lazily evaluates ANSLicenseHelper::CheckHardwareInformation() once and
|
||||
// caches the result. Thread-safe: the first call on any thread performs
|
||||
// the detection, all subsequent calls return the cached bool. Using an
|
||||
// atomic bool + init-flag avoids pulling in std::call_once and its
|
||||
// exception-safety overhead (the helper is on the hot decoder path).
|
||||
[[nodiscard]] inline bool IsNvidiaGpuAvailable() noexcept {
|
||||
static std::atomic<int> s_state{0}; // 0 = unknown, 1 = NVIDIA, 2 = non-NVIDIA
|
||||
int cached = s_state.load(std::memory_order_acquire);
|
||||
if (cached != 0) return cached == 1;
|
||||
try {
|
||||
const ANSCENTER::EngineType detected =
|
||||
ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
|
||||
const bool isNvidia = (detected == ANSCENTER::EngineType::NVIDIA_GPU);
|
||||
// Last-writer-wins is fine — CheckHardwareInformation is deterministic.
|
||||
s_state.store(isNvidia ? 1 : 2, std::memory_order_release);
|
||||
return isNvidia;
|
||||
} catch (...) {
|
||||
// If detection throws (should not happen), fail safe to non-NVIDIA so
|
||||
// we never activate CUDA runtime on unknown hardware.
|
||||
s_state.store(2, std::memory_order_release);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace anscv_vendor_gate
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "ANSFLV.h"
|
||||
#include "ANSMatRegistry.h"
|
||||
#include "ANSGpuFrameOps.h"
|
||||
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
||||
#include <memory>
|
||||
#include <cstdint>
|
||||
#include "media_codec.h"
|
||||
@@ -551,6 +552,12 @@ namespace ANSCENTER {
|
||||
#endif
|
||||
|
||||
int ANSFLVClient::AutoConfigureHWDecoders(int maxPerGpuOverride) {
|
||||
// Skip the CUDA probe on non-NVIDIA hardware — the Platform fallback
|
||||
// handles Intel/AMD auto configuration. See ANSCVVendorGate.h.
|
||||
if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
|
||||
return AutoConfigureHWDecoders_Platform_FLV();
|
||||
}
|
||||
|
||||
int gpuCount = 0;
|
||||
cudaError_t err = cudaGetDeviceCount(&gpuCount);
|
||||
if (err != cudaSuccess || gpuCount <= 0) {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "ANSMJPEG.h"
|
||||
#include "ANSMatRegistry.h"
|
||||
#include "ANSGpuFrameOps.h"
|
||||
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
||||
#include <memory>
|
||||
#include <cstdint>
|
||||
#include "media_codec.h"
|
||||
@@ -549,6 +550,12 @@ namespace ANSCENTER {
|
||||
#endif
|
||||
|
||||
int ANSMJPEGClient::AutoConfigureHWDecoders(int maxPerGpuOverride) {
|
||||
// Skip the CUDA probe on non-NVIDIA hardware — the Platform fallback
|
||||
// handles Intel/AMD auto configuration. See ANSCVVendorGate.h.
|
||||
if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
|
||||
return AutoConfigureHWDecoders_Platform_MJPEG();
|
||||
}
|
||||
|
||||
int gpuCount = 0;
|
||||
cudaError_t err = cudaGetDeviceCount(&gpuCount);
|
||||
if (err != cudaSuccess || gpuCount <= 0) {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "ANSRTMP.h"
|
||||
#include "ANSMatRegistry.h"
|
||||
#include "ANSGpuFrameOps.h"
|
||||
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
||||
#include <memory>
|
||||
#include "media_codec.h"
|
||||
#include <cstdint>
|
||||
@@ -563,6 +564,12 @@ namespace ANSCENTER {
|
||||
#endif
|
||||
|
||||
int ANSRTMPClient::AutoConfigureHWDecoders(int maxPerGpuOverride) {
|
||||
// Skip the CUDA probe on non-NVIDIA hardware — the Platform fallback
|
||||
// handles Intel/AMD auto configuration. See ANSCVVendorGate.h.
|
||||
if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
|
||||
return AutoConfigureHWDecoders_Platform_RTMP();
|
||||
}
|
||||
|
||||
int gpuCount = 0;
|
||||
cudaError_t err = cudaGetDeviceCount(&gpuCount);
|
||||
if (err != cudaSuccess || gpuCount <= 0) {
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include "ANSGpuFrameOps.h"
|
||||
#include "GpuNV12SlotPool.h"
|
||||
#include "ANSLicense.h" // ANS_DBG macro
|
||||
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
||||
#include <memory>
|
||||
#include <chrono>
|
||||
#include <format>
|
||||
@@ -136,17 +137,26 @@ namespace ANSCENTER {
|
||||
// memory → VRAM grows by ~200-300MB per destroy/create cycle.
|
||||
// cudaDeviceSynchronize ensures all pending GPU ops are done, then
|
||||
// cudaMemPool trim releases the freed blocks back to the OS.
|
||||
cudaDeviceSynchronize();
|
||||
cudaMemPool_t memPool = nullptr;
|
||||
int currentDev = 0;
|
||||
cudaGetDevice(¤tDev);
|
||||
if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
|
||||
cudaMemPoolTrimTo(memPool, 0); // Release all unused memory
|
||||
//
|
||||
// AMD/Intel/CPU gate: this entire block is a no-op on non-NVIDIA
|
||||
// machines because NVDEC never ran, the CUDA memory pool is empty,
|
||||
// and calling cuda*() here would wake up cudart_static for nothing
|
||||
// (and on AMD can destabilise amdkmdag when DirectML is active).
|
||||
if (anscv_vendor_gate::IsNvidiaGpuAvailable()) {
|
||||
cudaDeviceSynchronize();
|
||||
cudaMemPool_t memPool = nullptr;
|
||||
int currentDev = 0;
|
||||
cudaGetDevice(¤tDev);
|
||||
if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
|
||||
cudaMemPoolTrimTo(memPool, 0); // Release all unused memory
|
||||
}
|
||||
size_t vramFree = 0, vramTotal = 0;
|
||||
cudaMemGetInfo(&vramFree, &vramTotal);
|
||||
ANS_DBG("RTSP_Destroy", "NVDEC closed + memPool trimmed GPU%d VRAM=%zuMB/%zuMB",
|
||||
currentDev, (vramTotal - vramFree) / (1024*1024), vramFree / (1024*1024));
|
||||
} else {
|
||||
ANS_DBG("RTSP_Destroy", "non-NVIDIA hardware — skipped CUDA memory pool trim");
|
||||
}
|
||||
size_t vramFree = 0, vramTotal = 0;
|
||||
cudaMemGetInfo(&vramFree, &vramTotal);
|
||||
ANS_DBG("RTSP_Destroy", "NVDEC closed + memPool trimmed GPU%d VRAM=%zuMB/%zuMB",
|
||||
currentDev, (vramTotal - vramFree) / (1024*1024), vramFree / (1024*1024));
|
||||
}
|
||||
}
|
||||
static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) {
|
||||
@@ -281,23 +291,32 @@ namespace ANSCENTER {
|
||||
auto _rc1 = std::chrono::steady_clock::now();
|
||||
|
||||
// Force CUDA runtime to release cached memory from the destroyed NVDEC decoder.
|
||||
cudaDeviceSynchronize();
|
||||
auto _rc2 = std::chrono::steady_clock::now();
|
||||
cudaMemPool_t memPool = nullptr;
|
||||
int currentDev = 0;
|
||||
cudaGetDevice(¤tDev);
|
||||
if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
|
||||
cudaMemPoolTrimTo(memPool, 0);
|
||||
}
|
||||
auto _rc3 = std::chrono::steady_clock::now();
|
||||
{
|
||||
size_t vf = 0, vt = 0;
|
||||
cudaMemGetInfo(&vf, &vt);
|
||||
// Gated on NVIDIA: on AMD/Intel/CPU there was no NVDEC decoder and no
|
||||
// CUDA memory pool to trim, so calling into cudart is pure overhead
|
||||
// (and combined with DirectML on AMD has been observed to destabilise
|
||||
// amdkmdag). See ANSCVVendorGate.h for the rationale.
|
||||
if (anscv_vendor_gate::IsNvidiaGpuAvailable()) {
|
||||
cudaDeviceSynchronize();
|
||||
auto _rc2 = std::chrono::steady_clock::now();
|
||||
cudaMemPool_t memPool = nullptr;
|
||||
int currentDev = 0;
|
||||
cudaGetDevice(¤tDev);
|
||||
if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
|
||||
cudaMemPoolTrimTo(memPool, 0);
|
||||
}
|
||||
auto _rc3 = std::chrono::steady_clock::now();
|
||||
{
|
||||
size_t vf = 0, vt = 0;
|
||||
cudaMemGetInfo(&vf, &vt);
|
||||
double closeMs = std::chrono::duration<double, std::milli>(_rc1 - _rc0).count();
|
||||
double syncMs = std::chrono::duration<double, std::milli>(_rc2 - _rc1).count();
|
||||
double trimMs = std::chrono::duration<double, std::milli>(_rc3 - _rc2).count();
|
||||
ANS_DBG("RTSP_Reconnect", "close=%.1fms sync=%.1fms trim=%.1fms VRAM=%zuMB/%zuMB",
|
||||
closeMs, syncMs, trimMs, (vt - vf) / (1024*1024), vf / (1024*1024));
|
||||
}
|
||||
} else {
|
||||
double closeMs = std::chrono::duration<double, std::milli>(_rc1 - _rc0).count();
|
||||
double syncMs = std::chrono::duration<double, std::milli>(_rc2 - _rc1).count();
|
||||
double trimMs = std::chrono::duration<double, std::milli>(_rc3 - _rc2).count();
|
||||
ANS_DBG("RTSP_Reconnect", "close=%.1fms sync=%.1fms trim=%.1fms VRAM=%zuMB/%zuMB",
|
||||
closeMs, syncMs, trimMs, (vt - vf) / (1024*1024), vf / (1024*1024));
|
||||
ANS_DBG("RTSP_Reconnect", "close=%.1fms (non-NVIDIA — CUDA memory pool trim skipped)", closeMs);
|
||||
}
|
||||
RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this);
|
||||
|
||||
@@ -882,6 +901,14 @@ namespace ANSCENTER {
|
||||
#endif
|
||||
|
||||
int ANSRTSPClient::AutoConfigureHWDecoders(int maxPerGpuOverride) {
|
||||
// Skip the CUDA probe entirely on non-NVIDIA hardware — the Platform
|
||||
// fallback (DXGI on Windows, sysfs on Linux) handles Intel/AMD auto
|
||||
// configuration, and calling cudaGetDeviceCount() on AMD wakes up
|
||||
// cudart_static for no benefit. See ANSCVVendorGate.h.
|
||||
if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
|
||||
return AutoConfigureHWDecoders_Platform();
|
||||
}
|
||||
|
||||
int gpuCount = 0;
|
||||
cudaError_t err = cudaGetDeviceCount(&gpuCount);
|
||||
if (err != cudaSuccess || gpuCount <= 0) {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "ANSSRT.h"
|
||||
#include "ANSMatRegistry.h"
|
||||
#include "ANSGpuFrameOps.h"
|
||||
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
||||
#include <memory>
|
||||
#include "media_codec.h"
|
||||
#include <cstdint>
|
||||
@@ -577,6 +578,13 @@ namespace ANSCENTER {
|
||||
#endif
|
||||
|
||||
int ANSSRTClient::AutoConfigureHWDecoders(int maxPerGpuOverride) {
|
||||
// Skip the CUDA probe on non-NVIDIA hardware — the Platform fallback
|
||||
// (DXGI/sysfs) handles Intel/AMD auto configuration. See
|
||||
// ANSCVVendorGate.h for rationale.
|
||||
if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
|
||||
return AutoConfigureHWDecoders_Platform_SRT();
|
||||
}
|
||||
|
||||
int gpuCount = 0;
|
||||
cudaError_t err = cudaGetDeviceCount(&gpuCount);
|
||||
if (err != cudaSuccess || gpuCount <= 0) {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "ANSWEBCAM.h"
|
||||
#include "ANSMatRegistry.h"
|
||||
#include "ANSGpuFrameRegistry.h"
|
||||
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
extern "C" {
|
||||
@@ -914,6 +915,15 @@ namespace ANSCENTER {
|
||||
return result;
|
||||
}
|
||||
void ANSWEBCAMPlayer::uploadPlanarBGRToGPU(const cv::Mat& inputMat, unsigned char** data) {
|
||||
// Refuse on non-NVIDIA — cudaMalloc/cudaMemcpy are NVIDIA-only.
|
||||
// The public entry point encodeMatToJpegWithNvJPEG() also guards,
|
||||
// but defense-in-depth in case a future caller wires this up directly.
|
||||
if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
|
||||
this->_logger.LogWarn("ANSWEBCAMPlayer::uploadPlanarBGRToGPU",
|
||||
"skipped — non-NVIDIA hardware, nvJPEG path unavailable", __FILE__, __LINE__);
|
||||
if (data) *data = nullptr;
|
||||
return;
|
||||
}
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
try {
|
||||
int width = inputMat.cols;
|
||||
@@ -933,6 +943,13 @@ namespace ANSCENTER {
|
||||
}
|
||||
std::string ANSWEBCAMPlayer::encodeMatToJpegWithNvJPEG(const cv::Mat& inputMat, int quality)
|
||||
{
|
||||
// nvJPEG encoder is NVIDIA-only (part of CUDA toolkit). Refuse on
|
||||
// AMD/Intel/CPU and let the caller fall back to the turbojpeg path.
|
||||
if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
|
||||
this->_logger.LogWarn("ANSWEBCAMPlayer::encodeMatToJpegWithNvJPEG",
|
||||
"nvJPEG requires NVIDIA GPU; falling back to last cached JPEG", __FILE__, __LINE__);
|
||||
return _lastJpegImage;
|
||||
}
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
try {
|
||||
// Image dimensions
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
// dllmain.cpp : Defines the entry point for the DLL application.
|
||||
#include "pch.h"
|
||||
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
||||
#include "ANSLicense.h" // ANSCENTER::EngineType, CheckHardwareInformation
|
||||
#include <mutex>
|
||||
#include <unordered_map>
|
||||
#include <iostream>
|
||||
@@ -61,6 +63,15 @@ BOOL APIENTRY DllMain( HMODULE hModule,
|
||||
// Pinning keeps the code pages mapped; the OS kills all threads when
|
||||
// the process exits, so this is safe and is Microsoft's recommended
|
||||
// pattern for DLLs that own threads.
|
||||
//
|
||||
// CRITICAL: do NOT call CheckHardwareInformation() or
|
||||
// anscv_vendor_gate::IsNvidiaGpuAvailable() here. DllMain holds the
|
||||
// OS loader lock (LdrpLoaderLock). CheckHardwareInformation touches
|
||||
// hwinfo → DXGI / WMI / COM which internally call LoadLibrary; doing
|
||||
// that while holding the loader lock causes a classic loader-lock
|
||||
// deadlock (confirmed by stress-test hang). The vendor gate will
|
||||
// lazy-initialise on the first real call from worker code, which
|
||||
// runs with the loader lock released.
|
||||
{
|
||||
HMODULE hSelf = nullptr;
|
||||
GetModuleHandleExW(
|
||||
|
||||
Reference in New Issue
Block a user