Add CPU/GPU gate and support new ANSALPR using OCR

This commit is contained in:
2026-04-12 17:16:16 +10:00
parent 27083a6530
commit 0a8aaed215
30 changed files with 1870 additions and 2166 deletions

View File

@@ -3,6 +3,7 @@
#include "ANSGpuFrameOps.h"
#include "GpuNV12SlotPool.h"
#include "ANSLicense.h" // ANS_DBG macro
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
#include <memory>
#include <chrono>
#include <format>
@@ -136,17 +137,26 @@ namespace ANSCENTER {
// memory → VRAM grows by ~200-300MB per destroy/create cycle.
// cudaDeviceSynchronize ensures all pending GPU ops are done, then
// cudaMemPool trim releases the freed blocks back to the OS.
cudaDeviceSynchronize();
cudaMemPool_t memPool = nullptr;
int currentDev = 0;
cudaGetDevice(&currentDev);
if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
cudaMemPoolTrimTo(memPool, 0); // Release all unused memory
//
// AMD/Intel/CPU gate: this entire block is a no-op on non-NVIDIA
// machines because NVDEC never ran, the CUDA memory pool is empty,
// and calling cuda*() here would wake up cudart_static for nothing
// (and on AMD can destabilise amdkmdag when DirectML is active).
if (anscv_vendor_gate::IsNvidiaGpuAvailable()) {
cudaDeviceSynchronize();
cudaMemPool_t memPool = nullptr;
int currentDev = 0;
cudaGetDevice(&currentDev);
if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
cudaMemPoolTrimTo(memPool, 0); // Release all unused memory
}
size_t vramFree = 0, vramTotal = 0;
cudaMemGetInfo(&vramFree, &vramTotal);
ANS_DBG("RTSP_Destroy", "NVDEC closed + memPool trimmed GPU%d VRAM=%zuMB/%zuMB",
currentDev, (vramTotal - vramFree) / (1024*1024), vramFree / (1024*1024));
} else {
ANS_DBG("RTSP_Destroy", "non-NVIDIA hardware — skipped CUDA memory pool trim");
}
size_t vramFree = 0, vramTotal = 0;
cudaMemGetInfo(&vramFree, &vramTotal);
ANS_DBG("RTSP_Destroy", "NVDEC closed + memPool trimmed GPU%d VRAM=%zuMB/%zuMB",
currentDev, (vramTotal - vramFree) / (1024*1024), vramFree / (1024*1024));
}
}
static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) {
@@ -281,23 +291,32 @@ namespace ANSCENTER {
auto _rc1 = std::chrono::steady_clock::now();
// Force CUDA runtime to release cached memory from the destroyed NVDEC decoder.
cudaDeviceSynchronize();
auto _rc2 = std::chrono::steady_clock::now();
cudaMemPool_t memPool = nullptr;
int currentDev = 0;
cudaGetDevice(&currentDev);
if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
cudaMemPoolTrimTo(memPool, 0);
}
auto _rc3 = std::chrono::steady_clock::now();
{
size_t vf = 0, vt = 0;
cudaMemGetInfo(&vf, &vt);
// Gated on NVIDIA: on AMD/Intel/CPU there was no NVDEC decoder and no
// CUDA memory pool to trim, so calling into cudart is pure overhead
// (and combined with DirectML on AMD has been observed to destabilise
// amdkmdag). See ANSCVVendorGate.h for the rationale.
if (anscv_vendor_gate::IsNvidiaGpuAvailable()) {
cudaDeviceSynchronize();
auto _rc2 = std::chrono::steady_clock::now();
cudaMemPool_t memPool = nullptr;
int currentDev = 0;
cudaGetDevice(&currentDev);
if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
cudaMemPoolTrimTo(memPool, 0);
}
auto _rc3 = std::chrono::steady_clock::now();
{
size_t vf = 0, vt = 0;
cudaMemGetInfo(&vf, &vt);
double closeMs = std::chrono::duration<double, std::milli>(_rc1 - _rc0).count();
double syncMs = std::chrono::duration<double, std::milli>(_rc2 - _rc1).count();
double trimMs = std::chrono::duration<double, std::milli>(_rc3 - _rc2).count();
ANS_DBG("RTSP_Reconnect", "close=%.1fms sync=%.1fms trim=%.1fms VRAM=%zuMB/%zuMB",
closeMs, syncMs, trimMs, (vt - vf) / (1024*1024), vf / (1024*1024));
}
} else {
double closeMs = std::chrono::duration<double, std::milli>(_rc1 - _rc0).count();
double syncMs = std::chrono::duration<double, std::milli>(_rc2 - _rc1).count();
double trimMs = std::chrono::duration<double, std::milli>(_rc3 - _rc2).count();
ANS_DBG("RTSP_Reconnect", "close=%.1fms sync=%.1fms trim=%.1fms VRAM=%zuMB/%zuMB",
closeMs, syncMs, trimMs, (vt - vf) / (1024*1024), vf / (1024*1024));
ANS_DBG("RTSP_Reconnect", "close=%.1fms (non-NVIDIA — CUDA memory pool trim skipped)", closeMs);
}
RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this);
@@ -882,6 +901,14 @@ namespace ANSCENTER {
#endif
int ANSRTSPClient::AutoConfigureHWDecoders(int maxPerGpuOverride) {
// Skip the CUDA probe entirely on non-NVIDIA hardware — the Platform
// fallback (DXGI on Windows, sysfs on Linux) handles Intel/AMD auto
// configuration, and calling cudaGetDeviceCount() on AMD wakes up
// cudart_static for no benefit. See ANSCVVendorGate.h.
if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
return AutoConfigureHWDecoders_Platform();
}
int gpuCount = 0;
cudaError_t err = cudaGetDeviceCount(&gpuCount);
if (err != cudaSuccess || gpuCount <= 0) {