Add CPU/GPU gate and support new ANSALPR using OCR
This commit is contained in:
@@ -3,6 +3,7 @@
|
||||
#include "ANSGpuFrameOps.h"
|
||||
#include "GpuNV12SlotPool.h"
|
||||
#include "ANSLicense.h" // ANS_DBG macro
|
||||
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
||||
#include <memory>
|
||||
#include <chrono>
|
||||
#include <format>
|
||||
@@ -136,17 +137,26 @@ namespace ANSCENTER {
|
||||
// memory → VRAM grows by ~200-300MB per destroy/create cycle.
|
||||
// cudaDeviceSynchronize ensures all pending GPU ops are done, then
|
||||
// cudaMemPool trim releases the freed blocks back to the OS.
|
||||
cudaDeviceSynchronize();
|
||||
cudaMemPool_t memPool = nullptr;
|
||||
int currentDev = 0;
|
||||
cudaGetDevice(¤tDev);
|
||||
if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
|
||||
cudaMemPoolTrimTo(memPool, 0); // Release all unused memory
|
||||
//
|
||||
// AMD/Intel/CPU gate: this entire block is a no-op on non-NVIDIA
|
||||
// machines because NVDEC never ran, the CUDA memory pool is empty,
|
||||
// and calling cuda*() here would wake up cudart_static for nothing
|
||||
// (and on AMD can destabilise amdkmdag when DirectML is active).
|
||||
if (anscv_vendor_gate::IsNvidiaGpuAvailable()) {
|
||||
cudaDeviceSynchronize();
|
||||
cudaMemPool_t memPool = nullptr;
|
||||
int currentDev = 0;
|
||||
cudaGetDevice(¤tDev);
|
||||
if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
|
||||
cudaMemPoolTrimTo(memPool, 0); // Release all unused memory
|
||||
}
|
||||
size_t vramFree = 0, vramTotal = 0;
|
||||
cudaMemGetInfo(&vramFree, &vramTotal);
|
||||
ANS_DBG("RTSP_Destroy", "NVDEC closed + memPool trimmed GPU%d VRAM=%zuMB/%zuMB",
|
||||
currentDev, (vramTotal - vramFree) / (1024*1024), vramFree / (1024*1024));
|
||||
} else {
|
||||
ANS_DBG("RTSP_Destroy", "non-NVIDIA hardware — skipped CUDA memory pool trim");
|
||||
}
|
||||
size_t vramFree = 0, vramTotal = 0;
|
||||
cudaMemGetInfo(&vramFree, &vramTotal);
|
||||
ANS_DBG("RTSP_Destroy", "NVDEC closed + memPool trimmed GPU%d VRAM=%zuMB/%zuMB",
|
||||
currentDev, (vramTotal - vramFree) / (1024*1024), vramFree / (1024*1024));
|
||||
}
|
||||
}
|
||||
static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) {
|
||||
@@ -281,23 +291,32 @@ namespace ANSCENTER {
|
||||
auto _rc1 = std::chrono::steady_clock::now();
|
||||
|
||||
// Force CUDA runtime to release cached memory from the destroyed NVDEC decoder.
|
||||
cudaDeviceSynchronize();
|
||||
auto _rc2 = std::chrono::steady_clock::now();
|
||||
cudaMemPool_t memPool = nullptr;
|
||||
int currentDev = 0;
|
||||
cudaGetDevice(¤tDev);
|
||||
if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
|
||||
cudaMemPoolTrimTo(memPool, 0);
|
||||
}
|
||||
auto _rc3 = std::chrono::steady_clock::now();
|
||||
{
|
||||
size_t vf = 0, vt = 0;
|
||||
cudaMemGetInfo(&vf, &vt);
|
||||
// Gated on NVIDIA: on AMD/Intel/CPU there was no NVDEC decoder and no
|
||||
// CUDA memory pool to trim, so calling into cudart is pure overhead
|
||||
// (and combined with DirectML on AMD has been observed to destabilise
|
||||
// amdkmdag). See ANSCVVendorGate.h for the rationale.
|
||||
if (anscv_vendor_gate::IsNvidiaGpuAvailable()) {
|
||||
cudaDeviceSynchronize();
|
||||
auto _rc2 = std::chrono::steady_clock::now();
|
||||
cudaMemPool_t memPool = nullptr;
|
||||
int currentDev = 0;
|
||||
cudaGetDevice(¤tDev);
|
||||
if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
|
||||
cudaMemPoolTrimTo(memPool, 0);
|
||||
}
|
||||
auto _rc3 = std::chrono::steady_clock::now();
|
||||
{
|
||||
size_t vf = 0, vt = 0;
|
||||
cudaMemGetInfo(&vf, &vt);
|
||||
double closeMs = std::chrono::duration<double, std::milli>(_rc1 - _rc0).count();
|
||||
double syncMs = std::chrono::duration<double, std::milli>(_rc2 - _rc1).count();
|
||||
double trimMs = std::chrono::duration<double, std::milli>(_rc3 - _rc2).count();
|
||||
ANS_DBG("RTSP_Reconnect", "close=%.1fms sync=%.1fms trim=%.1fms VRAM=%zuMB/%zuMB",
|
||||
closeMs, syncMs, trimMs, (vt - vf) / (1024*1024), vf / (1024*1024));
|
||||
}
|
||||
} else {
|
||||
double closeMs = std::chrono::duration<double, std::milli>(_rc1 - _rc0).count();
|
||||
double syncMs = std::chrono::duration<double, std::milli>(_rc2 - _rc1).count();
|
||||
double trimMs = std::chrono::duration<double, std::milli>(_rc3 - _rc2).count();
|
||||
ANS_DBG("RTSP_Reconnect", "close=%.1fms sync=%.1fms trim=%.1fms VRAM=%zuMB/%zuMB",
|
||||
closeMs, syncMs, trimMs, (vt - vf) / (1024*1024), vf / (1024*1024));
|
||||
ANS_DBG("RTSP_Reconnect", "close=%.1fms (non-NVIDIA — CUDA memory pool trim skipped)", closeMs);
|
||||
}
|
||||
RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this);
|
||||
|
||||
@@ -882,6 +901,14 @@ namespace ANSCENTER {
|
||||
#endif
|
||||
|
||||
int ANSRTSPClient::AutoConfigureHWDecoders(int maxPerGpuOverride) {
|
||||
// Skip the CUDA probe entirely on non-NVIDIA hardware — the Platform
|
||||
// fallback (DXGI on Windows, sysfs on Linux) handles Intel/AMD auto
|
||||
// configuration, and calling cudaGetDeviceCount() on AMD wakes up
|
||||
// cudart_static for no benefit. See ANSCVVendorGate.h.
|
||||
if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) {
|
||||
return AutoConfigureHWDecoders_Platform();
|
||||
}
|
||||
|
||||
int gpuCount = 0;
|
||||
cudaError_t err = cudaGetDeviceCount(&gpuCount);
|
||||
if (err != cudaSuccess || gpuCount <= 0) {
|
||||
|
||||
Reference in New Issue
Block a user