Improve ANSCV

This commit is contained in:
2026-04-21 09:26:02 +10:00
parent 9f0a10a4c8
commit 7e772f76bc
15 changed files with 749 additions and 421 deletions

View File

@@ -6,6 +6,19 @@
#include "TRTCompat.h"
#include "ANSLicense.h" // ANS_DBG macro for DebugView logging
#ifdef _WIN32
# ifndef WIN32_LEAN_AND_MEAN
# define WIN32_LEAN_AND_MEAN
# endif
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
# include <psapi.h>
# include <tlhelp32.h>
# pragma comment(lib, "psapi.lib")
#endif
// Per-device mutex for CUDA graph capture.
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
// at the CUDA context level. When two Engine instances on the same GPU
@@ -398,6 +411,56 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
s_globalActiveInf.fetch_add(1);
// ── Process-wide host-RAM heartbeat (once per ~60s) ──────────────────────
// Diagnostic for long-run leak hunts: if [PROC_MEM] privateMB climbs while
// [TRT_SM100] VRAM stays flat, the leak is on the host side (FFmpeg
// contexts, RTSP threads, GDI objects). Cheap when not firing — single
// atomic load + one compare in the hot path.
#ifdef _WIN32
{
using clk = std::chrono::steady_clock;
static std::atomic<int64_t> s_hbLastNs{0};
const int64_t nowNs = clk::now().time_since_epoch().count();
int64_t prev = s_hbLastNs.load(std::memory_order_relaxed);
constexpr int64_t kIntervalNs = 60LL * 1'000'000'000LL;
if (nowNs - prev >= kIntervalNs &&
s_hbLastNs.compare_exchange_strong(prev, nowNs,
std::memory_order_relaxed)) {
PROCESS_MEMORY_COUNTERS_EX pmc{};
pmc.cb = sizeof(pmc);
GetProcessMemoryInfo(GetCurrentProcess(),
reinterpret_cast<PROCESS_MEMORY_COUNTERS*>(&pmc),
sizeof(pmc));
DWORD gdi = GetGuiResources(GetCurrentProcess(), GR_GDIOBJECTS);
DWORD usr = GetGuiResources(GetCurrentProcess(), GR_USEROBJECTS);
// Thread count via Toolhelp snapshot (filter to current PID).
DWORD threads = 0;
HANDLE snap = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);
if (snap != INVALID_HANDLE_VALUE) {
THREADENTRY32 te{ sizeof(te) };
const DWORD pid = GetCurrentProcessId();
if (Thread32First(snap, &te)) {
do {
if (te.th32OwnerProcessID == pid) ++threads;
} while (Thread32Next(snap, &te));
}
CloseHandle(snap);
}
ANS_DBG("PROC_MEM",
"privateMB=%llu workingMB=%llu peakWorkingMB=%llu "
"pagefileMB=%llu gdi=%lu user=%lu threads=%lu",
(unsigned long long)(pmc.PrivateUsage >> 20),
(unsigned long long)(pmc.WorkingSetSize >> 20),
(unsigned long long)(pmc.PeakWorkingSetSize >> 20),
(unsigned long long)(pmc.PagefileUsage >> 20),
(unsigned long)gdi, (unsigned long)usr,
(unsigned long)threads);
}
}
#endif
// Per-thread tracking
{
static thread_local int64_t s_infCount = 0;
@@ -935,15 +998,29 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
}
// ============================================================================
// Per-inference total timing breakdown (mutex wait + preprocess + GPU)
// Slow-inference alarm — ONE-SIDED FILTER, NOT A DISTRIBUTION
// ============================================================================
// This emits a DebugView line ONLY when a single inference's total wall
// time (mutex-wait + GPU execution) exceeds 100 ms. Fast calls are silent.
//
// Consequence: if you aggregate `[TRT_Slow]` lines and compute an average,
// you get the mean of the slow *tail*, NOT the real average inference
// time. Expect this avg to look dramatic (~200400 ms) because by design
// every sample here is already slow. A typical inference on a healthy
// system fires this line for ~13% of calls; >10% indicates a problem.
//
// For the true per-inference distribution, look at `[TRT_SM100] #N ...
// avgMs=... maxMs=...` (running-average, emitted every 50 inferences).
// The tag was previously `[TRT_Timing]` which misled readers into
// interpreting the avg as overall pipeline latency.
{
double totalMs = std::chrono::duration<double, std::milli>(
std::chrono::steady_clock::now() - _mutexWaitStart).count();
double gpuMs = totalMs - _mutexWaitMs; // Everything after mutex acquired
// Log every inference that takes >100ms total (including mutex wait)
if (totalMs > 100.0) {
ANS_DBG("TRT_Timing", "total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d",
ANS_DBG("TRT_Slow",
"SLOW inference total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d "
"(this filter only fires for calls >100ms)",
totalMs, _mutexWaitMs, gpuMs, batchSize, s_globalActiveInf.load());
}
}