Improve ANSCV
This commit is contained in:
@@ -6,6 +6,19 @@
|
||||
#include "TRTCompat.h"
|
||||
#include "ANSLicense.h" // ANS_DBG macro for DebugView logging
|
||||
|
||||
#ifdef _WIN32
|
||||
# ifndef WIN32_LEAN_AND_MEAN
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# endif
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
# include <psapi.h>
|
||||
# include <tlhelp32.h>
|
||||
# pragma comment(lib, "psapi.lib")
|
||||
#endif
|
||||
|
||||
// Per-device mutex for CUDA graph capture.
|
||||
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
|
||||
// at the CUDA context level. When two Engine instances on the same GPU
|
||||
@@ -398,6 +411,56 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
|
||||
s_globalActiveInf.fetch_add(1);
|
||||
|
||||
// ── Process-wide host-RAM heartbeat (once per ~60s) ──────────────────────
|
||||
// Diagnostic for long-run leak hunts: if [PROC_MEM] privateMB climbs while
|
||||
// [TRT_SM100] VRAM stays flat, the leak is on the host side (FFmpeg
|
||||
// contexts, RTSP threads, GDI objects). Cheap when not firing — single
|
||||
// atomic load + one compare in the hot path.
|
||||
#ifdef _WIN32
|
||||
{
|
||||
using clk = std::chrono::steady_clock;
|
||||
static std::atomic<int64_t> s_hbLastNs{0};
|
||||
const int64_t nowNs = clk::now().time_since_epoch().count();
|
||||
int64_t prev = s_hbLastNs.load(std::memory_order_relaxed);
|
||||
constexpr int64_t kIntervalNs = 60LL * 1'000'000'000LL;
|
||||
if (nowNs - prev >= kIntervalNs &&
|
||||
s_hbLastNs.compare_exchange_strong(prev, nowNs,
|
||||
std::memory_order_relaxed)) {
|
||||
PROCESS_MEMORY_COUNTERS_EX pmc{};
|
||||
pmc.cb = sizeof(pmc);
|
||||
GetProcessMemoryInfo(GetCurrentProcess(),
|
||||
reinterpret_cast<PROCESS_MEMORY_COUNTERS*>(&pmc),
|
||||
sizeof(pmc));
|
||||
DWORD gdi = GetGuiResources(GetCurrentProcess(), GR_GDIOBJECTS);
|
||||
DWORD usr = GetGuiResources(GetCurrentProcess(), GR_USEROBJECTS);
|
||||
|
||||
// Thread count via Toolhelp snapshot (filter to current PID).
|
||||
DWORD threads = 0;
|
||||
HANDLE snap = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);
|
||||
if (snap != INVALID_HANDLE_VALUE) {
|
||||
THREADENTRY32 te{ sizeof(te) };
|
||||
const DWORD pid = GetCurrentProcessId();
|
||||
if (Thread32First(snap, &te)) {
|
||||
do {
|
||||
if (te.th32OwnerProcessID == pid) ++threads;
|
||||
} while (Thread32Next(snap, &te));
|
||||
}
|
||||
CloseHandle(snap);
|
||||
}
|
||||
|
||||
ANS_DBG("PROC_MEM",
|
||||
"privateMB=%llu workingMB=%llu peakWorkingMB=%llu "
|
||||
"pagefileMB=%llu gdi=%lu user=%lu threads=%lu",
|
||||
(unsigned long long)(pmc.PrivateUsage >> 20),
|
||||
(unsigned long long)(pmc.WorkingSetSize >> 20),
|
||||
(unsigned long long)(pmc.PeakWorkingSetSize >> 20),
|
||||
(unsigned long long)(pmc.PagefileUsage >> 20),
|
||||
(unsigned long)gdi, (unsigned long)usr,
|
||||
(unsigned long)threads);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Per-thread tracking
|
||||
{
|
||||
static thread_local int64_t s_infCount = 0;
|
||||
@@ -935,15 +998,29 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Per-inference total timing breakdown (mutex wait + preprocess + GPU)
|
||||
// Slow-inference alarm — ONE-SIDED FILTER, NOT A DISTRIBUTION
|
||||
// ============================================================================
|
||||
// This emits a DebugView line ONLY when a single inference's total wall
|
||||
// time (mutex-wait + GPU execution) exceeds 100 ms. Fast calls are silent.
|
||||
//
|
||||
// Consequence: if you aggregate `[TRT_Slow]` lines and compute an average,
|
||||
// you get the mean of the slow *tail*, NOT the real average inference
|
||||
// time. Expect this avg to look dramatic (~200–400 ms) because by design
|
||||
// every sample here is already slow. A typical inference on a healthy
|
||||
// system fires this line for ~1–3% of calls; >10% indicates a problem.
|
||||
//
|
||||
// For the true per-inference distribution, look at `[TRT_SM100] #N ...
|
||||
// avgMs=... maxMs=...` (running-average, emitted every 50 inferences).
|
||||
// The tag was previously `[TRT_Timing]` which misled readers into
|
||||
// interpreting the avg as overall pipeline latency.
|
||||
{
|
||||
double totalMs = std::chrono::duration<double, std::milli>(
|
||||
std::chrono::steady_clock::now() - _mutexWaitStart).count();
|
||||
double gpuMs = totalMs - _mutexWaitMs; // Everything after mutex acquired
|
||||
// Log every inference that takes >100ms total (including mutex wait)
|
||||
if (totalMs > 100.0) {
|
||||
ANS_DBG("TRT_Timing", "total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d",
|
||||
ANS_DBG("TRT_Slow",
|
||||
"SLOW inference total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d "
|
||||
"(this filter only fires for calls >100ms)",
|
||||
totalMs, _mutexWaitMs, gpuMs, batchSize, s_globalActiveInf.load());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user