Improve ANSCV

2026-04-21 09:26:02 +10:00
parent 9f0a10a4c8
commit 7e772f76bc
15 changed files with 749 additions and 421 deletions
--- a/engines/TensorRTAPI/include/engine/EngineRunInference.inl
+++ b/engines/TensorRTAPI/include/engine/EngineRunInference.inl
@@ -6,6 +6,19 @@
 #include "TRTCompat.h"
 #include "ANSLicense.h"   // ANS_DBG macro for DebugView logging

+#ifdef _WIN32
+#  ifndef WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN
+#  endif
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#  include <windows.h>
+#  include <psapi.h>
+#  include <tlhelp32.h>
+#  pragma comment(lib, "psapi.lib")
+#endif
+
 // Per-device mutex for CUDA graph capture.
 // TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
 // at the CUDA context level.  When two Engine instances on the same GPU
@@ -398,6 +411,56 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
    const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
    s_globalActiveInf.fetch_add(1);

+    // ── Process-wide host-RAM heartbeat (once per ~60s) ──────────────────────
+    // Diagnostic for long-run leak hunts: if [PROC_MEM] privateMB climbs while
+    // [TRT_SM100] VRAM stays flat, the leak is on the host side (FFmpeg
+    // contexts, RTSP threads, GDI objects). Cheap when not firing — single
+    // atomic load + one compare in the hot path.
+#ifdef _WIN32
+    {
+        using clk = std::chrono::steady_clock;
+        static std::atomic<int64_t> s_hbLastNs{0};
+        const int64_t nowNs = clk::now().time_since_epoch().count();
+        int64_t prev = s_hbLastNs.load(std::memory_order_relaxed);
+        constexpr int64_t kIntervalNs = 60LL * 1'000'000'000LL;
+        if (nowNs - prev >= kIntervalNs &&
+            s_hbLastNs.compare_exchange_strong(prev, nowNs,
+                                               std::memory_order_relaxed)) {
+            PROCESS_MEMORY_COUNTERS_EX pmc{};
+            pmc.cb = sizeof(pmc);
+            GetProcessMemoryInfo(GetCurrentProcess(),
+                                 reinterpret_cast<PROCESS_MEMORY_COUNTERS*>(&pmc),
+                                 sizeof(pmc));
+            DWORD gdi  = GetGuiResources(GetCurrentProcess(), GR_GDIOBJECTS);
+            DWORD usr  = GetGuiResources(GetCurrentProcess(), GR_USEROBJECTS);
+
+            // Thread count via Toolhelp snapshot (filter to current PID).
+            DWORD threads = 0;
+            HANDLE snap = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);
+            if (snap != INVALID_HANDLE_VALUE) {
+                THREADENTRY32 te{ sizeof(te) };
+                const DWORD pid = GetCurrentProcessId();
+                if (Thread32First(snap, &te)) {
+                    do {
+                        if (te.th32OwnerProcessID == pid) ++threads;
+                    } while (Thread32Next(snap, &te));
+                }
+                CloseHandle(snap);
+            }
+
+            ANS_DBG("PROC_MEM",
+                "privateMB=%llu workingMB=%llu peakWorkingMB=%llu "
+                "pagefileMB=%llu gdi=%lu user=%lu threads=%lu",
+                (unsigned long long)(pmc.PrivateUsage      >> 20),
+                (unsigned long long)(pmc.WorkingSetSize    >> 20),
+                (unsigned long long)(pmc.PeakWorkingSetSize >> 20),
+                (unsigned long long)(pmc.PagefileUsage     >> 20),
+                (unsigned long)gdi, (unsigned long)usr,
+                (unsigned long)threads);
+        }
+    }
+#endif
+
    // Per-thread tracking
    {
        static thread_local int64_t s_infCount = 0;
@@ -935,15 +998,29 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
    }

    // ============================================================================
-    // Per-inference total timing breakdown (mutex wait + preprocess + GPU)
+    // Slow-inference alarm — ONE-SIDED FILTER, NOT A DISTRIBUTION
    // ============================================================================
+    // This emits a DebugView line ONLY when a single inference's total wall
+    // time (mutex-wait + GPU execution) exceeds 100 ms. Fast calls are silent.
+    //
+    // Consequence: if you aggregate `[TRT_Slow]` lines and compute an average,
+    // you get the mean of the slow *tail*, NOT the real average inference
+    // time. Expect this avg to look dramatic (~200–400 ms) because by design
+    // every sample here is already slow. A typical inference on a healthy
+    // system fires this line for ~1–3% of calls; >10% indicates a problem.
+    //
+    // For the true per-inference distribution, look at `[TRT_SM100] #N ...
+    // avgMs=... maxMs=...` (running-average, emitted every 50 inferences).
+    // The tag was previously `[TRT_Timing]` which misled readers into
+    // interpreting the avg as overall pipeline latency.
    {
        double totalMs = std::chrono::duration<double, std::milli>(
            std::chrono::steady_clock::now() - _mutexWaitStart).count();
        double gpuMs = totalMs - _mutexWaitMs;  // Everything after mutex acquired
-        // Log every inference that takes >100ms total (including mutex wait)
        if (totalMs > 100.0) {
-            ANS_DBG("TRT_Timing", "total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d",
+            ANS_DBG("TRT_Slow",
+                    "SLOW inference total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d "
+                    "(this filter only fires for calls >100ms)",
                    totalMs, _mutexWaitMs, gpuMs, batchSize, s_globalActiveInf.load());
        }
    }