Fix double stop in ANSVideoPlayer

2026-04-22 10:10:16 +10:00
parent 97d814936d
commit 57cc8e0a56
14 changed files with 492 additions and 70 deletions
--- a/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl
+++ b/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl
@@ -1167,7 +1167,12 @@ trt_cache_create_context:

    // -- Pinned output buffers (CUDA graph prerequisite) -----------------------
    // Invalidate any graphs captured by a previous loadNetwork() call on this instance.
-    for (auto& [bs, ge] : m_graphExecs) { if (ge) cudaGraphExecDestroy(ge); }
+    for (auto& [bs, ge] : m_graphExecs) {
+        if (ge) {
+            cudaGraphExecDestroy(ge);
+            m_trtGraphDestroys.fetch_add(1, std::memory_order_relaxed);
+        }
+    }
    m_graphExecs.clear();
    // Free any previously allocated pinned buffers.
    for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
--- a/engines/TensorRTAPI/include/engine/EngineRunInference.inl
+++ b/engines/TensorRTAPI/include/engine/EngineRunInference.inl
@@ -731,7 +731,10 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
            if (!m_graphExecs.empty()) {
                size_t destroyed = m_graphExecs.size();
                for (auto& [bs, ge] : m_graphExecs) {
-                    if (ge) cudaGraphExecDestroy(ge);
+                    if (ge) {
+                        cudaGraphExecDestroy(ge);
+                        m_trtGraphDestroys.fetch_add(1, std::memory_order_relaxed);
+                    }
                }
                m_graphExecs.clear();
                ANS_DBG("TRT_Engine", "INVALIDATED %zu cached CUDA graphs after shape change (batch=%d)",
@@ -901,8 +904,10 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i

            if (captureOk) {
                cudaGraphExec_t exec = nullptr;
-                if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
+                if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess) {
                    graphExec = exec;
+                    m_trtGraphCreates.fetch_add(1, std::memory_order_relaxed);
+                }
                cudaGraphDestroy(graph);
                ANS_DBG("TRT_Engine", "CUDA graph CAPTURED OK for batch=%d exec=%p",
                        batchSize, (void*)graphExec);
@@ -1053,5 +1058,32 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
        s_globalActiveInf.fetch_sub(1);
    }

+    // Leak diagnostic — one [TRT_Leak] line per engine instance per 60 s.
+    // Reports CUDA graph create/destroy balance and current cache size.
+    // If (creates - destroys) climbs monotonically, graph execs are being
+    // leaked on every shape change; each leaked exec is tens of MB.
+    // Lock-free window claim via compare_exchange — concurrent inference
+    // threads race to log but only one wins per 60-s window.
+    {
+        using clk = std::chrono::steady_clock;
+        const long long tick = clk::now().time_since_epoch().count();
+        long long expected = m_trtLeakNextLogTick.load(std::memory_order_relaxed);
+        if (tick >= expected) {
+            const long long deadline = tick +
+                std::chrono::duration_cast<clk::duration>(
+                    std::chrono::seconds(60)).count();
+            if (m_trtLeakNextLogTick.compare_exchange_strong(
+                    expected, deadline, std::memory_order_relaxed)) {
+                const int64_t cr = m_trtGraphCreates.load(std::memory_order_relaxed);
+                const int64_t ds = m_trtGraphDestroys.load(std::memory_order_relaxed);
+                ANS_DBG("TRT_Leak",
+                    "engine=%p creates=%lld destroys=%lld net=%lld cached=%zu",
+                    (void*)this,
+                    (long long)cr, (long long)ds, (long long)(cr - ds),
+                    m_graphExecs.size());
+            }
+        }
+    }
+
    return true;
 }