Fix double stop in ANSVideoPlayer

2026-04-22 10:10:16 +10:00
parent 97d814936d
commit 57cc8e0a56
14 changed files with 492 additions and 70 deletions
--- a/modules/ANSCV/ANSFilePlayer.cpp
+++ b/modules/ANSCV/ANSFilePlayer.cpp
@@ -57,8 +57,11 @@ namespace ANSCENTER {
 				_logger.LogError("ANSFILEPLAYER::Destroy.", "Unknown exception", __FILE__, __LINE__);
 			}
 		}
+		// Destructor calls close() exactly once — do not call close() explicitly
+		// beforehand. CFilePlayer::close() is not safe to call twice (it re-enters
+		// decoder Stop/flush on an already-torn-down decoder).
 		if (clientToClose) {
-			clientToClose->close();
+			clientToClose.reset();
 		}
 	}
 	void ANSFILEPLAYER::CheckLicense() {
@@ -102,7 +105,8 @@ namespace ANSCENTER {
 			std::lock_guard<std::recursive_mutex> lock(_mutex);
 			_isPlaying = false;
 		}
-		_playerClient->close();
+		// CFilePlayer::open() calls close() internally at the top — no need
+		// to close explicitly here (doing so would double-close the decoder).
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
 		Setup();
 		return Start();
--- a/modules/ANSCV/ANSGpuFrameOps.h
+++ b/modules/ANSCV/ANSGpuFrameOps.h
@@ -26,11 +26,19 @@ extern "C" {
 #include <thread>
 #include <mutex>
 #include <cstdio>
+#include <atomic>

 #ifdef _WIN32
 #include <windows.h>
 #endif

+// Leak diagnostic — counts AVFrames handed back to the media layer for
+// deferred freeing. Defined in video_player.cpp. Paired with g_nv12Escapes /
+// g_cudaHWEscapes in the [MEDIA_Leak] heartbeat: if escapes > pendingReturns
+// and the delta grows, external callers (via getNV12Frame/getCudaHWFrame)
+// are holding clones instead of returning them.
+extern std::atomic<int64_t> g_avframePendingReturns;
+
 // Debug logging macro for GPU frame operations.
 // Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
 #ifndef GPU_FRAME_DBG
@@ -172,6 +180,7 @@ inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pt
        auto& reg = ANSGpuFrameRegistry::instance();
        auto lk = reg.acquire_lock();
        reg.pushPendingFree_locked(old);
+        g_avframePendingReturns.fetch_add(1, std::memory_order_relaxed);
    }

    // NOTE: No drain_pending() here (hot path). Freed by evict_stale.
@@ -378,6 +387,7 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
            auto& reg = ANSGpuFrameRegistry::instance();
            auto lk = reg.acquire_lock();
            reg.pushPendingFree_locked(cudaFrame);
+            g_avframePendingReturns.fetch_add(1, std::memory_order_relaxed);
        }
        data.avframe = nullptr;
    }
@@ -386,6 +396,7 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
        auto& reg = ANSGpuFrameRegistry::instance();
        auto lk = reg.acquire_lock();
        reg.pushPendingFree_locked(cpuNV12);
+        g_avframePendingReturns.fetch_add(1, std::memory_order_relaxed);
    }
    data.cpuAvframe = nullptr;

@@ -399,6 +410,7 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
        auto& reg = ANSGpuFrameRegistry::instance();
        auto lk = reg.acquire_lock();
        reg.pushPendingFree_locked(old);
+        g_avframePendingReturns.fetch_add(1, std::memory_order_relaxed);
    }

    // NOTE: No drain_pending() here (hot path). AVFrames accumulate in
--- a/modules/ANSCV/ANSVideoPlayer.cpp
+++ b/modules/ANSCV/ANSVideoPlayer.cpp
@@ -47,9 +47,6 @@ namespace ANSCENTER {
 		{
 			std::lock_guard<std::recursive_mutex> lock(_mutex);
 			try {
-				if (_hwPlayer) {
-					try { _hwPlayer->stop(); } catch (...) {}
-				}
 				hwPlayerToClose = std::move(_hwPlayer);
 				_hwDecodeActive = false;
 				_hwGpuIndex = -1;
@@ -80,9 +77,10 @@ namespace ANSCENTER {
 		}
 		} // end lock scope

-		// CUDA cleanup happens here, outside the mutex
+		// CUDA cleanup happens here, outside the mutex.
+		// Destructor calls close() once — do not call stop()/close() explicitly
+		// beforehand (double-close re-enters torn-down decoder state).
 		if (hwPlayerToClose) {
-			try { hwPlayerToClose->close(); } catch (...) {}
 			hwPlayerToClose.reset();
 		}
 	}
@@ -201,13 +199,10 @@ namespace ANSCENTER {
 		{
 			std::lock_guard<std::recursive_mutex> lock(_mutex);
 			_isPlaying = false;  // GetImage() returns cached frame while we reconnect
-			if (_hwPlayer) {
-				try { _hwPlayer->stop(); } catch (...) {}
-				hwPlayerToClose = std::move(_hwPlayer);
-			}
+			hwPlayerToClose = std::move(_hwPlayer);
 		}
+		// Destructor calls close() exactly once — single teardown.
 		if (hwPlayerToClose) {
-			try { hwPlayerToClose->close(); } catch (...) {}
 			hwPlayerToClose.reset();
 		}

@@ -241,11 +236,24 @@ namespace ANSCENTER {
 	bool ANSVIDEOPLAYER::Start() {
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
 		try {
+			// Re-initialize after a prior Stop(): _hwPlayer was released and
+			// cap was closed. Setup() reopens whichever backend applies.
+			// Why: CFilePlayer::stop() == close(), which frees m_pFormatContext.
+			// Calling play() on a closed player dereferences NULL and crashes.
+			if (!_hwPlayer && !cap.isOpened()) {
+				if (!Setup()) {
+					this->_logger.LogError("ANSVIDEOPLAYER::Start. Exception occurred:",
+						"Setup() failed on restart", __FILE__, __LINE__);
+					return false;
+				}
+			}
+
 			// --- HW decode path ---
 			if (_hwDecodeActive && _hwPlayer) {
 				_hwPlayer->play();  // starts read/video/audio threads
 				_hwEOF = false;
 				_hwFrameCount = 0;
+				_hwLastPts = 0;
 				_isPlaying = true;

 				// Wait for first frame outside the mutex to let decode threads run
@@ -284,15 +292,26 @@ namespace ANSCENTER {
 		}
 	}
 	bool ANSVIDEOPLAYER::Stop() {
-		decltype(_hwPlayer.get()) hwPlayer = nullptr;
+		// Move HW player out of lock scope — CFilePlayer::stop() == close(),
+		// which does CUDA cleanup that must not run under _mutex to avoid
+		// deadlocking with the nvcuda64 SRW lock held by inference.
+		decltype(_hwPlayer) hwPlayerToClose;
 		{
 			std::lock_guard<std::recursive_mutex> lock(_mutex);
 			try {
 				// --- HW decode path ---
 				if (_hwDecodeActive && _hwPlayer) {
 					_isPlaying = false;
-					hwPlayer = _hwPlayer.get();
-					// stop() called outside the lock below; skip cap path
+					// Release the player completely — CFilePlayer::stop() == close(),
+					// which frees m_pFormatContext. Keeping the unique_ptr alive after
+					// this point is a landmine: a later play() would deref NULL.
+					hwPlayerToClose = std::move(_hwPlayer);
+					_hwDecodeActive = false;
+					_hwGpuIndex = -1;
+					_hwCudaAccel = false;
+					_hwEOF = false;
+					_hwFrameCount = 0;
+					_hwLastPts = 0;
 				}
 				else {
 					// --- cv::VideoCapture fallback ---
@@ -322,8 +341,12 @@ namespace ANSCENTER {
 				return false;
 			}
 		}
-		if (hwPlayer) {
-			hwPlayer->stop();
+		// CUDA cleanup happens here, outside the mutex.
+		// Rely on the destructor to call close() exactly once. Calling stop()
+		// (== close()) explicitly would double-close the CFilePlayer, which
+		// re-enters decoder Stop/flush on an already-torn-down decoder.
+		if (hwPlayerToClose) {
+			hwPlayerToClose.reset();
 		}
 		return true;
 	}
--- a/modules/ANSCV/GpuNV12SlotPool.cpp
+++ b/modules/ANSCV/GpuNV12SlotPool.cpp
@@ -7,8 +7,11 @@
 #define NOMINMAX
 #include <windows.h>
 #include "GpuNV12SlotPool.h"
+#include "ANSLicense.h"   // ANS_DBG macro for [Pool_Leak] heartbeat

 #include <cuda_runtime.h>
+#include <atomic>
+#include <chrono>

 // ANSCV.dll owns the process-wide singleton.
 GpuNV12SlotPool* GpuNV12SlotPool::resolveProcessWide() {
@@ -40,6 +43,41 @@ void GpuNV12SlotPool::drainCooledSlots_locked() {
 GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
    std::lock_guard<std::mutex> lock(m_mutex);

+    // Leak diagnostic — [Pool_Leak] heartbeat fires at most once per 60 s.
+    // Reports current slot count and rough VRAM footprint. Slot count is
+    // bounded by GPU_NV12_POOL_MAX_SLOTS; if it persists near the cap we
+    // also see ACTIVE/COOLING state distribution which can hint at slots
+    // not being released.
+    {
+        using clk = std::chrono::steady_clock;
+        static std::atomic<long long> s_nextLog{0};
+        const long long tick = clk::now().time_since_epoch().count();
+        long long expected = s_nextLog.load(std::memory_order_relaxed);
+        if (tick >= expected) {
+            const long long deadline = tick +
+                std::chrono::duration_cast<clk::duration>(
+                    std::chrono::seconds(60)).count();
+            if (s_nextLog.compare_exchange_strong(expected, deadline,
+                                                  std::memory_order_relaxed)) {
+                size_t totalBytes = 0;
+                size_t active = 0, cooling = 0, free_ = 0;
+                for (const auto& sp : m_slots) {
+                    totalBytes += sp->pitchY * sp->height
+                               +  sp->pitchUV * (sp->height / 2);
+                    const int st = sp->state.load(std::memory_order_relaxed);
+                    if (st == GpuNV12Slot::STATE_ACTIVE)  ++active;
+                    else if (st == GpuNV12Slot::STATE_COOLING) ++cooling;
+                    else ++free_;
+                }
+                ANS_DBG("Pool_Leak",
+                    "NV12Pool slots=%zu (active=%zu cooling=%zu free=%zu) bytesMB=%.1f (max=%d)",
+                    m_slots.size(), active, cooling, free_,
+                    (double)totalBytes / (1024.0 * 1024.0),
+                    GPU_NV12_POOL_MAX_SLOTS);
+            }
+        }
+    }
+
    // 1. Drain cooled-down slots to make them available
    drainCooledSlots_locked();

--- a/modules/ANSLPR/ANSLPR_OCR.cpp
+++ b/modules/ANSLPR/ANSLPR_OCR.cpp
@@ -6,6 +6,7 @@

 #include <json.hpp>
 #include <algorithm>
+#include <atomic>
 #include <chrono>

 // ---------------------------------------------------------------------------
@@ -1063,6 +1064,34 @@ namespace ANSCENTER
 		std::lock_guard<std::mutex> plateLock(_plateIdentitiesMutex);
 		auto& identities = _plateIdentities[cameraId];

+		// Leak diagnostic — [OCR_Leak] heartbeat, at most once per 60 s
+		// process-wide. Same fields as the ANSALPR_OD variant for direct
+		// comparison: cams, ids_tot, clr, imgtrk. If any of these climb
+		// monotonically, the corresponding state container is the leak.
+		{
+			using clk = std::chrono::steady_clock;
+			static std::atomic<long long> s_nextLog{0};
+			const long long tick = clk::now().time_since_epoch().count();
+			long long expected = s_nextLog.load(std::memory_order_relaxed);
+			if (tick >= expected) {
+				const long long deadline = tick +
+					std::chrono::duration_cast<clk::duration>(
+						std::chrono::seconds(60)).count();
+				if (s_nextLog.compare_exchange_strong(expected, deadline,
+				                                       std::memory_order_relaxed)) {
+					size_t ids_tot = 0;
+					for (const auto& [cam, v] : _plateIdentities) ids_tot += v.size();
+					ANS_DBG("OCR_Leak",
+						"ANSALPR_OCR this=%p cams=%zu ids_tot=%zu clr=%zu imgtrk=%zu",
+						(void*)this,
+						_plateIdentities.size(),
+						ids_tot,
+						_colourCache.size(),
+						_imageSizeTrackers.size());
+				}
+			}
+		}
+
 		// Auto-detect mode by detection count.
 		//   1 detection  → pipeline/single-crop mode → no dedup needed.
 		//   2+ detections → full-frame mode → apply accumulated scoring.
--- a/modules/ANSLPR/ANSLPR_OD.cpp
+++ b/modules/ANSLPR/ANSLPR_OD.cpp
@@ -12,6 +12,7 @@
 #include <thread>
 #include <chrono>
 #include <algorithm>
+#include <atomic>
 #include <unordered_map>
 // ---------------------------------------------------------------------------
 // Check ONNX model opset version by reading the protobuf header directly.
@@ -3121,6 +3122,41 @@ namespace ANSCENTER {
 		std::lock_guard<std::mutex> plateLock(_plateIdentitiesMutex);
 		auto& identities = _plateIdentities[cameraId];

+		// Leak diagnostic — [OCR_Leak] heartbeat fires at most once per 60 s
+		// process-wide. Reports the three per-camera state containers that
+		// _could_ accumulate: _plateIdentities (keyed by cameraId), its sum
+		// of inner-vector sizes, _colourCache, _imageSizeTrackers. All three
+		// have stated bounds; heartbeat confirms they actually hold.
+		//   cams     — number of distinct cameraId keys in _plateIdentities
+		//   ids_tot  — sum of per-camera identity-vector sizes (should plateau)
+		//   clr      — _colourCache size (bounded at COLOUR_CACHE_MAX_SIZE=200)
+		//   imgtrk   — _imageSizeTrackers size (one entry per cameraId)
+		// All size() reads outside locks are diagnostic snapshots; brief
+		// races are acceptable (we're looking at trends over minutes).
+		{
+			using clk = std::chrono::steady_clock;
+			static std::atomic<long long> s_nextLog{0};
+			const long long tick = clk::now().time_since_epoch().count();
+			long long expected = s_nextLog.load(std::memory_order_relaxed);
+			if (tick >= expected) {
+				const long long deadline = tick +
+					std::chrono::duration_cast<clk::duration>(
+						std::chrono::seconds(60)).count();
+				if (s_nextLog.compare_exchange_strong(expected, deadline,
+				                                       std::memory_order_relaxed)) {
+					size_t ids_tot = 0;
+					for (const auto& [cam, v] : _plateIdentities) ids_tot += v.size();
+					ANS_DBG("OCR_Leak",
+						"ANSALPR_OD this=%p cams=%zu ids_tot=%zu clr=%zu imgtrk=%zu",
+						(void*)this,
+						_plateIdentities.size(),
+						ids_tot,
+						_colourCache.size(),
+						_imageSizeTrackers.size());
+				}
+			}
+		}
+
 		// Option B: Auto-detect mode by counting detections.
 		// 1 detection  → crop/pipeline mode → return instant result, no accumulated scoring
 		// 2+ detections → full-frame mode   → use accumulated scoring for dedup
--- a/modules/ANSMOT/ByteTrack/src/BYTETracker.cpp
+++ b/modules/ANSMOT/ByteTrack/src/BYTETracker.cpp
@@ -318,13 +318,28 @@ std::vector<ByteTrack::BYTETracker::STrackPtr> ByteTrack::BYTETracker::update(co
    lost_stracks_ = subStracks(jointStracks(subStracks(lost_stracks_, tracked_stracks_), current_lost_stracks), removed_stracks_);
    removed_stracks_ = jointStracks(removed_stracks_, current_removed_stracks);

+    // Cap removed_stracks_ to prevent unbounded growth. Its only job is to
+    // block re-entry into lost_stracks_ for tracks that have already timed
+    // out (see subStracks(..., removed_stracks_) on the previous line). A
+    // track that's been removed for more than a few hundred frames cannot
+    // plausibly re-appear as "lost" — by then it's been reaped elsewhere
+    // and any new detection would get a fresh track_id. 1 000 entries is
+    // ~100 s at 10 fps per camera, well beyond any re-identification
+    // window. Older entries (front of vector) are dropped first.
+    static constexpr size_t kRemovedCap = 1000;
+    if (removed_stracks_.size() > kRemovedCap) {
+        const size_t drop = removed_stracks_.size() - kRemovedCap;
+        removed_stracks_.erase(removed_stracks_.begin(),
+                               removed_stracks_.begin() + drop);
+    }
+
    std::vector<STrackPtr> tracked_stracks_out, lost_stracks_out;
    removeDuplicateStracks(tracked_stracks_, lost_stracks_, tracked_stracks_out, lost_stracks_out);
    tracked_stracks_ = tracked_stracks_out;
    lost_stracks_ = lost_stracks_out;

    // Diagnostic: report tracker state size at most once every 60 s per instance.
-    // removed_stracks_ is append-only in this implementation — watch it grow.
+    // With the cap above, removed_stracks_ should plateau at <= kRemovedCap.
    {
        static thread_local std::chrono::steady_clock::time_point s_nextLog{};
        auto now = std::chrono::steady_clock::now();
--- a/modules/ANSODEngine/engine.h
+++ b/modules/ANSODEngine/engine.h
@@ -434,6 +434,16 @@ private:
    // the first time each batch size is seen; subsequent calls reuse it.
    std::unordered_map<int, cudaGraphExec_t>  m_graphExecs;

+    // Leak diagnostics — per-engine-instance counters for CUDA graph
+    // create/destroy balance. Incremented in EngineRunInference.inl and
+    // EngineBuildLoadNetwork.inl. Read by the [TRT_Leak] heartbeat in
+    // runInference (fires ≤1×/60s per engine instance).
+    // m_trtLeakNextLogTick stores a steady_clock epoch count for lock-free
+    // compare_exchange window claim across concurrent inference threads.
+    std::atomic<int64_t>                      m_trtGraphCreates{0};
+    std::atomic<int64_t>                      m_trtGraphDestroys{0};
+    std::atomic<long long>                    m_trtLeakNextLogTick{0};
+
    Logger m_logger;
    bool m_verbose{ true };   // false for non-probe pool slots
    bool m_disableGraphs{ true }; // DISABLED by default — concurrent graph launches + uploads cause GPU deadlock on WDDM
@@ -569,7 +579,12 @@ template <typename T> Engine<T>::~Engine() {

    // Destroy cached CUDA graphs
    try {
-        for (auto& [bs, ge] : m_graphExecs) { if (ge) cudaGraphExecDestroy(ge); }
+        for (auto& [bs, ge] : m_graphExecs) {
+            if (ge) {
+                cudaGraphExecDestroy(ge);
+                m_trtGraphDestroys.fetch_add(1, std::memory_order_relaxed);
+            }
+        }
        m_graphExecs.clear();
    } catch (...) {}