Fix NV12 crash issue when recreate camera object

2026-04-02 22:07:27 +11:00
parent 4bedf3a3a2
commit 958cab6ae3
25 changed files with 1459 additions and 393 deletions
--- a/modules/ANSCV/ANSFLV.cpp
+++ b/modules/ANSCV/ANSFLV.cpp
@@ -46,13 +46,22 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSFLVClient::Destroy() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_playerClient) {
-            if (_isPlaying) {
-                _playerClient->stop();
-                _isPlaying = false;
+        // Move player out of lock scope — close() does CUDA cleanup
+        // (cuArrayDestroy/cuMemFree) which must not run under _mutex
+        // to avoid deadlocking with nvcuda64 SRW lock held by inference.
+        decltype(_playerClient) clientToClose;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_playerClient) {
+                if (_isPlaying) {
+                    _playerClient->stop();
+                    _isPlaying = false;
+                }
            }
-            _playerClient->close();
+            clientToClose = std::move(_playerClient);
+        }
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSFLVLicense(const std::string& licenseKey) {
@@ -129,8 +138,12 @@ namespace ANSCENTER {
        }
    }
    bool ANSFLVClient::Reconnect() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+        }
        _playerClient->close();
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
        Setup();
        _isPlaying = _playerClient->play();
        return _isPlaying;
@@ -143,10 +156,16 @@ namespace ANSCENTER {
        return _isPlaying;
    }
    bool ANSFLVClient::Stop() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_isPlaying) {
-            _playerClient->stop();
-            _isPlaying = false;
+        decltype(_playerClient.get()) player = nullptr;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_isPlaying) {
+                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
        }
        return true;
    }
--- a/modules/ANSCV/ANSFilePlayer.cpp
+++ b/modules/ANSCV/ANSFilePlayer.cpp
@@ -39,22 +39,26 @@ namespace ANSCENTER {
 		catch (...) {}
 	}
 	void ANSFILEPLAYER::Destroy() {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
-		try {
-			_url = "";
-			_imageRotateDeg = 0;
-			_isPlaying = false;
-			_lastJpegImage = "";
-			_pLastFrame.release();
-			if (_playerClient) {
-				_playerClient->close();
+		decltype(_playerClient) clientToClose;
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			try {
+				_url = "";
+				_imageRotateDeg = 0;
+				_isPlaying = false;
+				_lastJpegImage = "";
+				_pLastFrame.release();
+				clientToClose = std::move(_playerClient);
+			}
+			catch (const std::exception& e) {
+				_logger.LogError("ANSFILEPLAYER::Destroy. Exception:", e.what(), __FILE__, __LINE__);
+			}
+			catch (...) {
+				_logger.LogError("ANSFILEPLAYER::Destroy.", "Unknown exception", __FILE__, __LINE__);
 			}
 		}
-		catch (const std::exception& e) {
-			_logger.LogError("ANSFILEPLAYER::Destroy. Exception:", e.what(), __FILE__, __LINE__);
-		}
-		catch (...) {
-			_logger.LogError("ANSFILEPLAYER::Destroy.", "Unknown exception", __FILE__, __LINE__);
+		if (clientToClose) {
+			clientToClose->close();
 		}
 	}
 	void ANSFILEPLAYER::CheckLicense() {
@@ -94,8 +98,12 @@ namespace ANSCENTER {
 		return _playerClient->open(_url);
 	}
 	bool ANSFILEPLAYER::Reconnect() {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			_isPlaying = false;
+		}
 		_playerClient->close();
+		std::lock_guard<std::recursive_mutex> lock(_mutex);
 		Setup();
 		return Start();
 	}
@@ -105,14 +113,17 @@ namespace ANSCENTER {
 		return _isPlaying;
 	}
 	bool ANSFILEPLAYER::Stop() {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
-		if (_playerClient->pause()) {
+		decltype(_playerClient.get()) player = nullptr;
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			player = _playerClient.get();
+		}
+		if (player && player->pause()) {
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
 			_isPlaying = false;
 			return true;
 		}
-		else {
-			return false;
-		}
+		return false;
 	}
 	bool ANSFILEPLAYER::IsPaused() {
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
--- a/modules/ANSCV/ANSGpuFrameOps.h
+++ b/modules/ANSCV/ANSGpuFrameOps.h
@@ -19,8 +19,31 @@ extern "C" {
 #include "libavutil/frame.h"
 }

+#include <cuda_runtime.h>
 #include <cstring>
 #include <cstdlib>
+#include <cstdio>
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+// Debug logging macro for GPU frame operations.
+// Output goes to stderr (console) AND OutputDebugString (DebugView / VS debugger).
+// Use Sysinternals DebugView (dbgview64.exe) to capture these after a crash.
+#ifndef GPU_FRAME_DBG
+#ifdef _WIN32
+#define GPU_FRAME_DBG(fmt, ...) do { \
+    char _gpu_dbg_buf[512]; \
+    snprintf(_gpu_dbg_buf, sizeof(_gpu_dbg_buf), "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__); \
+    OutputDebugStringA(_gpu_dbg_buf); \
+    fprintf(stderr, "%s", _gpu_dbg_buf); \
+} while(0)
+#else
+#define GPU_FRAME_DBG(fmt, ...) \
+    fprintf(stderr, "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__)
+#endif
+#endif

 namespace anscv_gpu_ops {
 namespace detail {
@@ -71,6 +94,42 @@ inline bool snapshotNV12Planes(const AVFrame* nv12,
    return true;
 }

+// Drain pending GPU device pointers and actually cudaFree them.
+// Must be called from a thread with CUDA context available.
+inline void drainAndFreeGpuPending() {
+    auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
+    if (gpuPending.empty()) return;
+    GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs", gpuPending.size());
+    int prevDev = -1;
+    cudaGetDevice(&prevDev);
+
+    // Group by device to minimize cudaSetDevice calls and synchronize once per device.
+    // cudaDeviceSynchronize() is CRITICAL: NV12 kernels run on cv::cuda::Stream
+    // (not the default stream).  cudaFree on stream 0 doesn't wait for other
+    // streams, so without this sync, cudaFree can free a buffer while a kernel
+    // on another stream is still reading from it → cudaErrorIllegalAddress (700)
+    // which permanently corrupts the CUDA context.
+    int lastSyncDev = -1;
+    for (auto& entry : gpuPending) {
+        if (entry.ptr) {
+            if (entry.deviceIdx >= 0)
+                cudaSetDevice(entry.deviceIdx);
+            if (entry.deviceIdx != lastSyncDev) {
+                cudaDeviceSynchronize();
+                lastSyncDev = entry.deviceIdx;
+            }
+            GPU_FRAME_DBG("drainGpuPending: cudaFree(%p) dev=%d", entry.ptr, entry.deviceIdx);
+            cudaError_t err = cudaFree(entry.ptr);
+            if (err != cudaSuccess) {
+                GPU_FRAME_DBG("drainGpuPending: cudaFree FAILED err=%d (%s)",
+                              (int)err, cudaGetErrorString(err));
+            }
+        }
+    }
+    if (prevDev >= 0)
+        cudaSetDevice(prevDev);
+}
+
 } // namespace detail
 } // namespace anscv_gpu_ops

@@ -117,36 +176,44 @@ inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pt
    }
 }

-// Attach CUDA HW frame — keeps CUDA device pointers for zero-copy inference.
+// Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
 // TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
 //
-// Primary path: yPlane/uvPlane point to CUDA device pointers from the cloned
-// AVFrame (data[0]/data[1]).  The cloned AVFrame keeps the NVDEC surface alive
-// until gpu_frame_remove() is called after inference.  With 4 cameras each
-// holding ~1 surface, this uses 4 of NVDEC's 25-32 surface pool — safe.
+// D2D copy path: cudaMemcpy2D from NVDEC surfaces to cudaMalloc'd buffers on the
+// same GPU.  This decouples the NV12 data lifetime from the NVDEC decoder, so
+// player->close() can safely destroy the decoder at any time without invalidating
+// pointers that inference engines may be reading.  The NVDEC surface is freed
+// immediately (av_frame_free), returning it to the decoder's surface pool.
+//
+// The owned GPU pointers are stored as both yPlane/uvPlane (for zero-copy reads)
+// and gpuCacheY/gpuCacheUV (for lifecycle management / cudaFree on cleanup).
+//
+// VRAM budget: if the global GPU cache budget is exceeded, falls back to CPU-only
+// NV12 snapshot (no zero-copy, but safe).
 //
 // Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
-// inference (when decode GPU != inference GPU, CUDA device ptrs aren't
-// accessible from another GPU context).
+// inference (when decode GPU != inference GPU).
 inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
                                   AVFrame* cpuNV12 = nullptr) {
-    if (!mat || !cudaFrame) return;
+    if (!mat || !cudaFrame) {
+        GPU_FRAME_DBG("attach_cuda: SKIP mat=%p cudaFrame=%p", (void*)mat, (void*)cudaFrame);
+        return;
+    }
+
+    const int w = cudaFrame->width;
+    const int h = cudaFrame->height;
+    GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p cpuNV12=%p",
+                  (void*)mat, w, h, gpuIdx,
+                  (void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)cpuNV12);

    GpuFrameData data{};
    data.gpuIndex        = gpuIdx;
    data.pts             = pts;
-    data.width           = cudaFrame->width;
-    data.height          = cudaFrame->height;
-    data.pixelFormat     = 23; // AV_PIX_FMT_NV12 — the underlying sw_format
+    data.width           = w;
+    data.height          = h;
+    data.pixelFormat     = 23; // AV_PIX_FMT_NV12

-    // Primary: CUDA device pointers from NVDEC (zero-copy on same GPU)
-    data.isCudaDevicePtr = true;
-    data.yPlane          = cudaFrame->data[0];   // CUDA device ptr: Y plane
-    data.uvPlane         = cudaFrame->data[1];   // CUDA device ptr: UV plane
-    data.yLinesize       = cudaFrame->linesize[0];
-    data.uvLinesize      = cudaFrame->linesize[1];
-
-    // Fallback: snapshot CPU NV12 for cross-GPU inference
+    // Snapshot CPU NV12 for cross-GPU fallback (must do before freeing cpuNV12)
    if (cpuNV12) {
        anscv_gpu_ops::detail::snapshotNV12Planes(
            cpuNV12,
@@ -155,9 +222,98 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
            data.width, data.height);
    }

-    // Store AVFrames for cleanup (cudaFrame keeps NVDEC surface alive)
-    data.avframe    = cudaFrame;
-    data.cpuAvframe = cpuNV12;
+    // --- D2D copy: NVDEC surface → owned GPU memory ---
+    // Estimate VRAM needed for the owned NV12 copy
+    const size_t yBytes  = static_cast<size_t>(w) * h;
+    const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
+    const size_t totalBytes = yBytes + uvBytes;
+
+    bool d2dOk = false;
+    if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
+        int prevDev = -1;
+        cudaGetDevice(&prevDev);
+        if (gpuIdx >= 0)
+            cudaSetDevice(gpuIdx);
+
+        void*  ownedY  = nullptr;
+        void*  ownedUV = nullptr;
+        size_t yPitch  = 0;
+        size_t uvPitch = 0;
+
+        cudaError_t e1 = cudaMallocPitch(&ownedY,  &yPitch,  w, h);
+        cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);
+
+        if (e1 == cudaSuccess && e2 == cudaSuccess) {
+            cudaError_t e3 = cudaMemcpy2D(ownedY,  yPitch,
+                                           cudaFrame->data[0], cudaFrame->linesize[0],
+                                           w, h, cudaMemcpyDeviceToDevice);
+            cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
+                                           cudaFrame->data[1], cudaFrame->linesize[1],
+                                           w, h / 2, cudaMemcpyDeviceToDevice);
+
+            if (e3 == cudaSuccess && e4 == cudaSuccess) {
+                // Store owned GPU pointers as primary NV12 source
+                data.isCudaDevicePtr = true;
+                data.yPlane          = static_cast<uint8_t*>(ownedY);
+                data.uvPlane         = static_cast<uint8_t*>(ownedUV);
+                data.yLinesize       = static_cast<int>(yPitch);
+                data.uvLinesize      = static_cast<int>(uvPitch);
+
+                // Track in gpuCache for lifecycle management (cudaFree on cleanup)
+                data.gpuCacheY          = ownedY;
+                data.gpuCacheUV         = ownedUV;
+                data.gpuCacheYPitch     = yPitch;
+                data.gpuCacheUVPitch    = uvPitch;
+                data.gpuCacheDeviceIdx  = gpuIdx;
+                data.gpuCacheValid      = true;
+                data.gpuCacheBytes      = yPitch * h + uvPitch * (h / 2);
+
+                ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
+                d2dOk = true;
+                GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
+                              ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
+            } else {
+                // D2D copy failed — free allocated memory and fall back
+                GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
+                              (int)e3, (int)e4);
+                cudaFree(ownedY);
+                cudaFree(ownedUV);
+            }
+        } else {
+            // Allocation failed — free any partial allocation and fall back
+            GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
+                          (int)e1, (int)e2);
+            if (e1 == cudaSuccess) cudaFree(ownedY);
+            if (e2 == cudaSuccess) cudaFree(ownedUV);
+        }
+
+        if (prevDev >= 0)
+            cudaSetDevice(prevDev);
+    }
+
+    if (!d2dOk) {
+        // Fall back to CPU NV12 snapshot only (no zero-copy)
+        GPU_FRAME_DBG("attach_cuda: FALLBACK CPU-only cpuY=%p cpuUV=%p",
+                      (void*)data.cpuYPlane, (void*)data.cpuUvPlane);
+        data.isCudaDevicePtr = false;
+        data.yPlane          = data.cpuYPlane;
+        data.uvPlane         = data.cpuUvPlane;
+        data.yLinesize       = data.cpuYLinesize;
+        data.uvLinesize      = data.cpuUvLinesize;
+    }
+
+    // Release AVFrames immediately — NVDEC surfaces returned to pool.
+    // No longer stored in GpuFrameData (owned GPU copy is independent).
+    GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p",
+                  (void*)cudaFrame, (void*)cpuNV12);
+    av_frame_free(&cudaFrame);
+    if (cpuNV12) av_frame_free(&cpuNV12);
+    data.avframe    = nullptr;
+    data.cpuAvframe = nullptr;
+
+    GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d gpuCacheY=%p gpuCacheUV=%p",
+                  (void*)data.yPlane, (void*)data.uvPlane, (int)data.isCudaDevicePtr,
+                  data.gpuCacheY, data.gpuCacheUV);

    void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
    if (old) {
@@ -165,17 +321,23 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
        av_frame_free(&oldFrame);
    }

+    // Free stale AVFrames evicted by TTL or previous attach
    auto pending = ANSGpuFrameRegistry::instance().drain_pending();
    for (void* p : pending) {
        AVFrame* stale = static_cast<AVFrame*>(p);
        av_frame_free(&stale);
    }
+
+    // Free stale GPU device pointers
+    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }

-// Release entry by cv::Mat* and free any returned AVFrames. Safe if not in map (no-op).
+// Release entry by cv::Mat* and free any returned AVFrames + GPU pointers.
+// Safe if not in map (no-op).
 inline void gpu_frame_remove(cv::Mat* mat) {
    if (!mat) return;

+    GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat);
    ANSGpuFrameRegistry::instance().release(mat);

    // Free any AVFrames that became pending from this release or prior eviction
@@ -186,13 +348,7 @@ inline void gpu_frame_remove(cv::Mat* mat) {
    }

    // Free any GPU device pointers that became pending
-    auto gpuPending = gpu_frame_drain_gpu_pending();
-    // NOTE: cudaFree requires CUDA context — caller must be on a CUDA-capable thread.
-    // If not, these will leak. In practice, gpu_frame_remove is called from ANSCV
-    // camera threads which do have CUDA context.
-    // For safety, we skip cudaFree here and let NV12PreprocessHelper handle it.
-    // The GPU pointers are tracked in the budget and will be accounted for.
-    (void)gpuPending;
+    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }

 // Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
@@ -209,4 +365,7 @@ inline void gpu_frame_evict_stale() {
        AVFrame* stale = static_cast<AVFrame*>(p);
        av_frame_free(&stale);
    }
+
+    // Free any GPU device pointers from evicted frames
+    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }
--- a/modules/ANSCV/ANSMJPEG.cpp
+++ b/modules/ANSCV/ANSMJPEG.cpp
@@ -46,13 +46,19 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSMJPEGClient::Destroy() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_playerClient) {
-            if (_isPlaying) {
-                _playerClient->stop();
-                _isPlaying = false;
+        decltype(_playerClient) clientToClose;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_playerClient) {
+                if (_isPlaying) {
+                    _playerClient->stop();
+                    _isPlaying = false;
+                }
            }
-            _playerClient->close();
+            clientToClose = std::move(_playerClient);
+        }
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSMJPEGLicense(const std::string& licenseKey) {
@@ -129,8 +135,12 @@ namespace ANSCENTER {
        }
    }
    bool ANSMJPEGClient::Reconnect() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+        }
        _playerClient->close();
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
        Setup();
        _isPlaying = _playerClient->play();
        return _isPlaying;
@@ -143,10 +153,16 @@ namespace ANSCENTER {
        return _isPlaying;
    }
    bool ANSMJPEGClient::Stop() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_isPlaying) {
-            _playerClient->stop();
-            _isPlaying = false;
+        decltype(_playerClient.get()) player = nullptr;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_isPlaying) {
+                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
        }
        return true;
    }
--- a/modules/ANSCV/ANSRTMP.cpp
+++ b/modules/ANSCV/ANSRTMP.cpp
@@ -48,13 +48,19 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSRTMPClient::Destroy() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_playerClient) {
-            if (_isPlaying) {
-                _playerClient->stop();
-                _isPlaying = false;
+        decltype(_playerClient) clientToClose;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_playerClient) {
+                if (_isPlaying) {
+                    _playerClient->stop();
+                    _isPlaying = false;
+                }
            }
-            _playerClient->close();
+            clientToClose = std::move(_playerClient);
+        }
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSRTMPLicense(const std::string& licenseKey) {
@@ -126,8 +132,12 @@ namespace ANSCENTER {
    }

    bool ANSRTMPClient::Reconnect() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+        }
        _playerClient->close();
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
        Setup();
        _isPlaying = _playerClient->play();
        return _isPlaying;
@@ -140,10 +150,16 @@ namespace ANSCENTER {
        return _isPlaying;
    }
    bool ANSRTMPClient::Stop() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_isPlaying) {
-            _playerClient->stop();
-            _isPlaying = false;
+        decltype(_playerClient.get()) player = nullptr;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_isPlaying) {
+                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
        }
        return true;
    }
--- a/modules/ANSCV/ANSRTSP.cpp
+++ b/modules/ANSCV/ANSRTSP.cpp
@@ -2,6 +2,7 @@
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
 #include <memory>
+#include <format>
 #include "media_codec.h"
 #include <cstdint>
 #include <cuda_runtime.h>
@@ -21,6 +22,20 @@ extern "C"
 }
 // Note: per-instance thread safety is handled by ANSRTSPClient::_mutex
 // Mat registry thread safety is handled by anscv_mat_replace's internal registry_mutex
+
+// Debug logging — goes to both stderr AND OutputDebugString (DebugView).
+#ifndef RTSP_DBG
+#ifdef _WIN32
+#define RTSP_DBG(fmt, ...) do { \
+    char _rtsp_buf[512]; \
+    snprintf(_rtsp_buf, sizeof(_rtsp_buf), fmt "\n", ##__VA_ARGS__); \
+    OutputDebugStringA(_rtsp_buf); \
+    fprintf(stderr, "%s", _rtsp_buf); \
+} while(0)
+#else
+#define RTSP_DBG(fmt, ...) fprintf(stderr, fmt "\n", ##__VA_ARGS__)
+#endif
+#endif
 static bool ansrtspLicenceValid = false;
 // Global once_flag to protect license checking
 static std::once_flag ansrtspLicenseOnceFlag;
@@ -48,19 +63,88 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSRTSPClient::Destroy() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_playerClient) {
-            // Stop the stream first so the video decoder is flushed and
-            // the RTSP callback thread is no longer feeding frames into
-            // decode().  Without this, rtsp_close() can block waiting for
-            // CRtspClient::m_pMutex (held by the callback mid-decode),
-            // and the hardware decoder flush during destruction can hang
-            // on the GPU.
-            if (_isPlaying) {
-                _playerClient->stop();
-                _isPlaying = false;
+        // Move the player client pointer out of the lock scope, then
+        // close it OUTSIDE the mutex.  close() calls cuArrayDestroy /
+        // cuMemFree which acquire an EXCLUSIVE SRW lock inside nvcuda64.
+        // If we hold _mutex during close(), and another thread holds
+        // the nvcuda64 SRW lock (e.g. cuStreamSynchronize during
+        // inference), we get a deadlock: Stop() → _mutex → nvcuda64
+        // vs inference → nvcuda64 → (blocked by exclusive waiter).
+        decltype(_playerClient) clientToClose;
+        {
+            std::unique_lock<std::recursive_mutex> lock(_mutex);
+            if (_playerClient) {
+                if (_isPlaying) {
+                    _playerClient->stop();
+                    _isPlaying = false;
+                }
            }
-            _playerClient->close();
+
+            // --- Inference guard: wait for in-flight frames to finish ---
+            // GetRTSPCVImage increments _inFlightFrames when it hands out
+            // a GPU frame; the registry decrements it when the frame is
+            // released after inference completes.  We wait here so that
+            // close() doesn't free NVDEC surfaces while TensorRT is
+            // still reading from them (the LabVIEW crash root cause).
+            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
+            if (inFlight > 0) {
+                _logger.LogInfo("ANSRTSPClient::Destroy",
+                    std::format("waiting for {} in-flight inference frame(s)...", inFlight),
+                    __FILE__, __LINE__);
+                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
+                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
+                });
+                if (!done) {
+                    _logger.LogWarn("ANSRTSPClient::Destroy",
+                        std::format("timed out waiting for in-flight frames "
+                                    "(still {} in-flight) — force-releasing GPU frames",
+                                    _inFlightFrames.load()),
+                        __FILE__, __LINE__);
+                }
+            }
+
+            // Force-release ALL GPU frames owned by this client BEFORE close().
+            // Unreleased clones (e.g. LabVIEW AI tasks still holding cloned
+            // cv::Mat*) keep gpuCacheY/gpuCacheUV allocated.  We must cudaFree
+            // them NOW while the CUDA context is still alive.  After close()
+            // destroys the context, cudaFree would crash.
+            int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
+            if (forceReleased > 0) {
+                _logger.LogWarn("ANSRTSPClient::Destroy",
+                    std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
+                    __FILE__, __LINE__);
+                // Drain and cudaFree the GPU buffers while CUDA context is alive
+                // Sync all GPU streams before freeing to avoid illegal access
+                cudaDeviceSynchronize();
+                auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
+                if (!gpuPending.empty()) {
+                    RTSP_DBG("[Destroy] cudaFree %zu GPU ptrs before close()", gpuPending.size());
+                    int prevDev = -1;
+                    cudaGetDevice(&prevDev);
+                    for (auto& entry : gpuPending) {
+                        if (entry.ptr) {
+                            if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
+                            cudaFree(entry.ptr);
+                        }
+                    }
+                    if (prevDev >= 0) cudaSetDevice(prevDev);
+                }
+                // Also drain any pending AVFrames
+                auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
+                for (void* p : avPending) {
+                    AVFrame* f = static_cast<AVFrame*>(p);
+                    av_frame_free(&f);
+                }
+            }
+            ANSGpuFrameRegistry::instance().invalidateOwner(this);
+            _inFlightFrames.store(0, std::memory_order_release);
+
+            clientToClose = std::move(_playerClient);
+        }
+        // CUDA cleanup happens here, outside the mutex — now safe.
+        // All GPU frames owned by this client have been force-freed above.
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) {
@@ -146,10 +230,81 @@ namespace ANSCENTER {
        _playerClient->setCrop(crop);
    }
    bool ANSRTSPClient::Reconnect() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // 1. Mark as not-playing under the mutex FIRST.  This makes GetImage()
+        //    return the cached _pLastFrame instead of calling into the player,
+        //    preventing use-after-free when close() destroys CUDA resources.
+        {
+            std::unique_lock<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+
+            // --- Inference guard: wait for in-flight frames to finish ---
+            // Same guard as Destroy(): close() will free NVDEC surfaces, so
+            // we must wait for any inference engines still reading NV12 data
+            // via zero-copy CUDA device pointers.
+            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
+            if (inFlight > 0) {
+                _logger.LogInfo("ANSRTSPClient::Reconnect",
+                    std::format("waiting for {} in-flight inference frame(s)...", inFlight),
+                    __FILE__, __LINE__);
+                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
+                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
+                });
+                if (!done) {
+                    _logger.LogWarn("ANSRTSPClient::Reconnect",
+                        std::format("timed out waiting for in-flight frames "
+                                    "(still {} in-flight) — force-releasing GPU frames",
+                                    _inFlightFrames.load()),
+                        __FILE__, __LINE__);
+                }
+            }
+
+            // Force-release GPU frames before close() — same as Destroy().
+            int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
+            if (forceReleased > 0) {
+                _logger.LogWarn("ANSRTSPClient::Reconnect",
+                    std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
+                    __FILE__, __LINE__);
+                // Sync all GPU streams before freeing
+                cudaDeviceSynchronize();
+                auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
+                if (!gpuPending.empty()) {
+                    int prevDev = -1;
+                    cudaGetDevice(&prevDev);
+                    for (auto& entry : gpuPending) {
+                        if (entry.ptr) {
+                            if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
+                            cudaFree(entry.ptr);
+                        }
+                    }
+                    if (prevDev >= 0) cudaSetDevice(prevDev);
+                }
+                auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
+                for (void* p : avPending) {
+                    AVFrame* f = static_cast<AVFrame*>(p);
+                    av_frame_free(&f);
+                }
+            }
+            ANSGpuFrameRegistry::instance().invalidateOwner(this);
+            _inFlightFrames.store(0, std::memory_order_release);
+        }
+
+        // 2. close() does CUDA cleanup (cuArrayDestroy/cuMemFree) — run outside
+        //    _mutex to avoid deadlocking with nvcuda64 SRW lock held by inference.
+        //    Safe now because GetImage()/GetNV12Frame() won't touch the player
+        //    while _isPlaying == false, and all in-flight frames have been released.
+        _logger.LogInfo("ANSRTSPClient::Reconnect",
+            "calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
+        RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
        _playerClient->close();
+        RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this);
+
+        // 3. Re-setup and play under the mutex.
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        _logger.LogInfo("ANSRTSPClient::Reconnect",
+            "calling Setup() + play()", __FILE__, __LINE__);
        Setup();
        _isPlaying = _playerClient->play();
+        RTSP_DBG("[Reconnect] DONE isPlaying=%d this=%p", (int)_isPlaying, (void*)this);
        return _isPlaying;
    }
    void ANSRTSPClient::EnableAudio(bool status) {
@@ -169,11 +324,23 @@ namespace ANSCENTER {
    }

     bool ANSRTSPClient::Stop() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_isPlaying) {
-            _playerClient->stop();
-            _isPlaying = false;
-		}
+        // Grab the player pointer and clear _isPlaying under the lock,
+        // then call stop() OUTSIDE the mutex.  stop() internally calls
+        // StopVideoDecoder -> decoder->flush() which does CUDA calls
+        // that can block on the nvcuda64 SRW lock.  Holding _mutex
+        // during that time blocks all other operations on this client
+        // and contributes to the convoy when many clients stop at once.
+        CRtspPlayer* player = nullptr;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_isPlaying) {
+                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
+        }
        return true;
    }
 	bool ANSRTSPClient::Pause() {
@@ -759,10 +926,12 @@ namespace ANSCENTER {
    }
    AVFrame* ANSRTSPClient::GetNV12Frame() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        if (!_isPlaying) return nullptr;  // Player may be mid-reconnect (CUDA resources freed)
        return _playerClient->getNV12Frame();  // Returns clone, caller must av_frame_free
    }
    AVFrame* ANSRTSPClient::GetCudaHWFrame() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        if (!_isPlaying) return nullptr;  // Player may be mid-reconnect (CUDA resources freed)
        return _playerClient->getCudaHWFrame();
    }
    bool ANSRTSPClient::IsCudaHWAccel() {
@@ -810,6 +979,11 @@ extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClien
        if (_username.empty() && _password.empty()) result = ptr->Init(licenseKey, url);
        else result = ptr->Init(licenseKey, username, password, url);
        if (result) {
+            // Default to CUDA/NVDEC HW decoding (mode 7) for NV12 zero-copy
+            // fast path.  LabVIEW may not call SetRTSPHWDecoding after
+            // destroy+recreate cycles, so this ensures the new handle always
+            // uses the GPU decode path instead of falling back to D3D11VA/CPU.
+            ptr->SetHWDecoding(7);  // HW_DECODING_CUDA
            *Handle = ptr.release();
            extern void anscv_unregister_handle(void*);
            extern void anscv_register_handle(void*, void(*)(void*));
@@ -830,9 +1004,37 @@ extern "C" __declspec(dllexport) int ReleaseANSRTSPHandle(ANSCENTER::ANSRTSPClie
    try {
        extern void anscv_unregister_handle(void*);
        anscv_unregister_handle(*Handle);
-        // unique_ptr destructor calls ~ANSRTSPClient which calls Destroy() — no need to call Destroy() separately
-        std::unique_ptr<ANSCENTER::ANSRTSPClient> ptr(*Handle);
+
+        // Grab the raw pointer and NULL the caller's handle immediately.
+        // This prevents the caller (LabVIEW) from issuing new calls.
+        ANSCENTER::ANSRTSPClient* raw = *Handle;
        *Handle = nullptr;
+
+        // Mark as not-playing under _mutex ONLY.  This makes
+        // GetImage()/GetNV12Frame()/GetCudaHWFrame() return empty/null
+        // on any subsequent call, and prevents NEW NV12 GPU surface
+        // pointers from being handed out.
+        //
+        // Do NOT call Destroy()/close() here — close() frees the
+        // NVDEC GPU surfaces (cuArrayDestroy/cuMemFree) which may
+        // still be in use by a CUDA inference kernel that received
+        // the NV12 pointer from a GetRTSPCVImage call that already
+        // completed before this Release was called.
+        {
+            // Use the client's _mutex to safely set _isPlaying = false.
+            // This is the same lock GetImage/GetNV12Frame acquire.
+            raw->Stop();  // sets _isPlaying = false, stops playback
+        }
+
+        // Defer the full cleanup (Destroy + delete) to a background thread
+        // so LabVIEW's UI thread is not blocked.  Destroy() now waits
+        // precisely for in-flight inference to finish (via _inFlightFrames
+        // counter + condition variable) instead of the old 500ms sleep hack.
+        std::thread([raw]() {
+            try { raw->Destroy(); } catch (...) {}
+            try { delete raw; } catch (...) {}
+        }).detach();
+
        return 0;
    } catch (...) {
        if (Handle) *Handle = nullptr;
@@ -882,19 +1084,56 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(

        // Attach NV12 frame for GPU fast-path inference (side-table registry)
        // attach() takes ownership — do NOT av_frame_free here
+        //
+        // CRITICAL: TryIncrementInFlight() MUST be called BEFORE GetCudaHWFrame().
+        // It atomically checks _isPlaying and increments _inFlightFrames under
+        // the same mutex, so Reconnect() cannot call close() while we're doing
+        // the D2D copy from NVDEC surfaces inside gpu_frame_attach_cuda().
        int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
-        AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
-        if (cudaHW) {
-            // CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers.
-            // Also attach CPU NV12 as fallback for cross-GPU inference
-            // (when decode GPU != inference GPU, CUDA ptrs aren't accessible).
-            AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
-            gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
-        } else {
-            AVFrame* nv12 = (*Handle)->GetNV12Frame();
-            if (nv12) {
-                gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
+        bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight();
+        RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
+                (void*)*image, gpuIdx, (int)inFlightGuardHeld);
+
+        if (inFlightGuardHeld) {
+            AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
+            if (cudaHW) {
+                RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p",
+                        cudaHW->width, cudaHW->height,
+                        (void*)cudaHW->data[0], (void*)cudaHW->data[1]);
+                AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
+                gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
+            } else {
+                // HW decode not active — try CPU NV12
+                AVFrame* nv12 = (*Handle)->GetNV12Frame();
+                if (nv12) {
+                    gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
+                }
            }
+
+            // Wire up the registry callback to release the in-flight guard.
+            // TryIncrementInFlight already incremented; DecrementInFlight fires
+            // when the last clone of this frame is released after inference.
+            auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
+            RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d gpuCacheY=%p",
+                    (void*)gpuData,
+                    gpuData ? (void*)gpuData->yPlane : nullptr,
+                    gpuData ? (int)gpuData->isCudaDevicePtr : -1,
+                    gpuData ? gpuData->gpuCacheY : nullptr);
+            if (gpuData) {
+                gpuData->ownerClient = *Handle;
+                gpuData->onReleaseFn = [](void* client) {
+                    static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight();
+                };
+                // NOTE: Do NOT call IncrementInFlight() again here —
+                // TryIncrementInFlight() already did it above.
+            } else {
+                // No gpuData registered (attach failed?) — release the guard
+                (*Handle)->DecrementInFlight();
+            }
+        } else {
+            // Player is stopping/reconnecting — skip CUDA path entirely.
+            // GetImage() already returned a cached BGR frame, which is safe.
+            RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
        }

        return 1;  // Success
--- a/modules/ANSCV/ANSRTSP.h
+++ b/modules/ANSCV/ANSRTSP.h
@@ -16,6 +16,8 @@
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <opencv2/opencv.hpp>
+#include <atomic>
+#include <condition_variable>

 namespace ANSCENTER
 {
@@ -37,7 +39,36 @@ namespace ANSCENTER
 		int64_t  _pts;
 		bool	 _isPlaying;
 		std::recursive_mutex		_mutex;
+
+		// --- Per-client inference guard ---
+		// Tracks how many GPU frames from this client are currently in-flight
+		// (grabbed by GetRTSPCVImage but not yet released after inference).
+		// Destroy() waits for this to reach 0 before freeing NVDEC surfaces,
+		// preventing the use-after-free crash when LabVIEW stops a camera
+		// while AI inference is still reading CUDA device pointers.
+		std::atomic<int>            _inFlightFrames{0};
+		std::condition_variable_any _inFlightDone;
 	public:
+		void IncrementInFlight() { _inFlightFrames.fetch_add(1, std::memory_order_acq_rel); }
+		void DecrementInFlight() {
+			if (_inFlightFrames.fetch_sub(1, std::memory_order_acq_rel) <= 1) {
+				_inFlightDone.notify_all();
+			}
+		}
+		// Atomically check _isPlaying AND increment _inFlightFrames under the
+		// same mutex.  Returns true if the caller may proceed to access CUDA
+		// resources (GetCudaHWFrame + D2D copy).  Returns false if the player
+		// is stopping/reconnecting — caller must NOT touch CUDA resources.
+		//
+		// This closes the race window where Reconnect() sets _isPlaying=false
+		// and calls close() while GetRTSPCVImage is between GetCudaHWFrame()
+		// and the D2D copy in gpu_frame_attach_cuda().
+		bool TryIncrementInFlight() {
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			if (!_isPlaying) return false;
+			_inFlightFrames.fetch_add(1, std::memory_order_acq_rel);
+			return true;
+		}
 		ANSRTSPClient();
 		~ANSRTSPClient() noexcept;
 		[[nodiscard]] bool Init(std::string licenseKey, std::string url);
--- a/modules/ANSCV/ANSSRT.cpp
+++ b/modules/ANSCV/ANSSRT.cpp
@@ -48,13 +48,19 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSSRTClient::Destroy() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_playerClient) {
-            if (_isPlaying) {
-                _playerClient->stop();
-                _isPlaying = false;
+        decltype(_playerClient) clientToClose;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_playerClient) {
+                if (_isPlaying) {
+                    _playerClient->stop();
+                    _isPlaying = false;
+                }
            }
-            _playerClient->close();
+            clientToClose = std::move(_playerClient);
+        }
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSSRTLicense(const std::string& licenseKey) {
@@ -124,8 +130,12 @@ namespace ANSCENTER {
        }
    }
    bool ANSSRTClient::Reconnect() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+        }
        _playerClient->close();
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
        Setup();
        _isPlaying = _playerClient->play();
        return _isPlaying;
@@ -155,10 +165,16 @@ namespace ANSCENTER {
    }

    bool ANSSRTClient::Stop() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_isPlaying) {
-            _playerClient->stop();
-            _isPlaying = false;
+        decltype(_playerClient.get()) player = nullptr;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_isPlaying) {
+                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
        }
        return true;
    }
--- a/modules/ANSCV/ANSVideoPlayer.cpp
+++ b/modules/ANSCV/ANSVideoPlayer.cpp
@@ -40,33 +40,34 @@ namespace ANSCENTER {
 		catch (...) {}
 	}
 	void ANSVIDEOPLAYER::Destroy() {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
-		try {
-			// --- HW decode cleanup ---
-			if (_hwPlayer) {
-				try {
-					_hwPlayer->stop();
-					_hwPlayer->close();
-				} catch (...) {}
-				_hwPlayer.reset();  // releases CFilePlayer + HWDecoderPool slot
-			}
-			_hwDecodeActive = false;
-			_hwGpuIndex = -1;
-			_hwCudaAccel = false;
-			_hwEOF = false;
-			_hwFrameCount = 0;
+		// Move HW player out of lock scope — close() does CUDA cleanup
+		// (cuArrayDestroy/cuMemFree) which must not run under _mutex
+		// to avoid deadlocking with nvcuda64 SRW lock held by inference.
+		decltype(_hwPlayer) hwPlayerToClose;
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			try {
+				if (_hwPlayer) {
+					try { _hwPlayer->stop(); } catch (...) {}
+				}
+				hwPlayerToClose = std::move(_hwPlayer);
+				_hwDecodeActive = false;
+				_hwGpuIndex = -1;
+				_hwCudaAccel = false;
+				_hwEOF = false;
+				_hwFrameCount = 0;

-			// --- cv::VideoCapture cleanup ---
-			_previousImage.release();
-			_inferenceImage.release();
-			_inferenceCloneCurr.release();
-			_inferenceClonePrev.release();
-			_lastJpegImage = "";
-			_isPlaying = false;
-			_resWidth = 0;
-			_resHeight = 0;
-			_currentFrame = 0;
-			_previousPTS = 0;
+				// --- cv::VideoCapture cleanup ---
+				_previousImage.release();
+				_inferenceImage.release();
+				_inferenceCloneCurr.release();
+				_inferenceClonePrev.release();
+				_lastJpegImage = "";
+				_isPlaying = false;
+				_resWidth = 0;
+				_resHeight = 0;
+				_currentFrame = 0;
+				_previousPTS = 0;
 			if (cap.isOpened()) {
 				cap.release();
 			}
@@ -77,6 +78,13 @@ namespace ANSCENTER {
 		catch (...) {
 			_logger.LogError("ANSVIDEOPLAYER::Destroy.", "Unknown exception", __FILE__, __LINE__);
 		}
+		} // end lock scope
+
+		// CUDA cleanup happens here, outside the mutex
+		if (hwPlayerToClose) {
+			try { hwPlayerToClose->close(); } catch (...) {}
+			hwPlayerToClose.reset();
+		}
 	}

 	static void VerifyGlobalANSVPLicense(const std::string& licenseKey) {
@@ -187,15 +195,25 @@ namespace ANSCENTER {
 	}
 	
 	bool ANSVIDEOPLAYER::Reconnect() {
+		// HW decoder close() does CUDA cleanup — run outside _mutex
+		// to avoid deadlocking with nvcuda64 SRW lock held by inference.
+		decltype(_hwPlayer) hwPlayerToClose;
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			_isPlaying = false;  // GetImage() returns cached frame while we reconnect
+			if (_hwPlayer) {
+				try { _hwPlayer->stop(); } catch (...) {}
+				hwPlayerToClose = std::move(_hwPlayer);
+			}
+		}
+		if (hwPlayerToClose) {
+			try { hwPlayerToClose->close(); } catch (...) {}
+			hwPlayerToClose.reset();
+		}
+
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
 		try {
 			_currentFrame = 0;
-
-			// --- HW decode: destroy and re-setup ---
-			if (_hwPlayer) {
-				try { _hwPlayer->stop(); _hwPlayer->close(); } catch (...) {}
-				_hwPlayer.reset();
-			}
 			_hwDecodeActive = false;
 			_hwGpuIndex = -1;
 			_hwCudaAccel = false;
@@ -266,41 +284,48 @@ namespace ANSCENTER {
 		}
 	}
 	bool ANSVIDEOPLAYER::Stop() {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
-		try {
-			// --- HW decode path ---
-			if (_hwDecodeActive && _hwPlayer) {
-				_hwPlayer->stop();
-				_isPlaying = false;
-				return true;
-			}
-
-			// --- cv::VideoCapture fallback ---
-			if (cap.isOpened()) {
-				try {
-					double frame_pos = cap.get(cv::CAP_PROP_POS_FRAMES);
-					if (frame_pos >= 0) {
-						_currentFrame = static_cast<int64_t>(frame_pos);
-					}
-					else {
-						_currentFrame = 0;
-						this->_logger.LogError("ANSVIDEOPLAYER::Stop. Exception occurred:", "Unable to retrieve current frame position", __FILE__, __LINE__);
-					}
+		decltype(_hwPlayer.get()) hwPlayer = nullptr;
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			try {
+				// --- HW decode path ---
+				if (_hwDecodeActive && _hwPlayer) {
+					_isPlaying = false;
+					hwPlayer = _hwPlayer.get();
+					// stop() called outside the lock below; skip cap path
 				}
-				catch (const std::exception& e) {
-					this->_logger.LogError("ANSVIDEOPLAYER::Stop. Exception occurred:", e.what(), __FILE__, __LINE__);
-					_currentFrame = 0;
+				else {
+					// --- cv::VideoCapture fallback ---
+					if (cap.isOpened()) {
+						try {
+							double frame_pos = cap.get(cv::CAP_PROP_POS_FRAMES);
+							if (frame_pos >= 0) {
+								_currentFrame = static_cast<int64_t>(frame_pos);
+							}
+							else {
+								_currentFrame = 0;
+								this->_logger.LogError("ANSVIDEOPLAYER::Stop. Exception occurred:", "Unable to retrieve current frame position", __FILE__, __LINE__);
+							}
+						}
+						catch (const std::exception& e) {
+							this->_logger.LogError("ANSVIDEOPLAYER::Stop. Exception occurred:", e.what(), __FILE__, __LINE__);
+							_currentFrame = 0;
+						}
+						cap.release();
+					}
+					_isPlaying = false;
+					return true;
 				}
-				cap.release();
 			}
-			_isPlaying = false;
-			return true;
+			catch (const std::exception& e) {
+				this->_logger.LogError("ANSVIDEOPLAYER::Stop. Exception occurred:", e.what(), __FILE__, __LINE__);
+				return false;
+			}
 		}
-		catch (const std::exception& e) {
-			this->_logger.LogError("ANSVIDEOPLAYER::Stop. Exception occurred:", e.what(), __FILE__, __LINE__);
-			return false;
+		if (hwPlayer) {
+			hwPlayer->stop();
 		}
-
+		return true;
 	}
 	void ANSVIDEOPLAYER::SetBBox(cv::Rect bbox) {
 		std::lock_guard<std::recursive_mutex> lock(_mutex);