Fix NV12 crash issue when recreate camera object

2026-04-02 22:07:27 +11:00
parent 4bedf3a3a2
commit 958cab6ae3
25 changed files with 1459 additions and 393 deletions
--- a/modules/ANSCV/ANSFLV.cpp
+++ b/modules/ANSCV/ANSFLV.cpp
@@ -46,13 +46,22 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSFLVClient::Destroy() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_playerClient) {
-            if (_isPlaying) {
-                _playerClient->stop();
-                _isPlaying = false;
+        // Move player out of lock scope — close() does CUDA cleanup
+        // (cuArrayDestroy/cuMemFree) which must not run under _mutex
+        // to avoid deadlocking with nvcuda64 SRW lock held by inference.
+        decltype(_playerClient) clientToClose;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_playerClient) {
+                if (_isPlaying) {
+                    _playerClient->stop();
+                    _isPlaying = false;
+                }
            }
-            _playerClient->close();
+            clientToClose = std::move(_playerClient);
+        }
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSFLVLicense(const std::string& licenseKey) {
@@ -129,8 +138,12 @@ namespace ANSCENTER {
        }
    }
    bool ANSFLVClient::Reconnect() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+        }
        _playerClient->close();
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
        Setup();
        _isPlaying = _playerClient->play();
        return _isPlaying;
@@ -143,10 +156,16 @@ namespace ANSCENTER {
        return _isPlaying;
    }
    bool ANSFLVClient::Stop() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_isPlaying) {
-            _playerClient->stop();
-            _isPlaying = false;
+        decltype(_playerClient.get()) player = nullptr;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_isPlaying) {
+                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
        }
        return true;
    }
--- a/modules/ANSCV/ANSFilePlayer.cpp
+++ b/modules/ANSCV/ANSFilePlayer.cpp
@@ -39,22 +39,26 @@ namespace ANSCENTER {
 		catch (...) {}
 	}
 	void ANSFILEPLAYER::Destroy() {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
-		try {
-			_url = "";
-			_imageRotateDeg = 0;
-			_isPlaying = false;
-			_lastJpegImage = "";
-			_pLastFrame.release();
-			if (_playerClient) {
-				_playerClient->close();
+		decltype(_playerClient) clientToClose;
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			try {
+				_url = "";
+				_imageRotateDeg = 0;
+				_isPlaying = false;
+				_lastJpegImage = "";
+				_pLastFrame.release();
+				clientToClose = std::move(_playerClient);
+			}
+			catch (const std::exception& e) {
+				_logger.LogError("ANSFILEPLAYER::Destroy. Exception:", e.what(), __FILE__, __LINE__);
+			}
+			catch (...) {
+				_logger.LogError("ANSFILEPLAYER::Destroy.", "Unknown exception", __FILE__, __LINE__);
 			}
 		}
-		catch (const std::exception& e) {
-			_logger.LogError("ANSFILEPLAYER::Destroy. Exception:", e.what(), __FILE__, __LINE__);
-		}
-		catch (...) {
-			_logger.LogError("ANSFILEPLAYER::Destroy.", "Unknown exception", __FILE__, __LINE__);
+		if (clientToClose) {
+			clientToClose->close();
 		}
 	}
 	void ANSFILEPLAYER::CheckLicense() {
@@ -94,8 +98,12 @@ namespace ANSCENTER {
 		return _playerClient->open(_url);
 	}
 	bool ANSFILEPLAYER::Reconnect() {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			_isPlaying = false;
+		}
 		_playerClient->close();
+		std::lock_guard<std::recursive_mutex> lock(_mutex);
 		Setup();
 		return Start();
 	}
@@ -105,14 +113,17 @@ namespace ANSCENTER {
 		return _isPlaying;
 	}
 	bool ANSFILEPLAYER::Stop() {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
-		if (_playerClient->pause()) {
+		decltype(_playerClient.get()) player = nullptr;
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			player = _playerClient.get();
+		}
+		if (player && player->pause()) {
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
 			_isPlaying = false;
 			return true;
 		}
-		else {
-			return false;
-		}
+		return false;
 	}
 	bool ANSFILEPLAYER::IsPaused() {
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
--- a/modules/ANSCV/ANSGpuFrameOps.h
+++ b/modules/ANSCV/ANSGpuFrameOps.h
@@ -19,8 +19,31 @@ extern "C" {
 #include "libavutil/frame.h"
 }

+#include <cuda_runtime.h>
 #include <cstring>
 #include <cstdlib>
+#include <cstdio>
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+// Debug logging macro for GPU frame operations.
+// Output goes to stderr (console) AND OutputDebugString (DebugView / VS debugger).
+// Use Sysinternals DebugView (dbgview64.exe) to capture these after a crash.
+#ifndef GPU_FRAME_DBG
+#ifdef _WIN32
+#define GPU_FRAME_DBG(fmt, ...) do { \
+    char _gpu_dbg_buf[512]; \
+    snprintf(_gpu_dbg_buf, sizeof(_gpu_dbg_buf), "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__); \
+    OutputDebugStringA(_gpu_dbg_buf); \
+    fprintf(stderr, "%s", _gpu_dbg_buf); \
+} while(0)
+#else
+#define GPU_FRAME_DBG(fmt, ...) \
+    fprintf(stderr, "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__)
+#endif
+#endif

 namespace anscv_gpu_ops {
 namespace detail {
@@ -71,6 +94,42 @@ inline bool snapshotNV12Planes(const AVFrame* nv12,
    return true;
 }

+// Drain pending GPU device pointers and actually cudaFree them.
+// Must be called from a thread with CUDA context available.
+inline void drainAndFreeGpuPending() {
+    auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
+    if (gpuPending.empty()) return;
+    GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs", gpuPending.size());
+    int prevDev = -1;
+    cudaGetDevice(&prevDev);
+
+    // Group by device to minimize cudaSetDevice calls and synchronize once per device.
+    // cudaDeviceSynchronize() is CRITICAL: NV12 kernels run on cv::cuda::Stream
+    // (not the default stream).  cudaFree on stream 0 doesn't wait for other
+    // streams, so without this sync, cudaFree can free a buffer while a kernel
+    // on another stream is still reading from it → cudaErrorIllegalAddress (700)
+    // which permanently corrupts the CUDA context.
+    int lastSyncDev = -1;
+    for (auto& entry : gpuPending) {
+        if (entry.ptr) {
+            if (entry.deviceIdx >= 0)
+                cudaSetDevice(entry.deviceIdx);
+            if (entry.deviceIdx != lastSyncDev) {
+                cudaDeviceSynchronize();
+                lastSyncDev = entry.deviceIdx;
+            }
+            GPU_FRAME_DBG("drainGpuPending: cudaFree(%p) dev=%d", entry.ptr, entry.deviceIdx);
+            cudaError_t err = cudaFree(entry.ptr);
+            if (err != cudaSuccess) {
+                GPU_FRAME_DBG("drainGpuPending: cudaFree FAILED err=%d (%s)",
+                              (int)err, cudaGetErrorString(err));
+            }
+        }
+    }
+    if (prevDev >= 0)
+        cudaSetDevice(prevDev);
+}
+
 } // namespace detail
 } // namespace anscv_gpu_ops

@@ -117,36 +176,44 @@ inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pt
    }
 }

-// Attach CUDA HW frame — keeps CUDA device pointers for zero-copy inference.
+// Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
 // TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
 //
-// Primary path: yPlane/uvPlane point to CUDA device pointers from the cloned
-// AVFrame (data[0]/data[1]).  The cloned AVFrame keeps the NVDEC surface alive
-// until gpu_frame_remove() is called after inference.  With 4 cameras each
-// holding ~1 surface, this uses 4 of NVDEC's 25-32 surface pool — safe.
+// D2D copy path: cudaMemcpy2D from NVDEC surfaces to cudaMalloc'd buffers on the
+// same GPU.  This decouples the NV12 data lifetime from the NVDEC decoder, so
+// player->close() can safely destroy the decoder at any time without invalidating
+// pointers that inference engines may be reading.  The NVDEC surface is freed
+// immediately (av_frame_free), returning it to the decoder's surface pool.
+//
+// The owned GPU pointers are stored as both yPlane/uvPlane (for zero-copy reads)
+// and gpuCacheY/gpuCacheUV (for lifecycle management / cudaFree on cleanup).
+//
+// VRAM budget: if the global GPU cache budget is exceeded, falls back to CPU-only
+// NV12 snapshot (no zero-copy, but safe).
 //
 // Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
-// inference (when decode GPU != inference GPU, CUDA device ptrs aren't
-// accessible from another GPU context).
+// inference (when decode GPU != inference GPU).
 inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
                                   AVFrame* cpuNV12 = nullptr) {
-    if (!mat || !cudaFrame) return;
+    if (!mat || !cudaFrame) {
+        GPU_FRAME_DBG("attach_cuda: SKIP mat=%p cudaFrame=%p", (void*)mat, (void*)cudaFrame);
+        return;
+    }
+
+    const int w = cudaFrame->width;
+    const int h = cudaFrame->height;
+    GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p cpuNV12=%p",
+                  (void*)mat, w, h, gpuIdx,
+                  (void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)cpuNV12);

    GpuFrameData data{};
    data.gpuIndex        = gpuIdx;
    data.pts             = pts;
-    data.width           = cudaFrame->width;
-    data.height          = cudaFrame->height;
-    data.pixelFormat     = 23; // AV_PIX_FMT_NV12 — the underlying sw_format
+    data.width           = w;
+    data.height          = h;
+    data.pixelFormat     = 23; // AV_PIX_FMT_NV12

-    // Primary: CUDA device pointers from NVDEC (zero-copy on same GPU)
-    data.isCudaDevicePtr = true;
-    data.yPlane          = cudaFrame->data[0];   // CUDA device ptr: Y plane
-    data.uvPlane         = cudaFrame->data[1];   // CUDA device ptr: UV plane
-    data.yLinesize       = cudaFrame->linesize[0];
-    data.uvLinesize      = cudaFrame->linesize[1];
-
-    // Fallback: snapshot CPU NV12 for cross-GPU inference
+    // Snapshot CPU NV12 for cross-GPU fallback (must do before freeing cpuNV12)
    if (cpuNV12) {
        anscv_gpu_ops::detail::snapshotNV12Planes(
            cpuNV12,
@@ -155,9 +222,98 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
            data.width, data.height);
    }

-    // Store AVFrames for cleanup (cudaFrame keeps NVDEC surface alive)
-    data.avframe    = cudaFrame;
-    data.cpuAvframe = cpuNV12;
+    // --- D2D copy: NVDEC surface → owned GPU memory ---
+    // Estimate VRAM needed for the owned NV12 copy
+    const size_t yBytes  = static_cast<size_t>(w) * h;
+    const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
+    const size_t totalBytes = yBytes + uvBytes;
+
+    bool d2dOk = false;
+    if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
+        int prevDev = -1;
+        cudaGetDevice(&prevDev);
+        if (gpuIdx >= 0)
+            cudaSetDevice(gpuIdx);
+
+        void*  ownedY  = nullptr;
+        void*  ownedUV = nullptr;
+        size_t yPitch  = 0;
+        size_t uvPitch = 0;
+
+        cudaError_t e1 = cudaMallocPitch(&ownedY,  &yPitch,  w, h);
+        cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);
+
+        if (e1 == cudaSuccess && e2 == cudaSuccess) {
+            cudaError_t e3 = cudaMemcpy2D(ownedY,  yPitch,
+                                           cudaFrame->data[0], cudaFrame->linesize[0],
+                                           w, h, cudaMemcpyDeviceToDevice);
+            cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
+                                           cudaFrame->data[1], cudaFrame->linesize[1],
+                                           w, h / 2, cudaMemcpyDeviceToDevice);
+
+            if (e3 == cudaSuccess && e4 == cudaSuccess) {
+                // Store owned GPU pointers as primary NV12 source
+                data.isCudaDevicePtr = true;
+                data.yPlane          = static_cast<uint8_t*>(ownedY);
+                data.uvPlane         = static_cast<uint8_t*>(ownedUV);
+                data.yLinesize       = static_cast<int>(yPitch);
+                data.uvLinesize      = static_cast<int>(uvPitch);
+
+                // Track in gpuCache for lifecycle management (cudaFree on cleanup)
+                data.gpuCacheY          = ownedY;
+                data.gpuCacheUV         = ownedUV;
+                data.gpuCacheYPitch     = yPitch;
+                data.gpuCacheUVPitch    = uvPitch;
+                data.gpuCacheDeviceIdx  = gpuIdx;
+                data.gpuCacheValid      = true;
+                data.gpuCacheBytes      = yPitch * h + uvPitch * (h / 2);
+
+                ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
+                d2dOk = true;
+                GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
+                              ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
+            } else {
+                // D2D copy failed — free allocated memory and fall back
+                GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
+                              (int)e3, (int)e4);
+                cudaFree(ownedY);
+                cudaFree(ownedUV);
+            }
+        } else {
+            // Allocation failed — free any partial allocation and fall back
+            GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
+                          (int)e1, (int)e2);
+            if (e1 == cudaSuccess) cudaFree(ownedY);
+            if (e2 == cudaSuccess) cudaFree(ownedUV);
+        }
+
+        if (prevDev >= 0)
+            cudaSetDevice(prevDev);
+    }
+
+    if (!d2dOk) {
+        // Fall back to CPU NV12 snapshot only (no zero-copy)
+        GPU_FRAME_DBG("attach_cuda: FALLBACK CPU-only cpuY=%p cpuUV=%p",
+                      (void*)data.cpuYPlane, (void*)data.cpuUvPlane);
+        data.isCudaDevicePtr = false;
+        data.yPlane          = data.cpuYPlane;
+        data.uvPlane         = data.cpuUvPlane;
+        data.yLinesize       = data.cpuYLinesize;
+        data.uvLinesize      = data.cpuUvLinesize;
+    }
+
+    // Release AVFrames immediately — NVDEC surfaces returned to pool.
+    // No longer stored in GpuFrameData (owned GPU copy is independent).
+    GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p",
+                  (void*)cudaFrame, (void*)cpuNV12);
+    av_frame_free(&cudaFrame);
+    if (cpuNV12) av_frame_free(&cpuNV12);
+    data.avframe    = nullptr;
+    data.cpuAvframe = nullptr;
+
+    GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d gpuCacheY=%p gpuCacheUV=%p",
+                  (void*)data.yPlane, (void*)data.uvPlane, (int)data.isCudaDevicePtr,
+                  data.gpuCacheY, data.gpuCacheUV);

    void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
    if (old) {
@@ -165,17 +321,23 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
        av_frame_free(&oldFrame);
    }

+    // Free stale AVFrames evicted by TTL or previous attach
    auto pending = ANSGpuFrameRegistry::instance().drain_pending();
    for (void* p : pending) {
        AVFrame* stale = static_cast<AVFrame*>(p);
        av_frame_free(&stale);
    }
+
+    // Free stale GPU device pointers
+    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }

-// Release entry by cv::Mat* and free any returned AVFrames. Safe if not in map (no-op).
+// Release entry by cv::Mat* and free any returned AVFrames + GPU pointers.
+// Safe if not in map (no-op).
 inline void gpu_frame_remove(cv::Mat* mat) {
    if (!mat) return;

+    GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat);
    ANSGpuFrameRegistry::instance().release(mat);

    // Free any AVFrames that became pending from this release or prior eviction
@@ -186,13 +348,7 @@ inline void gpu_frame_remove(cv::Mat* mat) {
    }

    // Free any GPU device pointers that became pending
-    auto gpuPending = gpu_frame_drain_gpu_pending();
-    // NOTE: cudaFree requires CUDA context — caller must be on a CUDA-capable thread.
-    // If not, these will leak. In practice, gpu_frame_remove is called from ANSCV
-    // camera threads which do have CUDA context.
-    // For safety, we skip cudaFree here and let NV12PreprocessHelper handle it.
-    // The GPU pointers are tracked in the budget and will be accounted for.
-    (void)gpuPending;
+    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }

 // Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
@@ -209,4 +365,7 @@ inline void gpu_frame_evict_stale() {
        AVFrame* stale = static_cast<AVFrame*>(p);
        av_frame_free(&stale);
    }
+
+    // Free any GPU device pointers from evicted frames
+    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }
--- a/modules/ANSCV/ANSMJPEG.cpp
+++ b/modules/ANSCV/ANSMJPEG.cpp
@@ -46,13 +46,19 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSMJPEGClient::Destroy() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_playerClient) {
-            if (_isPlaying) {
-                _playerClient->stop();
-                _isPlaying = false;
+        decltype(_playerClient) clientToClose;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_playerClient) {
+                if (_isPlaying) {
+                    _playerClient->stop();
+                    _isPlaying = false;
+                }
            }
-            _playerClient->close();
+            clientToClose = std::move(_playerClient);
+        }
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSMJPEGLicense(const std::string& licenseKey) {
@@ -129,8 +135,12 @@ namespace ANSCENTER {
        }
    }
    bool ANSMJPEGClient::Reconnect() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+        }
        _playerClient->close();
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
        Setup();
        _isPlaying = _playerClient->play();
        return _isPlaying;
@@ -143,10 +153,16 @@ namespace ANSCENTER {
        return _isPlaying;
    }
    bool ANSMJPEGClient::Stop() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_isPlaying) {
-            _playerClient->stop();
-            _isPlaying = false;
+        decltype(_playerClient.get()) player = nullptr;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_isPlaying) {
+                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
        }
        return true;
    }
--- a/modules/ANSCV/ANSRTMP.cpp
+++ b/modules/ANSCV/ANSRTMP.cpp
@@ -48,13 +48,19 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSRTMPClient::Destroy() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_playerClient) {
-            if (_isPlaying) {
-                _playerClient->stop();
-                _isPlaying = false;
+        decltype(_playerClient) clientToClose;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_playerClient) {
+                if (_isPlaying) {
+                    _playerClient->stop();
+                    _isPlaying = false;
+                }
            }
-            _playerClient->close();
+            clientToClose = std::move(_playerClient);
+        }
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSRTMPLicense(const std::string& licenseKey) {
@@ -126,8 +132,12 @@ namespace ANSCENTER {
    }

    bool ANSRTMPClient::Reconnect() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+        }
        _playerClient->close();
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
        Setup();
        _isPlaying = _playerClient->play();
        return _isPlaying;
@@ -140,10 +150,16 @@ namespace ANSCENTER {
        return _isPlaying;
    }
    bool ANSRTMPClient::Stop() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_isPlaying) {
-            _playerClient->stop();
-            _isPlaying = false;
+        decltype(_playerClient.get()) player = nullptr;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_isPlaying) {
+                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
        }
        return true;
    }
--- a/modules/ANSCV/ANSRTSP.cpp
+++ b/modules/ANSCV/ANSRTSP.cpp
@@ -2,6 +2,7 @@
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
 #include <memory>
+#include <format>
 #include "media_codec.h"
 #include <cstdint>
 #include <cuda_runtime.h>
@@ -21,6 +22,20 @@ extern "C"
 }
 // Note: per-instance thread safety is handled by ANSRTSPClient::_mutex
 // Mat registry thread safety is handled by anscv_mat_replace's internal registry_mutex
+
+// Debug logging — goes to both stderr AND OutputDebugString (DebugView).
+#ifndef RTSP_DBG
+#ifdef _WIN32
+#define RTSP_DBG(fmt, ...) do { \
+    char _rtsp_buf[512]; \
+    snprintf(_rtsp_buf, sizeof(_rtsp_buf), fmt "\n", ##__VA_ARGS__); \
+    OutputDebugStringA(_rtsp_buf); \
+    fprintf(stderr, "%s", _rtsp_buf); \
+} while(0)
+#else
+#define RTSP_DBG(fmt, ...) fprintf(stderr, fmt "\n", ##__VA_ARGS__)
+#endif
+#endif
 static bool ansrtspLicenceValid = false;
 // Global once_flag to protect license checking
 static std::once_flag ansrtspLicenseOnceFlag;
@@ -48,19 +63,88 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSRTSPClient::Destroy() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_playerClient) {
-            // Stop the stream first so the video decoder is flushed and
-            // the RTSP callback thread is no longer feeding frames into
-            // decode().  Without this, rtsp_close() can block waiting for
-            // CRtspClient::m_pMutex (held by the callback mid-decode),
-            // and the hardware decoder flush during destruction can hang
-            // on the GPU.
-            if (_isPlaying) {
-                _playerClient->stop();
-                _isPlaying = false;
+        // Move the player client pointer out of the lock scope, then
+        // close it OUTSIDE the mutex.  close() calls cuArrayDestroy /
+        // cuMemFree which acquire an EXCLUSIVE SRW lock inside nvcuda64.
+        // If we hold _mutex during close(), and another thread holds
+        // the nvcuda64 SRW lock (e.g. cuStreamSynchronize during
+        // inference), we get a deadlock: Stop() → _mutex → nvcuda64
+        // vs inference → nvcuda64 → (blocked by exclusive waiter).
+        decltype(_playerClient) clientToClose;
+        {
+            std::unique_lock<std::recursive_mutex> lock(_mutex);
+            if (_playerClient) {
+                if (_isPlaying) {
+                    _playerClient->stop();
+                    _isPlaying = false;
+                }
            }
-            _playerClient->close();
+
+            // --- Inference guard: wait for in-flight frames to finish ---
+            // GetRTSPCVImage increments _inFlightFrames when it hands out
+            // a GPU frame; the registry decrements it when the frame is
+            // released after inference completes.  We wait here so that
+            // close() doesn't free NVDEC surfaces while TensorRT is
+            // still reading from them (the LabVIEW crash root cause).
+            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
+            if (inFlight > 0) {
+                _logger.LogInfo("ANSRTSPClient::Destroy",
+                    std::format("waiting for {} in-flight inference frame(s)...", inFlight),
+                    __FILE__, __LINE__);
+                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
+                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
+                });
+                if (!done) {
+                    _logger.LogWarn("ANSRTSPClient::Destroy",
+                        std::format("timed out waiting for in-flight frames "
+                                    "(still {} in-flight) — force-releasing GPU frames",
+                                    _inFlightFrames.load()),
+                        __FILE__, __LINE__);
+                }
+            }
+
+            // Force-release ALL GPU frames owned by this client BEFORE close().
+            // Unreleased clones (e.g. LabVIEW AI tasks still holding cloned
+            // cv::Mat*) keep gpuCacheY/gpuCacheUV allocated.  We must cudaFree
+            // them NOW while the CUDA context is still alive.  After close()
+            // destroys the context, cudaFree would crash.
+            int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
+            if (forceReleased > 0) {
+                _logger.LogWarn("ANSRTSPClient::Destroy",
+                    std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
+                    __FILE__, __LINE__);
+                // Drain and cudaFree the GPU buffers while CUDA context is alive
+                // Sync all GPU streams before freeing to avoid illegal access
+                cudaDeviceSynchronize();
+                auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
+                if (!gpuPending.empty()) {
+                    RTSP_DBG("[Destroy] cudaFree %zu GPU ptrs before close()", gpuPending.size());
+                    int prevDev = -1;
+                    cudaGetDevice(&prevDev);
+                    for (auto& entry : gpuPending) {
+                        if (entry.ptr) {
+                            if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
+                            cudaFree(entry.ptr);
+                        }
+                    }
+                    if (prevDev >= 0) cudaSetDevice(prevDev);
+                }
+                // Also drain any pending AVFrames
+                auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
+                for (void* p : avPending) {
+                    AVFrame* f = static_cast<AVFrame*>(p);
+                    av_frame_free(&f);
+                }
+            }
+            ANSGpuFrameRegistry::instance().invalidateOwner(this);
+            _inFlightFrames.store(0, std::memory_order_release);
+
+            clientToClose = std::move(_playerClient);
+        }
+        // CUDA cleanup happens here, outside the mutex — now safe.
+        // All GPU frames owned by this client have been force-freed above.
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) {
@@ -146,10 +230,81 @@ namespace ANSCENTER {
        _playerClient->setCrop(crop);
    }
    bool ANSRTSPClient::Reconnect() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // 1. Mark as not-playing under the mutex FIRST.  This makes GetImage()
+        //    return the cached _pLastFrame instead of calling into the player,
+        //    preventing use-after-free when close() destroys CUDA resources.
+        {
+            std::unique_lock<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+
+            // --- Inference guard: wait for in-flight frames to finish ---
+            // Same guard as Destroy(): close() will free NVDEC surfaces, so
+            // we must wait for any inference engines still reading NV12 data
+            // via zero-copy CUDA device pointers.
+            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
+            if (inFlight > 0) {
+                _logger.LogInfo("ANSRTSPClient::Reconnect",
+                    std::format("waiting for {} in-flight inference frame(s)...", inFlight),
+                    __FILE__, __LINE__);
+                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
+                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
+                });
+                if (!done) {
+                    _logger.LogWarn("ANSRTSPClient::Reconnect",
+                        std::format("timed out waiting for in-flight frames "
+                                    "(still {} in-flight) — force-releasing GPU frames",
+                                    _inFlightFrames.load()),
+                        __FILE__, __LINE__);
+                }
+            }
+
+            // Force-release GPU frames before close() — same as Destroy().
+            int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
+            if (forceReleased > 0) {
+                _logger.LogWarn("ANSRTSPClient::Reconnect",
+                    std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
+                    __FILE__, __LINE__);
+                // Sync all GPU streams before freeing
+                cudaDeviceSynchronize();
+                auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
+                if (!gpuPending.empty()) {
+                    int prevDev = -1;
+                    cudaGetDevice(&prevDev);
+                    for (auto& entry : gpuPending) {
+                        if (entry.ptr) {
+                            if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
+                            cudaFree(entry.ptr);
+                        }
+                    }
+                    if (prevDev >= 0) cudaSetDevice(prevDev);
+                }
+                auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
+                for (void* p : avPending) {
+                    AVFrame* f = static_cast<AVFrame*>(p);
+                    av_frame_free(&f);
+                }
+            }
+            ANSGpuFrameRegistry::instance().invalidateOwner(this);
+            _inFlightFrames.store(0, std::memory_order_release);
+        }
+
+        // 2. close() does CUDA cleanup (cuArrayDestroy/cuMemFree) — run outside
+        //    _mutex to avoid deadlocking with nvcuda64 SRW lock held by inference.
+        //    Safe now because GetImage()/GetNV12Frame() won't touch the player
+        //    while _isPlaying == false, and all in-flight frames have been released.
+        _logger.LogInfo("ANSRTSPClient::Reconnect",
+            "calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
+        RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
        _playerClient->close();
+        RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this);
+
+        // 3. Re-setup and play under the mutex.
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        _logger.LogInfo("ANSRTSPClient::Reconnect",
+            "calling Setup() + play()", __FILE__, __LINE__);
        Setup();
        _isPlaying = _playerClient->play();
+        RTSP_DBG("[Reconnect] DONE isPlaying=%d this=%p", (int)_isPlaying, (void*)this);
        return _isPlaying;
    }
    void ANSRTSPClient::EnableAudio(bool status) {
@@ -169,11 +324,23 @@ namespace ANSCENTER {
    }

     bool ANSRTSPClient::Stop() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_isPlaying) {
-            _playerClient->stop();
-            _isPlaying = false;
-		}
+        // Grab the player pointer and clear _isPlaying under the lock,
+        // then call stop() OUTSIDE the mutex.  stop() internally calls
+        // StopVideoDecoder -> decoder->flush() which does CUDA calls
+        // that can block on the nvcuda64 SRW lock.  Holding _mutex
+        // during that time blocks all other operations on this client
+        // and contributes to the convoy when many clients stop at once.
+        CRtspPlayer* player = nullptr;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_isPlaying) {
+                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
+        }
        return true;
    }
 	bool ANSRTSPClient::Pause() {
@@ -759,10 +926,12 @@ namespace ANSCENTER {
    }
    AVFrame* ANSRTSPClient::GetNV12Frame() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        if (!_isPlaying) return nullptr;  // Player may be mid-reconnect (CUDA resources freed)
        return _playerClient->getNV12Frame();  // Returns clone, caller must av_frame_free
    }
    AVFrame* ANSRTSPClient::GetCudaHWFrame() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        if (!_isPlaying) return nullptr;  // Player may be mid-reconnect (CUDA resources freed)
        return _playerClient->getCudaHWFrame();
    }
    bool ANSRTSPClient::IsCudaHWAccel() {
@@ -810,6 +979,11 @@ extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClien
        if (_username.empty() && _password.empty()) result = ptr->Init(licenseKey, url);
        else result = ptr->Init(licenseKey, username, password, url);
        if (result) {
+            // Default to CUDA/NVDEC HW decoding (mode 7) for NV12 zero-copy
+            // fast path.  LabVIEW may not call SetRTSPHWDecoding after
+            // destroy+recreate cycles, so this ensures the new handle always
+            // uses the GPU decode path instead of falling back to D3D11VA/CPU.
+            ptr->SetHWDecoding(7);  // HW_DECODING_CUDA
            *Handle = ptr.release();
            extern void anscv_unregister_handle(void*);
            extern void anscv_register_handle(void*, void(*)(void*));
@@ -830,9 +1004,37 @@ extern "C" __declspec(dllexport) int ReleaseANSRTSPHandle(ANSCENTER::ANSRTSPClie
    try {
        extern void anscv_unregister_handle(void*);
        anscv_unregister_handle(*Handle);
-        // unique_ptr destructor calls ~ANSRTSPClient which calls Destroy() — no need to call Destroy() separately
-        std::unique_ptr<ANSCENTER::ANSRTSPClient> ptr(*Handle);
+
+        // Grab the raw pointer and NULL the caller's handle immediately.
+        // This prevents the caller (LabVIEW) from issuing new calls.
+        ANSCENTER::ANSRTSPClient* raw = *Handle;
        *Handle = nullptr;
+
+        // Mark as not-playing under _mutex ONLY.  This makes
+        // GetImage()/GetNV12Frame()/GetCudaHWFrame() return empty/null
+        // on any subsequent call, and prevents NEW NV12 GPU surface
+        // pointers from being handed out.
+        //
+        // Do NOT call Destroy()/close() here — close() frees the
+        // NVDEC GPU surfaces (cuArrayDestroy/cuMemFree) which may
+        // still be in use by a CUDA inference kernel that received
+        // the NV12 pointer from a GetRTSPCVImage call that already
+        // completed before this Release was called.
+        {
+            // Use the client's _mutex to safely set _isPlaying = false.
+            // This is the same lock GetImage/GetNV12Frame acquire.
+            raw->Stop();  // sets _isPlaying = false, stops playback
+        }
+
+        // Defer the full cleanup (Destroy + delete) to a background thread
+        // so LabVIEW's UI thread is not blocked.  Destroy() now waits
+        // precisely for in-flight inference to finish (via _inFlightFrames
+        // counter + condition variable) instead of the old 500ms sleep hack.
+        std::thread([raw]() {
+            try { raw->Destroy(); } catch (...) {}
+            try { delete raw; } catch (...) {}
+        }).detach();
+
        return 0;
    } catch (...) {
        if (Handle) *Handle = nullptr;
@@ -882,19 +1084,56 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(

        // Attach NV12 frame for GPU fast-path inference (side-table registry)
        // attach() takes ownership — do NOT av_frame_free here
+        //
+        // CRITICAL: TryIncrementInFlight() MUST be called BEFORE GetCudaHWFrame().
+        // It atomically checks _isPlaying and increments _inFlightFrames under
+        // the same mutex, so Reconnect() cannot call close() while we're doing
+        // the D2D copy from NVDEC surfaces inside gpu_frame_attach_cuda().
        int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
-        AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
-        if (cudaHW) {
-            // CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers.
-            // Also attach CPU NV12 as fallback for cross-GPU inference
-            // (when decode GPU != inference GPU, CUDA ptrs aren't accessible).
-            AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
-            gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
-        } else {
-            AVFrame* nv12 = (*Handle)->GetNV12Frame();
-            if (nv12) {
-                gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
+        bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight();
+        RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
+                (void*)*image, gpuIdx, (int)inFlightGuardHeld);
+
+        if (inFlightGuardHeld) {
+            AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
+            if (cudaHW) {
+                RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p",
+                        cudaHW->width, cudaHW->height,
+                        (void*)cudaHW->data[0], (void*)cudaHW->data[1]);
+                AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
+                gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
+            } else {
+                // HW decode not active — try CPU NV12
+                AVFrame* nv12 = (*Handle)->GetNV12Frame();
+                if (nv12) {
+                    gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
+                }
            }
+
+            // Wire up the registry callback to release the in-flight guard.
+            // TryIncrementInFlight already incremented; DecrementInFlight fires
+            // when the last clone of this frame is released after inference.
+            auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
+            RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d gpuCacheY=%p",
+                    (void*)gpuData,
+                    gpuData ? (void*)gpuData->yPlane : nullptr,
+                    gpuData ? (int)gpuData->isCudaDevicePtr : -1,
+                    gpuData ? gpuData->gpuCacheY : nullptr);
+            if (gpuData) {
+                gpuData->ownerClient = *Handle;
+                gpuData->onReleaseFn = [](void* client) {
+                    static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight();
+                };
+                // NOTE: Do NOT call IncrementInFlight() again here —
+                // TryIncrementInFlight() already did it above.
+            } else {
+                // No gpuData registered (attach failed?) — release the guard
+                (*Handle)->DecrementInFlight();
+            }
+        } else {
+            // Player is stopping/reconnecting — skip CUDA path entirely.
+            // GetImage() already returned a cached BGR frame, which is safe.
+            RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
        }

        return 1;  // Success
--- a/modules/ANSCV/ANSRTSP.h
+++ b/modules/ANSCV/ANSRTSP.h
@@ -16,6 +16,8 @@
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <opencv2/opencv.hpp>
+#include <atomic>
+#include <condition_variable>

 namespace ANSCENTER
 {
@@ -37,7 +39,36 @@ namespace ANSCENTER
 		int64_t  _pts;
 		bool	 _isPlaying;
 		std::recursive_mutex		_mutex;
+
+		// --- Per-client inference guard ---
+		// Tracks how many GPU frames from this client are currently in-flight
+		// (grabbed by GetRTSPCVImage but not yet released after inference).
+		// Destroy() waits for this to reach 0 before freeing NVDEC surfaces,
+		// preventing the use-after-free crash when LabVIEW stops a camera
+		// while AI inference is still reading CUDA device pointers.
+		std::atomic<int>            _inFlightFrames{0};
+		std::condition_variable_any _inFlightDone;
 	public:
+		void IncrementInFlight() { _inFlightFrames.fetch_add(1, std::memory_order_acq_rel); }
+		void DecrementInFlight() {
+			if (_inFlightFrames.fetch_sub(1, std::memory_order_acq_rel) <= 1) {
+				_inFlightDone.notify_all();
+			}
+		}
+		// Atomically check _isPlaying AND increment _inFlightFrames under the
+		// same mutex.  Returns true if the caller may proceed to access CUDA
+		// resources (GetCudaHWFrame + D2D copy).  Returns false if the player
+		// is stopping/reconnecting — caller must NOT touch CUDA resources.
+		//
+		// This closes the race window where Reconnect() sets _isPlaying=false
+		// and calls close() while GetRTSPCVImage is between GetCudaHWFrame()
+		// and the D2D copy in gpu_frame_attach_cuda().
+		bool TryIncrementInFlight() {
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			if (!_isPlaying) return false;
+			_inFlightFrames.fetch_add(1, std::memory_order_acq_rel);
+			return true;
+		}
 		ANSRTSPClient();
 		~ANSRTSPClient() noexcept;
 		[[nodiscard]] bool Init(std::string licenseKey, std::string url);
--- a/modules/ANSCV/ANSSRT.cpp
+++ b/modules/ANSCV/ANSSRT.cpp
@@ -48,13 +48,19 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSSRTClient::Destroy() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_playerClient) {
-            if (_isPlaying) {
-                _playerClient->stop();
-                _isPlaying = false;
+        decltype(_playerClient) clientToClose;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_playerClient) {
+                if (_isPlaying) {
+                    _playerClient->stop();
+                    _isPlaying = false;
+                }
            }
-            _playerClient->close();
+            clientToClose = std::move(_playerClient);
+        }
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSSRTLicense(const std::string& licenseKey) {
@@ -124,8 +130,12 @@ namespace ANSCENTER {
        }
    }
    bool ANSSRTClient::Reconnect() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+        }
        _playerClient->close();
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
        Setup();
        _isPlaying = _playerClient->play();
        return _isPlaying;
@@ -155,10 +165,16 @@ namespace ANSCENTER {
    }

    bool ANSSRTClient::Stop() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_isPlaying) {
-            _playerClient->stop();
-            _isPlaying = false;
+        decltype(_playerClient.get()) player = nullptr;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_isPlaying) {
+                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
        }
        return true;
    }
--- a/modules/ANSCV/ANSVideoPlayer.cpp
+++ b/modules/ANSCV/ANSVideoPlayer.cpp
@@ -40,33 +40,34 @@ namespace ANSCENTER {
 		catch (...) {}
 	}
 	void ANSVIDEOPLAYER::Destroy() {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
-		try {
-			// --- HW decode cleanup ---
-			if (_hwPlayer) {
-				try {
-					_hwPlayer->stop();
-					_hwPlayer->close();
-				} catch (...) {}
-				_hwPlayer.reset();  // releases CFilePlayer + HWDecoderPool slot
-			}
-			_hwDecodeActive = false;
-			_hwGpuIndex = -1;
-			_hwCudaAccel = false;
-			_hwEOF = false;
-			_hwFrameCount = 0;
+		// Move HW player out of lock scope — close() does CUDA cleanup
+		// (cuArrayDestroy/cuMemFree) which must not run under _mutex
+		// to avoid deadlocking with nvcuda64 SRW lock held by inference.
+		decltype(_hwPlayer) hwPlayerToClose;
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			try {
+				if (_hwPlayer) {
+					try { _hwPlayer->stop(); } catch (...) {}
+				}
+				hwPlayerToClose = std::move(_hwPlayer);
+				_hwDecodeActive = false;
+				_hwGpuIndex = -1;
+				_hwCudaAccel = false;
+				_hwEOF = false;
+				_hwFrameCount = 0;

-			// --- cv::VideoCapture cleanup ---
-			_previousImage.release();
-			_inferenceImage.release();
-			_inferenceCloneCurr.release();
-			_inferenceClonePrev.release();
-			_lastJpegImage = "";
-			_isPlaying = false;
-			_resWidth = 0;
-			_resHeight = 0;
-			_currentFrame = 0;
-			_previousPTS = 0;
+				// --- cv::VideoCapture cleanup ---
+				_previousImage.release();
+				_inferenceImage.release();
+				_inferenceCloneCurr.release();
+				_inferenceClonePrev.release();
+				_lastJpegImage = "";
+				_isPlaying = false;
+				_resWidth = 0;
+				_resHeight = 0;
+				_currentFrame = 0;
+				_previousPTS = 0;
 			if (cap.isOpened()) {
 				cap.release();
 			}
@@ -77,6 +78,13 @@ namespace ANSCENTER {
 		catch (...) {
 			_logger.LogError("ANSVIDEOPLAYER::Destroy.", "Unknown exception", __FILE__, __LINE__);
 		}
+		} // end lock scope
+
+		// CUDA cleanup happens here, outside the mutex
+		if (hwPlayerToClose) {
+			try { hwPlayerToClose->close(); } catch (...) {}
+			hwPlayerToClose.reset();
+		}
 	}

 	static void VerifyGlobalANSVPLicense(const std::string& licenseKey) {
@@ -187,15 +195,25 @@ namespace ANSCENTER {
 	}
 	
 	bool ANSVIDEOPLAYER::Reconnect() {
+		// HW decoder close() does CUDA cleanup — run outside _mutex
+		// to avoid deadlocking with nvcuda64 SRW lock held by inference.
+		decltype(_hwPlayer) hwPlayerToClose;
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			_isPlaying = false;  // GetImage() returns cached frame while we reconnect
+			if (_hwPlayer) {
+				try { _hwPlayer->stop(); } catch (...) {}
+				hwPlayerToClose = std::move(_hwPlayer);
+			}
+		}
+		if (hwPlayerToClose) {
+			try { hwPlayerToClose->close(); } catch (...) {}
+			hwPlayerToClose.reset();
+		}
+
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
 		try {
 			_currentFrame = 0;
-
-			// --- HW decode: destroy and re-setup ---
-			if (_hwPlayer) {
-				try { _hwPlayer->stop(); _hwPlayer->close(); } catch (...) {}
-				_hwPlayer.reset();
-			}
 			_hwDecodeActive = false;
 			_hwGpuIndex = -1;
 			_hwCudaAccel = false;
@@ -266,41 +284,48 @@ namespace ANSCENTER {
 		}
 	}
 	bool ANSVIDEOPLAYER::Stop() {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
-		try {
-			// --- HW decode path ---
-			if (_hwDecodeActive && _hwPlayer) {
-				_hwPlayer->stop();
-				_isPlaying = false;
-				return true;
-			}
-
-			// --- cv::VideoCapture fallback ---
-			if (cap.isOpened()) {
-				try {
-					double frame_pos = cap.get(cv::CAP_PROP_POS_FRAMES);
-					if (frame_pos >= 0) {
-						_currentFrame = static_cast<int64_t>(frame_pos);
-					}
-					else {
-						_currentFrame = 0;
-						this->_logger.LogError("ANSVIDEOPLAYER::Stop. Exception occurred:", "Unable to retrieve current frame position", __FILE__, __LINE__);
-					}
+		decltype(_hwPlayer.get()) hwPlayer = nullptr;
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			try {
+				// --- HW decode path ---
+				if (_hwDecodeActive && _hwPlayer) {
+					_isPlaying = false;
+					hwPlayer = _hwPlayer.get();
+					// stop() called outside the lock below; skip cap path
 				}
-				catch (const std::exception& e) {
-					this->_logger.LogError("ANSVIDEOPLAYER::Stop. Exception occurred:", e.what(), __FILE__, __LINE__);
-					_currentFrame = 0;
+				else {
+					// --- cv::VideoCapture fallback ---
+					if (cap.isOpened()) {
+						try {
+							double frame_pos = cap.get(cv::CAP_PROP_POS_FRAMES);
+							if (frame_pos >= 0) {
+								_currentFrame = static_cast<int64_t>(frame_pos);
+							}
+							else {
+								_currentFrame = 0;
+								this->_logger.LogError("ANSVIDEOPLAYER::Stop. Exception occurred:", "Unable to retrieve current frame position", __FILE__, __LINE__);
+							}
+						}
+						catch (const std::exception& e) {
+							this->_logger.LogError("ANSVIDEOPLAYER::Stop. Exception occurred:", e.what(), __FILE__, __LINE__);
+							_currentFrame = 0;
+						}
+						cap.release();
+					}
+					_isPlaying = false;
+					return true;
 				}
-				cap.release();
 			}
-			_isPlaying = false;
-			return true;
+			catch (const std::exception& e) {
+				this->_logger.LogError("ANSVIDEOPLAYER::Stop. Exception occurred:", e.what(), __FILE__, __LINE__);
+				return false;
+			}
 		}
-		catch (const std::exception& e) {
-			this->_logger.LogError("ANSVIDEOPLAYER::Stop. Exception occurred:", e.what(), __FILE__, __LINE__);
-			return false;
+		if (hwPlayer) {
+			hwPlayer->stop();
 		}
-
+		return true;
 	}
 	void ANSVIDEOPLAYER::SetBBox(cv::Rect bbox) {
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
--- a/modules/ANSLPR/ANSLPR_CPU.cpp
+++ b/modules/ANSLPR/ANSLPR_CPU.cpp
@@ -378,7 +378,7 @@ namespace ANSCENTER {
        }
    }
    std::vector<Object> ANSALPR_CPU::RunInference(const cv::Mat& input, const std::string &cameraId) {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // No coarse _mutex — sub-components have their own fine-grained locks.
        std::vector<Object> output;
        output.clear();
        // Initial validation
@@ -419,17 +419,18 @@ namespace ANSCENTER {
 #ifdef FNS_DEBUG  // Corrected preprocessor directive
            cv::Mat draw = input.clone();
 #endif 
-            _detectedArea = cv::Rect(0, 0, frame.cols, frame.rows);
-            if ((_detectedArea.width > 50) && (_detectedArea.height > 50)) {
+            // Use local variable instead of shared _detectedArea for thread safety
+            cv::Rect detectedArea(0, 0, frame.cols, frame.rows);
+            if ((detectedArea.width > 50) && (detectedArea.height > 50)) {
 #ifdef FNS_DEBUG  // Corrected preprocessor directive
-                cv::rectangle(draw, _detectedArea, cv::Scalar(0, 0, 255), 2); // RED for detectedArea
-#endif 
+                cv::rectangle(draw, detectedArea, cv::Scalar(0, 0, 255), 2); // RED for detectedArea
+#endif
                // Ensure _lprDetector is valid
                if (!_lprDetector) {
                    this->_logger.LogFatal("ANSALPR_CPU::Inference", "_lprDetector is null", __FILE__, __LINE__);
                    return output;
                }
-                cv::Mat activeFrame = frame(_detectedArea).clone();
+                cv::Mat activeFrame = frame(detectedArea).clone();

                //std::vector<Object> lprOutputRaw = _lpDetector->RunInference(activeFrame, cameraId);
                //std::vector<Object> lprOutput = AdjustLicensePlateBoundingBoxes(lprOutputRaw, _detectedArea, frame.size(), 3.0);
@@ -471,8 +472,12 @@ namespace ANSCENTER {
                            lprObject.cameraId = cameraId;
                            lprObject.polygon = RectToNormalizedPolygon(lprObject.box, input.cols, input.rows);

-                            // OCR inference
-                            std::vector<PaddleOCR::OCRPredictResult> res_ocr = ppocr->ocr(alignedLPR);
+                            // OCR inference (ppocr is not thread-safe, use fine-grained lock)
+                            std::vector<PaddleOCR::OCRPredictResult> res_ocr;
+                            {
+                                std::lock_guard<std::mutex> ocrLock(_ocrMutex);
+                                res_ocr = ppocr->ocr(alignedLPR);
+                            }
                            std::string ocrText;

                            if (!res_ocr.empty() && res_ocr.size() < 3) {
@@ -515,13 +520,13 @@ namespace ANSCENTER {
        return output;
    }
    bool ANSALPR_CPU::Inference(const cv::Mat& input, std::string& lprResult) {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // No coarse _mutex — delegates to Inference(input, lprResult, cameraId)
        if (input.empty()) return false;
        if ((input.cols < 5) || (input.rows < 5)) return false;
 		return Inference(input, lprResult, "CustomCam");
    }
    bool ANSALPR_CPU::Inference(const cv::Mat& input, std::string& lprResult, const std::string & cameraId) {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // No coarse _mutex — sub-components have fine-grained locks.
        std::vector<Object> output;
        output.clear();
        if (!_licenseValid) {
@@ -587,10 +592,15 @@ namespace ANSCENTER {
                        cv::Mat lprImage = frame(lprPos).clone();
                        lprObject.cameraId = cameraId;
                        lprObject.polygon = RectToNormalizedPolygon(lprObject.box, input.cols, input.rows);
-                        std::vector<PaddleOCR::OCRPredictResult>  res_ocr = ppocr->ocr(lprImage);
+                        // ppocr is not thread-safe, use fine-grained lock
+                        std::vector<PaddleOCR::OCRPredictResult> res_ocr;
+                        {
+                            std::lock_guard<std::mutex> ocrLock(_ocrMutex);
+                            res_ocr = ppocr->ocr(lprImage);
+                        }
                        int detectionSize = res_ocr.size();
                        if ((detectionSize > 0) && (detectionSize < 3)) {
-                            for (int n = 0; n < res_ocr.size(); n++) { // number of detections                        
+                            for (int n = 0; n < res_ocr.size(); n++) { // number of detections
                                ocrText.append(res_ocr[n].text);
                            }
 							std::string rawText = AnalyseLicensePlateText(ocrText);
@@ -613,7 +623,7 @@ namespace ANSCENTER {
        }
    }
    bool ANSALPR_CPU::Inference(const cv::Mat& input, const std::vector<cv::Rect> & Bbox, std::string& lprResult) {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // No coarse _mutex — delegates to Inference(input, Bbox, lprResult, cameraId)
        if (input.empty()) return false;
        if ((input.cols < 5) || (input.rows < 5)) return false;
        return Inference(input, Bbox, lprResult, "CustomCam");
@@ -622,7 +632,7 @@ namespace ANSCENTER {
    bool ANSALPR_CPU::Inference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox,
        std::string& lprResult, const std::string& cameraId)
    {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // No coarse _mutex — sub-components have fine-grained locks.

        // Early validation
        if (!_licenseValid) {
@@ -668,16 +678,12 @@ namespace ANSCENTER {
        }

        try {
-            // Convert grayscale to BGR if necessary
-            const cv::Mat* framePtr;
+            // Convert grayscale to BGR if necessary (use local buffer for thread safety)
+            cv::Mat localFrame;
            if (input.channels() == 1) {
-                cv::cvtColor(input, this->_frameBuffer, cv::COLOR_GRAY2BGR);
-                framePtr = &this->_frameBuffer;
+                cv::cvtColor(input, localFrame, cv::COLOR_GRAY2BGR);
            }
-            else {
-                framePtr = &input;
-            }
-            const cv::Mat& frame = *framePtr;
+            const cv::Mat& frame = (input.channels() == 1) ? localFrame : input;

            const int frameWidth = frame.cols;
            const int frameHeight = frame.rows;
@@ -794,7 +800,12 @@ namespace ANSCENTER {
        cv::Mat lprImage = frame(plateRect);
        cv::Mat alignedLPR = enhanceForOCR(lprImage);

-        std::vector<PaddleOCR::OCRPredictResult> res_ocr = ppocr->ocr(alignedLPR);
+        // ppocr is not thread-safe, use fine-grained lock
+        std::vector<PaddleOCR::OCRPredictResult> res_ocr;
+        {
+            std::lock_guard<std::mutex> ocrLock(_ocrMutex);
+            res_ocr = ppocr->ocr(alignedLPR);
+        }

        const size_t detectionSize = res_ocr.size();
        if (detectionSize == 0 || detectionSize >= 3) {
--- a/modules/ANSLPR/ANSLPR_CPU.h
+++ b/modules/ANSLPR/ANSLPR_CPU.h
@@ -5,6 +5,7 @@
 #include <list>
 #include <map>
 #include <string>
+#include <mutex>
 #include <utility>
 #include <vector>
 #include <include/paddleocr.h>
@@ -157,6 +158,7 @@ namespace ANSCENTER
                                                         "43B1", "68L1", "70G1", "36M1", "81N1", "90K1", "17B1", "64E1", "99D1", "60B2", "74L1", "60C1", "68M1", "63B7", "34B1", "69M1", "24B1", "15M1", "83Y1", "48C1", "95H1", "79X1", "17B6", "36E1", "38K1", "25N1", "25U1", "61B1", "36C1", "36B3", "38F1", "99G1", "69N1", "97D1", "92T1", "92B1", "88B1", "97G1", "14U1", "63A1", "26N1", "19D1", "93C1", "73B1", "84B1", "81K1", "18L1", "64D1", "35M1", "61N1", "83P1", "15S1", "82B1", "92U1", "43D1", "22L1", "63B5", "64G1", "27N1", "14X1", "62C1", "81D1", "38G1", "19F1", "34K1", "49P1", "89H1", "14T1", "19M1", "78D1", "76A1", "66K1", "66C1", "71C1", "37K1", "19G1", "15F1", "85C1", "49B1", "21B1", "89F1", "23M1", "66L1", "90B5", "93M1", "14P1", "77N1", "36B8", "86B1", "12U1", "63B3", "21L1", "36G5", "65G1", "82E1", "61H1", "65H1", "84A1", "23F1", "95C1", "99K1", "49G1", "92D1", "36K3", "92N1", "82X1", "83M1", "11N1", "14K1", "19H1", "93H1", "60A1", "79A1", "20D1", "90D1", "81C1", "66P1", "36K1", "92V1", "18B1", "37P1", "22Y1", "23H1", "26D1", "66G1", "78F1", "49C1", "26H1", "38P1", "47T1", "74H1", "63P1", "47D1", "15D1", "23D1", "68E1", "20B1", "49F1", "43K1", "65K1", "27Z1", "92S1", "79H1", "21E1", "35Y1", "14S1", "75E1", "24Y1", "12T1", "27P1", "77B1", "88H1", "60B3", "23P1", "61F1", "99H1", "23K1", "59A3", "26C1", "81B1", "74E1", "66B1", "22S1", "92P1", "93B1", "69B1", "81P1", "12H1", "62K1", "35A1", "77C1", "27V1", "68N1", "12D1", "64K1", "41A1", "12Z1", "76C1", "38B1", "78G1", "74K1", "69H1", "94A1", "61K1", "86B7", "82G1", "14N1", "82M1", "76E1", "18E1", "61C1", "15N1", "90A1", "77F1", "34D1", "47B1", "62S1", "43E1", "81M1", "92X1", "75B1", "34F1", "70H1", "62B1", "26B1", "60B4", "61A1", "12B1", "90T1", "92E1", "34C1", "47G1", "97B1", "25S1", "70E1", "93Y1", "47S1", "37F1", "28N1", "11K1", "38E1", "78M1", "74C1", "12S1", "75S1", "37A1", "28D1", "65L1", "22B1", "99B1", "74G1", "79K1", "76K1", "76H1", "23B1", "15R1", "36B1", "74D1", "62L1", "37E1", "78E1", "89K1", "26M1", "25F1", "48H1", "79D1", "43H1", "76F1", "36L1", "43L1", "21K1", "88L1", "27S1", "92K1", "77D1", "19N1", "66H1", "36H5", "62N1", "18G1", "75D1", "37L1", "68K1", "28C1", "26E1", "35N1", "85H1", "62D1", "27U1", "19E1", "99E1", "14Y1", "49L1", "66M1", "73F1", "70K1", "36F5", "97H1", "93E1", "68P1", "43F1", "48G1", "75K1", "62U1", "86B9", "65F1", "27L1", "70L1", "63B8", "78L1", "11Z1", "68C1", "18D1", "15L1", "99C1", "49E1", "84E1", "69E1", "38A1", "48D1", "68S1", "81E1", "84K1", "63B6", "24T1", "95A1", "86B4", "34M1", "84L1", "24V1", "14M1", "36H1", "15B1", "69F1", "47E1", "38H1", "88D1", "28E1", "60C2", "63B9", "75Y1", "21D1", "35H1", "68F1", "86B5", "15H1", "36B5", "83X1", "17B7", "12V1", "86B8", "95E1", "63B2", "74F1", "86C1", "48K1", "89M1", "85D1", "71C4", "34E1", "97C1", "88E1", "81F1", "60B5", "84M1", "92H1", "28L1", "34H1", "38X1", "82L1", "61E1", "82F1", "62P1", "93F1", "65B1", "93L1", "95B1", "15P1", "77G1", "28M1", "35B1", "68G1", "36C2", "68D1", "69K1", "14L1", "36M3", "24X1", "24Z1", "86A1", "88C1", "15E1", "77E1", "83E1", "47L1", "25T1", "89C1", "71C3", "49D1", "36L6", "48F1", "36B6", "34P1", "84D1", "15C1", "38M1", "85F1", "77K1", "86B3", "74B1", "78H1", "89G1", "64A2", "15K1", "85B1", "49K1", "21H1", "73C1", "47U1", "65E1", "18C1", "69D1", "63B1", "95G1", "19L1", "20G1", "76D1", "29A1", "68T1", "75L1", "12L1", "89L1", "37C1", "27B1", "19C1", "11H1", "81X1", "70B1", "11V1", "43G1", "22A1", "83C1", "75C1", "79C1", "22F1", "92F1", "81G1", "81T1", "28H1", "66N1", "71B1", "18H1", "76P1", "26F1", "81U1", "34N1", "64F1", "76N1", "24S1", "26P1", "63B4", "35T1", "36N1", "47F1", "81L1", "61G1", "77M1", "34G1", "26G1", "97F1", "62H1", "28F1", "62T1", "93G1", "73D1", "65A1", "47P1", "74P1", "82N1", "20E1", "36D1", "60B1", "49M1", "37H1", "37M1", "38D1", "84F1", "88F1", "36B2", "65C1", "92M1", "86B6", "75H1", "38L1", "20C1", "97E1", "85E1", "38N1", "26K1", "89B1", "99F1", "28B1", "34L1", "86B2", "66F1", "77L1", "27Y1", "68H1", "37D1", "92L1", "82K1", "99A1", "69L1", "76M1", "90B4", "48B1", "95D1", "20H1", "64H1", "79Z1", "92G1", "23G1", "21G1", "37G1", "35K1", "81H1", "83Z1", "76T1", "36F1", "36B4", "14B9", "47K1", "20K1", "62M1", "84H1", "62F1", "74A1", "18A1", "73H1", "37N1", "79N1", "61D1", "11P1", "15G1", "47N1", "19K1", "71C2", "81S1", "11M1", "60B7", "60B8", "62G1", "71A1", "24P1", "69A1", "38C1", "49N1", "21C1", "84G1", "37B1", "72A1", "88K1", "88G1", "83V1", "78C1", "73K1", "78K1", "73E189D1", "67A1", "27X1", "62A1", "18K1", "70F1", "36K5", "19B1", "49H1", "66S1", "12P1"};
        ALPRChecker     alprChecker;
        std::vector<std::string> ValidVNCarList = { "94H", "49F", "93A", "20F", "81H", "95R", "38R", "29F", "81F", "28G", "19A", "85B", "2", "43H", "51L", "28C", "21A", "51D", "50F", "24H", "93R", "92H", "71G", "75H", "86G", "30L", "79A", "82B", "79H", "78C", "61E", "70A", "90C", "72G", "34B", "17E", "18E", "78A", "37F", "51E", "71A", "28F", "47E", "83D", "81B", "84C", "71H", "76G", "92E", "36A", "69R", "30M", "27R", "71D", "19B", "34E", "38K", "88G", "68G", "30E", "68E", "25F", "74D", "98K", "89H", "36R", "84D", "61F", "49G", "25H", "17F", "14R", "36H", "47G", "90A", "68A", "83C", "26B", "15B", "61C", "15K", "47H", "78E", "75D", "15C", "63E", "34C", "36F", "38G", "15E", "93F", "22G", "60B", "94D", "62R", "24D", "11R", "12A", "76A", "94C", "97R", "24E", "26A", "15F", "72A", "49H", "62D", "98C", "71B", "61A", "12C", "27A", "78R", "51M", "69E", "76D", "78F", "49R", "81A", "64F", "29D", "18A", "19F", "21E", "92A", "65G", "86E", "62G", "61K", "47A", "23R", "14F", "95D", "36B", "74R", "11H", "24C", "11G", "66D", "63A", "43R", "70F", "86B", "61G", "47M", "67C", "37D", "43G", "14H", "90F", "51G", "86A", "11E", "29K", "85C", "83F", "24B", "98R", "19E", "61B", "90D", "82G", "14K", "74G", "72D", "85A", "19C", "37G", "98E", "74F", "28H", "90E", "89D", "35R", "97H", "83H", "95A", "20C", "65E", "15R", "73C", "37A", "38E", "77G", "94B", "17A", "75R", "98F", "65R", "76R", "20B", "24G", "25B", "73G", "62F", "29G", "77C", "22H", "14D", "23F", "93C", "19R", "15D", "47R", "79D", "60G", "77A", "82C", "63G", "21H", "81E", "25D", "12D", "37R", "36K", "84F", "98G", "28B", "51N", "18F", "50R", "74C", "35C", "30G", "64A", "95F", "18C", "99G", "99B", "37C", "76H", "60K", "67R", "75A", "83R", "28E", "65F", "17D", "92G", "23C", "60R", "90R", "38A", "43D", "50H", "43C", "77H", "47B", "89F", "82F", "65H", "89E", "62C", "24R", "26G", "84E", "17C", "65B", "34A", "12B", "64R", "29H", "71C", "88D", "79F", "76C", "98A", "69H", "22B", "29A", "72R", "67H", "48C", "22D", "60C", "35H", "38H", "63P", "70D", "49D", "18H", "89A", "72E", "92D", "26H", "73R", "85G", "20E", "98H", "69C", "18B", "73B", "22E", "34G", "30K", "20D", "50A", "34D", "15H", "34H", "71E", "62E", "64C", "51R", "82D", "99E", "70R", "18D", "92F", "94R", "24A", "85H", "11C", "73E", "95E", "86C", "94F", "86R", "37K", "23B", "20H", "73D", "95H", "35A", "89B", "82H", "67F", "70H", "97F", "29E", "97A", "51K", "68D", "37B", "82E", "18R", "86H", "35B", "43E", "35F", "95B", "70E", "21D", "27F", "36E", "63D", "68C", "50E", "36G", "75F", "21G", "29B", "93B", "22A", "18G", "43F", "93G", "62A", "83B", "28D", "75C", "22C", "21R", "25E", "23G", "97C", "75E", "79E", "19H", "47K", "65C", "35E", "20R", "68B", "89R", "67A", "75G", "81R", "78B", "77D", "78G", "20K", "36D", "66C", "38F", "27G", "19D", "67B", "84G", "22F", "61D", "20G", "48A", "76F", "48H", "92B", "85R", "26C", "65A", "70B", "38D", "14C", "66A", "73A", "49C", "74E", "68R", "66B", "74A", "49E", "17B", "69D", "51C", "85F", "21F", "99C", "17G", "72H", "94E", "51F", "92R", "60H", "21B", "93D", "19G", "86F", "51A", "66R", "72B", "26D", "64E", "93H", "12H", "97E", "60E", "82A", "60A", "83E", "27D", "64B", "11B", "11D", "76B", "95G", "14A", "61R", "21C", "30F", "23H", "89C", "97G", "62B", "63R", "88B", "98B", "90B", "67G", "69F", "73H", "20A", "72C", "65D", "68H", "51H", "79G", "70C", "90G", "66G", "83A", "77F", "63B", "64G", "25A", "88E", "68F", "99D", "26E", "94A", "48F", "34R", "61H", "90H", "74B", "14G", "12F", "15A", "27E", "69A", "35D", "12E", "85E", "25C", "29M", "89G", "17R", "78D", "84R", "95C", "15G", "28R", "99A", "69G", "48D", "97D", "27C", "78H", "14E", "79R", "73F", "88A", "48E", "48B", "64H", "99R", "14B", "77R", "75B", "88F", "84B", "11A", "67E", "12R", "50M", "11F", "79C", "49A", "43A", "88R", "77E", "48G", "51B", "81D", "74H", "93E", "37H", "88C", "71F", "94G", "38C", "29C", "43B", "30H", "81G", "28A", "26R", "66H", "66E", "17H", "79B", "49B", "63C", "98D", "81C", "69B", "63H", "85D", "26F", "22R", "83G", "37E", "12G", "77B", "35G", "62H", "60D", "60F", "99H", "70G", "76E", "84A", "72F", "25R", "27B", "30A", "47F", "34F", "97B", "23E", "36C", "66F", "48R", "92C", "71R", "23A", "50G", "47C", "82R", "63F", "84H", "38B", "47D", "67D", "25G", "86D", "88H", "64D", "24F", "23D", "99F" };
+        std::mutex _ocrMutex;  // Fine-grained lock for PaddleOCR (not thread-safe)
        std::unique_ptr<PaddleOCR::PPOCR> ppocr = std::make_unique<PaddleOCR::PPOCR>();
        [[nodiscard]] std::string AnalyseLicensePlateText(const std::string& ocrText);
        [[nodiscard]] char convertDigitToLetter(char c);
--- a/modules/ANSLPR/ANSLPR_OD.cpp
+++ b/modules/ANSLPR/ANSLPR_OD.cpp
@@ -863,7 +863,8 @@ namespace ANSCENTER {
 		}
 	}
 	std::vector<Object> ANSALPR_OD::RunInferenceSingleFrame(const cv::Mat& input, const std::string& cameraId) {
-	std::lock_guard<std::recursive_mutex> lock(_mutex);
+	// No coarse _mutex here — sub-components (detectors, alprChecker) have their own locks.
+	// LabVIEW semaphore controls concurrency at the caller level.

 	// Early validation
 	if (!_licenseValid) {
@@ -916,18 +917,19 @@ namespace ANSCENTER {
 		cv::Mat draw = input.clone();
 #endif

-		_detectedArea = cv::Rect(0, 0, frameWidth, frameHeight);
+		// Use local variable instead of shared _detectedArea for thread safety
+		cv::Rect detectedArea(0, 0, frameWidth, frameHeight);

-		if (_detectedArea.width <= 50 || _detectedArea.height <= 50) {
+		if (detectedArea.width <= 50 || detectedArea.height <= 50) {
 			return {};
 		}

 #ifdef FNS_DEBUG
-		cv::rectangle(draw, _detectedArea, cv::Scalar(0, 0, 255), 2);
+		cv::rectangle(draw, detectedArea, cv::Scalar(0, 0, 255), 2);
 #endif

 		// Run license plate detection
-		cv::Mat activeFrame = frame(_detectedArea);
+		cv::Mat activeFrame = frame(detectedArea);
 		std::vector<Object> lprOutput = _lpDetector->RunInference(activeFrame, cameraId);

 		if (lprOutput.empty()) {
@@ -1010,7 +1012,7 @@ namespace ANSCENTER {
 		return {};
 	}
 	std::string ANSALPR_OD::DetectLicensePlateString(const cv::Mat& lprROI, const std::string& cameraId) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — _ocrDetector has its own m_inferenceMutex
 		try {
 			// convert lprROI to greyscale if it is not already
 			if (lprROI.empty()) {
@@ -1277,8 +1279,7 @@ namespace ANSCENTER {
 			return {};
 		}

-		std::lock_guard<std::recursive_mutex> lock(_mutex);
-
+		// No coarse _mutex — _lpColourDetector has its own m_inferenceMutex
 		try {
 			std::vector<Object> colourOutputs = _lpColourDetector->RunInference(lprROI, cameraId);

@@ -1310,8 +1311,9 @@ namespace ANSCENTER {
 			return DetectLPColourDetector(lprROI, cameraId);
 		}

-		// Check cache first (no GPU work needed)
+		// Check cache first (fine-grained lock, no GPU work)
 		{
+			std::lock_guard<std::mutex> cacheLock(_colourCacheMutex);
 			auto it = _colourCache.find(plateText);
 			if (it != _colourCache.end()) {
 				it->second.hitCount++;
@@ -1319,11 +1321,12 @@ namespace ANSCENTER {
 			}
 		}

-		// Cache miss — run the actual classifier
+		// Cache miss — run the actual classifier (no lock held during GPU inference)
 		std::string colour = DetectLPColourDetector(lprROI, cameraId);

-		// Store in cache
+		// Store in cache (fine-grained lock)
 		if (!colour.empty()) {
+			std::lock_guard<std::mutex> cacheLock(_colourCacheMutex);
 			if (_colourCache.size() >= COLOUR_CACHE_MAX_SIZE) {
 				_colourCache.clear();
 			}
@@ -1334,13 +1337,14 @@ namespace ANSCENTER {
 	}

 	bool ANSALPR_OD::Inference(const cv::Mat& input, std::string& lprResult) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — delegates to Inference(input, lprResult, cameraId) which is also lock-free
 		if (input.empty()) return false;
 		if ((input.cols < 5) || (input.rows < 5)) return false;
 		return Inference(input, lprResult, "CustomCam");
 	}
 	bool ANSALPR_OD::Inference(const cv::Mat& input, std::string& lprResult, const std::string& cameraId) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — sub-components have their own fine-grained locks.
+		// LabVIEW semaphore controls concurrency at the caller level.

 		// Early validation
 		if (!_licenseValid) {
@@ -1518,14 +1522,14 @@ namespace ANSCENTER {
 		}
 	}
 	bool ANSALPR_OD::Inference(const cv::Mat& input, const std::vector<cv::Rect> & Bbox, std::string& lprResult) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — delegates to Inference(input, Bbox, lprResult, cameraId)
 		if (input.empty()) return false;
 		if ((input.cols < 5) || (input.rows < 5)) return false;
 		return Inference(input, Bbox, lprResult, "CustomCam");
 	}
 	bool ANSALPR_OD::Inference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox,std::string& lprResult, const std::string& cameraId)
 	{
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — sub-components have their own fine-grained locks.

 		// Early validation
 		if (!_licenseValid) {
@@ -2177,12 +2181,10 @@ namespace ANSCENTER {
 		cv::Mat unsharp;
 		cv::addWeighted(denoised, 1.8, blurred, -0.8, 0, unsharp);

-		// Step 5: CLAHE contrast enhancement
-		if (!_clahe) {
-			_clahe = cv::createCLAHE(4.0, cv::Size(8, 8));
-		}
+		// Step 5: CLAHE contrast enhancement (thread-local for thread safety)
+		thread_local cv::Ptr<cv::CLAHE> tl_clahe = cv::createCLAHE(4.0, cv::Size(8, 8));
 		cv::Mat contrastEnhanced;
-		_clahe->apply(unsharp, contrastEnhanced);
+		tl_clahe->apply(unsharp, contrastEnhanced);

 		// Step 6: Laplacian edge sharpening
 		cv::Mat lap;
@@ -2718,6 +2720,7 @@ namespace ANSCENTER {

 	void ANSALPR_OD::ensureUniquePlateText(std::vector<Object>& results, const std::string& cameraId)
 	{
+		std::lock_guard<std::mutex> plateLock(_plateIdentitiesMutex);
 		auto& identities = _plateIdentities[cameraId];

 		// Option B: Auto-detect mode by counting detections.
--- a/modules/ANSLPR/ANSLPR_OD.h
+++ b/modules/ANSLPR/ANSLPR_OD.h
@@ -24,7 +24,7 @@ namespace ANSCENTER
        ANSCENTER::ModelConfig _lpdmodelConfig;
        ANSCENTER::ModelConfig _ocrModelConfig;
 		ANSCENTER::ModelConfig _lpColourModelConfig;
-        cv::Ptr<cv::CLAHE>     _clahe;  // Reusable CLAHE instance
+        // _clahe moved to thread-local in enhanceForOCR() for thread safety
        ANSCENTER::NV12PreprocessHelper _nv12Helper;  // NV12 crop for high-res plate OCR

        std::string             _lpdLabels;
@@ -147,6 +147,7 @@ namespace ANSCENTER
            int framesSinceLastSeen = 0;
        };
        // cameraId → list of tracked plate identities
+        std::mutex _plateIdentitiesMutex;  // Fine-grained lock for plate identity tracking
        std::unordered_map<std::string, std::vector<SpatialPlateIdentity>> _plateIdentities;
        static constexpr float PLATE_SPATIAL_MATCH_THRESHOLD = 0.3f; // IoU threshold for same plate
        void ensureUniquePlateText(std::vector<Object>& results, const std::string& cameraId);
@@ -176,6 +177,7 @@ namespace ANSCENTER
            std::string colour;
            int         hitCount = 0;
        };
+        std::mutex _colourCacheMutex;  // Fine-grained lock for colour cache only
        std::unordered_map<std::string, ColourCacheEntry> _colourCache;
        static constexpr size_t COLOUR_CACHE_MAX_SIZE = 200;

--- a/modules/ANSOCR/ANSOCR.cpp
+++ b/modules/ANSOCR/ANSOCR.cpp
@@ -118,7 +118,7 @@ namespace ANSCENTER {
 	}

 	std::vector<ANSCENTER::OCRObject> ANSOCR::RunInference(const cv::Mat& input, const std::string& cameraId) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — ppOCR->Predict() / engine has its own internal lock
 		std::vector<ANSCENTER::OCRObject> OCRObjects;
 		OCRObjects.clear();
 		if (!_licenseValid) {
@@ -177,7 +177,7 @@ namespace ANSCENTER {


 	std::vector<ANSCENTER::OCRObject> ANSOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — ppOCR->Predict() / engine has its own internal lock
 		std::vector<ANSCENTER::OCRObject> OCRObjects;
 		OCRObjects.clear();
 		if (!_licenseValid) {
@@ -271,7 +271,7 @@ namespace ANSCENTER {


 	std::vector<ANSCENTER::OCRObject> ANSOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox, const std::string& cameraId) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — ppOCR->Predict() / engine has its own internal lock
 		std::vector<ANSCENTER::OCRObject> OCRObjects;
 		OCRObjects.clear();
 		if (!_licenseValid) {
--- a/modules/ANSOCR/ANSOnnxOCR.cpp
+++ b/modules/ANSOCR/ANSOnnxOCR.cpp
@@ -80,7 +80,7 @@ std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input)
 }

 std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input, const std::string& cameraId) {
-	std::lock_guard<std::recursive_mutex> lock(_mutex);
+	// No coarse _mutex — _engine->ocr() has its own internal lock
 	std::vector<ANSCENTER::OCRObject> OCRObjects;

 	if (!_licenseValid) {
@@ -164,7 +164,7 @@ std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input,
 }

 std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox) {
-	std::lock_guard<std::recursive_mutex> lock(_mutex);
+	// No coarse _mutex — _engine->ocr() has its own internal lock
 	std::vector<ANSCENTER::OCRObject> OCRObjects;

 	if (!_licenseValid) {
@@ -268,7 +268,7 @@ std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input,
 }

 std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox, const std::string& cameraId) {
-	std::lock_guard<std::recursive_mutex> lock(_mutex);
+	// No coarse _mutex — _engine->ocr() has its own internal lock
 	std::vector<ANSCENTER::OCRObject> OCRObjects;

 	if (!_licenseValid) {
@@ -385,7 +385,7 @@ bool ANSONNXOCR::Destroy() {
 }

 std::pair<std::string, float> ANSONNXOCR::RecognizeText(const cv::Mat& croppedImage) {
-	std::lock_guard<std::recursive_mutex> lock(_mutex);
+	// No coarse _mutex — _engine->recognizeOnly() has its own internal lock
 	if (!_isInitialized || !_engine || croppedImage.empty()) return {"", 0.0f};
 	auto result = _engine->recognizeOnly(croppedImage);
 	return {result.text, result.score};
--- a/modules/ANSOCR/ANSRtOCR.cpp
+++ b/modules/ANSOCR/ANSRtOCR.cpp
@@ -90,7 +90,7 @@ std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input) {
 }

 std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, const std::string& cameraId) {
-    std::lock_guard<std::recursive_mutex> lock(_mutex);
+    // No coarse _mutex — _engine->ocr() has its own internal lock
    std::vector<ANSCENTER::OCRObject> OCRObjects;

    if (!_licenseValid) {
@@ -178,7 +178,7 @@ std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, c
 }

 std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox) {
-    std::lock_guard<std::recursive_mutex> lock(_mutex);
+    // No coarse _mutex — _engine->ocr() has its own internal lock
    std::vector<ANSCENTER::OCRObject> OCRObjects;

    if (!_licenseValid) {
@@ -282,7 +282,7 @@ std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, c
 }

 std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox, const std::string& cameraId) {
-    std::lock_guard<std::recursive_mutex> lock(_mutex);
+    // No coarse _mutex — _engine->ocr() has its own internal lock
    std::vector<ANSCENTER::OCRObject> OCRObjects;

    if (!_licenseValid) {
@@ -379,7 +379,7 @@ std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, c
 }

 std::pair<std::string, float> ANSRTOCR::RecognizeText(const cv::Mat& croppedImage) {
-    std::lock_guard<std::recursive_mutex> lock(_mutex);
+    // No coarse _mutex — _engine->recognizeOnly() has its own internal lock
    if (!_isInitialized || !_engine || croppedImage.empty()) return {"", 0.0f};
    auto result = _engine->recognizeOnly(croppedImage);
    return {result.text, result.score};
--- a/modules/ANSODEngine/ANSODEngine.cpp
+++ b/modules/ANSODEngine/ANSODEngine.cpp
@@ -1455,7 +1455,7 @@ namespace ANSCENTER
 		}
 	}
 	std::vector<Object> ANSODBase::RunStaticInference(const cv::Mat& input, cv::Rect Bbox, const std::string& camera_id) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — only uses local variables and virtual RunInference() which has its own engine lock
 		std::vector<Object> output;
 		output.clear();
 		try {
@@ -2100,7 +2100,8 @@ namespace ANSCENTER
 		}
 	}
 	std::vector<Object>   ANSODBase::RunInferenceWithOption(const cv::Mat& input, const std::string& camera_id, const std::string activeROIMode) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — sub-components (engines, trackers) have their own locks.
+		// LabVIEW semaphore controls concurrency at the caller level.
 		try {
 			int mode = 0;
 			double confidenceThreshold = 0.35;
@@ -2116,8 +2117,11 @@ namespace ANSCENTER
 			if (confidenceThreshold <= 0) confidenceThreshold = 0;
 			if (confidenceThreshold > 1) confidenceThreshold = 1;

-			// Update model configuration with the new parameters
-			if(confidenceThreshold>0)_modelConfig.detectionScoreThreshold = confidenceThreshold;
+			// Update model configuration with the new parameters (brief lock for config)
+			if (confidenceThreshold > 0) {
+				std::lock_guard<std::recursive_mutex> cfgLock(_mutex);
+				_modelConfig.detectionScoreThreshold = confidenceThreshold;
+			}
 			switch (mode) {
 			case 0: // Normal mode
 				return RunInference(input, camera_id); //RunInference
--- a/modules/ANSODEngine/NV12PreprocessHelper.cpp
+++ b/modules/ANSODEngine/NV12PreprocessHelper.cpp
@@ -275,6 +275,26 @@ namespace ANSCENTER {
                              gpuData->gpuIndex == inferenceGpu;
        const bool useZeroCopy = isCudaDevice && gpuMatch;

+        // --- Debug: log pointer state before reading ---
+        {
+            char _nv12_dbg[512];
+            snprintf(_nv12_dbg, sizeof(_nv12_dbg),
+                "[NV12Helper] tryNV12: gpuData=%p yPlane=%p uvPlane=%p isCuda=%d "
+                "gpuIdx=%d infGpu=%d gpuMatch=%d zeroCopy=%d "
+                "gpuCacheY=%p gpuCacheUV=%p gpuCacheValid=%d refcount=%d %dx%d\n",
+                (void*)gpuData, (void*)gpuData->yPlane, (void*)gpuData->uvPlane,
+                (int)isCudaDevice, gpuData->gpuIndex, inferenceGpu,
+                (int)gpuMatch, (int)useZeroCopy,
+                gpuData->gpuCacheY, gpuData->gpuCacheUV,
+                (int)gpuData->gpuCacheValid,
+                gpuData->refcount.load(),
+                frameW, frameH);
+#ifdef _WIN32
+            OutputDebugStringA(_nv12_dbg);
+#endif
+            fprintf(stderr, "%s", _nv12_dbg);
+        }
+
        // Effective plane pointers — for zero-copy, use CUDA device ptrs;
        // for CPU upload, use the CPU snapshot buffers.
        uint8_t* effYPlane;
@@ -283,7 +303,7 @@ namespace ANSCENTER {
        int      effUvLinesize;

        if (useZeroCopy) {
-            // Same GPU: wrap NVDEC device pointers directly
+            // Same GPU: wrap owned CUDA device pointers directly
            effYPlane     = gpuData->yPlane;
            effUvPlane    = gpuData->uvPlane;
            effYLinesize  = gpuData->yLinesize;
@@ -435,6 +455,18 @@ namespace ANSCENTER {
        gpuResized.create(inputH, inputW, CV_8UC3);

        cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
+        {
+            char _nv12_dbg2[256];
+            snprintf(_nv12_dbg2, sizeof(_nv12_dbg2),
+                "[NV12Helper] KERNEL LAUNCH: gpuY=%p(%dx%d) gpuUV=%p(%dx%d) -> %dx%d zeroCopy=%d\n",
+                (void*)gpuY.data, gpuY.cols, gpuY.rows,
+                (void*)gpuUV.data, gpuUV.cols, gpuUV.rows,
+                inputW, inputH, (int)useZeroCopy);
+#ifdef _WIN32
+            OutputDebugStringA(_nv12_dbg2);
+#endif
+            fprintf(stderr, "%s", _nv12_dbg2);
+        }
        launcher(gpuY, gpuUV, gpuResized, frameW, frameH, inputW, inputH, rawStream);

        stream.waitForCompletion();
@@ -945,7 +977,15 @@ namespace ANSCENTER {
                inputW, inputH, frameW, frameH, stream);
        }

-        cudaStreamSynchronize(stream);
+        // Use polling sync instead of cudaStreamSynchronize to avoid
+        // holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
+        {
+            cudaError_t err = cudaStreamQuery(stream);
+            while (err == cudaErrorNotReady) {
+                Sleep(0);
+                err = cudaStreamQuery(stream);
+            }
+        }

        // (No registry lock to release — data kept alive by refcount)

--- a/modules/ANSODEngine/nv12_to_rgb.cu
+++ b/modules/ANSODEngine/nv12_to_rgb.cu
@@ -8,6 +8,9 @@

 #include <cuda_runtime.h>
 #include <cstdint>
+#ifdef _WIN32
+#include <windows.h>  // Sleep()
+#endif
 #include <cstdio>

 // ── Shared YUV→RGB computation ───────────────────────────────────────────
@@ -651,7 +654,24 @@ int ANSGpuNV12ToBGR(
                      width * 3, height,
                      cudaMemcpyDeviceToHost, t_bufs.stream);

-    cudaStreamSynchronize(t_bufs.stream);
+    // Use polling sync instead of cudaStreamSynchronize to avoid
+    // holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
+    // Short Sleep(0) fast path for sub-ms kernels, then Sleep(1) to give
+    // cleanup operations (cuArrayDestroy, cuMemFree) a window to acquire
+    // the exclusive SRW lock.
+    {
+        cudaError_t qerr = cudaStreamQuery(t_bufs.stream);
+        if (qerr == cudaErrorNotReady) {
+            for (int i = 0; i < 10 && qerr == cudaErrorNotReady; ++i) {
+                Sleep(0);
+                qerr = cudaStreamQuery(t_bufs.stream);
+            }
+            while (qerr == cudaErrorNotReady) {
+                Sleep(1);
+                qerr = cudaStreamQuery(t_bufs.stream);
+            }
+        }
+    }

    // Check for errors
    cudaError_t err = cudaGetLastError();