Fix NV12 crash issue when recreate camera object

2026-04-02 22:07:27 +11:00
parent 4bedf3a3a2
commit 958cab6ae3
25 changed files with 1459 additions and 393 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -12,7 +12,29 @@
      "Bash(grep -n \"struct Object\\\\|class Object\" /c/Projects/CLionProjects/ANSCORE/modules/ANSLPR/*.h /c/Projects/CLionProjects/ANSCORE/modules/ANSLPR/include/*.h)",
      "Bash(grep -n \"cudaStream\\\\|cudaMalloc\\\\|cudaFree\\\\|queue\\\\|Task\\\\|mutex\" /c/Projects/CLionProjects/ANSCORE/engines/TensorRTAPI/include/engine/*.inl)",
      "Bash(grep -n \"~Engine\\\\|destructor\\\\|cleanup\\\\|~\" /c/Projects/CLionProjects/ANSCORE/engines/TensorRTAPI/include/engine/*.inl)",
-      "Bash(grep -n \"for.*cudaFree\\\\|m_buffers\\\\[\" /c/Projects/CLionProjects/ANSCORE/engines/TensorRTAPI/include/engine/*.inl)"
+      "Bash(grep -n \"for.*cudaFree\\\\|m_buffers\\\\[\" /c/Projects/CLionProjects/ANSCORE/engines/TensorRTAPI/include/engine/*.inl)",
+      "Bash(find /c/Projects/CLionProjects/ANSCORE -name ANSGpuFrameRegistry* -type f)",
+      "Bash(ls -la /c/Projects/CLionProjects/ANSCORE/modules/ANSLPR/*.h)",
+      "Bash(\"C:\\\\Users\\\\nghia\\\\AppData\\\\Local\\\\Programs\\\\CLion 2026.1\\\\bin\\\\cmake\\\\win\\\\x64\\\\bin\\\\cmake.exe\" --build cmake-build-release --target all -j 30)",
+      "Bash(cmake --build build --target ANSLPR-UnitTest --config Release)",
+      "Bash(ls -d C:/Projects/CLionProjects/ANSCORE/cmake-build-*)",
+      "Bash(ls -d C:/Projects/CLionProjects/ANSCORE/out/*)",
+      "Bash(cmake --build C:/Projects/CLionProjects/ANSCORE/cmake-build-release --target ANSLPR-UnitTest --config Release)",
+      "Bash(cmake --build C:/Projects/CLionProjects/ANSCORE/cmake-build-release --target ANSLPR-UnitTest)",
+      "Bash('C:/Program Files/Microsoft Visual Studio/2022/Community/Common7/Tools/VsDevCmd.bat' -arch=amd64)",
+      "Bash(cmake -B C:/Projects/CLionProjects/ANSCORE/cmake-build-release -S C:/Projects/CLionProjects/ANSCORE -G Ninja -DCMAKE_BUILD_TYPE=Release)",
+      "Bash(cmd //C \"call \"\"C:\\\\Program Files\\\\Microsoft Visual Studio\\\\2022\\\\Community\\\\VC\\\\Auxiliary\\\\Build\\\\vcvarsall.bat\"\" amd64 >nul 2>&1 && cmake --build C:\\\\Projects\\\\CLionProjects\\\\ANSCORE\\\\cmake-build-release --target ANSLPR-UnitTest\")",
+      "Bash(1 EOF cmd /C C:tmpbuild.bat)",
+      "Read(//tmp/**)",
+      "Bash(\"C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Auxiliary/Build/vcvarsall.bat\" amd64)",
+      "Bash(export INCLUDE=\"C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.44.35207/include;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/ucrt;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/um;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/shared;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/winrt;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/cppwinrt\" export LIB=\"C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.44.35207/lib/x64;C:/Program Files \\(x86\\)/Windows Kits/10/Lib/10.0.26100.0/ucrt/x64;C:/Program Files \\(x86\\)/Windows Kits/10/Lib/10.0.26100.0/um/x64\" cmake --build \"C:/Projects/CLionProjects/ANSCORE/cmake-build-release\" --target ANSLPR-UnitTest)",
+      "Bash(tasklist)",
+      "Bash(taskkill /F /IM ANSLPR-UnitTest.exe)",
+      "Bash(export \"INCLUDE=C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.44.35207/include;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/ucrt;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/um;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/shared;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/winrt;C:/Program Files \\(x86\\)/Windows Kits/10/Include/10.0.26100.0/cppwinrt\")",
+      "Bash(export \"LIB=C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.44.35207/lib/x64;C:/Program Files \\(x86\\)/Windows Kits/10/Lib/10.0.26100.0/ucrt/x64;C:/Program Files \\(x86\\)/Windows Kits/10/Lib/10.0.26100.0/um/x64\")",
+      "Bash(grep -E \"\\\\.\\(cpp|h|hpp\\)$\")",
+      "Bash(find /c/Projects/CLionProjects/ANSCORE -name *Logger* -type f)",
+      "Bash(find /c/Projects/CLionProjects/ANSCORE -name *SPDLogger* -o -name *ANSLogger*)"
    ]
  }
 }
--- a/MediaClient/media/video_decoder.cpp
+++ b/MediaClient/media/video_decoder.cpp
@@ -575,8 +575,13 @@ void CVideoDecoder::Start() {
 }

 void CVideoDecoder::Stop() {
-    std::lock_guard<std::recursive_mutex> lock(_mutex);
-    m_bRunning = FALSE;
+    // Atomically signal the decoder to stop WITHOUT acquiring _mutex.
+    // decode() holds _mutex while inside avcodec_send_packet / CUDA calls
+    // that can block on the nvcuda64 SRW lock for a long time.
+    // If we waited for _mutex here, Stop() would deadlock whenever a
+    // concurrent decode() is stuck waiting for a CUDA operation held by
+    // an inference thread.
+    m_bRunning.store(FALSE, std::memory_order_release);
    log_print(HT_LOG_INFO, "%s, Video decoder stopped\r\n", __FUNCTION__);
 }

--- a/MediaClient/media/video_decoder.h
+++ b/MediaClient/media/video_decoder.h
@@ -3,6 +3,7 @@
 #include "sys_inc.h"
 #include "media_format.h"
 #include <string>
+#include <atomic>
 #include <mutex>
 #include <vector>
 extern "C"
@@ -152,7 +153,7 @@ private:
 	int     hwDecoderInit(AVCodecContext* ctx, int hwMode, int preferredGpu = -1);
 private:
 	BOOL			m_bInited;
-	BOOL			m_bRunning;
+	std::atomic<BOOL>	m_bRunning;
 	BOOL			m_bHardwareDecoderEnabled;  // Track if hardware decoder is enabled
 	bool			m_bCudaHWAccel;             // true when using AV_HWDEVICE_TYPE_CUDA
 	int				m_hwGpuIndex;               // GPU index assigned by HWDecoderPool (-1 = legacy)
--- a/engines/TensorRTAPI/include/engine/EngineRunInference.inl
+++ b/engines/TensorRTAPI/include/engine/EngineRunInference.inl
@@ -1,6 +1,7 @@
 #pragma once
 #include <cstring>
 #include <filesystem>
+#include <semaphore>
 #include "TRTCompat.h"

 // Per-device mutex for CUDA graph capture.
@@ -15,6 +16,95 @@ static std::mutex& graphCaptureMutex() {
    return m;
 }

+// ============================================================================
+// GPU INFERENCE THROTTLE
+// ============================================================================
+// Global counting semaphore that limits how many Engine instances can execute
+// CUDA inference simultaneously.  Without this, N separate Engine instances
+// (one per camera) all submit GPU work at once, causing:
+//   1. SM 100% saturation → each inference takes 5-10x longer
+//   2. GPU thermal throttling at 85°C → further slowdown
+//   3. cudaStreamSynchronize blocking indefinitely → system freeze
+//
+// Auto-computed from GPU VRAM:
+//   ≤ 4 GB  → 2 concurrent     8 GB  → 4 concurrent
+//   6 GB    → 3 concurrent    12+ GB  → 6 concurrent
+//   Multi-GPU: sum across all GPUs
+//
+// Excess threads wait on CPU (nearly zero cost) while the bounded set
+// runs efficiently on the GPU without thermal throttling.
+static std::counting_semaphore<64>& gpuInferenceSemaphore() {
+    static int maxConcurrent = []() {
+        int totalSlots = 0;
+        int gpuCount = 0;
+        cudaGetDeviceCount(&gpuCount);
+        if (gpuCount <= 0) return 4;  // fallback
+
+        for (int i = 0; i < gpuCount; ++i) {
+            size_t freeMem = 0, totalMem = 0;
+            cudaSetDevice(i);
+            cudaMemGetInfo(&freeMem, &totalMem);
+            int gbTotal = static_cast<int>(totalMem / (1024ULL * 1024ULL * 1024ULL));
+
+            // Scale concurrency with VRAM: ~1 slot per 2 GB, min 2, max 6 per GPU
+            int slotsThisGpu = std::clamp(gbTotal / 2, 2, 6);
+            totalSlots += slotsThisGpu;
+        }
+
+        totalSlots = std::clamp(totalSlots, 2, 64);
+        std::cout << "Info [GPU Throttle]: max concurrent inferences = "
+                  << totalSlots << " (across " << gpuCount << " GPU(s))" << std::endl;
+        return totalSlots;
+    }();
+    static std::counting_semaphore<64> sem(maxConcurrent);
+    return sem;
+}
+
+// RAII guard for the GPU inference semaphore
+struct GpuInferenceGuard {
+    GpuInferenceGuard()  { gpuInferenceSemaphore().acquire(); }
+    ~GpuInferenceGuard() { gpuInferenceSemaphore().release(); }
+    GpuInferenceGuard(const GpuInferenceGuard&) = delete;
+    GpuInferenceGuard& operator=(const GpuInferenceGuard&) = delete;
+};
+
+// ============================================================================
+// WDDM-SAFE STREAM SYNCHRONIZATION
+// ============================================================================
+// Under Windows WDDM, cudaStreamSynchronize calls cuStreamQuery in a tight
+// loop with SwitchToThread, holding nvcuda64's internal SRW lock the entire
+// time.  When the GPU is busy with inference, this spin blocks ALL other CUDA
+// operations — including HW video decode (nvcuvid), cuMemAlloc, cuArrayDestroy.
+// If a camera Reconnect or decode buffer allocation needs an exclusive SRW lock
+// while inference is spinning, the entire system deadlocks.
+//
+// This function replaces cudaStreamSynchronize with a polling loop that
+// explicitly releases the SRW lock between queries by sleeping briefly.
+// This allows other CUDA operations to interleave with the sync wait.
+static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
+    // Fast path: check if already done (no sleep overhead for quick kernels)
+    cudaError_t err = cudaStreamQuery(stream);
+    if (err != cudaErrorNotReady) return err;
+
+    // Short Sleep(0) fast path (~10 iterations) catches sub-ms kernel completions.
+    // Then switch to Sleep(1) to give cleanup operations (cuArrayDestroy, cuMemFree)
+    // a window to acquire the exclusive nvcuda64 SRW lock.
+    // Previously used 1000 Sleep(0) iterations which hogged the SRW lock and
+    // caused ~20-second stalls when concurrent cleanup needed exclusive access.
+    for (int i = 0; i < 10; ++i) {
+        Sleep(0);
+        err = cudaStreamQuery(stream);
+        if (err != cudaErrorNotReady) return err;
+    }
+
+    // 1ms sleeps — adds negligible latency at 30 FPS but prevents SRW lock starvation.
+    while (true) {
+        Sleep(1);
+        err = cudaStreamQuery(stream);
+        if (err != cudaErrorNotReady) return err;
+    }
+}
+
 template <typename T>
 void Engine<T>::warmUp(int iterations) {
    if (m_verbose) {
@@ -163,6 +253,16 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
        return runInferenceFromPool(inputs, featureVectors);
    }

+    // ============================================================================
+    // GPU INFERENCE THROTTLE
+    // ============================================================================
+    // Limit how many Engine instances can run CUDA inference simultaneously.
+    // Without this, 12 cameras each with their own Engine all submit GPU work
+    // at once → SM 100% → thermal throttle → cudaStreamSynchronize hangs.
+    // The semaphore lets excess threads wait on CPU (nearly zero cost) while
+    // a bounded number use the GPU efficiently.
+    GpuInferenceGuard gpuThrottle;
+
    // ============================================================================
    // SINGLE-ENGINE SERIALISATION
    // ============================================================================
@@ -376,7 +476,7 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
                        std::cout << "Error: Failed to set optimization profile 0" << std::endl;
                        return false;
                    }
-                    cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
+                    cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
                    if (syncErr != cudaSuccess) {
                        std::cout << "Error: Failed to sync after profile change: "
                            << cudaGetErrorString(syncErr) << std::endl;
@@ -642,7 +742,7 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
        if (graphExec) {
            // Launch the pre-captured graph (single API call replaces many).
            cudaGraphLaunch(graphExec, m_inferenceStream);
-            cudaStreamSynchronize(m_inferenceStream);
+            cudaStreamSynchronize_Safe(m_inferenceStream);

            // CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
            for (int batch = 0; batch < batchSize; ++batch) {
@@ -705,7 +805,7 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
            }
        }

-        cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
+        cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
        if (syncErr != cudaSuccess) {
            std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
                + std::string(cudaGetErrorString(syncErr));
--- a/include/ANSGpuFrameRegistry.h
+++ b/include/ANSGpuFrameRegistry.h
@@ -34,15 +34,40 @@
 #include <atomic>
 #include <cstdint>
 #include <cstdlib>
+#include <cstdio>
 #include <chrono>
 #include <opencv2/core/mat.hpp>

+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+// Debug logging for registry operations — both stderr and OutputDebugString.
+#ifndef REG_DBG
+#ifdef _WIN32
+#define REG_DBG(fmt, ...) do { \
+    char _reg_buf[512]; \
+    snprintf(_reg_buf, sizeof(_reg_buf), "[Registry] " fmt "\n", ##__VA_ARGS__); \
+    OutputDebugStringA(_reg_buf); \
+    fprintf(stderr, "%s", _reg_buf); \
+} while(0)
+#else
+#define REG_DBG(fmt, ...) fprintf(stderr, "[Registry] " fmt "\n", ##__VA_ARGS__)
+#endif
+#endif
+
 // Safety constants
 static constexpr int    MAX_FRAME_REFCOUNT    = 64;
 static constexpr int    FRAME_TTL_SECONDS     = 3;
 static constexpr size_t GPU_CACHE_BUDGET_DEFAULT = 1ULL * 1024 * 1024 * 1024; // 1GB
 static constexpr int    EVICT_CHECK_INTERVAL_MS  = 500;

+// Entry for deferred GPU memory deallocation (tracks device index for cudaSetDevice)
+struct GpuPendingFreeEntry {
+    void* ptr       = nullptr;
+    int   deviceIdx = -1;
+};
+
 struct GpuFrameData {
    // --- CPU NV12 snapshot (OWNED malloc'd buffers, independent of decoder) ---
    uint8_t*  cpuYPlane    = nullptr;   // malloc'd Y plane copy
@@ -83,6 +108,14 @@ struct GpuFrameData {
    std::atomic<int> refcount{1};
    std::chrono::steady_clock::time_point createdAt;

+    // --- Owner callback (for per-client inference guard) ---
+    // When the last reference to this frame drops, onReleaseFn is called
+    // with ownerClient to decrement the RTSP client's in-flight counter.
+    // This lets Destroy() wait for in-flight inference to finish before
+    // freeing NVDEC surfaces (fixes LabVIEW crash).
+    void*  ownerClient   = nullptr;
+    void (*onReleaseFn)(void*) = nullptr;
+
    // Default constructor
    GpuFrameData() = default;

@@ -100,6 +133,7 @@ struct GpuFrameData {
        , yPlane(o.yPlane), uvPlane(o.uvPlane)
        , yLinesize(o.yLinesize), uvLinesize(o.uvLinesize)
        , refcount(o.refcount.load()), createdAt(o.createdAt)
+        , ownerClient(o.ownerClient), onReleaseFn(o.onReleaseFn)
    {
        // Null out source to prevent double-free of owned pointers
        o.cpuYPlane = nullptr;
@@ -111,6 +145,8 @@ struct GpuFrameData {
        o.yPlane = nullptr;
        o.uvPlane = nullptr;
        o.gpuCacheBytes = 0;
+        o.ownerClient = nullptr;
+        o.onReleaseFn = nullptr;
    }

    // No copy
@@ -140,11 +176,21 @@ public:
        if (!mat) return nullptr;
        void* oldAvframe = nullptr;

+        // Capture old frame's owner callback to invoke OUTSIDE m_mutex
+        void* oldOwner = nullptr;
+        void (*oldReleaseFn)(void*) = nullptr;
+
        data.createdAt = std::chrono::steady_clock::now();
        data.refcount.store(1);

        auto* heapData = new GpuFrameData(std::move(data));
+        REG_DBG("attach mat=%p new frame=%p yPlane=%p gpuCacheY=%p isCuda=%d %dx%d",
+                (void*)mat, (void*)heapData,
+                (void*)heapData->yPlane, heapData->gpuCacheY,
+                (int)heapData->isCudaDevicePtr,
+                heapData->width, heapData->height);

+        {
            std::lock_guard<std::mutex> lock(m_mutex);

            // If this Mat* already has an entry, release the old one
@@ -153,6 +199,8 @@ public:
                auto* oldFrame = it->second;
                int oldRef = oldFrame->refcount.fetch_sub(1);
                if (oldRef <= 1) {
+                    oldOwner = oldFrame->ownerClient;
+                    oldReleaseFn = oldFrame->onReleaseFn;
                    oldAvframe = oldFrame->avframe;
                    if (oldFrame->cpuAvframe)
                        m_pendingFree.push_back(oldFrame->cpuAvframe);
@@ -166,6 +214,12 @@ public:

            m_map[mat] = heapData;
            m_frameSet.insert(heapData);
+        }
+
+        // Notify old frame's owner OUTSIDE m_mutex
+        if (oldReleaseFn && oldOwner) {
+            oldReleaseFn(oldOwner);
+        }

        return oldAvframe;  // Caller must av_frame_free if non-null
    }
@@ -197,6 +251,11 @@ public:
    void release(cv::Mat* mat) {
        if (!mat) return;

+        // Capture owner callback to invoke OUTSIDE m_mutex (deadlock safety)
+        void* owner = nullptr;
+        void (*releaseFn)(void*) = nullptr;
+
+        {
            std::lock_guard<std::mutex> lock(m_mutex);

            auto it = m_map.find(mat);
@@ -206,7 +265,16 @@ public:
            m_map.erase(it);

            int oldRef = frame->refcount.fetch_sub(1);
+            REG_DBG("release mat=%p refcount %d->%d yPlane=%p gpuCacheY=%p owner=%p",
+                    (void*)mat, oldRef, oldRef - 1,
+                    (void*)frame->yPlane, frame->gpuCacheY, frame->ownerClient);
            if (oldRef <= 1) {
+                // Capture owner callback before deleting frame
+                owner = frame->ownerClient;
+                releaseFn = frame->onReleaseFn;
+                REG_DBG("LAST REF — freeing frame=%p cpuY=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu",
+                        (void*)frame, (void*)frame->cpuYPlane,
+                        frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes);
                // Last reference — free everything
                if (frame->avframe)
                    m_pendingFree.push_back(frame->avframe);
@@ -218,6 +286,14 @@ public:
            }
        }

+        // Notify owner OUTSIDE m_mutex — prevents lock-ordering deadlock
+        // with ANSRTSPClient::_mutex (used by Destroy's condition_variable wait)
+        if (releaseFn && owner) {
+            REG_DBG("calling onReleaseFn owner=%p", owner);
+            releaseFn(owner);
+        }
+    }
+
    // --- lookup: find GpuFrameData by cv::Mat* (locking) ---
    GpuFrameData* lookup(cv::Mat* mat) {
        std::lock_guard<std::mutex> lock(m_mutex);
@@ -267,9 +343,10 @@ public:
    }

    // --- Drain pending GPU device pointers for caller to cudaFree ---
-    std::vector<void*> drain_gpu_pending() {
+    // Each entry includes the device index for cudaSetDevice before cudaFree.
+    std::vector<GpuPendingFreeEntry> drain_gpu_pending() {
        std::lock_guard<std::mutex> lock(m_mutex);
-        std::vector<void*> result;
+        std::vector<GpuPendingFreeEntry> result;
        result.swap(m_pendingGpuFree);
        return result;
    }
@@ -287,12 +364,21 @@ public:
            m_lastEvictCheck = now;
        }

+        // Collect owner callbacks to invoke OUTSIDE m_mutex
+        struct OwnerCallback { void* client; void (*fn)(void*); };
+        std::vector<OwnerCallback> callbacks;
+
+        {
            std::lock_guard<std::mutex> lock(m_mutex);
            for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
                auto* frame = *it;
                auto age_s = std::chrono::duration_cast<std::chrono::seconds>(
                    now - frame->createdAt).count();
                if (age_s > FRAME_TTL_SECONDS && frame->refcount.load() > 0) {
+                    // Capture owner callback before deleting
+                    if (frame->onReleaseFn && frame->ownerClient) {
+                        callbacks.push_back({frame->ownerClient, frame->onReleaseFn});
+                    }
                    // Force cleanup — remove all Mat* keys pointing to this frame
                    for (auto jt = m_map.begin(); jt != m_map.end(); ) {
                        if (jt->second == frame)
@@ -314,6 +400,12 @@ public:
            }
        }

+        // Notify owners OUTSIDE m_mutex
+        for (auto& cb : callbacks) {
+            cb.fn(cb.client);
+        }
+    }
+
    // --- VRAM budget management ---
    bool canAllocateGpuCache(size_t bytes) const {
        return m_totalGpuCacheBytes.load(std::memory_order_relaxed) + bytes <= m_gpuCacheBudget;
@@ -340,6 +432,70 @@ public:
    void setGpuCacheBudget(size_t bytes) { m_gpuCacheBudget = bytes; }
    size_t gpuCacheBudget() const { return m_gpuCacheBudget; }

+    // --- Invalidate owner: nullify all callbacks for a client being destroyed ---
+    // Called by Destroy() on timeout to prevent callbacks into a deleted object.
+    void invalidateOwner(void* client) {
+        if (!client) return;
+        std::lock_guard<std::mutex> lock(m_mutex);
+        for (auto* frame : m_frameSet) {
+            if (frame->ownerClient == client) {
+                frame->ownerClient = nullptr;
+                frame->onReleaseFn = nullptr;
+            }
+        }
+    }
+
+    // --- Force-release all frames owned by a client ---
+    // Called by Destroy() BEFORE close() to free GPU buffers while the CUDA
+    // context is still alive.  Without this, unreleased clones (e.g. 70 cloned
+    // images held by LabVIEW AI tasks that haven't finished) keep gpuCacheY/UV
+    // allocated.  When close() destroys the CUDA context, those buffers become
+    // orphaned and later cudaFree calls crash.
+    //
+    // This force-frees ALL owned buffers for frames belonging to this client,
+    // removes all Mat* keys pointing to them, and deletes the GpuFrameData.
+    // Returns the number of frames force-released.
+    int forceReleaseByOwner(void* client) {
+        if (!client) return 0;
+        int count = 0;
+
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
+            auto* frame = *it;
+            if (frame->ownerClient == client) {
+                REG_DBG("forceReleaseByOwner: frame=%p refcount=%d gpuCacheY=%p gpuCacheUV=%p bytes=%zu",
+                        (void*)frame, frame->refcount.load(),
+                        frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes);
+
+                // Remove all Mat* keys pointing to this frame
+                for (auto jt = m_map.begin(); jt != m_map.end(); ) {
+                    if (jt->second == frame)
+                        jt = m_map.erase(jt);
+                    else
+                        ++jt;
+                }
+
+                // Free owned buffers (CPU + GPU pending)
+                if (frame->avframe)
+                    m_pendingFree.push_back(frame->avframe);
+                if (frame->cpuAvframe)
+                    m_pendingFree.push_back(frame->cpuAvframe);
+                freeOwnedBuffers_locked(frame);
+                it = m_frameSet.erase(it);
+                delete frame;
+                ++count;
+            } else {
+                ++it;
+            }
+        }
+
+        if (count > 0) {
+            REG_DBG("forceReleaseByOwner: force-released %d frames for client=%p", count, client);
+        }
+        return count;
+    }
+
 private:
    ANSGpuFrameRegistry() = default;

@@ -350,6 +506,10 @@ private:
    // Free malloc'd CPU NV12 buffers and GPU cache (but NOT avframe/cpuAvframe —
    // those go to pendingFree for the caller to av_frame_free).
    void freeOwnedBuffers_locked(GpuFrameData* frame) {
+        REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d",
+                (void*)frame, (void*)frame->cpuYPlane, (void*)frame->cpuUvPlane,
+                frame->gpuCacheY, frame->gpuCacheUV,
+                frame->gpuCacheBytes, frame->gpuCacheDeviceIdx);
        if (frame->cpuYPlane) {
            std::free(frame->cpuYPlane);
            frame->cpuYPlane = nullptr;
@@ -358,23 +518,17 @@ private:
            std::free(frame->cpuUvPlane);
            frame->cpuUvPlane = nullptr;
        }
-        // GPU cache freed via CUDA — caller (ANSODEngine) must handle this
-        // since we can't call cudaFree from this FFmpeg-free header.
-        // The gpuCacheBytes are tracked; actual deallocation happens in
-        // NV12PreprocessHelper or a GPU-aware cleanup path.
+        // GPU cache freed via CUDA — push to deferred list with device index
+        // so the caller (ANSGpuFrameOps.h) can cudaSetDevice + cudaFree.
        if (frame->gpuCacheBytes > 0) {
            onGpuCacheFreed(frame->gpuCacheBytes);
-            // Mark as invalid so no one reads stale pointers
            frame->gpuCacheValid = false;
            frame->gpuCacheBytes = 0;
-            // NOTE: gpuCacheY/gpuCacheUV device pointers are leaked here
-            // unless the caller handles GPU cleanup. This is addressed in
-            // Step 8 (NV12PreprocessHelper) where cudaFree is available.
-            // For now, push to a separate GPU-free list.
+            int devIdx = frame->gpuCacheDeviceIdx;
            if (frame->gpuCacheY)
-                m_pendingGpuFree.push_back(frame->gpuCacheY);
+                m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx});
            if (frame->gpuCacheUV)
-                m_pendingGpuFree.push_back(frame->gpuCacheUV);
+                m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx});
            frame->gpuCacheY = nullptr;
            frame->gpuCacheUV = nullptr;
        }
@@ -384,7 +538,7 @@ private:
    std::unordered_map<cv::Mat*, GpuFrameData*> m_map;
    std::unordered_set<GpuFrameData*> m_frameSet;  // All unique frames (for TTL scan)
    std::vector<void*> m_pendingFree;     // AVFrame* pointers to av_frame_free
-    std::vector<void*> m_pendingGpuFree;  // CUDA device pointers to cudaFree
+    std::vector<GpuPendingFreeEntry> m_pendingGpuFree;  // CUDA device pointers to cudaFree
    std::atomic<size_t> m_totalGpuCacheBytes{0};
    size_t m_gpuCacheBudget = GPU_CACHE_BUDGET_DEFAULT;
    std::chrono::steady_clock::time_point m_lastEvictCheck;
@@ -408,7 +562,7 @@ inline bool gpu_frame_addref(cv::Mat* src, cv::Mat* dst) {
 }

 // Drain GPU device pointers that need cudaFree.
-// Caller must cudaFree each returned pointer.
-inline std::vector<void*> gpu_frame_drain_gpu_pending() {
+// Caller must cudaSetDevice(entry.deviceIdx) + cudaFree(entry.ptr) for each.
+inline std::vector<GpuPendingFreeEntry> gpu_frame_drain_gpu_pending() {
    return ANSGpuFrameRegistry::instance().drain_gpu_pending();
 }
--- a/modules/ANSCV/ANSFLV.cpp
+++ b/modules/ANSCV/ANSFLV.cpp
@@ -46,13 +46,22 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSFLVClient::Destroy() {
+        // Move player out of lock scope — close() does CUDA cleanup
+        // (cuArrayDestroy/cuMemFree) which must not run under _mutex
+        // to avoid deadlocking with nvcuda64 SRW lock held by inference.
+        decltype(_playerClient) clientToClose;
+        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
            if (_playerClient) {
                if (_isPlaying) {
                    _playerClient->stop();
                    _isPlaying = false;
                }
-            _playerClient->close();
+            }
+            clientToClose = std::move(_playerClient);
+        }
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSFLVLicense(const std::string& licenseKey) {
@@ -129,8 +138,12 @@ namespace ANSCENTER {
        }
    }
    bool ANSFLVClient::Reconnect() {
+        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+        }
        _playerClient->close();
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
        Setup();
        _isPlaying = _playerClient->play();
        return _isPlaying;
@@ -143,10 +156,16 @@ namespace ANSCENTER {
        return _isPlaying;
    }
    bool ANSFLVClient::Stop() {
+        decltype(_playerClient.get()) player = nullptr;
+        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
            if (_isPlaying) {
-            _playerClient->stop();
                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
        }
        return true;
    }
--- a/modules/ANSCV/ANSFilePlayer.cpp
+++ b/modules/ANSCV/ANSFilePlayer.cpp
@@ -39,6 +39,8 @@ namespace ANSCENTER {
 		catch (...) {}
 	}
 	void ANSFILEPLAYER::Destroy() {
+		decltype(_playerClient) clientToClose;
+		{
 			std::lock_guard<std::recursive_mutex> lock(_mutex);
 			try {
 				_url = "";
@@ -46,9 +48,7 @@ namespace ANSCENTER {
 				_isPlaying = false;
 				_lastJpegImage = "";
 				_pLastFrame.release();
-			if (_playerClient) {
-				_playerClient->close();
-			}
+				clientToClose = std::move(_playerClient);
 			}
 			catch (const std::exception& e) {
 				_logger.LogError("ANSFILEPLAYER::Destroy. Exception:", e.what(), __FILE__, __LINE__);
@@ -57,6 +57,10 @@ namespace ANSCENTER {
 				_logger.LogError("ANSFILEPLAYER::Destroy.", "Unknown exception", __FILE__, __LINE__);
 			}
 		}
+		if (clientToClose) {
+			clientToClose->close();
+		}
+	}
 	void ANSFILEPLAYER::CheckLicense() {
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
 		try {
@@ -94,8 +98,12 @@ namespace ANSCENTER {
 		return _playerClient->open(_url);
 	}
 	bool ANSFILEPLAYER::Reconnect() {
+		{
 			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			_isPlaying = false;
+		}
 		_playerClient->close();
+		std::lock_guard<std::recursive_mutex> lock(_mutex);
 		Setup();
 		return Start();
 	}
@@ -105,15 +113,18 @@ namespace ANSCENTER {
 		return _isPlaying;
 	}
 	bool ANSFILEPLAYER::Stop() {
+		decltype(_playerClient.get()) player = nullptr;
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			player = _playerClient.get();
+		}
+		if (player && player->pause()) {
 			std::lock_guard<std::recursive_mutex> lock(_mutex);
-		if (_playerClient->pause()) {
 			_isPlaying = false;
 			return true;
 		}
-		else {
 		return false;
 	}
-	}
 	bool ANSFILEPLAYER::IsPaused() {
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
 		return _playerClient->isPaused();
--- a/modules/ANSCV/ANSGpuFrameOps.h
+++ b/modules/ANSCV/ANSGpuFrameOps.h
@@ -19,8 +19,31 @@ extern "C" {
 #include "libavutil/frame.h"
 }

+#include <cuda_runtime.h>
 #include <cstring>
 #include <cstdlib>
+#include <cstdio>
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+// Debug logging macro for GPU frame operations.
+// Output goes to stderr (console) AND OutputDebugString (DebugView / VS debugger).
+// Use Sysinternals DebugView (dbgview64.exe) to capture these after a crash.
+#ifndef GPU_FRAME_DBG
+#ifdef _WIN32
+#define GPU_FRAME_DBG(fmt, ...) do { \
+    char _gpu_dbg_buf[512]; \
+    snprintf(_gpu_dbg_buf, sizeof(_gpu_dbg_buf), "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__); \
+    OutputDebugStringA(_gpu_dbg_buf); \
+    fprintf(stderr, "%s", _gpu_dbg_buf); \
+} while(0)
+#else
+#define GPU_FRAME_DBG(fmt, ...) \
+    fprintf(stderr, "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__)
+#endif
+#endif

 namespace anscv_gpu_ops {
 namespace detail {
@@ -71,6 +94,42 @@ inline bool snapshotNV12Planes(const AVFrame* nv12,
    return true;
 }

+// Drain pending GPU device pointers and actually cudaFree them.
+// Must be called from a thread with CUDA context available.
+inline void drainAndFreeGpuPending() {
+    auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
+    if (gpuPending.empty()) return;
+    GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs", gpuPending.size());
+    int prevDev = -1;
+    cudaGetDevice(&prevDev);
+
+    // Group by device to minimize cudaSetDevice calls and synchronize once per device.
+    // cudaDeviceSynchronize() is CRITICAL: NV12 kernels run on cv::cuda::Stream
+    // (not the default stream).  cudaFree on stream 0 doesn't wait for other
+    // streams, so without this sync, cudaFree can free a buffer while a kernel
+    // on another stream is still reading from it → cudaErrorIllegalAddress (700)
+    // which permanently corrupts the CUDA context.
+    int lastSyncDev = -1;
+    for (auto& entry : gpuPending) {
+        if (entry.ptr) {
+            if (entry.deviceIdx >= 0)
+                cudaSetDevice(entry.deviceIdx);
+            if (entry.deviceIdx != lastSyncDev) {
+                cudaDeviceSynchronize();
+                lastSyncDev = entry.deviceIdx;
+            }
+            GPU_FRAME_DBG("drainGpuPending: cudaFree(%p) dev=%d", entry.ptr, entry.deviceIdx);
+            cudaError_t err = cudaFree(entry.ptr);
+            if (err != cudaSuccess) {
+                GPU_FRAME_DBG("drainGpuPending: cudaFree FAILED err=%d (%s)",
+                              (int)err, cudaGetErrorString(err));
+            }
+        }
+    }
+    if (prevDev >= 0)
+        cudaSetDevice(prevDev);
+}
+
 } // namespace detail
 } // namespace anscv_gpu_ops

@@ -117,36 +176,44 @@ inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pt
    }
 }

-// Attach CUDA HW frame — keeps CUDA device pointers for zero-copy inference.
+// Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
 // TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
 //
-// Primary path: yPlane/uvPlane point to CUDA device pointers from the cloned
-// AVFrame (data[0]/data[1]).  The cloned AVFrame keeps the NVDEC surface alive
-// until gpu_frame_remove() is called after inference.  With 4 cameras each
-// holding ~1 surface, this uses 4 of NVDEC's 25-32 surface pool — safe.
+// D2D copy path: cudaMemcpy2D from NVDEC surfaces to cudaMalloc'd buffers on the
+// same GPU.  This decouples the NV12 data lifetime from the NVDEC decoder, so
+// player->close() can safely destroy the decoder at any time without invalidating
+// pointers that inference engines may be reading.  The NVDEC surface is freed
+// immediately (av_frame_free), returning it to the decoder's surface pool.
+//
+// The owned GPU pointers are stored as both yPlane/uvPlane (for zero-copy reads)
+// and gpuCacheY/gpuCacheUV (for lifecycle management / cudaFree on cleanup).
+//
+// VRAM budget: if the global GPU cache budget is exceeded, falls back to CPU-only
+// NV12 snapshot (no zero-copy, but safe).
 //
 // Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
-// inference (when decode GPU != inference GPU, CUDA device ptrs aren't
-// accessible from another GPU context).
+// inference (when decode GPU != inference GPU).
 inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
                                   AVFrame* cpuNV12 = nullptr) {
-    if (!mat || !cudaFrame) return;
+    if (!mat || !cudaFrame) {
+        GPU_FRAME_DBG("attach_cuda: SKIP mat=%p cudaFrame=%p", (void*)mat, (void*)cudaFrame);
+        return;
+    }
+
+    const int w = cudaFrame->width;
+    const int h = cudaFrame->height;
+    GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p cpuNV12=%p",
+                  (void*)mat, w, h, gpuIdx,
+                  (void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)cpuNV12);

    GpuFrameData data{};
    data.gpuIndex        = gpuIdx;
    data.pts             = pts;
-    data.width           = cudaFrame->width;
-    data.height          = cudaFrame->height;
-    data.pixelFormat     = 23; // AV_PIX_FMT_NV12 — the underlying sw_format
+    data.width           = w;
+    data.height          = h;
+    data.pixelFormat     = 23; // AV_PIX_FMT_NV12

-    // Primary: CUDA device pointers from NVDEC (zero-copy on same GPU)
-    data.isCudaDevicePtr = true;
-    data.yPlane          = cudaFrame->data[0];   // CUDA device ptr: Y plane
-    data.uvPlane         = cudaFrame->data[1];   // CUDA device ptr: UV plane
-    data.yLinesize       = cudaFrame->linesize[0];
-    data.uvLinesize      = cudaFrame->linesize[1];
-
-    // Fallback: snapshot CPU NV12 for cross-GPU inference
+    // Snapshot CPU NV12 for cross-GPU fallback (must do before freeing cpuNV12)
    if (cpuNV12) {
        anscv_gpu_ops::detail::snapshotNV12Planes(
            cpuNV12,
@@ -155,9 +222,98 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
            data.width, data.height);
    }

-    // Store AVFrames for cleanup (cudaFrame keeps NVDEC surface alive)
-    data.avframe    = cudaFrame;
-    data.cpuAvframe = cpuNV12;
+    // --- D2D copy: NVDEC surface → owned GPU memory ---
+    // Estimate VRAM needed for the owned NV12 copy
+    const size_t yBytes  = static_cast<size_t>(w) * h;
+    const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
+    const size_t totalBytes = yBytes + uvBytes;
+
+    bool d2dOk = false;
+    if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
+        int prevDev = -1;
+        cudaGetDevice(&prevDev);
+        if (gpuIdx >= 0)
+            cudaSetDevice(gpuIdx);
+
+        void*  ownedY  = nullptr;
+        void*  ownedUV = nullptr;
+        size_t yPitch  = 0;
+        size_t uvPitch = 0;
+
+        cudaError_t e1 = cudaMallocPitch(&ownedY,  &yPitch,  w, h);
+        cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);
+
+        if (e1 == cudaSuccess && e2 == cudaSuccess) {
+            cudaError_t e3 = cudaMemcpy2D(ownedY,  yPitch,
+                                           cudaFrame->data[0], cudaFrame->linesize[0],
+                                           w, h, cudaMemcpyDeviceToDevice);
+            cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
+                                           cudaFrame->data[1], cudaFrame->linesize[1],
+                                           w, h / 2, cudaMemcpyDeviceToDevice);
+
+            if (e3 == cudaSuccess && e4 == cudaSuccess) {
+                // Store owned GPU pointers as primary NV12 source
+                data.isCudaDevicePtr = true;
+                data.yPlane          = static_cast<uint8_t*>(ownedY);
+                data.uvPlane         = static_cast<uint8_t*>(ownedUV);
+                data.yLinesize       = static_cast<int>(yPitch);
+                data.uvLinesize      = static_cast<int>(uvPitch);
+
+                // Track in gpuCache for lifecycle management (cudaFree on cleanup)
+                data.gpuCacheY          = ownedY;
+                data.gpuCacheUV         = ownedUV;
+                data.gpuCacheYPitch     = yPitch;
+                data.gpuCacheUVPitch    = uvPitch;
+                data.gpuCacheDeviceIdx  = gpuIdx;
+                data.gpuCacheValid      = true;
+                data.gpuCacheBytes      = yPitch * h + uvPitch * (h / 2);
+
+                ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
+                d2dOk = true;
+                GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
+                              ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
+            } else {
+                // D2D copy failed — free allocated memory and fall back
+                GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
+                              (int)e3, (int)e4);
+                cudaFree(ownedY);
+                cudaFree(ownedUV);
+            }
+        } else {
+            // Allocation failed — free any partial allocation and fall back
+            GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
+                          (int)e1, (int)e2);
+            if (e1 == cudaSuccess) cudaFree(ownedY);
+            if (e2 == cudaSuccess) cudaFree(ownedUV);
+        }
+
+        if (prevDev >= 0)
+            cudaSetDevice(prevDev);
+    }
+
+    if (!d2dOk) {
+        // Fall back to CPU NV12 snapshot only (no zero-copy)
+        GPU_FRAME_DBG("attach_cuda: FALLBACK CPU-only cpuY=%p cpuUV=%p",
+                      (void*)data.cpuYPlane, (void*)data.cpuUvPlane);
+        data.isCudaDevicePtr = false;
+        data.yPlane          = data.cpuYPlane;
+        data.uvPlane         = data.cpuUvPlane;
+        data.yLinesize       = data.cpuYLinesize;
+        data.uvLinesize      = data.cpuUvLinesize;
+    }
+
+    // Release AVFrames immediately — NVDEC surfaces returned to pool.
+    // No longer stored in GpuFrameData (owned GPU copy is independent).
+    GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p",
+                  (void*)cudaFrame, (void*)cpuNV12);
+    av_frame_free(&cudaFrame);
+    if (cpuNV12) av_frame_free(&cpuNV12);
+    data.avframe    = nullptr;
+    data.cpuAvframe = nullptr;
+
+    GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d gpuCacheY=%p gpuCacheUV=%p",
+                  (void*)data.yPlane, (void*)data.uvPlane, (int)data.isCudaDevicePtr,
+                  data.gpuCacheY, data.gpuCacheUV);

    void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
    if (old) {
@@ -165,17 +321,23 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
        av_frame_free(&oldFrame);
    }

+    // Free stale AVFrames evicted by TTL or previous attach
    auto pending = ANSGpuFrameRegistry::instance().drain_pending();
    for (void* p : pending) {
        AVFrame* stale = static_cast<AVFrame*>(p);
        av_frame_free(&stale);
    }
+
+    // Free stale GPU device pointers
+    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }

-// Release entry by cv::Mat* and free any returned AVFrames. Safe if not in map (no-op).
+// Release entry by cv::Mat* and free any returned AVFrames + GPU pointers.
+// Safe if not in map (no-op).
 inline void gpu_frame_remove(cv::Mat* mat) {
    if (!mat) return;

+    GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat);
    ANSGpuFrameRegistry::instance().release(mat);

    // Free any AVFrames that became pending from this release or prior eviction
@@ -186,13 +348,7 @@ inline void gpu_frame_remove(cv::Mat* mat) {
    }

    // Free any GPU device pointers that became pending
-    auto gpuPending = gpu_frame_drain_gpu_pending();
-    // NOTE: cudaFree requires CUDA context — caller must be on a CUDA-capable thread.
-    // If not, these will leak. In practice, gpu_frame_remove is called from ANSCV
-    // camera threads which do have CUDA context.
-    // For safety, we skip cudaFree here and let NV12PreprocessHelper handle it.
-    // The GPU pointers are tracked in the budget and will be accounted for.
-    (void)gpuPending;
+    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }

 // Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
@@ -209,4 +365,7 @@ inline void gpu_frame_evict_stale() {
        AVFrame* stale = static_cast<AVFrame*>(p);
        av_frame_free(&stale);
    }
+
+    // Free any GPU device pointers from evicted frames
+    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }
--- a/modules/ANSCV/ANSMJPEG.cpp
+++ b/modules/ANSCV/ANSMJPEG.cpp
@@ -46,13 +46,19 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSMJPEGClient::Destroy() {
+        decltype(_playerClient) clientToClose;
+        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
            if (_playerClient) {
                if (_isPlaying) {
                    _playerClient->stop();
                    _isPlaying = false;
                }
-            _playerClient->close();
+            }
+            clientToClose = std::move(_playerClient);
+        }
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSMJPEGLicense(const std::string& licenseKey) {
@@ -129,8 +135,12 @@ namespace ANSCENTER {
        }
    }
    bool ANSMJPEGClient::Reconnect() {
+        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+        }
        _playerClient->close();
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
        Setup();
        _isPlaying = _playerClient->play();
        return _isPlaying;
@@ -143,10 +153,16 @@ namespace ANSCENTER {
        return _isPlaying;
    }
    bool ANSMJPEGClient::Stop() {
+        decltype(_playerClient.get()) player = nullptr;
+        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
            if (_isPlaying) {
-            _playerClient->stop();
                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
        }
        return true;
    }
--- a/modules/ANSCV/ANSRTMP.cpp
+++ b/modules/ANSCV/ANSRTMP.cpp
@@ -48,13 +48,19 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSRTMPClient::Destroy() {
+        decltype(_playerClient) clientToClose;
+        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
            if (_playerClient) {
                if (_isPlaying) {
                    _playerClient->stop();
                    _isPlaying = false;
                }
-            _playerClient->close();
+            }
+            clientToClose = std::move(_playerClient);
+        }
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSRTMPLicense(const std::string& licenseKey) {
@@ -126,8 +132,12 @@ namespace ANSCENTER {
    }

    bool ANSRTMPClient::Reconnect() {
+        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+        }
        _playerClient->close();
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
        Setup();
        _isPlaying = _playerClient->play();
        return _isPlaying;
@@ -140,10 +150,16 @@ namespace ANSCENTER {
        return _isPlaying;
    }
    bool ANSRTMPClient::Stop() {
+        decltype(_playerClient.get()) player = nullptr;
+        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
            if (_isPlaying) {
-            _playerClient->stop();
                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
        }
        return true;
    }
--- a/modules/ANSCV/ANSRTSP.cpp
+++ b/modules/ANSCV/ANSRTSP.cpp
@@ -2,6 +2,7 @@
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
 #include <memory>
+#include <format>
 #include "media_codec.h"
 #include <cstdint>
 #include <cuda_runtime.h>
@@ -21,6 +22,20 @@ extern "C"
 }
 // Note: per-instance thread safety is handled by ANSRTSPClient::_mutex
 // Mat registry thread safety is handled by anscv_mat_replace's internal registry_mutex
+
+// Debug logging — goes to both stderr AND OutputDebugString (DebugView).
+#ifndef RTSP_DBG
+#ifdef _WIN32
+#define RTSP_DBG(fmt, ...) do { \
+    char _rtsp_buf[512]; \
+    snprintf(_rtsp_buf, sizeof(_rtsp_buf), fmt "\n", ##__VA_ARGS__); \
+    OutputDebugStringA(_rtsp_buf); \
+    fprintf(stderr, "%s", _rtsp_buf); \
+} while(0)
+#else
+#define RTSP_DBG(fmt, ...) fprintf(stderr, fmt "\n", ##__VA_ARGS__)
+#endif
+#endif
 static bool ansrtspLicenceValid = false;
 // Global once_flag to protect license checking
 static std::once_flag ansrtspLicenseOnceFlag;
@@ -48,19 +63,88 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSRTSPClient::Destroy() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // Move the player client pointer out of the lock scope, then
+        // close it OUTSIDE the mutex.  close() calls cuArrayDestroy /
+        // cuMemFree which acquire an EXCLUSIVE SRW lock inside nvcuda64.
+        // If we hold _mutex during close(), and another thread holds
+        // the nvcuda64 SRW lock (e.g. cuStreamSynchronize during
+        // inference), we get a deadlock: Stop() → _mutex → nvcuda64
+        // vs inference → nvcuda64 → (blocked by exclusive waiter).
+        decltype(_playerClient) clientToClose;
+        {
+            std::unique_lock<std::recursive_mutex> lock(_mutex);
            if (_playerClient) {
-            // Stop the stream first so the video decoder is flushed and
-            // the RTSP callback thread is no longer feeding frames into
-            // decode().  Without this, rtsp_close() can block waiting for
-            // CRtspClient::m_pMutex (held by the callback mid-decode),
-            // and the hardware decoder flush during destruction can hang
-            // on the GPU.
                if (_isPlaying) {
                    _playerClient->stop();
                    _isPlaying = false;
                }
-            _playerClient->close();
+            }
+
+            // --- Inference guard: wait for in-flight frames to finish ---
+            // GetRTSPCVImage increments _inFlightFrames when it hands out
+            // a GPU frame; the registry decrements it when the frame is
+            // released after inference completes.  We wait here so that
+            // close() doesn't free NVDEC surfaces while TensorRT is
+            // still reading from them (the LabVIEW crash root cause).
+            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
+            if (inFlight > 0) {
+                _logger.LogInfo("ANSRTSPClient::Destroy",
+                    std::format("waiting for {} in-flight inference frame(s)...", inFlight),
+                    __FILE__, __LINE__);
+                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
+                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
+                });
+                if (!done) {
+                    _logger.LogWarn("ANSRTSPClient::Destroy",
+                        std::format("timed out waiting for in-flight frames "
+                                    "(still {} in-flight) — force-releasing GPU frames",
+                                    _inFlightFrames.load()),
+                        __FILE__, __LINE__);
+                }
+            }
+
+            // Force-release ALL GPU frames owned by this client BEFORE close().
+            // Unreleased clones (e.g. LabVIEW AI tasks still holding cloned
+            // cv::Mat*) keep gpuCacheY/gpuCacheUV allocated.  We must cudaFree
+            // them NOW while the CUDA context is still alive.  After close()
+            // destroys the context, cudaFree would crash.
+            int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
+            if (forceReleased > 0) {
+                _logger.LogWarn("ANSRTSPClient::Destroy",
+                    std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
+                    __FILE__, __LINE__);
+                // Drain and cudaFree the GPU buffers while CUDA context is alive
+                // Sync all GPU streams before freeing to avoid illegal access
+                cudaDeviceSynchronize();
+                auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
+                if (!gpuPending.empty()) {
+                    RTSP_DBG("[Destroy] cudaFree %zu GPU ptrs before close()", gpuPending.size());
+                    int prevDev = -1;
+                    cudaGetDevice(&prevDev);
+                    for (auto& entry : gpuPending) {
+                        if (entry.ptr) {
+                            if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
+                            cudaFree(entry.ptr);
+                        }
+                    }
+                    if (prevDev >= 0) cudaSetDevice(prevDev);
+                }
+                // Also drain any pending AVFrames
+                auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
+                for (void* p : avPending) {
+                    AVFrame* f = static_cast<AVFrame*>(p);
+                    av_frame_free(&f);
+                }
+            }
+            ANSGpuFrameRegistry::instance().invalidateOwner(this);
+            _inFlightFrames.store(0, std::memory_order_release);
+
+            clientToClose = std::move(_playerClient);
+        }
+        // CUDA cleanup happens here, outside the mutex — now safe.
+        // All GPU frames owned by this client have been force-freed above.
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) {
@@ -146,10 +230,81 @@ namespace ANSCENTER {
        _playerClient->setCrop(crop);
    }
    bool ANSRTSPClient::Reconnect() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // 1. Mark as not-playing under the mutex FIRST.  This makes GetImage()
+        //    return the cached _pLastFrame instead of calling into the player,
+        //    preventing use-after-free when close() destroys CUDA resources.
+        {
+            std::unique_lock<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+
+            // --- Inference guard: wait for in-flight frames to finish ---
+            // Same guard as Destroy(): close() will free NVDEC surfaces, so
+            // we must wait for any inference engines still reading NV12 data
+            // via zero-copy CUDA device pointers.
+            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
+            if (inFlight > 0) {
+                _logger.LogInfo("ANSRTSPClient::Reconnect",
+                    std::format("waiting for {} in-flight inference frame(s)...", inFlight),
+                    __FILE__, __LINE__);
+                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
+                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
+                });
+                if (!done) {
+                    _logger.LogWarn("ANSRTSPClient::Reconnect",
+                        std::format("timed out waiting for in-flight frames "
+                                    "(still {} in-flight) — force-releasing GPU frames",
+                                    _inFlightFrames.load()),
+                        __FILE__, __LINE__);
+                }
+            }
+
+            // Force-release GPU frames before close() — same as Destroy().
+            int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
+            if (forceReleased > 0) {
+                _logger.LogWarn("ANSRTSPClient::Reconnect",
+                    std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
+                    __FILE__, __LINE__);
+                // Sync all GPU streams before freeing
+                cudaDeviceSynchronize();
+                auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
+                if (!gpuPending.empty()) {
+                    int prevDev = -1;
+                    cudaGetDevice(&prevDev);
+                    for (auto& entry : gpuPending) {
+                        if (entry.ptr) {
+                            if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
+                            cudaFree(entry.ptr);
+                        }
+                    }
+                    if (prevDev >= 0) cudaSetDevice(prevDev);
+                }
+                auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
+                for (void* p : avPending) {
+                    AVFrame* f = static_cast<AVFrame*>(p);
+                    av_frame_free(&f);
+                }
+            }
+            ANSGpuFrameRegistry::instance().invalidateOwner(this);
+            _inFlightFrames.store(0, std::memory_order_release);
+        }
+
+        // 2. close() does CUDA cleanup (cuArrayDestroy/cuMemFree) — run outside
+        //    _mutex to avoid deadlocking with nvcuda64 SRW lock held by inference.
+        //    Safe now because GetImage()/GetNV12Frame() won't touch the player
+        //    while _isPlaying == false, and all in-flight frames have been released.
+        _logger.LogInfo("ANSRTSPClient::Reconnect",
+            "calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
+        RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
        _playerClient->close();
+        RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this);
+
+        // 3. Re-setup and play under the mutex.
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        _logger.LogInfo("ANSRTSPClient::Reconnect",
+            "calling Setup() + play()", __FILE__, __LINE__);
        Setup();
        _isPlaying = _playerClient->play();
+        RTSP_DBG("[Reconnect] DONE isPlaying=%d this=%p", (int)_isPlaying, (void*)this);
        return _isPlaying;
    }
    void ANSRTSPClient::EnableAudio(bool status) {
@@ -169,10 +324,22 @@ namespace ANSCENTER {
    }

     bool ANSRTSPClient::Stop() {
+        // Grab the player pointer and clear _isPlaying under the lock,
+        // then call stop() OUTSIDE the mutex.  stop() internally calls
+        // StopVideoDecoder -> decoder->flush() which does CUDA calls
+        // that can block on the nvcuda64 SRW lock.  Holding _mutex
+        // during that time blocks all other operations on this client
+        // and contributes to the convoy when many clients stop at once.
+        CRtspPlayer* player = nullptr;
+        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
            if (_isPlaying) {
-            _playerClient->stop();
                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
        }
        return true;
    }
@@ -759,10 +926,12 @@ namespace ANSCENTER {
    }
    AVFrame* ANSRTSPClient::GetNV12Frame() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        if (!_isPlaying) return nullptr;  // Player may be mid-reconnect (CUDA resources freed)
        return _playerClient->getNV12Frame();  // Returns clone, caller must av_frame_free
    }
    AVFrame* ANSRTSPClient::GetCudaHWFrame() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        if (!_isPlaying) return nullptr;  // Player may be mid-reconnect (CUDA resources freed)
        return _playerClient->getCudaHWFrame();
    }
    bool ANSRTSPClient::IsCudaHWAccel() {
@@ -810,6 +979,11 @@ extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClien
        if (_username.empty() && _password.empty()) result = ptr->Init(licenseKey, url);
        else result = ptr->Init(licenseKey, username, password, url);
        if (result) {
+            // Default to CUDA/NVDEC HW decoding (mode 7) for NV12 zero-copy
+            // fast path.  LabVIEW may not call SetRTSPHWDecoding after
+            // destroy+recreate cycles, so this ensures the new handle always
+            // uses the GPU decode path instead of falling back to D3D11VA/CPU.
+            ptr->SetHWDecoding(7);  // HW_DECODING_CUDA
            *Handle = ptr.release();
            extern void anscv_unregister_handle(void*);
            extern void anscv_register_handle(void*, void(*)(void*));
@@ -830,9 +1004,37 @@ extern "C" __declspec(dllexport) int ReleaseANSRTSPHandle(ANSCENTER::ANSRTSPClie
    try {
        extern void anscv_unregister_handle(void*);
        anscv_unregister_handle(*Handle);
-        // unique_ptr destructor calls ~ANSRTSPClient which calls Destroy() — no need to call Destroy() separately
-        std::unique_ptr<ANSCENTER::ANSRTSPClient> ptr(*Handle);
+
+        // Grab the raw pointer and NULL the caller's handle immediately.
+        // This prevents the caller (LabVIEW) from issuing new calls.
+        ANSCENTER::ANSRTSPClient* raw = *Handle;
        *Handle = nullptr;
+
+        // Mark as not-playing under _mutex ONLY.  This makes
+        // GetImage()/GetNV12Frame()/GetCudaHWFrame() return empty/null
+        // on any subsequent call, and prevents NEW NV12 GPU surface
+        // pointers from being handed out.
+        //
+        // Do NOT call Destroy()/close() here — close() frees the
+        // NVDEC GPU surfaces (cuArrayDestroy/cuMemFree) which may
+        // still be in use by a CUDA inference kernel that received
+        // the NV12 pointer from a GetRTSPCVImage call that already
+        // completed before this Release was called.
+        {
+            // Use the client's _mutex to safely set _isPlaying = false.
+            // This is the same lock GetImage/GetNV12Frame acquire.
+            raw->Stop();  // sets _isPlaying = false, stops playback
+        }
+
+        // Defer the full cleanup (Destroy + delete) to a background thread
+        // so LabVIEW's UI thread is not blocked.  Destroy() now waits
+        // precisely for in-flight inference to finish (via _inFlightFrames
+        // counter + condition variable) instead of the old 500ms sleep hack.
+        std::thread([raw]() {
+            try { raw->Destroy(); } catch (...) {}
+            try { delete raw; } catch (...) {}
+        }).detach();
+
        return 0;
    } catch (...) {
        if (Handle) *Handle = nullptr;
@@ -882,21 +1084,58 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(

        // Attach NV12 frame for GPU fast-path inference (side-table registry)
        // attach() takes ownership — do NOT av_frame_free here
+        //
+        // CRITICAL: TryIncrementInFlight() MUST be called BEFORE GetCudaHWFrame().
+        // It atomically checks _isPlaying and increments _inFlightFrames under
+        // the same mutex, so Reconnect() cannot call close() while we're doing
+        // the D2D copy from NVDEC surfaces inside gpu_frame_attach_cuda().
        int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
+        bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight();
+        RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
+                (void*)*image, gpuIdx, (int)inFlightGuardHeld);
+
+        if (inFlightGuardHeld) {
            AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
            if (cudaHW) {
-            // CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers.
-            // Also attach CPU NV12 as fallback for cross-GPU inference
-            // (when decode GPU != inference GPU, CUDA ptrs aren't accessible).
+                RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p",
+                        cudaHW->width, cudaHW->height,
+                        (void*)cudaHW->data[0], (void*)cudaHW->data[1]);
                AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
                gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
            } else {
+                // HW decode not active — try CPU NV12
                AVFrame* nv12 = (*Handle)->GetNV12Frame();
                if (nv12) {
                    gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
                }
            }

+            // Wire up the registry callback to release the in-flight guard.
+            // TryIncrementInFlight already incremented; DecrementInFlight fires
+            // when the last clone of this frame is released after inference.
+            auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
+            RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d gpuCacheY=%p",
+                    (void*)gpuData,
+                    gpuData ? (void*)gpuData->yPlane : nullptr,
+                    gpuData ? (int)gpuData->isCudaDevicePtr : -1,
+                    gpuData ? gpuData->gpuCacheY : nullptr);
+            if (gpuData) {
+                gpuData->ownerClient = *Handle;
+                gpuData->onReleaseFn = [](void* client) {
+                    static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight();
+                };
+                // NOTE: Do NOT call IncrementInFlight() again here —
+                // TryIncrementInFlight() already did it above.
+            } else {
+                // No gpuData registered (attach failed?) — release the guard
+                (*Handle)->DecrementInFlight();
+            }
+        } else {
+            // Player is stopping/reconnecting — skip CUDA path entirely.
+            // GetImage() already returned a cached BGR frame, which is safe.
+            RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
+        }
+
        return 1;  // Success
    }
    catch (const cv::Exception& e) {
--- a/modules/ANSCV/ANSRTSP.h
+++ b/modules/ANSCV/ANSRTSP.h
@@ -16,6 +16,8 @@
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <opencv2/opencv.hpp>
+#include <atomic>
+#include <condition_variable>

 namespace ANSCENTER
 {
@@ -37,7 +39,36 @@ namespace ANSCENTER
 		int64_t  _pts;
 		bool	 _isPlaying;
 		std::recursive_mutex		_mutex;
+
+		// --- Per-client inference guard ---
+		// Tracks how many GPU frames from this client are currently in-flight
+		// (grabbed by GetRTSPCVImage but not yet released after inference).
+		// Destroy() waits for this to reach 0 before freeing NVDEC surfaces,
+		// preventing the use-after-free crash when LabVIEW stops a camera
+		// while AI inference is still reading CUDA device pointers.
+		std::atomic<int>            _inFlightFrames{0};
+		std::condition_variable_any _inFlightDone;
 	public:
+		void IncrementInFlight() { _inFlightFrames.fetch_add(1, std::memory_order_acq_rel); }
+		void DecrementInFlight() {
+			if (_inFlightFrames.fetch_sub(1, std::memory_order_acq_rel) <= 1) {
+				_inFlightDone.notify_all();
+			}
+		}
+		// Atomically check _isPlaying AND increment _inFlightFrames under the
+		// same mutex.  Returns true if the caller may proceed to access CUDA
+		// resources (GetCudaHWFrame + D2D copy).  Returns false if the player
+		// is stopping/reconnecting — caller must NOT touch CUDA resources.
+		//
+		// This closes the race window where Reconnect() sets _isPlaying=false
+		// and calls close() while GetRTSPCVImage is between GetCudaHWFrame()
+		// and the D2D copy in gpu_frame_attach_cuda().
+		bool TryIncrementInFlight() {
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			if (!_isPlaying) return false;
+			_inFlightFrames.fetch_add(1, std::memory_order_acq_rel);
+			return true;
+		}
 		ANSRTSPClient();
 		~ANSRTSPClient() noexcept;
 		[[nodiscard]] bool Init(std::string licenseKey, std::string url);
--- a/modules/ANSCV/ANSSRT.cpp
+++ b/modules/ANSCV/ANSSRT.cpp
@@ -48,13 +48,19 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSSRTClient::Destroy() {
+        decltype(_playerClient) clientToClose;
+        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
            if (_playerClient) {
                if (_isPlaying) {
                    _playerClient->stop();
                    _isPlaying = false;
                }
-            _playerClient->close();
+            }
+            clientToClose = std::move(_playerClient);
+        }
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSSRTLicense(const std::string& licenseKey) {
@@ -124,8 +130,12 @@ namespace ANSCENTER {
        }
    }
    bool ANSSRTClient::Reconnect() {
+        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+        }
        _playerClient->close();
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
        Setup();
        _isPlaying = _playerClient->play();
        return _isPlaying;
@@ -155,10 +165,16 @@ namespace ANSCENTER {
    }

    bool ANSSRTClient::Stop() {
+        decltype(_playerClient.get()) player = nullptr;
+        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
            if (_isPlaying) {
-            _playerClient->stop();
                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
        }
        return true;
    }
--- a/modules/ANSCV/ANSVideoPlayer.cpp
+++ b/modules/ANSCV/ANSVideoPlayer.cpp
@@ -40,16 +40,17 @@ namespace ANSCENTER {
 		catch (...) {}
 	}
 	void ANSVIDEOPLAYER::Destroy() {
+		// Move HW player out of lock scope — close() does CUDA cleanup
+		// (cuArrayDestroy/cuMemFree) which must not run under _mutex
+		// to avoid deadlocking with nvcuda64 SRW lock held by inference.
+		decltype(_hwPlayer) hwPlayerToClose;
+		{
 			std::lock_guard<std::recursive_mutex> lock(_mutex);
 			try {
-			// --- HW decode cleanup ---
 				if (_hwPlayer) {
-				try {
-					_hwPlayer->stop();
-					_hwPlayer->close();
-				} catch (...) {}
-				_hwPlayer.reset();  // releases CFilePlayer + HWDecoderPool slot
+					try { _hwPlayer->stop(); } catch (...) {}
 				}
+				hwPlayerToClose = std::move(_hwPlayer);
 				_hwDecodeActive = false;
 				_hwGpuIndex = -1;
 				_hwCudaAccel = false;
@@ -77,6 +78,13 @@ namespace ANSCENTER {
 		catch (...) {
 			_logger.LogError("ANSVIDEOPLAYER::Destroy.", "Unknown exception", __FILE__, __LINE__);
 		}
+		} // end lock scope
+
+		// CUDA cleanup happens here, outside the mutex
+		if (hwPlayerToClose) {
+			try { hwPlayerToClose->close(); } catch (...) {}
+			hwPlayerToClose.reset();
+		}
 	}

 	static void VerifyGlobalANSVPLicense(const std::string& licenseKey) {
@@ -187,15 +195,25 @@ namespace ANSCENTER {
 	}
 	
 	bool ANSVIDEOPLAYER::Reconnect() {
+		// HW decoder close() does CUDA cleanup — run outside _mutex
+		// to avoid deadlocking with nvcuda64 SRW lock held by inference.
+		decltype(_hwPlayer) hwPlayerToClose;
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			_isPlaying = false;  // GetImage() returns cached frame while we reconnect
+			if (_hwPlayer) {
+				try { _hwPlayer->stop(); } catch (...) {}
+				hwPlayerToClose = std::move(_hwPlayer);
+			}
+		}
+		if (hwPlayerToClose) {
+			try { hwPlayerToClose->close(); } catch (...) {}
+			hwPlayerToClose.reset();
+		}
+
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
 		try {
 			_currentFrame = 0;
-
-			// --- HW decode: destroy and re-setup ---
-			if (_hwPlayer) {
-				try { _hwPlayer->stop(); _hwPlayer->close(); } catch (...) {}
-				_hwPlayer.reset();
-			}
 			_hwDecodeActive = false;
 			_hwGpuIndex = -1;
 			_hwCudaAccel = false;
@@ -266,15 +284,17 @@ namespace ANSCENTER {
 		}
 	}
 	bool ANSVIDEOPLAYER::Stop() {
+		decltype(_hwPlayer.get()) hwPlayer = nullptr;
+		{
 			std::lock_guard<std::recursive_mutex> lock(_mutex);
 			try {
 				// --- HW decode path ---
 				if (_hwDecodeActive && _hwPlayer) {
-				_hwPlayer->stop();
 					_isPlaying = false;
-				return true;
+					hwPlayer = _hwPlayer.get();
+					// stop() called outside the lock below; skip cap path
 				}
-
+				else {
 					// --- cv::VideoCapture fallback ---
 					if (cap.isOpened()) {
 						try {
@@ -296,11 +316,16 @@ namespace ANSCENTER {
 					_isPlaying = false;
 					return true;
 				}
+			}
 			catch (const std::exception& e) {
 				this->_logger.LogError("ANSVIDEOPLAYER::Stop. Exception occurred:", e.what(), __FILE__, __LINE__);
 				return false;
 			}
-
+		}
+		if (hwPlayer) {
+			hwPlayer->stop();
+		}
+		return true;
 	}
 	void ANSVIDEOPLAYER::SetBBox(cv::Rect bbox) {
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
--- a/modules/ANSLPR/ANSLPR_CPU.cpp
+++ b/modules/ANSLPR/ANSLPR_CPU.cpp
@@ -378,7 +378,7 @@ namespace ANSCENTER {
        }
    }
    std::vector<Object> ANSALPR_CPU::RunInference(const cv::Mat& input, const std::string &cameraId) {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // No coarse _mutex — sub-components have their own fine-grained locks.
        std::vector<Object> output;
        output.clear();
        // Initial validation
@@ -419,17 +419,18 @@ namespace ANSCENTER {
 #ifdef FNS_DEBUG  // Corrected preprocessor directive
            cv::Mat draw = input.clone();
 #endif 
-            _detectedArea = cv::Rect(0, 0, frame.cols, frame.rows);
-            if ((_detectedArea.width > 50) && (_detectedArea.height > 50)) {
+            // Use local variable instead of shared _detectedArea for thread safety
+            cv::Rect detectedArea(0, 0, frame.cols, frame.rows);
+            if ((detectedArea.width > 50) && (detectedArea.height > 50)) {
 #ifdef FNS_DEBUG  // Corrected preprocessor directive
-                cv::rectangle(draw, _detectedArea, cv::Scalar(0, 0, 255), 2); // RED for detectedArea
+                cv::rectangle(draw, detectedArea, cv::Scalar(0, 0, 255), 2); // RED for detectedArea
 #endif
                // Ensure _lprDetector is valid
                if (!_lprDetector) {
                    this->_logger.LogFatal("ANSALPR_CPU::Inference", "_lprDetector is null", __FILE__, __LINE__);
                    return output;
                }
-                cv::Mat activeFrame = frame(_detectedArea).clone();
+                cv::Mat activeFrame = frame(detectedArea).clone();

                //std::vector<Object> lprOutputRaw = _lpDetector->RunInference(activeFrame, cameraId);
                //std::vector<Object> lprOutput = AdjustLicensePlateBoundingBoxes(lprOutputRaw, _detectedArea, frame.size(), 3.0);
@@ -471,8 +472,12 @@ namespace ANSCENTER {
                            lprObject.cameraId = cameraId;
                            lprObject.polygon = RectToNormalizedPolygon(lprObject.box, input.cols, input.rows);

-                            // OCR inference
-                            std::vector<PaddleOCR::OCRPredictResult> res_ocr = ppocr->ocr(alignedLPR);
+                            // OCR inference (ppocr is not thread-safe, use fine-grained lock)
+                            std::vector<PaddleOCR::OCRPredictResult> res_ocr;
+                            {
+                                std::lock_guard<std::mutex> ocrLock(_ocrMutex);
+                                res_ocr = ppocr->ocr(alignedLPR);
+                            }
                            std::string ocrText;

                            if (!res_ocr.empty() && res_ocr.size() < 3) {
@@ -515,13 +520,13 @@ namespace ANSCENTER {
        return output;
    }
    bool ANSALPR_CPU::Inference(const cv::Mat& input, std::string& lprResult) {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // No coarse _mutex — delegates to Inference(input, lprResult, cameraId)
        if (input.empty()) return false;
        if ((input.cols < 5) || (input.rows < 5)) return false;
 		return Inference(input, lprResult, "CustomCam");
    }
    bool ANSALPR_CPU::Inference(const cv::Mat& input, std::string& lprResult, const std::string & cameraId) {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // No coarse _mutex — sub-components have fine-grained locks.
        std::vector<Object> output;
        output.clear();
        if (!_licenseValid) {
@@ -587,7 +592,12 @@ namespace ANSCENTER {
                        cv::Mat lprImage = frame(lprPos).clone();
                        lprObject.cameraId = cameraId;
                        lprObject.polygon = RectToNormalizedPolygon(lprObject.box, input.cols, input.rows);
-                        std::vector<PaddleOCR::OCRPredictResult>  res_ocr = ppocr->ocr(lprImage);
+                        // ppocr is not thread-safe, use fine-grained lock
+                        std::vector<PaddleOCR::OCRPredictResult> res_ocr;
+                        {
+                            std::lock_guard<std::mutex> ocrLock(_ocrMutex);
+                            res_ocr = ppocr->ocr(lprImage);
+                        }
                        int detectionSize = res_ocr.size();
                        if ((detectionSize > 0) && (detectionSize < 3)) {
                            for (int n = 0; n < res_ocr.size(); n++) { // number of detections
@@ -613,7 +623,7 @@ namespace ANSCENTER {
        }
    }
    bool ANSALPR_CPU::Inference(const cv::Mat& input, const std::vector<cv::Rect> & Bbox, std::string& lprResult) {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // No coarse _mutex — delegates to Inference(input, Bbox, lprResult, cameraId)
        if (input.empty()) return false;
        if ((input.cols < 5) || (input.rows < 5)) return false;
        return Inference(input, Bbox, lprResult, "CustomCam");
@@ -622,7 +632,7 @@ namespace ANSCENTER {
    bool ANSALPR_CPU::Inference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox,
        std::string& lprResult, const std::string& cameraId)
    {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // No coarse _mutex — sub-components have fine-grained locks.

        // Early validation
        if (!_licenseValid) {
@@ -668,16 +678,12 @@ namespace ANSCENTER {
        }

        try {
-            // Convert grayscale to BGR if necessary
-            const cv::Mat* framePtr;
+            // Convert grayscale to BGR if necessary (use local buffer for thread safety)
+            cv::Mat localFrame;
            if (input.channels() == 1) {
-                cv::cvtColor(input, this->_frameBuffer, cv::COLOR_GRAY2BGR);
-                framePtr = &this->_frameBuffer;
+                cv::cvtColor(input, localFrame, cv::COLOR_GRAY2BGR);
            }
-            else {
-                framePtr = &input;
-            }
-            const cv::Mat& frame = *framePtr;
+            const cv::Mat& frame = (input.channels() == 1) ? localFrame : input;

            const int frameWidth = frame.cols;
            const int frameHeight = frame.rows;
@@ -794,7 +800,12 @@ namespace ANSCENTER {
        cv::Mat lprImage = frame(plateRect);
        cv::Mat alignedLPR = enhanceForOCR(lprImage);

-        std::vector<PaddleOCR::OCRPredictResult> res_ocr = ppocr->ocr(alignedLPR);
+        // ppocr is not thread-safe, use fine-grained lock
+        std::vector<PaddleOCR::OCRPredictResult> res_ocr;
+        {
+            std::lock_guard<std::mutex> ocrLock(_ocrMutex);
+            res_ocr = ppocr->ocr(alignedLPR);
+        }

        const size_t detectionSize = res_ocr.size();
        if (detectionSize == 0 || detectionSize >= 3) {
--- a/modules/ANSLPR/ANSLPR_CPU.h
+++ b/modules/ANSLPR/ANSLPR_CPU.h
@@ -5,6 +5,7 @@
 #include <list>
 #include <map>
 #include <string>
+#include <mutex>
 #include <utility>
 #include <vector>
 #include <include/paddleocr.h>
@@ -157,6 +158,7 @@ namespace ANSCENTER
                                                         "43B1", "68L1", "70G1", "36M1", "81N1", "90K1", "17B1", "64E1", "99D1", "60B2", "74L1", "60C1", "68M1", "63B7", "34B1", "69M1", "24B1", "15M1", "83Y1", "48C1", "95H1", "79X1", "17B6", "36E1", "38K1", "25N1", "25U1", "61B1", "36C1", "36B3", "38F1", "99G1", "69N1", "97D1", "92T1", "92B1", "88B1", "97G1", "14U1", "63A1", "26N1", "19D1", "93C1", "73B1", "84B1", "81K1", "18L1", "64D1", "35M1", "61N1", "83P1", "15S1", "82B1", "92U1", "43D1", "22L1", "63B5", "64G1", "27N1", "14X1", "62C1", "81D1", "38G1", "19F1", "34K1", "49P1", "89H1", "14T1", "19M1", "78D1", "76A1", "66K1", "66C1", "71C1", "37K1", "19G1", "15F1", "85C1", "49B1", "21B1", "89F1", "23M1", "66L1", "90B5", "93M1", "14P1", "77N1", "36B8", "86B1", "12U1", "63B3", "21L1", "36G5", "65G1", "82E1", "61H1", "65H1", "84A1", "23F1", "95C1", "99K1", "49G1", "92D1", "36K3", "92N1", "82X1", "83M1", "11N1", "14K1", "19H1", "93H1", "60A1", "79A1", "20D1", "90D1", "81C1", "66P1", "36K1", "92V1", "18B1", "37P1", "22Y1", "23H1", "26D1", "66G1", "78F1", "49C1", "26H1", "38P1", "47T1", "74H1", "63P1", "47D1", "15D1", "23D1", "68E1", "20B1", "49F1", "43K1", "65K1", "27Z1", "92S1", "79H1", "21E1", "35Y1", "14S1", "75E1", "24Y1", "12T1", "27P1", "77B1", "88H1", "60B3", "23P1", "61F1", "99H1", "23K1", "59A3", "26C1", "81B1", "74E1", "66B1", "22S1", "92P1", "93B1", "69B1", "81P1", "12H1", "62K1", "35A1", "77C1", "27V1", "68N1", "12D1", "64K1", "41A1", "12Z1", "76C1", "38B1", "78G1", "74K1", "69H1", "94A1", "61K1", "86B7", "82G1", "14N1", "82M1", "76E1", "18E1", "61C1", "15N1", "90A1", "77F1", "34D1", "47B1", "62S1", "43E1", "81M1", "92X1", "75B1", "34F1", "70H1", "62B1", "26B1", "60B4", "61A1", "12B1", "90T1", "92E1", "34C1", "47G1", "97B1", "25S1", "70E1", "93Y1", "47S1", "37F1", "28N1", "11K1", "38E1", "78M1", "74C1", "12S1", "75S1", "37A1", "28D1", "65L1", "22B1", "99B1", "74G1", "79K1", "76K1", "76H1", "23B1", "15R1", "36B1", "74D1", "62L1", "37E1", "78E1", "89K1", "26M1", "25F1", "48H1", "79D1", "43H1", "76F1", "36L1", "43L1", "21K1", "88L1", "27S1", "92K1", "77D1", "19N1", "66H1", "36H5", "62N1", "18G1", "75D1", "37L1", "68K1", "28C1", "26E1", "35N1", "85H1", "62D1", "27U1", "19E1", "99E1", "14Y1", "49L1", "66M1", "73F1", "70K1", "36F5", "97H1", "93E1", "68P1", "43F1", "48G1", "75K1", "62U1", "86B9", "65F1", "27L1", "70L1", "63B8", "78L1", "11Z1", "68C1", "18D1", "15L1", "99C1", "49E1", "84E1", "69E1", "38A1", "48D1", "68S1", "81E1", "84K1", "63B6", "24T1", "95A1", "86B4", "34M1", "84L1", "24V1", "14M1", "36H1", "15B1", "69F1", "47E1", "38H1", "88D1", "28E1", "60C2", "63B9", "75Y1", "21D1", "35H1", "68F1", "86B5", "15H1", "36B5", "83X1", "17B7", "12V1", "86B8", "95E1", "63B2", "74F1", "86C1", "48K1", "89M1", "85D1", "71C4", "34E1", "97C1", "88E1", "81F1", "60B5", "84M1", "92H1", "28L1", "34H1", "38X1", "82L1", "61E1", "82F1", "62P1", "93F1", "65B1", "93L1", "95B1", "15P1", "77G1", "28M1", "35B1", "68G1", "36C2", "68D1", "69K1", "14L1", "36M3", "24X1", "24Z1", "86A1", "88C1", "15E1", "77E1", "83E1", "47L1", "25T1", "89C1", "71C3", "49D1", "36L6", "48F1", "36B6", "34P1", "84D1", "15C1", "38M1", "85F1", "77K1", "86B3", "74B1", "78H1", "89G1", "64A2", "15K1", "85B1", "49K1", "21H1", "73C1", "47U1", "65E1", "18C1", "69D1", "63B1", "95G1", "19L1", "20G1", "76D1", "29A1", "68T1", "75L1", "12L1", "89L1", "37C1", "27B1", "19C1", "11H1", "81X1", "70B1", "11V1", "43G1", "22A1", "83C1", "75C1", "79C1", "22F1", "92F1", "81G1", "81T1", "28H1", "66N1", "71B1", "18H1", "76P1", "26F1", "81U1", "34N1", "64F1", "76N1", "24S1", "26P1", "63B4", "35T1", "36N1", "47F1", "81L1", "61G1", "77M1", "34G1", "26G1", "97F1", "62H1", "28F1", "62T1", "93G1", "73D1", "65A1", "47P1", "74P1", "82N1", "20E1", "36D1", "60B1", "49M1", "37H1", "37M1", "38D1", "84F1", "88F1", "36B2", "65C1", "92M1", "86B6", "75H1", "38L1", "20C1", "97E1", "85E1", "38N1", "26K1", "89B1", "99F1", "28B1", "34L1", "86B2", "66F1", "77L1", "27Y1", "68H1", "37D1", "92L1", "82K1", "99A1", "69L1", "76M1", "90B4", "48B1", "95D1", "20H1", "64H1", "79Z1", "92G1", "23G1", "21G1", "37G1", "35K1", "81H1", "83Z1", "76T1", "36F1", "36B4", "14B9", "47K1", "20K1", "62M1", "84H1", "62F1", "74A1", "18A1", "73H1", "37N1", "79N1", "61D1", "11P1", "15G1", "47N1", "19K1", "71C2", "81S1", "11M1", "60B7", "60B8", "62G1", "71A1", "24P1", "69A1", "38C1", "49N1", "21C1", "84G1", "37B1", "72A1", "88K1", "88G1", "83V1", "78C1", "73K1", "78K1", "73E189D1", "67A1", "27X1", "62A1", "18K1", "70F1", "36K5", "19B1", "49H1", "66S1", "12P1"};
        ALPRChecker     alprChecker;
        std::vector<std::string> ValidVNCarList = { "94H", "49F", "93A", "20F", "81H", "95R", "38R", "29F", "81F", "28G", "19A", "85B", "2", "43H", "51L", "28C", "21A", "51D", "50F", "24H", "93R", "92H", "71G", "75H", "86G", "30L", "79A", "82B", "79H", "78C", "61E", "70A", "90C", "72G", "34B", "17E", "18E", "78A", "37F", "51E", "71A", "28F", "47E", "83D", "81B", "84C", "71H", "76G", "92E", "36A", "69R", "30M", "27R", "71D", "19B", "34E", "38K", "88G", "68G", "30E", "68E", "25F", "74D", "98K", "89H", "36R", "84D", "61F", "49G", "25H", "17F", "14R", "36H", "47G", "90A", "68A", "83C", "26B", "15B", "61C", "15K", "47H", "78E", "75D", "15C", "63E", "34C", "36F", "38G", "15E", "93F", "22G", "60B", "94D", "62R", "24D", "11R", "12A", "76A", "94C", "97R", "24E", "26A", "15F", "72A", "49H", "62D", "98C", "71B", "61A", "12C", "27A", "78R", "51M", "69E", "76D", "78F", "49R", "81A", "64F", "29D", "18A", "19F", "21E", "92A", "65G", "86E", "62G", "61K", "47A", "23R", "14F", "95D", "36B", "74R", "11H", "24C", "11G", "66D", "63A", "43R", "70F", "86B", "61G", "47M", "67C", "37D", "43G", "14H", "90F", "51G", "86A", "11E", "29K", "85C", "83F", "24B", "98R", "19E", "61B", "90D", "82G", "14K", "74G", "72D", "85A", "19C", "37G", "98E", "74F", "28H", "90E", "89D", "35R", "97H", "83H", "95A", "20C", "65E", "15R", "73C", "37A", "38E", "77G", "94B", "17A", "75R", "98F", "65R", "76R", "20B", "24G", "25B", "73G", "62F", "29G", "77C", "22H", "14D", "23F", "93C", "19R", "15D", "47R", "79D", "60G", "77A", "82C", "63G", "21H", "81E", "25D", "12D", "37R", "36K", "84F", "98G", "28B", "51N", "18F", "50R", "74C", "35C", "30G", "64A", "95F", "18C", "99G", "99B", "37C", "76H", "60K", "67R", "75A", "83R", "28E", "65F", "17D", "92G", "23C", "60R", "90R", "38A", "43D", "50H", "43C", "77H", "47B", "89F", "82F", "65H", "89E", "62C", "24R", "26G", "84E", "17C", "65B", "34A", "12B", "64R", "29H", "71C", "88D", "79F", "76C", "98A", "69H", "22B", "29A", "72R", "67H", "48C", "22D", "60C", "35H", "38H", "63P", "70D", "49D", "18H", "89A", "72E", "92D", "26H", "73R", "85G", "20E", "98H", "69C", "18B", "73B", "22E", "34G", "30K", "20D", "50A", "34D", "15H", "34H", "71E", "62E", "64C", "51R", "82D", "99E", "70R", "18D", "92F", "94R", "24A", "85H", "11C", "73E", "95E", "86C", "94F", "86R", "37K", "23B", "20H", "73D", "95H", "35A", "89B", "82H", "67F", "70H", "97F", "29E", "97A", "51K", "68D", "37B", "82E", "18R", "86H", "35B", "43E", "35F", "95B", "70E", "21D", "27F", "36E", "63D", "68C", "50E", "36G", "75F", "21G", "29B", "93B", "22A", "18G", "43F", "93G", "62A", "83B", "28D", "75C", "22C", "21R", "25E", "23G", "97C", "75E", "79E", "19H", "47K", "65C", "35E", "20R", "68B", "89R", "67A", "75G", "81R", "78B", "77D", "78G", "20K", "36D", "66C", "38F", "27G", "19D", "67B", "84G", "22F", "61D", "20G", "48A", "76F", "48H", "92B", "85R", "26C", "65A", "70B", "38D", "14C", "66A", "73A", "49C", "74E", "68R", "66B", "74A", "49E", "17B", "69D", "51C", "85F", "21F", "99C", "17G", "72H", "94E", "51F", "92R", "60H", "21B", "93D", "19G", "86F", "51A", "66R", "72B", "26D", "64E", "93H", "12H", "97E", "60E", "82A", "60A", "83E", "27D", "64B", "11B", "11D", "76B", "95G", "14A", "61R", "21C", "30F", "23H", "89C", "97G", "62B", "63R", "88B", "98B", "90B", "67G", "69F", "73H", "20A", "72C", "65D", "68H", "51H", "79G", "70C", "90G", "66G", "83A", "77F", "63B", "64G", "25A", "88E", "68F", "99D", "26E", "94A", "48F", "34R", "61H", "90H", "74B", "14G", "12F", "15A", "27E", "69A", "35D", "12E", "85E", "25C", "29M", "89G", "17R", "78D", "84R", "95C", "15G", "28R", "99A", "69G", "48D", "97D", "27C", "78H", "14E", "79R", "73F", "88A", "48E", "48B", "64H", "99R", "14B", "77R", "75B", "88F", "84B", "11A", "67E", "12R", "50M", "11F", "79C", "49A", "43A", "88R", "77E", "48G", "51B", "81D", "74H", "93E", "37H", "88C", "71F", "94G", "38C", "29C", "43B", "30H", "81G", "28A", "26R", "66H", "66E", "17H", "79B", "49B", "63C", "98D", "81C", "69B", "63H", "85D", "26F", "22R", "83G", "37E", "12G", "77B", "35G", "62H", "60D", "60F", "99H", "70G", "76E", "84A", "72F", "25R", "27B", "30A", "47F", "34F", "97B", "23E", "36C", "66F", "48R", "92C", "71R", "23A", "50G", "47C", "82R", "63F", "84H", "38B", "47D", "67D", "25G", "86D", "88H", "64D", "24F", "23D", "99F" };
+        std::mutex _ocrMutex;  // Fine-grained lock for PaddleOCR (not thread-safe)
        std::unique_ptr<PaddleOCR::PPOCR> ppocr = std::make_unique<PaddleOCR::PPOCR>();
        [[nodiscard]] std::string AnalyseLicensePlateText(const std::string& ocrText);
        [[nodiscard]] char convertDigitToLetter(char c);
--- a/modules/ANSLPR/ANSLPR_OD.cpp
+++ b/modules/ANSLPR/ANSLPR_OD.cpp
@@ -863,7 +863,8 @@ namespace ANSCENTER {
 		}
 	}
 	std::vector<Object> ANSALPR_OD::RunInferenceSingleFrame(const cv::Mat& input, const std::string& cameraId) {
-	std::lock_guard<std::recursive_mutex> lock(_mutex);
+	// No coarse _mutex here — sub-components (detectors, alprChecker) have their own locks.
+	// LabVIEW semaphore controls concurrency at the caller level.

 	// Early validation
 	if (!_licenseValid) {
@@ -916,18 +917,19 @@ namespace ANSCENTER {
 		cv::Mat draw = input.clone();
 #endif

-		_detectedArea = cv::Rect(0, 0, frameWidth, frameHeight);
+		// Use local variable instead of shared _detectedArea for thread safety
+		cv::Rect detectedArea(0, 0, frameWidth, frameHeight);

-		if (_detectedArea.width <= 50 || _detectedArea.height <= 50) {
+		if (detectedArea.width <= 50 || detectedArea.height <= 50) {
 			return {};
 		}

 #ifdef FNS_DEBUG
-		cv::rectangle(draw, _detectedArea, cv::Scalar(0, 0, 255), 2);
+		cv::rectangle(draw, detectedArea, cv::Scalar(0, 0, 255), 2);
 #endif

 		// Run license plate detection
-		cv::Mat activeFrame = frame(_detectedArea);
+		cv::Mat activeFrame = frame(detectedArea);
 		std::vector<Object> lprOutput = _lpDetector->RunInference(activeFrame, cameraId);

 		if (lprOutput.empty()) {
@@ -1010,7 +1012,7 @@ namespace ANSCENTER {
 		return {};
 	}
 	std::string ANSALPR_OD::DetectLicensePlateString(const cv::Mat& lprROI, const std::string& cameraId) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — _ocrDetector has its own m_inferenceMutex
 		try {
 			// convert lprROI to greyscale if it is not already
 			if (lprROI.empty()) {
@@ -1277,8 +1279,7 @@ namespace ANSCENTER {
 			return {};
 		}

-		std::lock_guard<std::recursive_mutex> lock(_mutex);
-
+		// No coarse _mutex — _lpColourDetector has its own m_inferenceMutex
 		try {
 			std::vector<Object> colourOutputs = _lpColourDetector->RunInference(lprROI, cameraId);

@@ -1310,8 +1311,9 @@ namespace ANSCENTER {
 			return DetectLPColourDetector(lprROI, cameraId);
 		}

-		// Check cache first (no GPU work needed)
+		// Check cache first (fine-grained lock, no GPU work)
 		{
+			std::lock_guard<std::mutex> cacheLock(_colourCacheMutex);
 			auto it = _colourCache.find(plateText);
 			if (it != _colourCache.end()) {
 				it->second.hitCount++;
@@ -1319,11 +1321,12 @@ namespace ANSCENTER {
 			}
 		}

-		// Cache miss — run the actual classifier
+		// Cache miss — run the actual classifier (no lock held during GPU inference)
 		std::string colour = DetectLPColourDetector(lprROI, cameraId);

-		// Store in cache
+		// Store in cache (fine-grained lock)
 		if (!colour.empty()) {
+			std::lock_guard<std::mutex> cacheLock(_colourCacheMutex);
 			if (_colourCache.size() >= COLOUR_CACHE_MAX_SIZE) {
 				_colourCache.clear();
 			}
@@ -1334,13 +1337,14 @@ namespace ANSCENTER {
 	}

 	bool ANSALPR_OD::Inference(const cv::Mat& input, std::string& lprResult) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — delegates to Inference(input, lprResult, cameraId) which is also lock-free
 		if (input.empty()) return false;
 		if ((input.cols < 5) || (input.rows < 5)) return false;
 		return Inference(input, lprResult, "CustomCam");
 	}
 	bool ANSALPR_OD::Inference(const cv::Mat& input, std::string& lprResult, const std::string& cameraId) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — sub-components have their own fine-grained locks.
+		// LabVIEW semaphore controls concurrency at the caller level.

 		// Early validation
 		if (!_licenseValid) {
@@ -1518,14 +1522,14 @@ namespace ANSCENTER {
 		}
 	}
 	bool ANSALPR_OD::Inference(const cv::Mat& input, const std::vector<cv::Rect> & Bbox, std::string& lprResult) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — delegates to Inference(input, Bbox, lprResult, cameraId)
 		if (input.empty()) return false;
 		if ((input.cols < 5) || (input.rows < 5)) return false;
 		return Inference(input, Bbox, lprResult, "CustomCam");
 	}
 	bool ANSALPR_OD::Inference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox,std::string& lprResult, const std::string& cameraId)
 	{
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — sub-components have their own fine-grained locks.

 		// Early validation
 		if (!_licenseValid) {
@@ -2177,12 +2181,10 @@ namespace ANSCENTER {
 		cv::Mat unsharp;
 		cv::addWeighted(denoised, 1.8, blurred, -0.8, 0, unsharp);

-		// Step 5: CLAHE contrast enhancement
-		if (!_clahe) {
-			_clahe = cv::createCLAHE(4.0, cv::Size(8, 8));
-		}
+		// Step 5: CLAHE contrast enhancement (thread-local for thread safety)
+		thread_local cv::Ptr<cv::CLAHE> tl_clahe = cv::createCLAHE(4.0, cv::Size(8, 8));
 		cv::Mat contrastEnhanced;
-		_clahe->apply(unsharp, contrastEnhanced);
+		tl_clahe->apply(unsharp, contrastEnhanced);

 		// Step 6: Laplacian edge sharpening
 		cv::Mat lap;
@@ -2718,6 +2720,7 @@ namespace ANSCENTER {

 	void ANSALPR_OD::ensureUniquePlateText(std::vector<Object>& results, const std::string& cameraId)
 	{
+		std::lock_guard<std::mutex> plateLock(_plateIdentitiesMutex);
 		auto& identities = _plateIdentities[cameraId];

 		// Option B: Auto-detect mode by counting detections.
--- a/modules/ANSLPR/ANSLPR_OD.h
+++ b/modules/ANSLPR/ANSLPR_OD.h
@@ -24,7 +24,7 @@ namespace ANSCENTER
        ANSCENTER::ModelConfig _lpdmodelConfig;
        ANSCENTER::ModelConfig _ocrModelConfig;
 		ANSCENTER::ModelConfig _lpColourModelConfig;
-        cv::Ptr<cv::CLAHE>     _clahe;  // Reusable CLAHE instance
+        // _clahe moved to thread-local in enhanceForOCR() for thread safety
        ANSCENTER::NV12PreprocessHelper _nv12Helper;  // NV12 crop for high-res plate OCR

        std::string             _lpdLabels;
@@ -147,6 +147,7 @@ namespace ANSCENTER
            int framesSinceLastSeen = 0;
        };
        // cameraId → list of tracked plate identities
+        std::mutex _plateIdentitiesMutex;  // Fine-grained lock for plate identity tracking
        std::unordered_map<std::string, std::vector<SpatialPlateIdentity>> _plateIdentities;
        static constexpr float PLATE_SPATIAL_MATCH_THRESHOLD = 0.3f; // IoU threshold for same plate
        void ensureUniquePlateText(std::vector<Object>& results, const std::string& cameraId);
@@ -176,6 +177,7 @@ namespace ANSCENTER
            std::string colour;
            int         hitCount = 0;
        };
+        std::mutex _colourCacheMutex;  // Fine-grained lock for colour cache only
        std::unordered_map<std::string, ColourCacheEntry> _colourCache;
        static constexpr size_t COLOUR_CACHE_MAX_SIZE = 200;

--- a/modules/ANSOCR/ANSOCR.cpp
+++ b/modules/ANSOCR/ANSOCR.cpp
@@ -118,7 +118,7 @@ namespace ANSCENTER {
 	}

 	std::vector<ANSCENTER::OCRObject> ANSOCR::RunInference(const cv::Mat& input, const std::string& cameraId) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — ppOCR->Predict() / engine has its own internal lock
 		std::vector<ANSCENTER::OCRObject> OCRObjects;
 		OCRObjects.clear();
 		if (!_licenseValid) {
@@ -177,7 +177,7 @@ namespace ANSCENTER {


 	std::vector<ANSCENTER::OCRObject> ANSOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — ppOCR->Predict() / engine has its own internal lock
 		std::vector<ANSCENTER::OCRObject> OCRObjects;
 		OCRObjects.clear();
 		if (!_licenseValid) {
@@ -271,7 +271,7 @@ namespace ANSCENTER {


 	std::vector<ANSCENTER::OCRObject> ANSOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox, const std::string& cameraId) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — ppOCR->Predict() / engine has its own internal lock
 		std::vector<ANSCENTER::OCRObject> OCRObjects;
 		OCRObjects.clear();
 		if (!_licenseValid) {
--- a/modules/ANSOCR/ANSOnnxOCR.cpp
+++ b/modules/ANSOCR/ANSOnnxOCR.cpp
@@ -80,7 +80,7 @@ std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input)
 }

 std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input, const std::string& cameraId) {
-	std::lock_guard<std::recursive_mutex> lock(_mutex);
+	// No coarse _mutex — _engine->ocr() has its own internal lock
 	std::vector<ANSCENTER::OCRObject> OCRObjects;

 	if (!_licenseValid) {
@@ -164,7 +164,7 @@ std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input,
 }

 std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox) {
-	std::lock_guard<std::recursive_mutex> lock(_mutex);
+	// No coarse _mutex — _engine->ocr() has its own internal lock
 	std::vector<ANSCENTER::OCRObject> OCRObjects;

 	if (!_licenseValid) {
@@ -268,7 +268,7 @@ std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input,
 }

 std::vector<ANSCENTER::OCRObject> ANSONNXOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox, const std::string& cameraId) {
-	std::lock_guard<std::recursive_mutex> lock(_mutex);
+	// No coarse _mutex — _engine->ocr() has its own internal lock
 	std::vector<ANSCENTER::OCRObject> OCRObjects;

 	if (!_licenseValid) {
@@ -385,7 +385,7 @@ bool ANSONNXOCR::Destroy() {
 }

 std::pair<std::string, float> ANSONNXOCR::RecognizeText(const cv::Mat& croppedImage) {
-	std::lock_guard<std::recursive_mutex> lock(_mutex);
+	// No coarse _mutex — _engine->recognizeOnly() has its own internal lock
 	if (!_isInitialized || !_engine || croppedImage.empty()) return {"", 0.0f};
 	auto result = _engine->recognizeOnly(croppedImage);
 	return {result.text, result.score};
--- a/modules/ANSOCR/ANSRtOCR.cpp
+++ b/modules/ANSOCR/ANSRtOCR.cpp
@@ -90,7 +90,7 @@ std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input) {
 }

 std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, const std::string& cameraId) {
-    std::lock_guard<std::recursive_mutex> lock(_mutex);
+    // No coarse _mutex — _engine->ocr() has its own internal lock
    std::vector<ANSCENTER::OCRObject> OCRObjects;

    if (!_licenseValid) {
@@ -178,7 +178,7 @@ std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, c
 }

 std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox) {
-    std::lock_guard<std::recursive_mutex> lock(_mutex);
+    // No coarse _mutex — _engine->ocr() has its own internal lock
    std::vector<ANSCENTER::OCRObject> OCRObjects;

    if (!_licenseValid) {
@@ -282,7 +282,7 @@ std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, c
 }

 std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox, const std::string& cameraId) {
-    std::lock_guard<std::recursive_mutex> lock(_mutex);
+    // No coarse _mutex — _engine->ocr() has its own internal lock
    std::vector<ANSCENTER::OCRObject> OCRObjects;

    if (!_licenseValid) {
@@ -379,7 +379,7 @@ std::vector<ANSCENTER::OCRObject> ANSRTOCR::RunInference(const cv::Mat& input, c
 }

 std::pair<std::string, float> ANSRTOCR::RecognizeText(const cv::Mat& croppedImage) {
-    std::lock_guard<std::recursive_mutex> lock(_mutex);
+    // No coarse _mutex — _engine->recognizeOnly() has its own internal lock
    if (!_isInitialized || !_engine || croppedImage.empty()) return {"", 0.0f};
    auto result = _engine->recognizeOnly(croppedImage);
    return {result.text, result.score};
--- a/modules/ANSODEngine/ANSODEngine.cpp
+++ b/modules/ANSODEngine/ANSODEngine.cpp
@@ -1455,7 +1455,7 @@ namespace ANSCENTER
 		}
 	}
 	std::vector<Object> ANSODBase::RunStaticInference(const cv::Mat& input, cv::Rect Bbox, const std::string& camera_id) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — only uses local variables and virtual RunInference() which has its own engine lock
 		std::vector<Object> output;
 		output.clear();
 		try {
@@ -2100,7 +2100,8 @@ namespace ANSCENTER
 		}
 	}
 	std::vector<Object>   ANSODBase::RunInferenceWithOption(const cv::Mat& input, const std::string& camera_id, const std::string activeROIMode) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — sub-components (engines, trackers) have their own locks.
+		// LabVIEW semaphore controls concurrency at the caller level.
 		try {
 			int mode = 0;
 			double confidenceThreshold = 0.35;
@@ -2116,8 +2117,11 @@ namespace ANSCENTER
 			if (confidenceThreshold <= 0) confidenceThreshold = 0;
 			if (confidenceThreshold > 1) confidenceThreshold = 1;

-			// Update model configuration with the new parameters
-			if(confidenceThreshold>0)_modelConfig.detectionScoreThreshold = confidenceThreshold;
+			// Update model configuration with the new parameters (brief lock for config)
+			if (confidenceThreshold > 0) {
+				std::lock_guard<std::recursive_mutex> cfgLock(_mutex);
+				_modelConfig.detectionScoreThreshold = confidenceThreshold;
+			}
 			switch (mode) {
 			case 0: // Normal mode
 				return RunInference(input, camera_id); //RunInference
--- a/modules/ANSODEngine/NV12PreprocessHelper.cpp
+++ b/modules/ANSODEngine/NV12PreprocessHelper.cpp
@@ -275,6 +275,26 @@ namespace ANSCENTER {
                              gpuData->gpuIndex == inferenceGpu;
        const bool useZeroCopy = isCudaDevice && gpuMatch;

+        // --- Debug: log pointer state before reading ---
+        {
+            char _nv12_dbg[512];
+            snprintf(_nv12_dbg, sizeof(_nv12_dbg),
+                "[NV12Helper] tryNV12: gpuData=%p yPlane=%p uvPlane=%p isCuda=%d "
+                "gpuIdx=%d infGpu=%d gpuMatch=%d zeroCopy=%d "
+                "gpuCacheY=%p gpuCacheUV=%p gpuCacheValid=%d refcount=%d %dx%d\n",
+                (void*)gpuData, (void*)gpuData->yPlane, (void*)gpuData->uvPlane,
+                (int)isCudaDevice, gpuData->gpuIndex, inferenceGpu,
+                (int)gpuMatch, (int)useZeroCopy,
+                gpuData->gpuCacheY, gpuData->gpuCacheUV,
+                (int)gpuData->gpuCacheValid,
+                gpuData->refcount.load(),
+                frameW, frameH);
+#ifdef _WIN32
+            OutputDebugStringA(_nv12_dbg);
+#endif
+            fprintf(stderr, "%s", _nv12_dbg);
+        }
+
        // Effective plane pointers — for zero-copy, use CUDA device ptrs;
        // for CPU upload, use the CPU snapshot buffers.
        uint8_t* effYPlane;
@@ -283,7 +303,7 @@ namespace ANSCENTER {
        int      effUvLinesize;

        if (useZeroCopy) {
-            // Same GPU: wrap NVDEC device pointers directly
+            // Same GPU: wrap owned CUDA device pointers directly
            effYPlane     = gpuData->yPlane;
            effUvPlane    = gpuData->uvPlane;
            effYLinesize  = gpuData->yLinesize;
@@ -435,6 +455,18 @@ namespace ANSCENTER {
        gpuResized.create(inputH, inputW, CV_8UC3);

        cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
+        {
+            char _nv12_dbg2[256];
+            snprintf(_nv12_dbg2, sizeof(_nv12_dbg2),
+                "[NV12Helper] KERNEL LAUNCH: gpuY=%p(%dx%d) gpuUV=%p(%dx%d) -> %dx%d zeroCopy=%d\n",
+                (void*)gpuY.data, gpuY.cols, gpuY.rows,
+                (void*)gpuUV.data, gpuUV.cols, gpuUV.rows,
+                inputW, inputH, (int)useZeroCopy);
+#ifdef _WIN32
+            OutputDebugStringA(_nv12_dbg2);
+#endif
+            fprintf(stderr, "%s", _nv12_dbg2);
+        }
        launcher(gpuY, gpuUV, gpuResized, frameW, frameH, inputW, inputH, rawStream);

        stream.waitForCompletion();
@@ -945,7 +977,15 @@ namespace ANSCENTER {
                inputW, inputH, frameW, frameH, stream);
        }

-        cudaStreamSynchronize(stream);
+        // Use polling sync instead of cudaStreamSynchronize to avoid
+        // holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
+        {
+            cudaError_t err = cudaStreamQuery(stream);
+            while (err == cudaErrorNotReady) {
+                Sleep(0);
+                err = cudaStreamQuery(stream);
+            }
+        }

        // (No registry lock to release — data kept alive by refcount)

--- a/modules/ANSODEngine/nv12_to_rgb.cu
+++ b/modules/ANSODEngine/nv12_to_rgb.cu
@@ -8,6 +8,9 @@

 #include <cuda_runtime.h>
 #include <cstdint>
+#ifdef _WIN32
+#include <windows.h>  // Sleep()
+#endif
 #include <cstdio>

 // ── Shared YUV→RGB computation ───────────────────────────────────────────
@@ -651,7 +654,24 @@ int ANSGpuNV12ToBGR(
                      width * 3, height,
                      cudaMemcpyDeviceToHost, t_bufs.stream);

-    cudaStreamSynchronize(t_bufs.stream);
+    // Use polling sync instead of cudaStreamSynchronize to avoid
+    // holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
+    // Short Sleep(0) fast path for sub-ms kernels, then Sleep(1) to give
+    // cleanup operations (cuArrayDestroy, cuMemFree) a window to acquire
+    // the exclusive SRW lock.
+    {
+        cudaError_t qerr = cudaStreamQuery(t_bufs.stream);
+        if (qerr == cudaErrorNotReady) {
+            for (int i = 0; i < 10 && qerr == cudaErrorNotReady; ++i) {
+                Sleep(0);
+                qerr = cudaStreamQuery(t_bufs.stream);
+            }
+            while (qerr == cudaErrorNotReady) {
+                Sleep(1);
+                qerr = cudaStreamQuery(t_bufs.stream);
+            }
+        }
+    }

    // Check for errors
    cudaError_t err = cudaGetLastError();
--- a/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp
+++ b/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp
@@ -23,6 +23,7 @@
 #include <thread>
 #include <mutex>
 #include <atomic>
+#include <random>
 #include <chrono>
 #include <deque>
 #include <set>
@@ -751,8 +752,11 @@ static void LogGpuInfo() {
 // Worker thread: reads RTSP frames and runs ALPR inference
 // RTSP client and ALPR engine are pre-created on the main thread to avoid
 // race conditions in CreateANSRTSPHandle / CreateANSALPRHandle.
+// Takes rtspClientPtr (pointer to array slot) + streamGuard mutex so the
+// CHAOS thread can safely destroy+recreate the stream without use-after-free.
 static void ALPRWorkerThread(int taskId,
-                              ANSCENTER::ANSRTSPClient* rtspClient,
+                              ANSCENTER::ANSRTSPClient** rtspClientPtr,
+                              std::mutex* streamGuard,
                              ANSCENTER::ANSALPR* alprHandle,
                              TaskState& state) {
    char tag[32];
@@ -780,6 +784,23 @@ static void ALPRWorkerThread(int taskId,
    bool hwDecodeLogged = false;

    while (g_running.load()) {
+        // Lock the stream guard to prevent CHAOS from destroying the client
+        // while we're mid-frame-grab or mid-inference.
+        std::unique_lock<std::mutex> streamLock(*streamGuard);
+
+        // Re-read the client pointer each iteration — CHAOS may have
+        // destroyed+recreated it, so our old pointer could be dangling.
+        ANSCENTER::ANSRTSPClient* rtspClient = *rtspClientPtr;
+        if (rtspClient == nullptr) {
+            streamLock.unlock();
+            emptyFrames++;
+            if (emptyFrames % 100 == 1) {
+                g_log.add(prefix + " Stream destroyed by CHAOS, waiting... (count=" + std::to_string(emptyFrames) + ")");
+            }
+            std::this_thread::sleep_for(std::chrono::milliseconds(50));
+            continue;
+        }
+
        // Read frame from RTSP via ANSCV
        auto grabStart = std::chrono::steady_clock::now();
        cv::Mat* framePtr = nullptr;
@@ -797,6 +818,7 @@ static void ALPRWorkerThread(int taskId,
                ReconnectRTSP(&rtspClient);
                emptyFrames = 0;
            }
+            streamLock.unlock();
            if (framePtr) delete framePtr;
            std::this_thread::sleep_for(std::chrono::milliseconds(10));
            continue;
@@ -829,6 +851,9 @@ static void ALPRWorkerThread(int taskId,
        // matches by cv::Mat* pointer, so `new cv::Mat(*framePtr)` would create
        // a different pointer the registry doesn't know, breaking NV12 zero-copy.
        ANSALPR_RunInferenceComplete_CPP(&alprHandle, &framePtr, cameraId.c_str(), 0, 0, lpnResult, jpegImage);
+
+        // Release stream lock — inference is done, CHAOS can now safely destroy.
+        streamLock.unlock();
        auto infEnd = std::chrono::steady_clock::now();
        double infMs = std::chrono::duration<double, std::milli>(infEnd - infStart).count();
        totalInfMs += infMs;
@@ -933,19 +958,20 @@ int ANSLPR_MultiGPU_StressTest() {

    printf("\n");
    printf("============================================================\n");
-    printf("  ANSLPR Multi-GPU Stress Test — 4 Parallel ALPR Tasks\n");
+    printf("  ANSLPR Multi-GPU Stress Test — 5 Parallel ALPR Tasks\n");
+    printf("  (4 cameras, 5 AI tasks — Task 4 shares Stream 2)\n");
    printf("  Press ESC to stop\n");
    printf("  Log file: %s\n", LOG_FILE_PATH);
    printf("============================================================\n\n");

    g_log.add("============================================================");
-    g_log.add("  ANSLPR Multi-GPU Stress Test — 4 Parallel ALPR Tasks");
+    g_log.add("  ANSLPR Multi-GPU Stress Test — 5 Parallel ALPR Tasks");
    g_log.add("============================================================");

    // --- Log GPU info for diagnostics ---
    LogGpuInfo();

-    // --- RTSP URLs (4 independent streams, one per task) ---
+    // --- RTSP URLs (4 independent camera streams) ---
    const std::string rtspUrl0 = "rtsp://admin:admin123@103.156.0.133:8010/cam/realmonitor?channel=1&subtype=0";
    const std::string rtspUrl1 = "rtsp://cafe2471.ddns.net:600/rtsp/streaming?channel=01&subtype=0";
    const std::string rtspUrl2 = "rtsp://nhathuocngoclinh.zapto.org:600/rtsp/streaming?channel=01&subtype=0";
@@ -956,18 +982,39 @@ int ANSLPR_MultiGPU_StressTest() {
    g_log.add("Stream 2: " + rtspUrl2);
    g_log.add("Stream 3: " + rtspUrl3);

+    // =========================================================================
+    //  Architecture: Camera Process + AI Task Process (mimics LabVIEW)
+    //  -----------------------------------------------------------------------
+    //  Camera Process:  4 independent RTSP streams acquire frames from cameras.
+    //  AI Task Process: 5 AI tasks subscribe to camera streams and run inference
+    //                   in parallel.  Multiple tasks can share one camera stream.
+    //  Task 4 subscribes to Stream 2 (nhathuocngoclinh) to demonstrate the
+    //  shared-camera subscription model used in LabVIEW.
+    // =========================================================================
+
+    const int NUM_STREAMS = 4;
+    const int NUM_TASKS   = 5;
+
    // --- Task states ---
-    TaskState taskStates[4];
+    TaskState taskStates[NUM_TASKS];

    // =========================================================================
-    //  Create 4 INDEPENDENT RTSP readers — one per task, each with its own
-    //  camera stream.  Each task gets a dedicated RTSP connection.
+    //  CAMERA PROCESS: Create 4 independent RTSP readers (one per camera).
+    //  These form the camera acquisition layer that AI tasks subscribe to.
    // =========================================================================
-    const int NUM_STREAMS = 4;
    ANSCENTER::ANSRTSPClient* rtspClients[NUM_STREAMS] = {};
    const std::string streamUrls[NUM_STREAMS] = { rtspUrl0, rtspUrl1, rtspUrl2, rtspUrl3 };
-    // Map: task index -> stream index (1:1 mapping)
-    const int taskStreamMap[4] = { 0, 1, 2, 3 };
+    // Map: task index -> stream index
+    // Tasks 0-3 map 1:1 to streams 0-3.
+    // Task 4 subscribes to Stream 2 (nhathuocngoclinh) — shared camera.
+    const int taskStreamMap[NUM_TASKS] = { 0, 1, 2, 3, 2 };
+
+    // Log task-to-stream subscription mapping
+    g_log.add("--- AI Task -> Camera Stream subscription ---");
+    for (int i = 0; i < NUM_TASKS; i++) {
+        g_log.add("  Task " + std::to_string(i) + " -> Stream " + std::to_string(taskStreamMap[i])
+                   + " (" + streamUrls[taskStreamMap[i]] + ")");
+    }

    for (int s = 0; s < NUM_STREAMS; s++) {
        printf("[Stream%d] Creating RTSP handle for %s...\n", s, streamUrls[s].c_str());
@@ -986,14 +1033,17 @@ int ANSLPR_MultiGPU_StressTest() {
    }

    // =========================================================================
-    //  Create 4 ALPR engines sequentially
+    //  AI TASK PROCESS: Create 5 ALPR engines sequentially.
+    //  Each AI task gets its own engine and subscribes to a camera stream.
+    //  Task 4 shares Stream 2 (nhathuocngoclinh) with Task 2 — demonstrating
+    //  the LabVIEW pattern where multiple AI tasks subscribe to one camera.
    // =========================================================================
-    ANSCENTER::ANSALPR* alprHandles[4] = {};
+    ANSCENTER::ANSALPR* alprHandles[NUM_TASKS] = {};
    std::string modelZipFile = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_ALPR_v1.2.zip";
    int engineType = 1; // NVIDIA_GPU
    double detThresh = 0.5, ocrThresh = 0.5, colThresh = 0.5;

-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < NUM_TASKS; i++) {
        char tag[32];
        snprintf(tag, sizeof(tag), "[Task%d]", i);

@@ -1109,7 +1159,7 @@ int ANSLPR_MultiGPU_StressTest() {

            // Count votes: how many tasks on this stream use each GPU
            std::map<int, int> gpuVotes;
-            for (int i = 0; i < 4; i++) {
+            for (int i = 0; i < NUM_TASKS; i++) {
                if (taskStreamMap[i] == s && alprHandles[i]) {
                    gpuVotes[taskStates[i].gpuDeviceId]++;
                }
@@ -1194,30 +1244,132 @@ int ANSLPR_MultiGPU_StressTest() {
    }

    // --- Enable deep pipeline benchmarking on all ALPR handles ---
-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < NUM_TASKS; i++) {
        if (alprHandles[i]) {
            alprHandles[i]->ActivateDebugger(true);
        }
    }
    g_log.add("Debug benchmarking ENABLED on all ALPR handles");

-    // --- Launch worker threads — tasks sharing a stream get the same RTSP client ---
-    g_log.add("Launching worker threads...");
-    std::thread workers[4];
-    for (int i = 0; i < 4; i++) {
+    // --- Per-stream mutex: prevents CHAOS from destroying a stream while a
+    //     worker is mid-frame-grab or mid-inference (use-after-free fix). ---
+    std::mutex streamGuards[NUM_STREAMS];
+
+    // --- Launch worker threads ---
+    // Each AI task subscribes to its camera stream via taskStreamMap.
+    // Tasks sharing a stream (e.g. Task 2 & Task 4 on Stream 2) both get
+    // the same RTSP client pointer and share the stream's mutex guard.
+    g_log.add("Launching " + std::to_string(NUM_TASKS) + " worker threads...");
+    std::thread workers[NUM_TASKS];
+    for (int i = 0; i < NUM_TASKS; i++) {
        int streamIdx = taskStreamMap[i];
        if (rtspClients[streamIdx] && alprHandles[i]) {
            workers[i] = std::thread(ALPRWorkerThread, i,
-                                     rtspClients[streamIdx], alprHandles[i],
+                                     &rtspClients[streamIdx],
+                                     &streamGuards[streamIdx],
+                                     alprHandles[i],
                                     std::ref(taskStates[i]));
        }
    }

+    // =========================================================================
+    //  Camera Chaos Thread — simulates camera errors / reconnects
+    //  Mimics LabVIEW behavior: cameras randomly go into Error/Recovering
+    //  state, triggering Stop/Reconnect/Destroy+Recreate cycles that cause
+    //  CUDA cleanup (cuArrayDestroy, cuMemFree) while inference is running.
+    //  This is the exact scenario that triggers the nvcuda64 SRW lock deadlock.
+    // =========================================================================
+    std::atomic<bool> chaosEnabled{true};
+    std::thread chaosThread([&]() {
+        std::mt19937 rng(std::random_device{}());
+
+        // Wait 10 seconds for system to stabilize before starting chaos
+        for (int i = 0; i < 100 && g_running.load(); i++) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        }
+
+        g_log.add("[CHAOS] Camera chaos thread started — every 10s, stop/destroy/recreate one camera (round-robin)");
+        printf("[CHAOS] Camera chaos thread started — 10s interval, round-robin across %d streams\n", NUM_STREAMS);
+
+        int chaosCount = 0;
+        int nextStream = 0;  // Round-robin: cycle through streams 0,1,2,3,0,1,...
+        while (g_running.load() && chaosEnabled.load()) {
+            // Fixed 10-second interval between chaos events
+            for (int s = 0; s < 100 && g_running.load(); s++) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+            }
+            if (!g_running.load()) break;
+
+            int streamIdx = nextStream;
+            nextStream = (nextStream + 1) % NUM_STREAMS;
+            chaosCount++;
+
+            char buf[512];
+            auto chaosStart = std::chrono::steady_clock::now();
+
+            // Lock stream guard: wait for any in-flight inference to finish
+            // before touching the RTSP client.  This prevents use-after-free
+            // when CHAOS destroys a stream while a worker is mid-inference.
+            std::unique_lock<std::mutex> chaosLock(streamGuards[streamIdx]);
+
+            // Always use full DESTROY + RECREATE cycle.
+            // Reconnect() reuses internal player state which can leave stale
+            // CUDA resources and cause freezes.  A clean destroy + recreate
+            // guarantees a fresh decoder/player with no leftover state.
+            {
+                bool wasAlive = (rtspClients[streamIdx] != nullptr);
+
+                snprintf(buf, sizeof(buf), "[CHAOS #%d] Stream%d: DESTROY + RECREATE (%s)",
+                    chaosCount, streamIdx,
+                    wasAlive ? "camera was running" : "camera was already offline");
+                g_log.add(buf);
+                printf("%s\n", buf);
+
+                // Stop and release old handle if it exists
+                if (rtspClients[streamIdx]) {
+                    StopRTSP(&rtspClients[streamIdx]);
+                    ReleaseANSRTSPHandle(&rtspClients[streamIdx]);
+                    rtspClients[streamIdx] = nullptr;
+                }
+
+                // Release lock during offline sleep — worker sees nullptr and skips
+                int offlineMs = 500 + (rng() % 2500);  // 0.5 - 3 seconds offline
+                chaosLock.unlock();
+                std::this_thread::sleep_for(std::chrono::milliseconds(offlineMs));
+                chaosLock.lock();
+
+                // Recreate the RTSP handle (under lock again)
+                int result = CreateANSRTSPHandle(&rtspClients[streamIdx], "", "", "",
+                    streamUrls[streamIdx].c_str());
+                if (result == 1 && rtspClients[streamIdx]) {
+                    SetRTSPImageQuality(&rtspClients[streamIdx], 0);
+                    SetRTSPHWDecoding(&rtspClients[streamIdx], 7);
+                    StartRTSP(&rtspClients[streamIdx]);
+
+                    auto chaosEnd = std::chrono::steady_clock::now();
+                    double chaosMs = std::chrono::duration<double, std::milli>(chaosEnd - chaosStart).count();
+                    snprintf(buf, sizeof(buf), "[CHAOS #%d] Stream%d: RECREATED in %.0f ms (offline %d ms)",
+                        chaosCount, streamIdx, chaosMs, offlineMs);
+                } else {
+                    snprintf(buf, sizeof(buf), "[CHAOS #%d] Stream%d: RECREATE FAILED (result=%d)",
+                        chaosCount, streamIdx, result);
+                }
+                g_log.add(buf);
+                printf("%s\n", buf);
+            }
+        }
+
+        g_log.add("[CHAOS] Camera chaos thread stopped (total events: " + std::to_string(chaosCount) + ")");
+        printf("[CHAOS] Camera chaos thread stopped (total events: %d)\n", chaosCount);
+    });
+
    // --- Display loop (main thread) ---
-    const int cellW = 640, cellH = 480;
-    const int logPanelH = 200;
+    // 3x2 grid layout: 5 tasks displayed in 3 columns x 2 rows
+    const int cellW = 480, cellH = 360;  // Smaller cells for 3-column layout
+    const int logPanelH = 220;
+    const int gridCols = 3, gridRows = 2;
    cv::namedWindow("ANSLPR Multi-GPU Stress Test", cv::WINDOW_NORMAL);
-    cv::resizeWindow("ANSLPR Multi-GPU Stress Test", cellW * 2, cellH * 2 + logPanelH);
+    cv::resizeWindow("ANSLPR Multi-GPU Stress Test", cellW * gridCols, cellH * gridRows + logPanelH);

    auto testStart = std::chrono::steady_clock::now();
    auto lastGpuSnapshot = std::chrono::steady_clock::now();
@@ -1244,12 +1396,12 @@ int ANSLPR_MultiGPU_StressTest() {
            }
            // Per-task stats
            double totalFpsSnap = 0;
-            for (int t = 0; t < 4; t++) {
+            for (int t = 0; t < NUM_TASKS; t++) {
                std::lock_guard<std::mutex> lk(taskStates[t].mtx);
                char buf[256];
                snprintf(buf, sizeof(buf),
-                    "  T%d: GPU[%d] VRAM=%zuMiB FPS=%.1f GrabMs=%.0f InfMs=%.0f Frames=%d Det=%d",
-                    t, taskStates[t].gpuDeviceId,
+                    "  T%d(S%d): GPU[%d] VRAM=%zuMiB FPS=%.1f GrabMs=%.0f InfMs=%.0f Frames=%d Det=%d",
+                    t, taskStreamMap[t], taskStates[t].gpuDeviceId,
                    taskStates[t].vramUsedBytes / (1024 * 1024),
                    taskStates[t].fps, taskStates[t].lastGrabMs, taskStates[t].inferenceMs,
                    taskStates[t].frameCount, taskStates[t].detectionCount);
@@ -1261,7 +1413,7 @@ int ANSLPR_MultiGPU_StressTest() {
            g_log.add(buf);
            // Multi-GPU check
            std::set<int> gpusUsed;
-            for (int t = 0; t < 4; t++) {
+            for (int t = 0; t < NUM_TASKS; t++) {
                if (taskStates[t].gpuDeviceId >= 0) gpusUsed.insert(taskStates[t].gpuDeviceId);
            }
            if (gpusUsed.size() > 1) {
@@ -1271,12 +1423,12 @@ int ANSLPR_MultiGPU_StressTest() {
            }
            g_log.add("---- END SNAPSHOT ----");
        }
-        // Build 2x2 grid + log panel
-        cv::Mat canvas(cellH * 2 + logPanelH, cellW * 2, CV_8UC3, cv::Scalar(30, 30, 30));
+        // Build 3x2 grid + log panel (5 tasks: 3 cols x 2 rows, cell [1][2] empty)
+        cv::Mat canvas(cellH * gridRows + logPanelH, cellW * gridCols, CV_8UC3, cv::Scalar(30, 30, 30));

-        // Place each task's frame in its quadrant
-        for (int i = 0; i < 4; i++) {
-            int row = i / 2, col = i % 2;
+        // Place each task's frame in its cell
+        for (int i = 0; i < NUM_TASKS; i++) {
+            int row = i / gridCols, col = i % gridCols;
            cv::Rect roi(col * cellW, row * cellH, cellW, cellH);

            cv::Mat cell;
@@ -1313,8 +1465,8 @@ int ANSLPR_MultiGPU_StressTest() {
            // Draw status bar at bottom of each cell (2 lines)
            cv::rectangle(cell, cv::Rect(0, cellH - 50, cellW, 50), cv::Scalar(0, 0, 0), cv::FILLED);
            char bar1[256], bar2[256];
-            snprintf(bar1, sizeof(bar1), "T%d | %.1f FPS | %.0fms | Frames:%d | Det:%d | %s",
-                     i, fps, infMs, fCount, dCount,
+            snprintf(bar1, sizeof(bar1), "T%d(S%d) | %.1f FPS | %.0fms | F:%d | D:%d | %s",
+                     i, taskStreamMap[i], fps, infMs, fCount, dCount,
                     lastPlate.empty() ? "-" : lastPlate.c_str());
            if (gpuId >= 0) {
                snprintf(bar2, sizeof(bar2), "GPU[%d] | VRAM: %zu MiB", gpuId, vramMiB);
@@ -1323,45 +1475,53 @@ int ANSLPR_MultiGPU_StressTest() {
            }
            cv::Scalar barColor = engineLoaded ? cv::Scalar(0, 255, 0) : cv::Scalar(0, 100, 255);
            cv::putText(cell, bar1, cv::Point(5, cellH - 28),
-                        cv::FONT_HERSHEY_SIMPLEX, 0.45, barColor, 1);
+                        cv::FONT_HERSHEY_SIMPLEX, 0.4, barColor, 1);
            cv::putText(cell, bar2, cv::Point(5, cellH - 8),
-                        cv::FONT_HERSHEY_SIMPLEX, 0.45, cv::Scalar(0, 200, 255), 1);
+                        cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(0, 200, 255), 1);

            cell.copyTo(canvas(roi));
-
-            // Draw grid lines
-            cv::line(canvas, cv::Point(cellW, 0), cv::Point(cellW, cellH * 2),
-                     cv::Scalar(100, 100, 100), 1);
-            cv::line(canvas, cv::Point(0, cellH), cv::Point(cellW * 2, cellH),
-                     cv::Scalar(100, 100, 100), 1);
        }

+        // Draw grid lines
+        for (int c = 1; c < gridCols; c++)
+            cv::line(canvas, cv::Point(c * cellW, 0), cv::Point(c * cellW, cellH * gridRows),
+                     cv::Scalar(100, 100, 100), 1);
+        for (int r = 1; r < gridRows; r++)
+            cv::line(canvas, cv::Point(0, r * cellH), cv::Point(cellW * gridCols, r * cellH),
+                     cv::Scalar(100, 100, 100), 1);
+
        // --- Log panel at bottom ---
-        cv::Rect logRoi(0, cellH * 2, cellW * 2, logPanelH);
+        cv::Rect logRoi(0, cellH * gridRows, cellW * gridCols, logPanelH);
        cv::Mat logPanel = canvas(logRoi);
        logPanel.setTo(cv::Scalar(20, 20, 20));

        // Elapsed time header
        auto elapsed = std::chrono::duration<double>(std::chrono::steady_clock::now() - testStart).count();
-        char header[128];
+        char header[256];
        snprintf(header, sizeof(header),
-                 "Elapsed: %.0fs | Press ESC to stop | Resize window freely", elapsed);
+                 "Elapsed: %.0fs | %d cameras, %d AI tasks | Press ESC to stop",
+                 elapsed, NUM_STREAMS, NUM_TASKS);
        cv::putText(logPanel, header, cv::Point(10, 18),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(200, 200, 0), 1);

        // Aggregate stats + per-task GPU summary
        double totalFps = 0;
-        for (int i = 0; i < 4; i++) {
+        for (int i = 0; i < NUM_TASKS; i++) {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            totalFps += taskStates[i].fps;
        }
-        char aggLine[256];
-        snprintf(aggLine, sizeof(aggLine), "Total throughput: %.1f FPS | T0:GPU%d T1:GPU%d T2:GPU%d T3:GPU%d",
-                 totalFps,
-                 taskStates[0].gpuDeviceId, taskStates[1].gpuDeviceId,
-                 taskStates[2].gpuDeviceId, taskStates[3].gpuDeviceId);
+        // Build dynamic task-GPU summary string
+        std::string taskGpuStr;
+        for (int i = 0; i < NUM_TASKS; i++) {
+            if (i > 0) taskGpuStr += " ";
+            taskGpuStr += "T" + std::to_string(i) + "(S" + std::to_string(taskStreamMap[i])
+                        + "):GPU" + std::to_string(taskStates[i].gpuDeviceId);
+        }
+        char aggLine[512];
+        snprintf(aggLine, sizeof(aggLine), "Total: %.1f FPS | %s",
+                 totalFps, taskGpuStr.c_str());
        cv::putText(logPanel, aggLine, cv::Point(10, 38),
-                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 255), 1);
+                    cv::FONT_HERSHEY_SIMPLEX, 0.45, cv::Scalar(0, 255, 255), 1);

        // Real-time GPU VRAM monitor (query every frame — cheap call)
        auto gpuSnaps = QueryGpuVram();
@@ -1370,7 +1530,7 @@ int ANSLPR_MultiGPU_StressTest() {
            // Count tasks on this GPU and their total VRAM
            int tasksOnGpu = 0;
            size_t taskVramMiB = 0;
-            for (int i = 0; i < 4; i++) {
+            for (int i = 0; i < NUM_TASKS; i++) {
                std::lock_guard<std::mutex> lk(taskStates[i].mtx);
                if (taskStates[i].gpuDeviceId == gs.deviceId) {
                    tasksOnGpu++;
@@ -1387,13 +1547,13 @@ int ANSLPR_MultiGPU_StressTest() {
            gpuLineY += 18;
        }

-        // Per-task resource line
-        for (int i = 0; i < 4; i++) {
+        // Per-task resource line (shows which stream each task subscribes to)
+        for (int i = 0; i < NUM_TASKS; i++) {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            char tLine[256];
            snprintf(tLine, sizeof(tLine),
-                "T%d: GPU[%d] VRAM=%zuMiB FPS=%.1f Inf=%.0fms Frames=%d Det=%d",
-                i, taskStates[i].gpuDeviceId,
+                "T%d(S%d): GPU[%d] VRAM=%zuMiB FPS=%.1f Inf=%.0fms Frames=%d Det=%d",
+                i, taskStreamMap[i], taskStates[i].gpuDeviceId,
                taskStates[i].vramUsedBytes / (1024 * 1024),
                taskStates[i].fps, taskStates[i].inferenceMs,
                taskStates[i].frameCount, taskStates[i].detectionCount);
@@ -1421,9 +1581,13 @@ int ANSLPR_MultiGPU_StressTest() {
        }
    }

+    // --- Stop chaos thread ---
+    chaosEnabled.store(false);
+    if (chaosThread.joinable()) chaosThread.join();
+
    // --- Wait for all workers ---
-    printf("Waiting for worker threads to finish...\n");
-    for (int i = 0; i < 4; i++) {
+    printf("Waiting for %d worker threads to finish...\n", NUM_TASKS);
+    for (int i = 0; i < NUM_TASKS; i++) {
        if (workers[i].joinable()) workers[i].join();
    }

@@ -1433,19 +1597,21 @@ int ANSLPR_MultiGPU_StressTest() {

    g_log.add("================================================================");
    g_log.add("  FINAL PERFORMANCE SUMMARY");
+    g_log.add("  " + std::to_string(NUM_STREAMS) + " cameras, " + std::to_string(NUM_TASKS) + " AI tasks");
    g_log.add("  Total runtime: " + std::to_string((int)totalElapsed) + " seconds");
    g_log.add("================================================================");

    printf("\n============================================================\n");
    printf("  FINAL PERFORMANCE SUMMARY (runtime: %.0fs)\n", totalElapsed);
+    printf("  %d cameras, %d AI tasks\n", NUM_STREAMS, NUM_TASKS);
    printf("============================================================\n");

    double totalFpsFinal = 0;
-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < NUM_TASKS; i++) {
        char buf[512];
        snprintf(buf, sizeof(buf),
-            "  Task %d: GPU[%d] | VRAM=%zuMiB | %d frames, %d detections, FPS=%.1f, InfMs=%.0f",
-            i, taskStates[i].gpuDeviceId,
+            "  Task %d (Stream %d): GPU[%d] | VRAM=%zuMiB | %d frames, %d detections, FPS=%.1f, InfMs=%.0f",
+            i, taskStreamMap[i], taskStates[i].gpuDeviceId,
            taskStates[i].vramUsedBytes / (1024 * 1024),
            taskStates[i].frameCount, taskStates[i].detectionCount,
            taskStates[i].fps, taskStates[i].inferenceMs);
@@ -1466,12 +1632,13 @@ int ANSLPR_MultiGPU_StressTest() {

    // Multi-GPU verdict
    std::set<int> finalGpusUsed;
-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < NUM_TASKS; i++) {
        if (taskStates[i].gpuDeviceId >= 0) finalGpusUsed.insert(taskStates[i].gpuDeviceId);
    }
    {
        char buf[256];
-        snprintf(buf, sizeof(buf), "  Total throughput: %.1f FPS across 4 tasks", totalFpsFinal);
+        snprintf(buf, sizeof(buf), "  Total throughput: %.1f FPS across %d tasks (%d cameras)",
+                 totalFpsFinal, NUM_TASKS, NUM_STREAMS);
        printf("%s\n", buf);
        g_log.add(buf);
    }
@@ -1491,13 +1658,16 @@ int ANSLPR_MultiGPU_StressTest() {
        g_log.add("    3. No CUDA_VISIBLE_DEVICES env var restricting GPU access");
    }

+    // Log shared-camera subscription info
+    g_log.add("  Camera subscription: Task 2 and Task 4 both subscribe to Stream 2 (nhathuocngoclinh)");
+
    printf("============================================================\n");
    g_log.add("================================================================");
    g_log.add("  Log saved to: " + std::string(LOG_FILE_PATH));
    g_log.add("================================================================");

    // --- Release all handles (sequentially on main thread) ---
-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < NUM_TASKS; i++) {
        if (alprHandles[i]) {
            ReleaseANSALPRHandle(&alprHandles[i]);
        }
@@ -2770,9 +2940,9 @@ int main()
    //for (int i = 0; i < 100; i++) {
    //    ANSLPR_CPU_Inferences_FileTest();
    //}
-    //ANSLPR_MultiGPU_StressTest();
+    ANSLPR_MultiGPU_StressTest();
    //ANSLPR_MultiGPU_StressTest_SimulatedCam();
-    ANSLPR_MultiGPU_StressTest_FilePlayer();
+   // ANSLPR_MultiGPU_StressTest_FilePlayer();
    return 0;

 }