Fix NV12 crash issue when recreate camera object

(new structure) does not work
2026-04-03 14:51:52 +11:00
parent 958cab6ae3
commit 6fb09830c5
16 changed files with 854 additions and 209 deletions
--- a/modules/ANSCV/ANSGpuFrameOps.h
+++ b/modules/ANSCV/ANSGpuFrameOps.h
@@ -14,6 +14,7 @@
 // gpu_frame_lookup() + the GpuFrameData plane pointers.

 #include "ANSGpuFrameRegistry.h"
+#include "GpuNV12SlotPool.h"

 extern "C" {
 #include "libavutil/frame.h"
@@ -29,9 +30,9 @@ extern "C" {
 #endif

 // Debug logging macro for GPU frame operations.
-// Output goes to stderr (console) AND OutputDebugString (DebugView / VS debugger).
-// Use Sysinternals DebugView (dbgview64.exe) to capture these after a crash.
+// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
 #ifndef GPU_FRAME_DBG
+#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
 #ifdef _WIN32
 #define GPU_FRAME_DBG(fmt, ...) do { \
    char _gpu_dbg_buf[512]; \
@@ -43,6 +44,9 @@ extern "C" {
 #define GPU_FRAME_DBG(fmt, ...) \
    fprintf(stderr, "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__)
 #endif
+#else
+#define GPU_FRAME_DBG(fmt, ...) ((void)0)
+#endif
 #endif

 namespace anscv_gpu_ops {
@@ -94,31 +98,29 @@ inline bool snapshotNV12Planes(const AVFrame* nv12,
    return true;
 }

-// Drain pending GPU device pointers and actually cudaFree them.
-// Must be called from a thread with CUDA context available.
-inline void drainAndFreeGpuPending() {
-    auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
+// Drain pending GPU device pointers and cudaFree them.
+// Uses time-based safety: only frees entries queued >100ms ago, guaranteeing
+// all CUDA kernels reading from them have completed (kernels take <10ms).
+// NO cudaDeviceSynchronize — zero blocking of GPU pipeline.
+//
+// If forceAll=true, drains ALL entries with cudaDeviceSynchronize first
+// (used only by Destroy/Reconnect for final cleanup).
+inline void drainAndFreeGpuPending(bool forceAll = false) {
+    static constexpr int SAFE_AGE_MS = 100; // 100ms >> 10ms kernel duration
+    auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending(
+        forceAll ? 0 : SAFE_AGE_MS);
    if (gpuPending.empty()) return;
-    GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs", gpuPending.size());
+    GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs (force=%d)", gpuPending.size(), (int)forceAll);
    int prevDev = -1;
    cudaGetDevice(&prevDev);
-
-    // Group by device to minimize cudaSetDevice calls and synchronize once per device.
-    // cudaDeviceSynchronize() is CRITICAL: NV12 kernels run on cv::cuda::Stream
-    // (not the default stream).  cudaFree on stream 0 doesn't wait for other
-    // streams, so without this sync, cudaFree can free a buffer while a kernel
-    // on another stream is still reading from it → cudaErrorIllegalAddress (700)
-    // which permanently corrupts the CUDA context.
-    int lastSyncDev = -1;
+    if (forceAll) {
+        // Final cleanup — sync all devices first
+        cudaDeviceSynchronize();
+    }
    for (auto& entry : gpuPending) {
        if (entry.ptr) {
            if (entry.deviceIdx >= 0)
                cudaSetDevice(entry.deviceIdx);
-            if (entry.deviceIdx != lastSyncDev) {
-                cudaDeviceSynchronize();
-                lastSyncDev = entry.deviceIdx;
-            }
-            GPU_FRAME_DBG("drainGpuPending: cudaFree(%p) dev=%d", entry.ptr, entry.deviceIdx);
            cudaError_t err = cudaFree(entry.ptr);
            if (err != cudaSuccess) {
                GPU_FRAME_DBG("drainGpuPending: cudaFree FAILED err=%d (%s)",
@@ -179,22 +181,23 @@ inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pt
 // Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
 // TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
 //
-// D2D copy path: cudaMemcpy2D from NVDEC surfaces to cudaMalloc'd buffers on the
-// same GPU.  This decouples the NV12 data lifetime from the NVDEC decoder, so
-// player->close() can safely destroy the decoder at any time without invalidating
-// pointers that inference engines may be reading.  The NVDEC surface is freed
-// immediately (av_frame_free), returning it to the decoder's surface pool.
+// D2D copy: SYNCHRONOUS cudaMemcpy2D from NVDEC surfaces into a GpuNV12Slot
+// buffer from the global pool.  Data is valid immediately after the call returns.
+// AVFrame is freed immediately (NVDEC surface returned to decoder pool).
 //
-// The owned GPU pointers are stored as both yPlane/uvPlane (for zero-copy reads)
-// and gpuCacheY/gpuCacheUV (for lifecycle management / cudaFree on cleanup).
+// The slot is protected by a 200ms cooldown after the GpuFrameData's refcount
+// drops to 0, guaranteeing that all in-flight GPU kernels (which complete in
+// <10ms) have finished reading from the buffer before it can be reused.
 //
-// VRAM budget: if the global GPU cache budget is exceeded, falls back to CPU-only
-// NV12 snapshot (no zero-copy, but safe).
+// slot: pre-acquired from GpuNV12SlotPool::instance().acquire().
+//       If non-null, D2D copy goes into slot buffers (no per-frame alloc).
+//       If nullptr, falls back to per-frame cudaMallocPitch (legacy path).
 //
 // Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
 // inference (when decode GPU != inference GPU).
 inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
-                                   AVFrame* cpuNV12 = nullptr) {
+                                   AVFrame* cpuNV12 = nullptr,
+                                   GpuNV12Slot* slot = nullptr) {
    if (!mat || !cudaFrame) {
        GPU_FRAME_DBG("attach_cuda: SKIP mat=%p cudaFrame=%p", (void*)mat, (void*)cudaFrame);
        return;
@@ -202,9 +205,9 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,

    const int w = cudaFrame->width;
    const int h = cudaFrame->height;
-    GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p cpuNV12=%p",
+    GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p slot=%p",
                  (void*)mat, w, h, gpuIdx,
-                  (void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)cpuNV12);
+                  (void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)slot);

    GpuFrameData data{};
    data.gpuIndex        = gpuIdx;
@@ -213,86 +216,145 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
    data.height          = h;
    data.pixelFormat     = 23; // AV_PIX_FMT_NV12

-    // Snapshot CPU NV12 for cross-GPU fallback (must do before freeing cpuNV12)
-    if (cpuNV12) {
-        anscv_gpu_ops::detail::snapshotNV12Planes(
-            cpuNV12,
-            data.cpuYPlane, data.cpuYLinesize,
-            data.cpuUvPlane, data.cpuUvLinesize,
-            data.width, data.height);
-    }
-
-    // --- D2D copy: NVDEC surface → owned GPU memory ---
-    // Estimate VRAM needed for the owned NV12 copy
-    const size_t yBytes  = static_cast<size_t>(w) * h;
-    const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
-    const size_t totalBytes = yBytes + uvBytes;
+    // NOTE: CPU NV12 snapshot is DEFERRED — only taken if pool D2D fails.
+    // For 4K frames, the snapshot is ~12MB malloc+memcpy+free per frame.
+    // Skipping it when the pool path succeeds (the common case) eliminates
+    // ~276MB/s of CPU heap allocation churn that causes process-level stalls.

+    // --- D2D copy: NVDEC surface → GPU buffer ---
    bool d2dOk = false;
-    if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
+
+    if (slot && slot->bufY && slot->bufUV && slot->pitchY > 0 && slot->pitchUV > 0) {
+        // --- Global pool path: D2D copy on per-slot non-blocking stream ---
+        // CRITICAL: Using the NULL stream (cudaMemcpy2D without stream) causes
+        // 1-2 second stalls on WDDM because it implicitly synchronizes with
+        // ALL other streams before executing.  By using cudaMemcpy2DAsync on
+        // the slot's own non-blocking stream + cudaStreamSynchronize, we:
+        //   1. Submit the copy immediately (no wait for inference kernels)
+        //   2. Wait ONLY for this copy to finish (~0.3ms 1080p, ~1.2ms 4K)
+        //   3. Data is valid after sync — av_frame_free is safe
        int prevDev = -1;
        cudaGetDevice(&prevDev);
-        if (gpuIdx >= 0)
-            cudaSetDevice(gpuIdx);
+        if (gpuIdx >= 0) cudaSetDevice(gpuIdx);

-        void*  ownedY  = nullptr;
-        void*  ownedUV = nullptr;
-        size_t yPitch  = 0;
-        size_t uvPitch = 0;
-
-        cudaError_t e1 = cudaMallocPitch(&ownedY,  &yPitch,  w, h);
-        cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);
-
-        if (e1 == cudaSuccess && e2 == cudaSuccess) {
-            cudaError_t e3 = cudaMemcpy2D(ownedY,  yPitch,
-                                           cudaFrame->data[0], cudaFrame->linesize[0],
-                                           w, h, cudaMemcpyDeviceToDevice);
-            cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
-                                           cudaFrame->data[1], cudaFrame->linesize[1],
-                                           w, h / 2, cudaMemcpyDeviceToDevice);
+        cudaStream_t copyStream = static_cast<cudaStream_t>(slot->copyStream);
+        cudaError_t e3, e4;

+        if (copyStream) {
+            e3 = cudaMemcpy2DAsync(slot->bufY,  slot->pitchY,
+                                    cudaFrame->data[0], cudaFrame->linesize[0],
+                                    w, h, cudaMemcpyDeviceToDevice, copyStream);
+            e4 = cudaMemcpy2DAsync(slot->bufUV, slot->pitchUV,
+                                    cudaFrame->data[1], cudaFrame->linesize[1],
+                                    w, h / 2, cudaMemcpyDeviceToDevice, copyStream);
            if (e3 == cudaSuccess && e4 == cudaSuccess) {
-                // Store owned GPU pointers as primary NV12 source
-                data.isCudaDevicePtr = true;
-                data.yPlane          = static_cast<uint8_t*>(ownedY);
-                data.uvPlane         = static_cast<uint8_t*>(ownedUV);
-                data.yLinesize       = static_cast<int>(yPitch);
-                data.uvLinesize      = static_cast<int>(uvPitch);
-
-                // Track in gpuCache for lifecycle management (cudaFree on cleanup)
-                data.gpuCacheY          = ownedY;
-                data.gpuCacheUV         = ownedUV;
-                data.gpuCacheYPitch     = yPitch;
-                data.gpuCacheUVPitch    = uvPitch;
-                data.gpuCacheDeviceIdx  = gpuIdx;
-                data.gpuCacheValid      = true;
-                data.gpuCacheBytes      = yPitch * h + uvPitch * (h / 2);
-
-                ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
-                d2dOk = true;
-                GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
-                              ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
-            } else {
-                // D2D copy failed — free allocated memory and fall back
-                GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
-                              (int)e3, (int)e4);
-                cudaFree(ownedY);
-                cudaFree(ownedUV);
+                // Wait ONLY for this stream's 2 copies (~0.3-1.2ms).
+                // Does NOT wait for inference kernels on other streams.
+                cudaStreamSynchronize(copyStream);
            }
        } else {
-            // Allocation failed — free any partial allocation and fall back
-            GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
-                          (int)e1, (int)e2);
-            if (e1 == cudaSuccess) cudaFree(ownedY);
-            if (e2 == cudaSuccess) cudaFree(ownedUV);
+            // Fallback if stream creation failed — NULL stream (may stall)
+            e3 = cudaMemcpy2D(slot->bufY,  slot->pitchY,
+                               cudaFrame->data[0], cudaFrame->linesize[0],
+                               w, h, cudaMemcpyDeviceToDevice);
+            e4 = cudaMemcpy2D(slot->bufUV, slot->pitchUV,
+                               cudaFrame->data[1], cudaFrame->linesize[1],
+                               w, h / 2, cudaMemcpyDeviceToDevice);
        }

-        if (prevDev >= 0)
-            cudaSetDevice(prevDev);
+        if (prevDev >= 0) cudaSetDevice(prevDev);
+
+        if (e3 == cudaSuccess && e4 == cudaSuccess) {
+            data.isCudaDevicePtr = true;
+            data.yPlane          = static_cast<uint8_t*>(slot->bufY);
+            data.uvPlane         = static_cast<uint8_t*>(slot->bufUV);
+            data.yLinesize       = static_cast<int>(slot->pitchY);
+            data.uvLinesize      = static_cast<int>(slot->pitchUV);
+            data.poolSlot        = slot;  // Track for deferred release
+            // gpuCacheY/UV stay nullptr — global pool owns the buffers
+            d2dOk = true;
+            GPU_FRAME_DBG("attach_cuda: D2D OK (global pool) Y=%p UV=%p yPitch=%zu uvPitch=%zu",
+                          slot->bufY, slot->bufUV, slot->pitchY, slot->pitchUV);
+        } else {
+            GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED (pool) e3=%d e4=%d — fallback",
+                          (int)e3, (int)e4);
+            // Release slot back to pool on failure (immediate, no cooldown needed)
+            slot->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
+        }
+    }
+
+    if (!d2dOk && !slot) {
+        // --- Legacy path: per-frame cudaMallocPitch (for modules without pool) ---
+        const size_t yBytes  = static_cast<size_t>(w) * h;
+        const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
+        const size_t totalBytes = yBytes + uvBytes;
+
+        if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
+            int prevDev = -1;
+            cudaGetDevice(&prevDev);
+            if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
+
+            void*  ownedY  = nullptr;
+            void*  ownedUV = nullptr;
+            size_t yPitch  = 0;
+            size_t uvPitch = 0;
+
+            cudaError_t e1 = cudaMallocPitch(&ownedY,  &yPitch,  w, h);
+            cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);
+
+            if (e1 == cudaSuccess && e2 == cudaSuccess) {
+                cudaError_t e3 = cudaMemcpy2D(ownedY,  yPitch,
+                                               cudaFrame->data[0], cudaFrame->linesize[0],
+                                               w, h, cudaMemcpyDeviceToDevice);
+                cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
+                                               cudaFrame->data[1], cudaFrame->linesize[1],
+                                               w, h / 2, cudaMemcpyDeviceToDevice);
+
+                if (e3 == cudaSuccess && e4 == cudaSuccess) {
+                    data.isCudaDevicePtr = true;
+                    data.yPlane          = static_cast<uint8_t*>(ownedY);
+                    data.uvPlane         = static_cast<uint8_t*>(ownedUV);
+                    data.yLinesize       = static_cast<int>(yPitch);
+                    data.uvLinesize      = static_cast<int>(uvPitch);
+                    data.gpuCacheY          = ownedY;
+                    data.gpuCacheUV         = ownedUV;
+                    data.gpuCacheYPitch     = yPitch;
+                    data.gpuCacheUVPitch    = uvPitch;
+                    data.gpuCacheDeviceIdx  = gpuIdx;
+                    data.gpuCacheValid      = true;
+                    data.gpuCacheBytes      = yPitch * h + uvPitch * (h / 2);
+                    ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
+                    d2dOk = true;
+                    GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
+                                  ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
+                } else {
+                    GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
+                                  (int)e3, (int)e4);
+                    cudaFree(ownedY);
+                    cudaFree(ownedUV);
+                }
+            } else {
+                GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
+                              (int)e1, (int)e2);
+                if (e1 == cudaSuccess) cudaFree(ownedY);
+                if (e2 == cudaSuccess) cudaFree(ownedUV);
+            }
+
+            if (prevDev >= 0) cudaSetDevice(prevDev);
+        }
    }

    if (!d2dOk) {
-        // Fall back to CPU NV12 snapshot only (no zero-copy)
+        // D2D failed or no slot — take CPU NV12 snapshot now (before freeing cpuNV12).
+        // This is the ONLY path where the CPU snapshot is needed.  Skipping it
+        // on the pool-success path avoids ~12MB malloc+memcpy+free per 4K frame.
+        if (cpuNV12) {
+            anscv_gpu_ops::detail::snapshotNV12Planes(
+                cpuNV12,
+                data.cpuYPlane, data.cpuYLinesize,
+                data.cpuUvPlane, data.cpuUvLinesize,
+                data.width, data.height);
+        }
        GPU_FRAME_DBG("attach_cuda: FALLBACK CPU-only cpuY=%p cpuUV=%p",
                      (void*)data.cpuYPlane, (void*)data.cpuUvPlane);
        data.isCudaDevicePtr = false;
@@ -302,8 +364,8 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
        data.uvLinesize      = data.cpuUvLinesize;
    }

-    // Release AVFrames immediately — NVDEC surfaces returned to pool.
-    // No longer stored in GpuFrameData (owned GPU copy is independent).
+    // Free AVFrames immediately — synchronous D2D copy has completed,
+    // so NVDEC surfaces can be returned to the decoder's surface pool.
    GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p",
                  (void*)cudaFrame, (void*)cpuNV12);
    av_frame_free(&cudaFrame);
@@ -311,9 +373,9 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
    data.avframe    = nullptr;
    data.cpuAvframe = nullptr;

-    GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d gpuCacheY=%p gpuCacheUV=%p",
+    GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d poolSlot=%p",
                  (void*)data.yPlane, (void*)data.uvPlane, (int)data.isCudaDevicePtr,
-                  data.gpuCacheY, data.gpuCacheUV);
+                  (void*)data.poolSlot);

    void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
    if (old) {
@@ -327,12 +389,10 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
        AVFrame* stale = static_cast<AVFrame*>(p);
        av_frame_free(&stale);
    }
-
-    // Free stale GPU device pointers
-    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }

-// Release entry by cv::Mat* and free any returned AVFrames + GPU pointers.
+// Release entry by cv::Mat* and free any returned AVFrames.
+// GPU device pointers are deferred to TTL eviction or explicit cleanup.
 // Safe if not in map (no-op).
 inline void gpu_frame_remove(cv::Mat* mat) {
    if (!mat) return;
@@ -347,8 +407,7 @@ inline void gpu_frame_remove(cv::Mat* mat) {
        av_frame_free(&stale);
    }

-    // Free any GPU device pointers that became pending
-    anscv_gpu_ops::detail::drainAndFreeGpuPending();
+    // GPU device pointers deferred — see gpu_frame_evict_stale() / Destroy()
 }

 // Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
@@ -357,6 +416,12 @@ inline void gpu_frame_invalidate(cv::Mat* mat) {
 }

 // Run TTL eviction + drain pending. Call periodically from camera threads.
+// TTL eviction is throttled to every 500ms (EVICT_CHECK_INTERVAL_MS).
+// GPU buffer cleanup is safe here because:
+//   1. Only frames >3 seconds old are evicted (kernels take <10ms)
+//   2. cudaDeviceSynchronize() ensures all in-flight kernels are done
+//   3. At 500ms interval, one sync per 500ms is ~0.1ms cost (acceptable)
+//      vs per-frame sync which caused 900ms spikes
 inline void gpu_frame_evict_stale() {
    ANSGpuFrameRegistry::instance().evictStaleFrames();

@@ -366,6 +431,7 @@ inline void gpu_frame_evict_stale() {
        av_frame_free(&stale);
    }

-    // Free any GPU device pointers from evicted frames
+    // Free GPU device pointers from evicted/released frames (legacy path).
+    // Pool-backed frames (ANSRTSP) don't add to this list (gpuCacheY=nullptr).
    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }
--- a/modules/ANSCV/ANSRTSP.cpp
+++ b/modules/ANSCV/ANSRTSP.cpp
@@ -1,6 +1,7 @@
 #include "ANSRTSP.h"
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
+#include "GpuNV12SlotPool.h"
 #include <memory>
 #include <format>
 #include "media_codec.h"
@@ -23,8 +24,9 @@ extern "C"
 // Note: per-instance thread safety is handled by ANSRTSPClient::_mutex
 // Mat registry thread safety is handled by anscv_mat_replace's internal registry_mutex

-// Debug logging — goes to both stderr AND OutputDebugString (DebugView).
+// Debug logging. Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame logging.
 #ifndef RTSP_DBG
+#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
 #ifdef _WIN32
 #define RTSP_DBG(fmt, ...) do { \
    char _rtsp_buf[512]; \
@@ -35,6 +37,9 @@ extern "C"
 #else
 #define RTSP_DBG(fmt, ...) fprintf(stderr, fmt "\n", ##__VA_ARGS__)
 #endif
+#else
+#define RTSP_DBG(fmt, ...) ((void)0)
+#endif
 #endif
 static bool ansrtspLicenceValid = false;
 // Global once_flag to protect license checking
@@ -62,6 +67,7 @@ namespace ANSCENTER {
    ANSRTSPClient::~ANSRTSPClient() noexcept {
 		Destroy();
    }
+
    void ANSRTSPClient::Destroy() {
        // Move the player client pointer out of the lock scope, then
        // close it OUTSIDE the mutex.  close() calls cuArrayDestroy /
@@ -80,69 +86,44 @@ namespace ANSCENTER {
                }
            }

-            // --- Inference guard: wait for in-flight frames to finish ---
-            // GetRTSPCVImage increments _inFlightFrames when it hands out
-            // a GPU frame; the registry decrements it when the frame is
-            // released after inference completes.  We wait here so that
-            // close() doesn't free NVDEC surfaces while TensorRT is
-            // still reading from them (the LabVIEW crash root cause).
+            // --- Inference guard: wait for in-flight D2D copies to finish ---
+            // With synchronous D2D copy, in-flight means "currently inside
+            // GetRTSPCVImage between TryIncrementInFlight and attach_cuda".
+            // This is typically <1ms, so the wait is very fast.
            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
            if (inFlight > 0) {
                _logger.LogInfo("ANSRTSPClient::Destroy",
-                    std::format("waiting for {} in-flight inference frame(s)...", inFlight),
+                    std::format("waiting for {} in-flight frame(s)...", inFlight),
                    __FILE__, __LINE__);
                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
                });
                if (!done) {
                    _logger.LogWarn("ANSRTSPClient::Destroy",
-                        std::format("timed out waiting for in-flight frames "
-                                    "(still {} in-flight) — force-releasing GPU frames",
-                                    _inFlightFrames.load()),
+                        std::format("timed out — still {} in-flight", _inFlightFrames.load()),
                        __FILE__, __LINE__);
                }
            }

-            // Force-release ALL GPU frames owned by this client BEFORE close().
-            // Unreleased clones (e.g. LabVIEW AI tasks still holding cloned
-            // cv::Mat*) keep gpuCacheY/gpuCacheUV allocated.  We must cudaFree
-            // them NOW while the CUDA context is still alive.  After close()
-            // destroys the context, cudaFree would crash.
-            int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
-            if (forceReleased > 0) {
-                _logger.LogWarn("ANSRTSPClient::Destroy",
-                    std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
-                    __FILE__, __LINE__);
-                // Drain and cudaFree the GPU buffers while CUDA context is alive
-                // Sync all GPU streams before freeing to avoid illegal access
-                cudaDeviceSynchronize();
-                auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
-                if (!gpuPending.empty()) {
-                    RTSP_DBG("[Destroy] cudaFree %zu GPU ptrs before close()", gpuPending.size());
-                    int prevDev = -1;
-                    cudaGetDevice(&prevDev);
-                    for (auto& entry : gpuPending) {
-                        if (entry.ptr) {
-                            if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
-                            cudaFree(entry.ptr);
-                        }
-                    }
-                    if (prevDev >= 0) cudaSetDevice(prevDev);
-                }
-                // Also drain any pending AVFrames
-                auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
-                for (void* p : avPending) {
-                    AVFrame* f = static_cast<AVFrame*>(p);
-                    av_frame_free(&f);
-                }
-            }
+            // Invalidate owner callbacks so stale GpuFrameData don't try to
+            // call DecrementInFlight on this (soon-to-be-deleted) object.
+            // The GpuFrameData and their global pool slots remain alive —
+            // inference engines can safely keep reading from them.
            ANSGpuFrameRegistry::instance().invalidateOwner(this);
            _inFlightFrames.store(0, std::memory_order_release);

+            // NO forceReleaseByOwner — frames survive camera deletion.
+            // Pool slot buffers are global (GpuNV12SlotPool) — NOT owned
+            // by this camera.  They are recycled when inference finishes
+            // (GpuFrameData refcount → 0 → slot.inUse = false).
+            // NO cudaDeviceSynchronize — no GPU buffers to free here.
+            // NO DestroyGpuPool — per-camera pool has been removed.
+
            clientToClose = std::move(_playerClient);
        }
-        // CUDA cleanup happens here, outside the mutex — now safe.
-        // All GPU frames owned by this client have been force-freed above.
+        // close() destroys the NVDEC decoder ONLY.  Pool slot buffers
+        // (regular cudaMallocPitch allocations) are untouched — they
+        // belong to the global GpuNV12SlotPool, not the decoder.
        if (clientToClose) {
            clientToClose->close();
        }
@@ -232,66 +213,44 @@ namespace ANSCENTER {
    bool ANSRTSPClient::Reconnect() {
        // 1. Mark as not-playing under the mutex FIRST.  This makes GetImage()
        //    return the cached _pLastFrame instead of calling into the player,
-        //    preventing use-after-free when close() destroys CUDA resources.
+        //    and blocks new TryIncrementInFlight calls.
        {
            std::unique_lock<std::recursive_mutex> lock(_mutex);
            _isPlaying = false;

-            // --- Inference guard: wait for in-flight frames to finish ---
-            // Same guard as Destroy(): close() will free NVDEC surfaces, so
-            // we must wait for any inference engines still reading NV12 data
-            // via zero-copy CUDA device pointers.
+            // --- Inference guard: wait for in-flight D2D copies to finish ---
+            // With synchronous D2D copy, in-flight means "currently inside
+            // GetRTSPCVImage between TryIncrementInFlight and attach_cuda".
+            // This is typically <1ms, so the wait is very fast.
            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
            if (inFlight > 0) {
                _logger.LogInfo("ANSRTSPClient::Reconnect",
-                    std::format("waiting for {} in-flight inference frame(s)...", inFlight),
+                    std::format("waiting for {} in-flight frame(s)...", inFlight),
                    __FILE__, __LINE__);
                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
                });
                if (!done) {
                    _logger.LogWarn("ANSRTSPClient::Reconnect",
-                        std::format("timed out waiting for in-flight frames "
-                                    "(still {} in-flight) — force-releasing GPU frames",
-                                    _inFlightFrames.load()),
+                        std::format("timed out — still {} in-flight", _inFlightFrames.load()),
                        __FILE__, __LINE__);
                }
            }

-            // Force-release GPU frames before close() — same as Destroy().
-            int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
-            if (forceReleased > 0) {
-                _logger.LogWarn("ANSRTSPClient::Reconnect",
-                    std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
-                    __FILE__, __LINE__);
-                // Sync all GPU streams before freeing
-                cudaDeviceSynchronize();
-                auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
-                if (!gpuPending.empty()) {
-                    int prevDev = -1;
-                    cudaGetDevice(&prevDev);
-                    for (auto& entry : gpuPending) {
-                        if (entry.ptr) {
-                            if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
-                            cudaFree(entry.ptr);
-                        }
-                    }
-                    if (prevDev >= 0) cudaSetDevice(prevDev);
-                }
-                auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
-                for (void* p : avPending) {
-                    AVFrame* f = static_cast<AVFrame*>(p);
-                    av_frame_free(&f);
-                }
-            }
+            // Invalidate owner callbacks — prevents stale DecrementInFlight
+            // calls after Reconnect re-creates the decoder.
+            // Frames and their global pool slots remain alive for inference.
            ANSGpuFrameRegistry::instance().invalidateOwner(this);
            _inFlightFrames.store(0, std::memory_order_release);
+
+            // NO forceReleaseByOwner — frames survive reconnect.
+            // NO cudaDeviceSynchronize — no GPU buffers to free.
+            // NO DestroyGpuPool — per-camera pool has been removed.
        }

-        // 2. close() does CUDA cleanup (cuArrayDestroy/cuMemFree) — run outside
-        //    _mutex to avoid deadlocking with nvcuda64 SRW lock held by inference.
-        //    Safe now because GetImage()/GetNV12Frame() won't touch the player
-        //    while _isPlaying == false, and all in-flight frames have been released.
+        // 2. close() destroys NVDEC decoder ONLY — run outside _mutex to
+        //    avoid deadlocking with nvcuda64 SRW lock held by inference.
+        //    Pool slot buffers are global and untouched.
        _logger.LogInfo("ANSRTSPClient::Reconnect",
            "calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
        RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
@@ -1071,6 +1030,8 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
    }

    try {
+        auto t0 = std::chrono::steady_clock::now();
+
        // Get image (shallow copy - reference counted, fast)
        cv::Mat img = (*Handle)->GetImage(width, height, timeStamp);

@@ -1082,6 +1043,8 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
        // Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock)
        anscv_mat_replace(image, std::move(img));

+        auto t1 = std::chrono::steady_clock::now();
+
        // Attach NV12 frame for GPU fast-path inference (side-table registry)
        // attach() takes ownership — do NOT av_frame_free here
        //
@@ -1101,7 +1064,11 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
                        cudaHW->width, cudaHW->height,
                        (void*)cudaHW->data[0], (void*)cudaHW->data[1]);
                AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
-                gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
+
+                // Acquire a slot from the global pool — survives camera Destroy.
+                GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire(
+                    gpuIdx, cudaHW->width, cudaHW->height);
+                gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot);
            } else {
                // HW decode not active — try CPU NV12
                AVFrame* nv12 = (*Handle)->GetNV12Frame();
@@ -1114,11 +1081,11 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
            // TryIncrementInFlight already incremented; DecrementInFlight fires
            // when the last clone of this frame is released after inference.
            auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
-            RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d gpuCacheY=%p",
+            RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p",
                    (void*)gpuData,
                    gpuData ? (void*)gpuData->yPlane : nullptr,
                    gpuData ? (int)gpuData->isCudaDevicePtr : -1,
-                    gpuData ? gpuData->gpuCacheY : nullptr);
+                    gpuData ? (void*)gpuData->poolSlot : nullptr);
            if (gpuData) {
                gpuData->ownerClient = *Handle;
                gpuData->onReleaseFn = [](void* client) {
@@ -1136,6 +1103,20 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
            RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
        }

+        // Lightweight timing via spdlog (no OutputDebugString).
+        // Logs only when the frame grab + D2D exceeds 50ms — helps diagnose stalls
+        // without the overhead of per-frame debug logging.
+        auto t2 = std::chrono::steady_clock::now();
+        double getImageMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
+        double cudaMs     = std::chrono::duration<double, std::milli>(t2 - t1).count();
+        double totalMs    = getImageMs + cudaMs;
+        if (totalMs > 50.0) {
+            (*Handle)->_logger.LogWarn("GetRTSPCVImage",
+                std::format("SLOW FRAME: total={:.1f}ms (getImage={:.1f}ms cuda={:.1f}ms) {}x{}",
+                    totalMs, getImageMs, cudaMs, width, height),
+                __FILE__, __LINE__);
+        }
+
        return 1;  // Success
    }
    catch (const cv::Exception& e) {
--- a/modules/ANSCV/ANSRTSP.h
+++ b/modules/ANSCV/ANSRTSP.h
@@ -40,7 +40,7 @@ namespace ANSCENTER
 		bool	 _isPlaying;
 		std::recursive_mutex		_mutex;

-		// --- Per-client inference guard ---
+			// --- Per-client inference guard ---
 		// Tracks how many GPU frames from this client are currently in-flight
 		// (grabbed by GetRTSPCVImage but not yet released after inference).
 		// Destroy() waits for this to reach 0 before freeing NVDEC surfaces,
--- a/modules/ANSCV/GpuNV12SlotPool.cpp
+++ b/modules/ANSCV/GpuNV12SlotPool.cpp
@@ -0,0 +1,107 @@
+// GpuNV12SlotPool.cpp — Process-wide singleton, compiled into ANSCV.dll.
+//
+// ANSCV.dll owns the canonical GpuNV12SlotPool instance.  Other DLLs
+// (ANSODEngine, etc.) find it via GetProcAddress at runtime.
+
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include "GpuNV12SlotPool.h"
+
+#include <cuda_runtime.h>
+
+// ANSCV.dll owns the process-wide singleton.
+GpuNV12SlotPool* GpuNV12SlotPool::resolveProcessWide() {
+    static GpuNV12SlotPool pool;
+    return &pool;
+}
+
+// Exported so other DLLs (ANSODEngine, etc.) can find this instance at runtime.
+extern "C" __declspec(dllexport)
+GpuNV12SlotPool* GpuNV12SlotPool_GetInstance() {
+    return &GpuNV12SlotPool::instance();
+}
+
+// Transition all COOLING slots past the cooldown threshold to FREE.
+void GpuNV12SlotPool::drainCooledSlots_locked() {
+    auto now = std::chrono::steady_clock::now();
+    auto threshold = std::chrono::milliseconds(SLOT_COOLDOWN_MS);
+    for (auto& s : m_slots) {
+        if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_COOLING) {
+            if (now - s->cooldownStart >= threshold) {
+                s->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
+            }
+        }
+    }
+}
+
+// Acquire a free slot matching (gpuIdx, w, h), or allocate a new one.
+GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    // 1. Drain cooled-down slots to make them available
+    drainCooledSlots_locked();
+
+    // 2. Try to find an existing FREE slot that matches the resolution
+    for (auto& s : m_slots) {
+        if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_FREE &&
+            s->gpuIdx == gpuIdx && s->width == w && s->height == h) {
+            s->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);
+            NV12POOL_DBG("acquire: reuse slot Y=%p UV=%p %dx%d gpu=%d (total=%zu)",
+                         s->bufY, s->bufUV, w, h, gpuIdx, m_slots.size());
+            return s.get();
+        }
+    }
+
+    // 3. No matching free slot — allocate a new one if under the limit
+    if (static_cast<int>(m_slots.size()) >= GPU_NV12_POOL_MAX_SLOTS) {
+        NV12POOL_DBG("acquire: POOL FULL (%zu slots) — fallback to CPU path",
+                     m_slots.size());
+        return nullptr;
+    }
+
+    // Allocate CUDA buffers on the target GPU
+    int prevDev = -1;
+    cudaGetDevice(&prevDev);
+    if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
+
+    auto slot = std::make_unique<GpuNV12Slot>();
+    cudaError_t e1 = cudaMallocPitch(&slot->bufY,  &slot->pitchY,  w, h);
+    cudaError_t e2 = cudaMallocPitch(&slot->bufUV, &slot->pitchUV, w, h / 2);
+
+    // Non-blocking stream avoids NULL-stream implicit sync with inference.
+    // On WDDM, the NULL stream must wait for ALL other streams to finish
+    // before executing — this caused 1-2 second stalls when inference
+    // kernels were running.  A non-blocking stream runs independently.
+    cudaStream_t stream = nullptr;
+    cudaError_t e3 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+
+    if (prevDev >= 0) cudaSetDevice(prevDev);
+
+    if (e1 != cudaSuccess || e2 != cudaSuccess) {
+        NV12POOL_DBG("acquire: cudaMallocPitch FAILED %dx%d gpu=%d e1=%d e2=%d",
+                     w, h, gpuIdx, (int)e1, (int)e2);
+        // Clean up partial allocation
+        int prev2 = -1; cudaGetDevice(&prev2);
+        if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
+        if (e1 == cudaSuccess && slot->bufY) cudaFree(slot->bufY);
+        if (e2 == cudaSuccess && slot->bufUV) cudaFree(slot->bufUV);
+        if (e3 == cudaSuccess && stream) cudaStreamDestroy(stream);
+        if (prev2 >= 0) cudaSetDevice(prev2);
+        return nullptr;
+    }
+
+    slot->width  = w;
+    slot->height = h;
+    slot->gpuIdx = gpuIdx;
+    slot->copyStream = (e3 == cudaSuccess) ? stream : nullptr;
+    slot->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);
+
+    GpuNV12Slot* raw = slot.get();
+    m_slots.push_back(std::move(slot));
+
+    NV12POOL_DBG("acquire: NEW slot Y=%p UV=%p pitchY=%zu pitchUV=%zu %dx%d gpu=%d stream=%p (total=%zu)",
+                 raw->bufY, raw->bufUV, raw->pitchY, raw->pitchUV,
+                 w, h, gpuIdx, raw->copyStream, m_slots.size());
+    return raw;
+}