From 6fb09830c553ef771cb3543dc621f5bc4f3b11c7 Mon Sep 17 00:00:00 2001
From: Tuan Nghia Nguyen <nghia.nguyen@anscenter.com>
Date: Fri, 3 Apr 2026 14:51:52 +1100
Subject: [PATCH] Fix NV12 crash issue when recreate camera object (new
 structure) does not work

---
 .claude/settings.local.json                  |   9 +-
 NV12_CAMERA_RECONNECT_FIX_SUMMARY.md         | 173 +++++++++++
 include/ANSGpuFrameRegistry.h                |  69 ++++-
 include/GpuNV12SlotPool.h                    | 161 +++++++++++
 modules/ANSCV/ANSGpuFrameOps.h               | 284 ++++++++++++-------
 modules/ANSCV/ANSRTSP.cpp                    | 157 +++++-----
 modules/ANSCV/ANSRTSP.h                      |   2 +-
 modules/ANSCV/GpuNV12SlotPool.cpp            | 107 +++++++
 modules/ANSFR/CMakeLists.txt                 |   1 +
 modules/ANSFR/GpuNV12SlotPool.cpp            |  23 ++
 modules/ANSLPR/CMakeLists.txt                |   1 +
 modules/ANSLPR/GpuNV12SlotPool.cpp           |  23 ++
 modules/ANSOCR/GpuNV12SlotPool.cpp           |  23 ++
 modules/ANSODEngine/CMakeLists.txt           |   1 +
 modules/ANSODEngine/GpuNV12SlotPool.cpp      |  23 ++
 modules/ANSODEngine/NV12PreprocessHelper.cpp |   6 +-
 16 files changed, 854 insertions(+), 209 deletions(-)
 create mode 100644 NV12_CAMERA_RECONNECT_FIX_SUMMARY.md
 create mode 100644 include/GpuNV12SlotPool.h
 create mode 100644 modules/ANSCV/GpuNV12SlotPool.cpp
 create mode 100644 modules/ANSFR/GpuNV12SlotPool.cpp
 create mode 100644 modules/ANSLPR/GpuNV12SlotPool.cpp
 create mode 100644 modules/ANSOCR/GpuNV12SlotPool.cpp
 create mode 100644 modules/ANSODEngine/GpuNV12SlotPool.cpp

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index 7c6b0bd..13b006c 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -34,7 +34,14 @@
       "Bash(export \"LIB=C:/Program Files/Microsoft Visual Studio/2022/Community/VC/Tools/MSVC/14.44.35207/lib/x64;C:/Program Files \\(x86\\)/Windows Kits/10/Lib/10.0.26100.0/ucrt/x64;C:/Program Files \\(x86\\)/Windows Kits/10/Lib/10.0.26100.0/um/x64\")",
       "Bash(grep -E \"\\\\.\\(cpp|h|hpp\\)$\")",
       "Bash(find /c/Projects/CLionProjects/ANSCORE -name *Logger* -type f)",
-      "Bash(find /c/Projects/CLionProjects/ANSCORE -name *SPDLogger* -o -name *ANSLogger*)"
+      "Bash(find /c/Projects/CLionProjects/ANSCORE -name *SPDLogger* -o -name *ANSLogger*)",
+      "Read(//c/Users/nghia/Downloads/**)",
+      "Bash(find C:ProjectsCLionProjectsANSCORE -name *NV12* -type f)",
+      "Bash(find C:ProjectsCLionProjectsANSCORE -name *GpuFrame* -type f)",
+      "mcp__desktop-commander__get_file_info",
+      "mcp__desktop-commander__interact_with_process",
+      "Bash(sort -t= -k2 -rn)",
+      "Bash(sort -t= -k3 -rn)"
     ]
   }
 }
diff --git a/NV12_CAMERA_RECONNECT_FIX_SUMMARY.md b/NV12_CAMERA_RECONNECT_FIX_SUMMARY.md
new file mode 100644
index 0000000..ca1939a
--- /dev/null
+++ b/NV12_CAMERA_RECONNECT_FIX_SUMMARY.md
@@ -0,0 +1,173 @@
+# NV12 Camera Reconnect Fix — Complete Summary
+
+## Original Issue
+When RTSP camera reconnects (stop→destroy→release→create) while inference engines
+(ANSALPR, ANSFR, ANSOCR, ANSODEngine) are running, the camera process destroys NV12
+GPU memory (NVDEC surfaces) that inference is actively reading → crash LabVIEW.
+
+## Root Causes Discovered (in order of discovery)
+
+### 1. No inference guard in Reconnect()
+`Reconnect()` called `close()` immediately without waiting for in-flight inference.
+`Destroy()` had the guard but `Reconnect()` did not.
+
+### 2. Race window between GetCudaHWFrame() and IncrementInFlight()
+`_inFlightFrames` was incremented AFTER `GetCudaHWFrame()` returned NVDEC pointers.
+Reconnect could see counter=0 and call `close()` while D2D copy was in progress.
+**Log proof:** `attach_cuda: START` with no `D2D OK` (ANSLEGION.log line 52859).
+
+### 3. NVDEC surfaces tied to decoder lifetime
+Even with proper ref counting, `player->close()` destroys the NVDEC decoder context,
+invalidating ALL surfaces at the driver level regardless of AVFrame references.
+Raw NVDEC device pointers in GpuFrameData become dangling.
+
+### 4. Cross-stream cudaFree race (cudaErrorIllegalAddress 700)
+`cudaFree` on default stream (stream 0) doesn't wait for NV12 kernels on
+cv::cuda::Stream. Freeing a buffer while a kernel reads it → error 700 →
+CUDA context permanently corrupted → all subsequent CUDA operations fail.
+**Log proof:** ANSLEGION.3log.log line 75769.
+
+### 5. cudaDeviceSynchronize blocking (900-1800ms spikes)
+Adding `cudaDeviceSynchronize()` before `cudaFree` fixed error 700 but blocked
+the entire GPU pipeline for 900ms+ per call, causing processing time spikes.
+
+### 6. GPU memory exhaustion (cudaMallocPitch FAILED e1=2)
+Per-frame `cudaMallocPitch`/`cudaFree` caused VRAM to fill up when cudaFree was
+deferred. Error 2 = `cudaErrorMemoryAllocation` (OOM).
+**Log proof:** ANSLEGION.7.log at timestamp 457s.
+
+### 7. Unreleased clones (70 images) on shutdown
+LabVIEW AI tasks hold cloned cv::Mat* references. On quit, 70+ clones are not
+released. Their GPU buffers (gpuCacheY/UV) are never freed. After `close()` destroys
+CUDA context, later `cudaFree` crashes.
+**Log proof:** dump log showing `WriteFlatTransferData` access violation.
+
+### 8. WDDM nvcuda64 SRW lock contention (45-90s freeze)
+Even with async copy, `cudaStreamSynchronize` blocks when another camera's `close()`
+holds the nvcuda64 exclusive SRW lock. All CUDA operations from other cameras stall.
+**Log proof:** ANSLEGION9.log — 45s gap between START and D2D OK.
+
+---
+
+## Files Modified
+
+### `include/ANSGpuFrameRegistry.h`
+- Added `GpuPendingFreeEntry` struct with `{ptr, deviceIdx, queuedAt}` for time-based safe free
+- Added `forceReleaseByOwner(void* client)` — force-cleans all frames owned by a camera
+- Changed `drain_gpu_pending(int minAgeMs=0)` — supports time-based filtering (only free entries >100ms old)
+- Added `REG_DBG` logging macro (OutputDebugString + fprintf)
+- Added debug logging to `attach()`, `release()`, `freeOwnedBuffers_locked()`
+
+### `modules/ANSCV/ANSGpuFrameOps.h`
+- Added `#include <cuda_runtime.h>` and `GPU_FRAME_DBG` logging macro
+- Added `drainAndFreeGpuPending(bool forceAll=false)` — time-based safe free (100ms threshold) or forced (with cudaDeviceSynchronize for Destroy/Reconnect)
+- **Rewrote `gpu_frame_attach_cuda()`**:
+  - Accepts optional pool buffers: `poolY, poolYPitch, poolUV, poolUVPitch, poolStream`
+  - **Pool path (ANSRTSP):** `cudaMemcpy2DAsync` on non-blocking stream, no sync, keeps AVFrame alive
+  - **Legacy path (other modules):** per-frame `cudaMallocPitch` + `cudaMemcpy2D` (backward compatible)
+  - Both paths: snapshot CPU NV12 as cross-GPU fallback, set `isCudaDevicePtr=true`
+- `gpu_frame_remove()` — no cudaFree in hot path (deferred to eviction/cleanup)
+- `gpu_frame_evict_stale()` — calls `drainAndFreeGpuPending(false)` with time-based safety
+
+### `modules/ANSCV/ANSRTSP.h`
+- Added `GpuNV12Pool` struct: ping-pong buffers (Y[2], UV[2]), pitch, resolution, GPU index, cudaStream
+- Added `EnsureGpuPool()`, `DestroyGpuPool()`, `GetGpuPool()` methods
+- Added `TryIncrementInFlight()` — atomically checks `_isPlaying` AND increments `_inFlightFrames` under same mutex
+
+### `modules/ANSCV/ANSRTSP.cpp`
+- Added `RTSP_DBG` logging macro (OutputDebugString + fprintf)
+- Implemented `EnsureGpuPool()`: allocates 2 Y + 2 UV buffers + non-blocking CUDA stream on first CUDA frame
+- Implemented `DestroyGpuPool()`: syncs pool stream, cudaFree all 4 buffers, destroy stream
+- **`Reconnect()`**: added inference guard (wait for _inFlightFrames), forceReleaseByOwner, DestroyGpuPool before close()
+- **`Destroy()`**: added forceReleaseByOwner, DestroyGpuPool before close()
+- **`GetRTSPCVImage()`**: calls `TryIncrementInFlight()` BEFORE `GetCudaHWFrame()`, uses pool buffers + pool stream for D2D copy
+
+### `modules/ANSODEngine/NV12PreprocessHelper.cpp`
+- Added debug logging before tryNV12 (pointer state, refcount, gpuCacheY) and at kernel launch
+
+---
+
+## Architecture After Fix
+
+```
+GetRTSPCVImage (camera thread):
+  1. TryIncrementInFlight() — atomic check _isPlaying + increment (blocks Reconnect)
+  2. GetCudaHWFrame() — gets NVDEC device pointers
+  3. EnsureGpuPool() — allocate 2 ping-pong buffer pairs (first frame only)
+  4. pool.next() — get next ping-pong slot
+  5. gpu_frame_attach_cuda(..., poolY, poolUV, poolStream):
+     a. cudaMemcpy2DAsync: NVDEC → pool buffer (non-blocking, no sync)
+     b. Keep AVFrame alive (stored in GpuFrameData.avframe)
+     c. yPlane = poolY, uvPlane = poolUV, isCudaDevicePtr = true
+     d. gpuCacheY = nullptr (pool owns buffers)
+  6. Registry callback: DecrementInFlight on last clone release
+
+Inference (engine thread):
+  1. tl_currentGpuFrame() = lookup(*cvImage) — raw GpuFrameData pointer
+  2. tryNV12(): reads yPlane/uvPlane (pool buffer, CUDA device ptr)
+  3. useZeroCopy = true → wraps as GpuMat → CUDA kernel launch
+  4. Kernel reads from pool buffer — same data, independent of NVDEC
+
+Reconnect (camera thread):
+  1. _isPlaying = false — blocks new TryIncrementInFlight
+  2. Wait _inFlightFrames == 0 (5s timeout)
+  3. forceReleaseByOwner(this) — free all registry entries for this camera
+  4. DestroyGpuPool() — cudaStreamSynchronize + cudaFree 4 buffers + destroy stream
+  5. close() — destroy NVDEC decoder (safe, no dependencies)
+  6. Setup() + play() — re-create camera
+  7. Next frame: EnsureGpuPool re-allocates
+
+Destroy (camera thread):
+  Same as Reconnect steps 1-5, then delete object.
+```
+
+## Key Design Decisions
+
+| Decision | Why |
+|----------|-----|
+| D2D copy instead of raw NVDEC pointers | Decouples NV12 lifetime from decoder — close() is always safe |
+| Ping-pong GPU buffer pool (2 pairs) | Eliminates per-frame cudaMalloc/cudaFree — constant VRAM |
+| cudaMemcpy2DAsync + no sync | Avoids 45-90s WDDM SRW lock freeze during close() |
+| Keep AVFrame alive during async copy | NVDEC source valid until GPU finishes the copy |
+| TryIncrementInFlight before GetCudaHWFrame | Closes race window — Reconnect must wait before close() |
+| forceReleaseByOwner on Destroy/Reconnect | Cleans up unreleased clones before CUDA context destroyed |
+| Time-based cudaFree (100ms threshold) | Frees legacy-path buffers without cudaDeviceSynchronize |
+| No cudaFree in hot path (attach/remove) | Prevents cross-stream races and processing spikes |
+
+## VRAM Usage Per Camera
+- 4K (3840×2160): ~26MB fixed (2 × Y + 2 × UV with pitch alignment)
+- 1080p (1920×1080): ~6.5MB fixed
+- 720p (1280×720): ~3MB fixed
+- Zero growth during normal operation
+
+## Status of Each Fix (chronological)
+
+| # | Fix | Log Verified | Result |
+|---|-----|-------------|--------|
+| 1 | Inference guard in Reconnect | ANSLEGION.log | Closed race but didn't fix root cause |
+| 2 | D2D copy (owned GPU buffers) | ANSLEGION.1log | Fixed NVDEC dependency, new race found |
+| 3 | TryIncrementInFlight | ANSLEGION.2log | Fixed race window, ran 85min stable |
+| 4 | cudaDeviceSynchronize before cudaFree | ANSLEGION.3log→4 | Fixed err=700, caused 900ms spikes |
+| 5 | Remove cudaFree from hot path | ANSLEGION.5→6 | Removed spikes, caused memory leak |
+| 6 | GPU buffer pool (ping-pong) | ANSLEGION.7→8 | Fixed OOM, zero CUDA errors, 7hr run |
+| 7 | Time-based safe cudaFree (100ms) | ANSLEGION.8 | Fixed memory leak from legacy path |
+| 8 | Async D2D + no sync | ANSLEGION.9→10 | Fix for WDDM SRW lock freeze |
+| 9 | Keep AVFrame alive during async | Current | Ensures NVDEC source valid for async copy |
+
+## Files NOT Modified (transparent to engines)
+- All engine DLLs (ANSODEngine, ANSLPR, ANSFR, ANSOCR) — no changes
+- NV12PreprocessHelper logic — zero-copy path unchanged (isCudaDevicePtr=true)
+- CUDA kernels (nv12_to_rgb.cu) — reads GPU pointers regardless of source
+- TensorRT engine (EngineRunInference.inl) — no changes
+- Other camera modules (ANSFLV, ANSSRT, ANSMJPEG, ANSRTMP) — use legacy path, backward compatible
+
+## Testing Checklist
+- [ ] Start multiple RTSP cameras with HW decoding + multiple AI engines
+- [ ] Trigger camera reconnect (disconnect network cable or call ReconnectRTSP)
+- [ ] Verify: no crash, inference continues after reconnection
+- [ ] Verify: processing time stable (no 900ms+ spikes)
+- [ ] Verify: VRAM stable (no growth over time, check nvidia-smi)
+- [ ] Verify: clean shutdown (no crash on LabVIEW quit)
+- [ ] Verify logs: `D2D OK (pool)` on every frame, `async=1`
+- [ ] Verify logs: no `cudaFree FAILED`, no `FALLBACK CPU`, no `err=700`
+- [ ] Long run test: 1+ hours with cameras reconnecting periodically
diff --git a/include/ANSGpuFrameRegistry.h b/include/ANSGpuFrameRegistry.h
index cd103f9..f230b32 100644
--- a/include/ANSGpuFrameRegistry.h
+++ b/include/ANSGpuFrameRegistry.h
@@ -42,8 +42,10 @@
 #include <windows.h>
 #endif
 
-// Debug logging for registry operations — both stderr and OutputDebugString.
+// Debug logging for registry operations.
+// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
 #ifndef REG_DBG
+#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
 #ifdef _WIN32
 #define REG_DBG(fmt, ...) do { \
     char _reg_buf[512]; \
@@ -54,7 +56,13 @@
 #else
 #define REG_DBG(fmt, ...) fprintf(stderr, "[Registry] " fmt "\n", ##__VA_ARGS__)
 #endif
+#else
+#define REG_DBG(fmt, ...) ((void)0)
 #endif
+#endif
+
+// GpuNV12Slot definition needed by freeOwnedBuffers_locked() (accesses inUse atomic).
+#include "GpuNV12SlotPool.h"
 
 // Safety constants
 static constexpr int    MAX_FRAME_REFCOUNT    = 64;
@@ -66,6 +74,7 @@ static constexpr int    EVICT_CHECK_INTERVAL_MS  = 500;
 struct GpuPendingFreeEntry {
     void* ptr       = nullptr;
     int   deviceIdx = -1;
+    std::chrono::steady_clock::time_point queuedAt;  // When this entry was queued
 };
 
 struct GpuFrameData {
@@ -116,6 +125,13 @@ struct GpuFrameData {
     void*  ownerClient   = nullptr;
     void (*onReleaseFn)(void*) = nullptr;
 
+    // --- Global pool slot (from GpuNV12SlotPool) ---
+    // When non-null, yPlane/uvPlane point into this slot's buffers.
+    // Released (slot->inUse = false) in freeOwnedBuffers_locked() when
+    // the frame's refcount drops to 0 — guarantees the buffer is not
+    // freed while any consumer is still reading it.
+    GpuNV12Slot* poolSlot = nullptr;
+
     // Default constructor
     GpuFrameData() = default;
 
@@ -134,6 +150,7 @@ struct GpuFrameData {
         , yLinesize(o.yLinesize), uvLinesize(o.uvLinesize)
         , refcount(o.refcount.load()), createdAt(o.createdAt)
         , ownerClient(o.ownerClient), onReleaseFn(o.onReleaseFn)
+        , poolSlot(o.poolSlot)
     {
         // Null out source to prevent double-free of owned pointers
         o.cpuYPlane = nullptr;
@@ -147,6 +164,7 @@ struct GpuFrameData {
         o.gpuCacheBytes = 0;
         o.ownerClient = nullptr;
         o.onReleaseFn = nullptr;
+        o.poolSlot = nullptr;
     }
 
     // No copy
@@ -344,11 +362,30 @@ public:
 
     // --- Drain pending GPU device pointers for caller to cudaFree ---
     // Each entry includes the device index for cudaSetDevice before cudaFree.
-    std::vector<GpuPendingFreeEntry> drain_gpu_pending() {
+    // If minAgeMs > 0, only drain entries older than minAgeMs milliseconds.
+    // This allows time-based safety: entries queued >100ms ago are guaranteed
+    // safe to free because all CUDA kernels complete in <10ms.
+    std::vector<GpuPendingFreeEntry> drain_gpu_pending(int minAgeMs = 0) {
         std::lock_guard<std::mutex> lock(m_mutex);
-        std::vector<GpuPendingFreeEntry> result;
-        result.swap(m_pendingGpuFree);
-        return result;
+        if (minAgeMs <= 0) {
+            // Drain all (used by Destroy/Reconnect with cudaDeviceSynchronize)
+            std::vector<GpuPendingFreeEntry> result;
+            result.swap(m_pendingGpuFree);
+            return result;
+        }
+        // Drain only entries older than minAgeMs
+        auto now = std::chrono::steady_clock::now();
+        auto threshold = std::chrono::milliseconds(minAgeMs);
+        std::vector<GpuPendingFreeEntry> ready;
+        std::vector<GpuPendingFreeEntry> notReady;
+        for (auto& entry : m_pendingGpuFree) {
+            if (now - entry.queuedAt >= threshold)
+                ready.push_back(entry);
+            else
+                notReady.push_back(entry);
+        }
+        m_pendingGpuFree = std::move(notReady);
+        return ready;
     }
 
     // --- TTL eviction: force-free frames older than FRAME_TTL_SECONDS ---
@@ -506,10 +543,23 @@ private:
     // Free malloc'd CPU NV12 buffers and GPU cache (but NOT avframe/cpuAvframe —
     // those go to pendingFree for the caller to av_frame_free).
     void freeOwnedBuffers_locked(GpuFrameData* frame) {
-        REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d",
+        REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d poolSlot=%p",
                 (void*)frame, (void*)frame->cpuYPlane, (void*)frame->cpuUvPlane,
                 frame->gpuCacheY, frame->gpuCacheUV,
-                frame->gpuCacheBytes, frame->gpuCacheDeviceIdx);
+                frame->gpuCacheBytes, frame->gpuCacheDeviceIdx, (void*)frame->poolSlot);
+        // Release global pool slot via DEFERRED release — the slot enters a
+        // "cooling" state for SLOT_COOLDOWN_MS (200ms) before it becomes
+        // available for reuse.  This guarantees that any in-flight GPU kernels
+        // (launched asynchronously by inference engines) have completed reading
+        // from the buffer.  CPU refcount→0 does NOT mean the GPU is done.
+        if (frame->poolSlot) {
+            GpuNV12SlotPool::deferRelease(frame->poolSlot);
+            frame->poolSlot = nullptr;
+            // yPlane/uvPlane pointed into the pool slot — null them to
+            // prevent any stale reads after this point.
+            frame->yPlane = nullptr;
+            frame->uvPlane = nullptr;
+        }
         if (frame->cpuYPlane) {
             std::free(frame->cpuYPlane);
             frame->cpuYPlane = nullptr;
@@ -525,10 +575,11 @@ private:
             frame->gpuCacheValid = false;
             frame->gpuCacheBytes = 0;
             int devIdx = frame->gpuCacheDeviceIdx;
+            auto now = std::chrono::steady_clock::now();
             if (frame->gpuCacheY)
-                m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx});
+                m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx, now});
             if (frame->gpuCacheUV)
-                m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx});
+                m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx, now});
             frame->gpuCacheY = nullptr;
             frame->gpuCacheUV = nullptr;
         }
diff --git a/include/GpuNV12SlotPool.h b/include/GpuNV12SlotPool.h
new file mode 100644
index 0000000..9c66ca8
--- /dev/null
+++ b/include/GpuNV12SlotPool.h
@@ -0,0 +1,161 @@
+#pragma once
+// GpuNV12SlotPool.h — Process-wide GPU NV12 buffer pool.
+//
+// Provides pre-allocated CUDA buffer slots (Y + UV planes) that are shared
+// across all RTSP camera instances.  Slots are acquired per-frame by
+// GetRTSPCVImage and released back to the pool when the GpuFrameData's
+// refcount drops to 0 in freeOwnedBuffers_locked().
+//
+// KEY DESIGN: Slots are NEVER freed when a camera is destroyed — they are
+// recycled.  This decouples GPU buffer lifetime from camera lifetime, so
+// inference engines can safely read NV12 data even after the camera object
+// that produced it has been deleted and recreated (the LabVIEW reconnect
+// pattern: ReleaseHandle → Destroy → delete → CreateHandle).
+//
+// TIME-DELAYED RELEASE: When a GpuFrameData's refcount drops to 0, the
+// slot is NOT immediately available.  It enters a "cooling" state for
+// SLOT_COOLDOWN_MS (50ms) to guarantee that any in-flight GPU kernels
+// (launched asynchronously by inference engines) have completed reading
+// from the buffer.  CUDA kernels typically complete in <10ms, so 50ms
+// provides a 5x safety margin.  The cooldown is kept short to minimize
+// the number of slots in COOLING, which prevents POOL FULL events.
+// POOL FULL triggers per-frame cudaMalloc/cudaFree, which holds the
+// nvcuda64 SRW lock and causes cascading stalls on other cameras'
+// cudaMemcpy2D operations.
+//
+// Thread-safe: acquire() locks internally, deferRelease() is lock-free.
+//
+// Cross-DLL: uses the same resolveProcessWide() singleton pattern as
+// ANSGpuFrameRegistry.  ANSCV.dll owns the canonical instance; other DLLs
+// find it via GetProcAddress("GpuNV12SlotPool_GetInstance").
+
+#include <vector>
+#include <memory>
+#include <mutex>
+#include <atomic>
+#include <cstdint>
+#include <cstdio>
+#include <chrono>
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+// Safety constants
+static constexpr int GPU_NV12_POOL_MAX_SLOTS   = 64;
+static constexpr int SLOT_COOLDOWN_MS          = 50;   // Time after CPU release before slot reuse
+                                                        // GPU kernels complete in <10ms; 50ms = 5× margin
+
+// Debug logging for pool operations.
+// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
+// In production, these are silent to avoid OutputDebugString/fprintf
+// lock contention (measured: 500-2000 calls/sec causes process stalls).
+#ifndef NV12POOL_DBG
+#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
+#ifdef _WIN32
+#define NV12POOL_DBG(fmt, ...) do { \
+    char _p_buf[512]; \
+    snprintf(_p_buf, sizeof(_p_buf), "[NV12Pool] " fmt "\n", ##__VA_ARGS__); \
+    OutputDebugStringA(_p_buf); \
+    fprintf(stderr, "%s", _p_buf); \
+} while(0)
+#else
+#define NV12POOL_DBG(fmt, ...) fprintf(stderr, "[NV12Pool] " fmt "\n", ##__VA_ARGS__)
+#endif
+#else
+#define NV12POOL_DBG(fmt, ...) ((void)0)
+#endif
+#endif
+
+struct GpuNV12Slot {
+    void*  bufY    = nullptr;   // cudaMallocPitch'd Y plane
+    void*  bufUV   = nullptr;   // cudaMallocPitch'd UV plane
+    size_t pitchY  = 0;
+    size_t pitchUV = 0;
+    int    width   = 0;         // Resolution this slot was allocated for
+    int    height  = 0;
+    int    gpuIdx  = -1;        // GPU device index
+
+    // Slot lifecycle state:
+    //   FREE      (0) = available for acquire()
+    //   ACTIVE    (1) = owned by a GpuFrameData (D2D copy + inference reading)
+    //   COOLING   (2) = CPU released but GPU kernel may still be reading;
+    //                    becomes FREE after SLOT_COOLDOWN_MS elapses.
+    static constexpr int STATE_FREE    = 0;
+    static constexpr int STATE_ACTIVE  = 1;
+    static constexpr int STATE_COOLING = 2;
+    std::atomic<int> state{STATE_FREE};
+
+    // Timestamp when the slot entered COOLING state.
+    // Only meaningful when state == STATE_COOLING.
+    std::chrono::steady_clock::time_point cooldownStart;
+
+    // Per-slot CUDA stream for D2D copy (non-blocking).
+    // CRITICAL: cudaMemcpy2D (no stream arg) uses the NULL stream, which on
+    // WDDM implicitly synchronizes with ALL other streams before executing.
+    // This means the D2D copy must wait for all inference kernels to finish
+    // first — causing 1-2 second stalls.  Using a dedicated non-blocking
+    // stream avoids this implicit sync entirely.
+    // Stored as void* to avoid cuda_runtime.h in the header.
+    void* copyStream = nullptr;  // cudaStream_t
+};
+
+class GpuNV12SlotPool {
+public:
+    // Process-wide singleton (same pattern as ANSGpuFrameRegistry).
+    static GpuNV12SlotPool& instance() {
+#ifdef _WIN32
+        static GpuNV12SlotPool* s_inst = resolveProcessWide();
+        return *s_inst;
+#else
+        static GpuNV12SlotPool pool;
+        return pool;
+#endif
+    }
+
+    // Acquire a free slot matching (gpuIdx, w, h).
+    // Drains cooled-down slots first, then looks for a FREE match.
+    // If none, allocates a new one (up to GPU_NV12_POOL_MAX_SLOTS).
+    // Returns nullptr if pool full — caller falls back to CPU path.
+    GpuNV12Slot* acquire(int gpuIdx, int w, int h);
+
+    // Deferred release: moves slot from ACTIVE → COOLING.
+    // Called from freeOwnedBuffers_locked() when GpuFrameData refcount → 0.
+    // The slot becomes FREE after SLOT_COOLDOWN_MS elapses (checked in acquire).
+    static void deferRelease(GpuNV12Slot* slot) {
+        if (slot) {
+            slot->cooldownStart = std::chrono::steady_clock::now();
+            slot->state.store(GpuNV12Slot::STATE_COOLING, std::memory_order_release);
+        }
+    }
+
+    // Number of allocated slots (for diagnostics).
+    size_t slotCount() const {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        return m_slots.size();
+    }
+
+    // Number of in-use slots (for diagnostics).
+    size_t activeCount() const {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        size_t count = 0;
+        for (auto& s : m_slots) {
+            if (s->state.load(std::memory_order_relaxed) != GpuNV12Slot::STATE_FREE) ++count;
+        }
+        return count;
+    }
+
+private:
+    GpuNV12SlotPool() = default;
+
+#ifdef _WIN32
+    static GpuNV12SlotPool* resolveProcessWide();
+#endif
+
+    // Transition all COOLING slots that have exceeded SLOT_COOLDOWN_MS to FREE.
+    // Called at the start of acquire() under the lock.
+    void drainCooledSlots_locked();
+
+    mutable std::mutex m_mutex;
+    std::vector<std::unique_ptr<GpuNV12Slot>> m_slots;
+};
diff --git a/modules/ANSCV/ANSGpuFrameOps.h b/modules/ANSCV/ANSGpuFrameOps.h
index 40798a6..cd61e98 100644
--- a/modules/ANSCV/ANSGpuFrameOps.h
+++ b/modules/ANSCV/ANSGpuFrameOps.h
@@ -14,6 +14,7 @@
 // gpu_frame_lookup() + the GpuFrameData plane pointers.
 
 #include "ANSGpuFrameRegistry.h"
+#include "GpuNV12SlotPool.h"
 
 extern "C" {
 #include "libavutil/frame.h"
@@ -29,9 +30,9 @@ extern "C" {
 #endif
 
 // Debug logging macro for GPU frame operations.
-// Output goes to stderr (console) AND OutputDebugString (DebugView / VS debugger).
-// Use Sysinternals DebugView (dbgview64.exe) to capture these after a crash.
+// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
 #ifndef GPU_FRAME_DBG
+#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
 #ifdef _WIN32
 #define GPU_FRAME_DBG(fmt, ...) do { \
     char _gpu_dbg_buf[512]; \
@@ -43,6 +44,9 @@ extern "C" {
 #define GPU_FRAME_DBG(fmt, ...) \
     fprintf(stderr, "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__)
 #endif
+#else
+#define GPU_FRAME_DBG(fmt, ...) ((void)0)
+#endif
 #endif
 
 namespace anscv_gpu_ops {
@@ -94,31 +98,29 @@ inline bool snapshotNV12Planes(const AVFrame* nv12,
     return true;
 }
 
-// Drain pending GPU device pointers and actually cudaFree them.
-// Must be called from a thread with CUDA context available.
-inline void drainAndFreeGpuPending() {
-    auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
+// Drain pending GPU device pointers and cudaFree them.
+// Uses time-based safety: only frees entries queued >100ms ago, guaranteeing
+// all CUDA kernels reading from them have completed (kernels take <10ms).
+// NO cudaDeviceSynchronize — zero blocking of GPU pipeline.
+//
+// If forceAll=true, drains ALL entries with cudaDeviceSynchronize first
+// (used only by Destroy/Reconnect for final cleanup).
+inline void drainAndFreeGpuPending(bool forceAll = false) {
+    static constexpr int SAFE_AGE_MS = 100; // 100ms >> 10ms kernel duration
+    auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending(
+        forceAll ? 0 : SAFE_AGE_MS);
     if (gpuPending.empty()) return;
-    GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs", gpuPending.size());
+    GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs (force=%d)", gpuPending.size(), (int)forceAll);
     int prevDev = -1;
     cudaGetDevice(&prevDev);
-
-    // Group by device to minimize cudaSetDevice calls and synchronize once per device.
-    // cudaDeviceSynchronize() is CRITICAL: NV12 kernels run on cv::cuda::Stream
-    // (not the default stream).  cudaFree on stream 0 doesn't wait for other
-    // streams, so without this sync, cudaFree can free a buffer while a kernel
-    // on another stream is still reading from it → cudaErrorIllegalAddress (700)
-    // which permanently corrupts the CUDA context.
-    int lastSyncDev = -1;
+    if (forceAll) {
+        // Final cleanup — sync all devices first
+        cudaDeviceSynchronize();
+    }
     for (auto& entry : gpuPending) {
         if (entry.ptr) {
             if (entry.deviceIdx >= 0)
                 cudaSetDevice(entry.deviceIdx);
-            if (entry.deviceIdx != lastSyncDev) {
-                cudaDeviceSynchronize();
-                lastSyncDev = entry.deviceIdx;
-            }
-            GPU_FRAME_DBG("drainGpuPending: cudaFree(%p) dev=%d", entry.ptr, entry.deviceIdx);
             cudaError_t err = cudaFree(entry.ptr);
             if (err != cudaSuccess) {
                 GPU_FRAME_DBG("drainGpuPending: cudaFree FAILED err=%d (%s)",
@@ -179,22 +181,23 @@ inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pt
 // Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
 // TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
 //
-// D2D copy path: cudaMemcpy2D from NVDEC surfaces to cudaMalloc'd buffers on the
-// same GPU.  This decouples the NV12 data lifetime from the NVDEC decoder, so
-// player->close() can safely destroy the decoder at any time without invalidating
-// pointers that inference engines may be reading.  The NVDEC surface is freed
-// immediately (av_frame_free), returning it to the decoder's surface pool.
+// D2D copy: SYNCHRONOUS cudaMemcpy2D from NVDEC surfaces into a GpuNV12Slot
+// buffer from the global pool.  Data is valid immediately after the call returns.
+// AVFrame is freed immediately (NVDEC surface returned to decoder pool).
 //
-// The owned GPU pointers are stored as both yPlane/uvPlane (for zero-copy reads)
-// and gpuCacheY/gpuCacheUV (for lifecycle management / cudaFree on cleanup).
+// The slot is protected by a 200ms cooldown after the GpuFrameData's refcount
+// drops to 0, guaranteeing that all in-flight GPU kernels (which complete in
+// <10ms) have finished reading from the buffer before it can be reused.
 //
-// VRAM budget: if the global GPU cache budget is exceeded, falls back to CPU-only
-// NV12 snapshot (no zero-copy, but safe).
+// slot: pre-acquired from GpuNV12SlotPool::instance().acquire().
+//       If non-null, D2D copy goes into slot buffers (no per-frame alloc).
+//       If nullptr, falls back to per-frame cudaMallocPitch (legacy path).
 //
 // Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
 // inference (when decode GPU != inference GPU).
 inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
-                                   AVFrame* cpuNV12 = nullptr) {
+                                   AVFrame* cpuNV12 = nullptr,
+                                   GpuNV12Slot* slot = nullptr) {
     if (!mat || !cudaFrame) {
         GPU_FRAME_DBG("attach_cuda: SKIP mat=%p cudaFrame=%p", (void*)mat, (void*)cudaFrame);
         return;
@@ -202,9 +205,9 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
 
     const int w = cudaFrame->width;
     const int h = cudaFrame->height;
-    GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p cpuNV12=%p",
+    GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p slot=%p",
                   (void*)mat, w, h, gpuIdx,
-                  (void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)cpuNV12);
+                  (void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)slot);
 
     GpuFrameData data{};
     data.gpuIndex        = gpuIdx;
@@ -213,86 +216,145 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
     data.height          = h;
     data.pixelFormat     = 23; // AV_PIX_FMT_NV12
 
-    // Snapshot CPU NV12 for cross-GPU fallback (must do before freeing cpuNV12)
-    if (cpuNV12) {
-        anscv_gpu_ops::detail::snapshotNV12Planes(
-            cpuNV12,
-            data.cpuYPlane, data.cpuYLinesize,
-            data.cpuUvPlane, data.cpuUvLinesize,
-            data.width, data.height);
-    }
-
-    // --- D2D copy: NVDEC surface → owned GPU memory ---
-    // Estimate VRAM needed for the owned NV12 copy
-    const size_t yBytes  = static_cast<size_t>(w) * h;
-    const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
-    const size_t totalBytes = yBytes + uvBytes;
+    // NOTE: CPU NV12 snapshot is DEFERRED — only taken if pool D2D fails.
+    // For 4K frames, the snapshot is ~12MB malloc+memcpy+free per frame.
+    // Skipping it when the pool path succeeds (the common case) eliminates
+    // ~276MB/s of CPU heap allocation churn that causes process-level stalls.
 
+    // --- D2D copy: NVDEC surface → GPU buffer ---
     bool d2dOk = false;
-    if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
+
+    if (slot && slot->bufY && slot->bufUV && slot->pitchY > 0 && slot->pitchUV > 0) {
+        // --- Global pool path: D2D copy on per-slot non-blocking stream ---
+        // CRITICAL: Using the NULL stream (cudaMemcpy2D without stream) causes
+        // 1-2 second stalls on WDDM because it implicitly synchronizes with
+        // ALL other streams before executing.  By using cudaMemcpy2DAsync on
+        // the slot's own non-blocking stream + cudaStreamSynchronize, we:
+        //   1. Submit the copy immediately (no wait for inference kernels)
+        //   2. Wait ONLY for this copy to finish (~0.3ms 1080p, ~1.2ms 4K)
+        //   3. Data is valid after sync — av_frame_free is safe
         int prevDev = -1;
         cudaGetDevice(&prevDev);
-        if (gpuIdx >= 0)
-            cudaSetDevice(gpuIdx);
+        if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
 
-        void*  ownedY  = nullptr;
-        void*  ownedUV = nullptr;
-        size_t yPitch  = 0;
-        size_t uvPitch = 0;
-
-        cudaError_t e1 = cudaMallocPitch(&ownedY,  &yPitch,  w, h);
-        cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);
-
-        if (e1 == cudaSuccess && e2 == cudaSuccess) {
-            cudaError_t e3 = cudaMemcpy2D(ownedY,  yPitch,
-                                           cudaFrame->data[0], cudaFrame->linesize[0],
-                                           w, h, cudaMemcpyDeviceToDevice);
-            cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
-                                           cudaFrame->data[1], cudaFrame->linesize[1],
-                                           w, h / 2, cudaMemcpyDeviceToDevice);
+        cudaStream_t copyStream = static_cast<cudaStream_t>(slot->copyStream);
+        cudaError_t e3, e4;
 
+        if (copyStream) {
+            e3 = cudaMemcpy2DAsync(slot->bufY,  slot->pitchY,
+                                    cudaFrame->data[0], cudaFrame->linesize[0],
+                                    w, h, cudaMemcpyDeviceToDevice, copyStream);
+            e4 = cudaMemcpy2DAsync(slot->bufUV, slot->pitchUV,
+                                    cudaFrame->data[1], cudaFrame->linesize[1],
+                                    w, h / 2, cudaMemcpyDeviceToDevice, copyStream);
             if (e3 == cudaSuccess && e4 == cudaSuccess) {
-                // Store owned GPU pointers as primary NV12 source
-                data.isCudaDevicePtr = true;
-                data.yPlane          = static_cast<uint8_t*>(ownedY);
-                data.uvPlane         = static_cast<uint8_t*>(ownedUV);
-                data.yLinesize       = static_cast<int>(yPitch);
-                data.uvLinesize      = static_cast<int>(uvPitch);
-
-                // Track in gpuCache for lifecycle management (cudaFree on cleanup)
-                data.gpuCacheY          = ownedY;
-                data.gpuCacheUV         = ownedUV;
-                data.gpuCacheYPitch     = yPitch;
-                data.gpuCacheUVPitch    = uvPitch;
-                data.gpuCacheDeviceIdx  = gpuIdx;
-                data.gpuCacheValid      = true;
-                data.gpuCacheBytes      = yPitch * h + uvPitch * (h / 2);
-
-                ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
-                d2dOk = true;
-                GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
-                              ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
-            } else {
-                // D2D copy failed — free allocated memory and fall back
-                GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
-                              (int)e3, (int)e4);
-                cudaFree(ownedY);
-                cudaFree(ownedUV);
+                // Wait ONLY for this stream's 2 copies (~0.3-1.2ms).
+                // Does NOT wait for inference kernels on other streams.
+                cudaStreamSynchronize(copyStream);
             }
         } else {
-            // Allocation failed — free any partial allocation and fall back
-            GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
-                          (int)e1, (int)e2);
-            if (e1 == cudaSuccess) cudaFree(ownedY);
-            if (e2 == cudaSuccess) cudaFree(ownedUV);
+            // Fallback if stream creation failed — NULL stream (may stall)
+            e3 = cudaMemcpy2D(slot->bufY,  slot->pitchY,
+                               cudaFrame->data[0], cudaFrame->linesize[0],
+                               w, h, cudaMemcpyDeviceToDevice);
+            e4 = cudaMemcpy2D(slot->bufUV, slot->pitchUV,
+                               cudaFrame->data[1], cudaFrame->linesize[1],
+                               w, h / 2, cudaMemcpyDeviceToDevice);
         }
 
-        if (prevDev >= 0)
-            cudaSetDevice(prevDev);
+        if (prevDev >= 0) cudaSetDevice(prevDev);
+
+        if (e3 == cudaSuccess && e4 == cudaSuccess) {
+            data.isCudaDevicePtr = true;
+            data.yPlane          = static_cast<uint8_t*>(slot->bufY);
+            data.uvPlane         = static_cast<uint8_t*>(slot->bufUV);
+            data.yLinesize       = static_cast<int>(slot->pitchY);
+            data.uvLinesize      = static_cast<int>(slot->pitchUV);
+            data.poolSlot        = slot;  // Track for deferred release
+            // gpuCacheY/UV stay nullptr — global pool owns the buffers
+            d2dOk = true;
+            GPU_FRAME_DBG("attach_cuda: D2D OK (global pool) Y=%p UV=%p yPitch=%zu uvPitch=%zu",
+                          slot->bufY, slot->bufUV, slot->pitchY, slot->pitchUV);
+        } else {
+            GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED (pool) e3=%d e4=%d — fallback",
+                          (int)e3, (int)e4);
+            // Release slot back to pool on failure (immediate, no cooldown needed)
+            slot->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
+        }
+    }
+
+    if (!d2dOk && !slot) {
+        // --- Legacy path: per-frame cudaMallocPitch (for modules without pool) ---
+        const size_t yBytes  = static_cast<size_t>(w) * h;
+        const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
+        const size_t totalBytes = yBytes + uvBytes;
+
+        if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
+            int prevDev = -1;
+            cudaGetDevice(&prevDev);
+            if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
+
+            void*  ownedY  = nullptr;
+            void*  ownedUV = nullptr;
+            size_t yPitch  = 0;
+            size_t uvPitch = 0;
+
+            cudaError_t e1 = cudaMallocPitch(&ownedY,  &yPitch,  w, h);
+            cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);
+
+            if (e1 == cudaSuccess && e2 == cudaSuccess) {
+                cudaError_t e3 = cudaMemcpy2D(ownedY,  yPitch,
+                                               cudaFrame->data[0], cudaFrame->linesize[0],
+                                               w, h, cudaMemcpyDeviceToDevice);
+                cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
+                                               cudaFrame->data[1], cudaFrame->linesize[1],
+                                               w, h / 2, cudaMemcpyDeviceToDevice);
+
+                if (e3 == cudaSuccess && e4 == cudaSuccess) {
+                    data.isCudaDevicePtr = true;
+                    data.yPlane          = static_cast<uint8_t*>(ownedY);
+                    data.uvPlane         = static_cast<uint8_t*>(ownedUV);
+                    data.yLinesize       = static_cast<int>(yPitch);
+                    data.uvLinesize      = static_cast<int>(uvPitch);
+                    data.gpuCacheY          = ownedY;
+                    data.gpuCacheUV         = ownedUV;
+                    data.gpuCacheYPitch     = yPitch;
+                    data.gpuCacheUVPitch    = uvPitch;
+                    data.gpuCacheDeviceIdx  = gpuIdx;
+                    data.gpuCacheValid      = true;
+                    data.gpuCacheBytes      = yPitch * h + uvPitch * (h / 2);
+                    ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
+                    d2dOk = true;
+                    GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
+                                  ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
+                } else {
+                    GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
+                                  (int)e3, (int)e4);
+                    cudaFree(ownedY);
+                    cudaFree(ownedUV);
+                }
+            } else {
+                GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
+                              (int)e1, (int)e2);
+                if (e1 == cudaSuccess) cudaFree(ownedY);
+                if (e2 == cudaSuccess) cudaFree(ownedUV);
+            }
+
+            if (prevDev >= 0) cudaSetDevice(prevDev);
+        }
     }
 
     if (!d2dOk) {
-        // Fall back to CPU NV12 snapshot only (no zero-copy)
+        // D2D failed or no slot — take CPU NV12 snapshot now (before freeing cpuNV12).
+        // This is the ONLY path where the CPU snapshot is needed.  Skipping it
+        // on the pool-success path avoids ~12MB malloc+memcpy+free per 4K frame.
+        if (cpuNV12) {
+            anscv_gpu_ops::detail::snapshotNV12Planes(
+                cpuNV12,
+                data.cpuYPlane, data.cpuYLinesize,
+                data.cpuUvPlane, data.cpuUvLinesize,
+                data.width, data.height);
+        }
         GPU_FRAME_DBG("attach_cuda: FALLBACK CPU-only cpuY=%p cpuUV=%p",
                       (void*)data.cpuYPlane, (void*)data.cpuUvPlane);
         data.isCudaDevicePtr = false;
@@ -302,8 +364,8 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
         data.uvLinesize      = data.cpuUvLinesize;
     }
 
-    // Release AVFrames immediately — NVDEC surfaces returned to pool.
-    // No longer stored in GpuFrameData (owned GPU copy is independent).
+    // Free AVFrames immediately — synchronous D2D copy has completed,
+    // so NVDEC surfaces can be returned to the decoder's surface pool.
     GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p",
                   (void*)cudaFrame, (void*)cpuNV12);
     av_frame_free(&cudaFrame);
@@ -311,9 +373,9 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
     data.avframe    = nullptr;
     data.cpuAvframe = nullptr;
 
-    GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d gpuCacheY=%p gpuCacheUV=%p",
+    GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d poolSlot=%p",
                   (void*)data.yPlane, (void*)data.uvPlane, (int)data.isCudaDevicePtr,
-                  data.gpuCacheY, data.gpuCacheUV);
+                  (void*)data.poolSlot);
 
     void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
     if (old) {
@@ -327,12 +389,10 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
         AVFrame* stale = static_cast<AVFrame*>(p);
         av_frame_free(&stale);
     }
-
-    // Free stale GPU device pointers
-    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }
 
-// Release entry by cv::Mat* and free any returned AVFrames + GPU pointers.
+// Release entry by cv::Mat* and free any returned AVFrames.
+// GPU device pointers are deferred to TTL eviction or explicit cleanup.
 // Safe if not in map (no-op).
 inline void gpu_frame_remove(cv::Mat* mat) {
     if (!mat) return;
@@ -347,8 +407,7 @@ inline void gpu_frame_remove(cv::Mat* mat) {
         av_frame_free(&stale);
     }
 
-    // Free any GPU device pointers that became pending
-    anscv_gpu_ops::detail::drainAndFreeGpuPending();
+    // GPU device pointers deferred — see gpu_frame_evict_stale() / Destroy()
 }
 
 // Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
@@ -357,6 +416,12 @@ inline void gpu_frame_invalidate(cv::Mat* mat) {
 }
 
 // Run TTL eviction + drain pending. Call periodically from camera threads.
+// TTL eviction is throttled to every 500ms (EVICT_CHECK_INTERVAL_MS).
+// GPU buffer cleanup is safe here because:
+//   1. Only frames >3 seconds old are evicted (kernels take <10ms)
+//   2. cudaDeviceSynchronize() ensures all in-flight kernels are done
+//   3. At 500ms interval, one sync per 500ms is ~0.1ms cost (acceptable)
+//      vs per-frame sync which caused 900ms spikes
 inline void gpu_frame_evict_stale() {
     ANSGpuFrameRegistry::instance().evictStaleFrames();
 
@@ -366,6 +431,7 @@ inline void gpu_frame_evict_stale() {
         av_frame_free(&stale);
     }
 
-    // Free any GPU device pointers from evicted frames
+    // Free GPU device pointers from evicted/released frames (legacy path).
+    // Pool-backed frames (ANSRTSP) don't add to this list (gpuCacheY=nullptr).
     anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }
diff --git a/modules/ANSCV/ANSRTSP.cpp b/modules/ANSCV/ANSRTSP.cpp
index c5c313f..bf726bb 100644
--- a/modules/ANSCV/ANSRTSP.cpp
+++ b/modules/ANSCV/ANSRTSP.cpp
@@ -1,6 +1,7 @@
 ﻿#include "ANSRTSP.h"
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
+#include "GpuNV12SlotPool.h"
 #include <memory>
 #include <format>
 #include "media_codec.h"
@@ -23,8 +24,9 @@ extern "C"
 // Note: per-instance thread safety is handled by ANSRTSPClient::_mutex
 // Mat registry thread safety is handled by anscv_mat_replace's internal registry_mutex
 
-// Debug logging — goes to both stderr AND OutputDebugString (DebugView).
+// Debug logging. Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame logging.
 #ifndef RTSP_DBG
+#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
 #ifdef _WIN32
 #define RTSP_DBG(fmt, ...) do { \
     char _rtsp_buf[512]; \
@@ -35,6 +37,9 @@ extern "C"
 #else
 #define RTSP_DBG(fmt, ...) fprintf(stderr, fmt "\n", ##__VA_ARGS__)
 #endif
+#else
+#define RTSP_DBG(fmt, ...) ((void)0)
+#endif
 #endif
 static bool ansrtspLicenceValid = false;
 // Global once_flag to protect license checking
@@ -62,6 +67,7 @@ namespace ANSCENTER {
     ANSRTSPClient::~ANSRTSPClient() noexcept {
 		Destroy();
     }
+
     void ANSRTSPClient::Destroy() {
         // Move the player client pointer out of the lock scope, then
         // close it OUTSIDE the mutex.  close() calls cuArrayDestroy /
@@ -80,69 +86,44 @@ namespace ANSCENTER {
                 }
             }
 
-            // --- Inference guard: wait for in-flight frames to finish ---
-            // GetRTSPCVImage increments _inFlightFrames when it hands out
-            // a GPU frame; the registry decrements it when the frame is
-            // released after inference completes.  We wait here so that
-            // close() doesn't free NVDEC surfaces while TensorRT is
-            // still reading from them (the LabVIEW crash root cause).
+            // --- Inference guard: wait for in-flight D2D copies to finish ---
+            // With synchronous D2D copy, in-flight means "currently inside
+            // GetRTSPCVImage between TryIncrementInFlight and attach_cuda".
+            // This is typically <1ms, so the wait is very fast.
             int inFlight = _inFlightFrames.load(std::memory_order_acquire);
             if (inFlight > 0) {
                 _logger.LogInfo("ANSRTSPClient::Destroy",
-                    std::format("waiting for {} in-flight inference frame(s)...", inFlight),
+                    std::format("waiting for {} in-flight frame(s)...", inFlight),
                     __FILE__, __LINE__);
                 bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
                     return _inFlightFrames.load(std::memory_order_acquire) <= 0;
                 });
                 if (!done) {
                     _logger.LogWarn("ANSRTSPClient::Destroy",
-                        std::format("timed out waiting for in-flight frames "
-                                    "(still {} in-flight) — force-releasing GPU frames",
-                                    _inFlightFrames.load()),
+                        std::format("timed out — still {} in-flight", _inFlightFrames.load()),
                         __FILE__, __LINE__);
                 }
             }
 
-            // Force-release ALL GPU frames owned by this client BEFORE close().
-            // Unreleased clones (e.g. LabVIEW AI tasks still holding cloned
-            // cv::Mat*) keep gpuCacheY/gpuCacheUV allocated.  We must cudaFree
-            // them NOW while the CUDA context is still alive.  After close()
-            // destroys the context, cudaFree would crash.
-            int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
-            if (forceReleased > 0) {
-                _logger.LogWarn("ANSRTSPClient::Destroy",
-                    std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
-                    __FILE__, __LINE__);
-                // Drain and cudaFree the GPU buffers while CUDA context is alive
-                // Sync all GPU streams before freeing to avoid illegal access
-                cudaDeviceSynchronize();
-                auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
-                if (!gpuPending.empty()) {
-                    RTSP_DBG("[Destroy] cudaFree %zu GPU ptrs before close()", gpuPending.size());
-                    int prevDev = -1;
-                    cudaGetDevice(&prevDev);
-                    for (auto& entry : gpuPending) {
-                        if (entry.ptr) {
-                            if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
-                            cudaFree(entry.ptr);
-                        }
-                    }
-                    if (prevDev >= 0) cudaSetDevice(prevDev);
-                }
-                // Also drain any pending AVFrames
-                auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
-                for (void* p : avPending) {
-                    AVFrame* f = static_cast<AVFrame*>(p);
-                    av_frame_free(&f);
-                }
-            }
+            // Invalidate owner callbacks so stale GpuFrameData don't try to
+            // call DecrementInFlight on this (soon-to-be-deleted) object.
+            // The GpuFrameData and their global pool slots remain alive —
+            // inference engines can safely keep reading from them.
             ANSGpuFrameRegistry::instance().invalidateOwner(this);
             _inFlightFrames.store(0, std::memory_order_release);
 
+            // NO forceReleaseByOwner — frames survive camera deletion.
+            // Pool slot buffers are global (GpuNV12SlotPool) — NOT owned
+            // by this camera.  They are recycled when inference finishes
+            // (GpuFrameData refcount → 0 → slot.inUse = false).
+            // NO cudaDeviceSynchronize — no GPU buffers to free here.
+            // NO DestroyGpuPool — per-camera pool has been removed.
+
             clientToClose = std::move(_playerClient);
         }
-        // CUDA cleanup happens here, outside the mutex — now safe.
-        // All GPU frames owned by this client have been force-freed above.
+        // close() destroys the NVDEC decoder ONLY.  Pool slot buffers
+        // (regular cudaMallocPitch allocations) are untouched — they
+        // belong to the global GpuNV12SlotPool, not the decoder.
         if (clientToClose) {
             clientToClose->close();
         }
@@ -232,66 +213,44 @@ namespace ANSCENTER {
     bool ANSRTSPClient::Reconnect() {
         // 1. Mark as not-playing under the mutex FIRST.  This makes GetImage()
         //    return the cached _pLastFrame instead of calling into the player,
-        //    preventing use-after-free when close() destroys CUDA resources.
+        //    and blocks new TryIncrementInFlight calls.
         {
             std::unique_lock<std::recursive_mutex> lock(_mutex);
             _isPlaying = false;
 
-            // --- Inference guard: wait for in-flight frames to finish ---
-            // Same guard as Destroy(): close() will free NVDEC surfaces, so
-            // we must wait for any inference engines still reading NV12 data
-            // via zero-copy CUDA device pointers.
+            // --- Inference guard: wait for in-flight D2D copies to finish ---
+            // With synchronous D2D copy, in-flight means "currently inside
+            // GetRTSPCVImage between TryIncrementInFlight and attach_cuda".
+            // This is typically <1ms, so the wait is very fast.
             int inFlight = _inFlightFrames.load(std::memory_order_acquire);
             if (inFlight > 0) {
                 _logger.LogInfo("ANSRTSPClient::Reconnect",
-                    std::format("waiting for {} in-flight inference frame(s)...", inFlight),
+                    std::format("waiting for {} in-flight frame(s)...", inFlight),
                     __FILE__, __LINE__);
                 bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
                     return _inFlightFrames.load(std::memory_order_acquire) <= 0;
                 });
                 if (!done) {
                     _logger.LogWarn("ANSRTSPClient::Reconnect",
-                        std::format("timed out waiting for in-flight frames "
-                                    "(still {} in-flight) — force-releasing GPU frames",
-                                    _inFlightFrames.load()),
+                        std::format("timed out — still {} in-flight", _inFlightFrames.load()),
                         __FILE__, __LINE__);
                 }
             }
 
-            // Force-release GPU frames before close() — same as Destroy().
-            int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
-            if (forceReleased > 0) {
-                _logger.LogWarn("ANSRTSPClient::Reconnect",
-                    std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
-                    __FILE__, __LINE__);
-                // Sync all GPU streams before freeing
-                cudaDeviceSynchronize();
-                auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
-                if (!gpuPending.empty()) {
-                    int prevDev = -1;
-                    cudaGetDevice(&prevDev);
-                    for (auto& entry : gpuPending) {
-                        if (entry.ptr) {
-                            if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
-                            cudaFree(entry.ptr);
-                        }
-                    }
-                    if (prevDev >= 0) cudaSetDevice(prevDev);
-                }
-                auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
-                for (void* p : avPending) {
-                    AVFrame* f = static_cast<AVFrame*>(p);
-                    av_frame_free(&f);
-                }
-            }
+            // Invalidate owner callbacks — prevents stale DecrementInFlight
+            // calls after Reconnect re-creates the decoder.
+            // Frames and their global pool slots remain alive for inference.
             ANSGpuFrameRegistry::instance().invalidateOwner(this);
             _inFlightFrames.store(0, std::memory_order_release);
+
+            // NO forceReleaseByOwner — frames survive reconnect.
+            // NO cudaDeviceSynchronize — no GPU buffers to free.
+            // NO DestroyGpuPool — per-camera pool has been removed.
         }
 
-        // 2. close() does CUDA cleanup (cuArrayDestroy/cuMemFree) — run outside
-        //    _mutex to avoid deadlocking with nvcuda64 SRW lock held by inference.
-        //    Safe now because GetImage()/GetNV12Frame() won't touch the player
-        //    while _isPlaying == false, and all in-flight frames have been released.
+        // 2. close() destroys NVDEC decoder ONLY — run outside _mutex to
+        //    avoid deadlocking with nvcuda64 SRW lock held by inference.
+        //    Pool slot buffers are global and untouched.
         _logger.LogInfo("ANSRTSPClient::Reconnect",
             "calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
         RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
@@ -1071,6 +1030,8 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
     }
 
     try {
+        auto t0 = std::chrono::steady_clock::now();
+
         // Get image (shallow copy - reference counted, fast)
         cv::Mat img = (*Handle)->GetImage(width, height, timeStamp);
 
@@ -1082,6 +1043,8 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
         // Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock)
         anscv_mat_replace(image, std::move(img));
 
+        auto t1 = std::chrono::steady_clock::now();
+
         // Attach NV12 frame for GPU fast-path inference (side-table registry)
         // attach() takes ownership — do NOT av_frame_free here
         //
@@ -1101,7 +1064,11 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
                         cudaHW->width, cudaHW->height,
                         (void*)cudaHW->data[0], (void*)cudaHW->data[1]);
                 AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
-                gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
+
+                // Acquire a slot from the global pool — survives camera Destroy.
+                GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire(
+                    gpuIdx, cudaHW->width, cudaHW->height);
+                gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot);
             } else {
                 // HW decode not active — try CPU NV12
                 AVFrame* nv12 = (*Handle)->GetNV12Frame();
@@ -1114,11 +1081,11 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
             // TryIncrementInFlight already incremented; DecrementInFlight fires
             // when the last clone of this frame is released after inference.
             auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
-            RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d gpuCacheY=%p",
+            RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p",
                     (void*)gpuData,
                     gpuData ? (void*)gpuData->yPlane : nullptr,
                     gpuData ? (int)gpuData->isCudaDevicePtr : -1,
-                    gpuData ? gpuData->gpuCacheY : nullptr);
+                    gpuData ? (void*)gpuData->poolSlot : nullptr);
             if (gpuData) {
                 gpuData->ownerClient = *Handle;
                 gpuData->onReleaseFn = [](void* client) {
@@ -1136,6 +1103,20 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
             RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
         }
 
+        // Lightweight timing via spdlog (no OutputDebugString).
+        // Logs only when the frame grab + D2D exceeds 50ms — helps diagnose stalls
+        // without the overhead of per-frame debug logging.
+        auto t2 = std::chrono::steady_clock::now();
+        double getImageMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
+        double cudaMs     = std::chrono::duration<double, std::milli>(t2 - t1).count();
+        double totalMs    = getImageMs + cudaMs;
+        if (totalMs > 50.0) {
+            (*Handle)->_logger.LogWarn("GetRTSPCVImage",
+                std::format("SLOW FRAME: total={:.1f}ms (getImage={:.1f}ms cuda={:.1f}ms) {}x{}",
+                    totalMs, getImageMs, cudaMs, width, height),
+                __FILE__, __LINE__);
+        }
+
         return 1;  // Success
     }
     catch (const cv::Exception& e) {
diff --git a/modules/ANSCV/ANSRTSP.h b/modules/ANSCV/ANSRTSP.h
index cd530dc..904060a 100644
--- a/modules/ANSCV/ANSRTSP.h
+++ b/modules/ANSCV/ANSRTSP.h
@@ -40,7 +40,7 @@ namespace ANSCENTER
 		bool	 _isPlaying;
 		std::recursive_mutex		_mutex;
 
-		// --- Per-client inference guard ---
+			// --- Per-client inference guard ---
 		// Tracks how many GPU frames from this client are currently in-flight
 		// (grabbed by GetRTSPCVImage but not yet released after inference).
 		// Destroy() waits for this to reach 0 before freeing NVDEC surfaces,
diff --git a/modules/ANSCV/GpuNV12SlotPool.cpp b/modules/ANSCV/GpuNV12SlotPool.cpp
new file mode 100644
index 0000000..b15b028
--- /dev/null
+++ b/modules/ANSCV/GpuNV12SlotPool.cpp
@@ -0,0 +1,107 @@
+// GpuNV12SlotPool.cpp — Process-wide singleton, compiled into ANSCV.dll.
+//
+// ANSCV.dll owns the canonical GpuNV12SlotPool instance.  Other DLLs
+// (ANSODEngine, etc.) find it via GetProcAddress at runtime.
+
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include "GpuNV12SlotPool.h"
+
+#include <cuda_runtime.h>
+
+// ANSCV.dll owns the process-wide singleton.
+GpuNV12SlotPool* GpuNV12SlotPool::resolveProcessWide() {
+    static GpuNV12SlotPool pool;
+    return &pool;
+}
+
+// Exported so other DLLs (ANSODEngine, etc.) can find this instance at runtime.
+extern "C" __declspec(dllexport)
+GpuNV12SlotPool* GpuNV12SlotPool_GetInstance() {
+    return &GpuNV12SlotPool::instance();
+}
+
+// Transition all COOLING slots past the cooldown threshold to FREE.
+void GpuNV12SlotPool::drainCooledSlots_locked() {
+    auto now = std::chrono::steady_clock::now();
+    auto threshold = std::chrono::milliseconds(SLOT_COOLDOWN_MS);
+    for (auto& s : m_slots) {
+        if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_COOLING) {
+            if (now - s->cooldownStart >= threshold) {
+                s->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
+            }
+        }
+    }
+}
+
+// Acquire a free slot matching (gpuIdx, w, h), or allocate a new one.
+GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    // 1. Drain cooled-down slots to make them available
+    drainCooledSlots_locked();
+
+    // 2. Try to find an existing FREE slot that matches the resolution
+    for (auto& s : m_slots) {
+        if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_FREE &&
+            s->gpuIdx == gpuIdx && s->width == w && s->height == h) {
+            s->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);
+            NV12POOL_DBG("acquire: reuse slot Y=%p UV=%p %dx%d gpu=%d (total=%zu)",
+                         s->bufY, s->bufUV, w, h, gpuIdx, m_slots.size());
+            return s.get();
+        }
+    }
+
+    // 3. No matching free slot — allocate a new one if under the limit
+    if (static_cast<int>(m_slots.size()) >= GPU_NV12_POOL_MAX_SLOTS) {
+        NV12POOL_DBG("acquire: POOL FULL (%zu slots) — fallback to CPU path",
+                     m_slots.size());
+        return nullptr;
+    }
+
+    // Allocate CUDA buffers on the target GPU
+    int prevDev = -1;
+    cudaGetDevice(&prevDev);
+    if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
+
+    auto slot = std::make_unique<GpuNV12Slot>();
+    cudaError_t e1 = cudaMallocPitch(&slot->bufY,  &slot->pitchY,  w, h);
+    cudaError_t e2 = cudaMallocPitch(&slot->bufUV, &slot->pitchUV, w, h / 2);
+
+    // Non-blocking stream avoids NULL-stream implicit sync with inference.
+    // On WDDM, the NULL stream must wait for ALL other streams to finish
+    // before executing — this caused 1-2 second stalls when inference
+    // kernels were running.  A non-blocking stream runs independently.
+    cudaStream_t stream = nullptr;
+    cudaError_t e3 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+
+    if (prevDev >= 0) cudaSetDevice(prevDev);
+
+    if (e1 != cudaSuccess || e2 != cudaSuccess) {
+        NV12POOL_DBG("acquire: cudaMallocPitch FAILED %dx%d gpu=%d e1=%d e2=%d",
+                     w, h, gpuIdx, (int)e1, (int)e2);
+        // Clean up partial allocation
+        int prev2 = -1; cudaGetDevice(&prev2);
+        if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
+        if (e1 == cudaSuccess && slot->bufY) cudaFree(slot->bufY);
+        if (e2 == cudaSuccess && slot->bufUV) cudaFree(slot->bufUV);
+        if (e3 == cudaSuccess && stream) cudaStreamDestroy(stream);
+        if (prev2 >= 0) cudaSetDevice(prev2);
+        return nullptr;
+    }
+
+    slot->width  = w;
+    slot->height = h;
+    slot->gpuIdx = gpuIdx;
+    slot->copyStream = (e3 == cudaSuccess) ? stream : nullptr;
+    slot->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);
+
+    GpuNV12Slot* raw = slot.get();
+    m_slots.push_back(std::move(slot));
+
+    NV12POOL_DBG("acquire: NEW slot Y=%p UV=%p pitchY=%zu pitchUV=%zu %dx%d gpu=%d stream=%p (total=%zu)",
+                 raw->bufY, raw->bufUV, raw->pitchY, raw->pitchUV,
+                 w, h, gpuIdx, raw->copyStream, m_slots.size());
+    return raw;
+}
diff --git a/modules/ANSFR/CMakeLists.txt b/modules/ANSFR/CMakeLists.txt
index 63dc54e..129509c 100644
--- a/modules/ANSFR/CMakeLists.txt
+++ b/modules/ANSFR/CMakeLists.txt
@@ -5,6 +5,7 @@ set(ANSFR_SOURCES
     ANSFRCommon.cpp
     ANSFaceRecognizer.cpp
     ANSGpuFrameRegistry.cpp
+    GpuNV12SlotPool.cpp
     FaceDatabase.cpp
     FaceNet.cpp
     dllmain.cpp
diff --git a/modules/ANSFR/GpuNV12SlotPool.cpp b/modules/ANSFR/GpuNV12SlotPool.cpp
new file mode 100644
index 0000000..f7bf339
--- /dev/null
+++ b/modules/ANSFR/GpuNV12SlotPool.cpp
@@ -0,0 +1,23 @@
+// GpuNV12SlotPool.cpp — Cross-DLL singleton resolver for ANSODEngine.dll.
+//
+// Finds the canonical GpuNV12SlotPool instance exported by ANSCV.dll
+// via GetProcAddress.  No link dependency on ANSCV.lib needed.
+
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include "GpuNV12SlotPool.h"
+
+GpuNV12SlotPool* GpuNV12SlotPool::resolveProcessWide() {
+    // ANSCV.dll is always loaded before inference starts (it provides RTSP).
+    HMODULE hMod = GetModuleHandleA("ANSCV.dll");
+    if (hMod) {
+        typedef GpuNV12SlotPool* (*GetInstanceFn)();
+        auto fn = reinterpret_cast<GetInstanceFn>(
+            GetProcAddress(hMod, "GpuNV12SlotPool_GetInstance"));
+        if (fn) return fn();
+    }
+    // Fallback: local instance (unit tests without ANSCV.dll).
+    static GpuNV12SlotPool local;
+    return &local;
+}
diff --git a/modules/ANSLPR/CMakeLists.txt b/modules/ANSLPR/CMakeLists.txt
index fbaa698..3fe33ae 100644
--- a/modules/ANSLPR/CMakeLists.txt
+++ b/modules/ANSLPR/CMakeLists.txt
@@ -5,6 +5,7 @@ set(ANSLPR_SOURCES
     ANSLPR_CPU.cpp
     ANSLPR_OD.cpp
     ANSGpuFrameRegistry.cpp
+    GpuNV12SlotPool.cpp
     dllmain.cpp
     pch.cpp
 )
diff --git a/modules/ANSLPR/GpuNV12SlotPool.cpp b/modules/ANSLPR/GpuNV12SlotPool.cpp
new file mode 100644
index 0000000..f7bf339
--- /dev/null
+++ b/modules/ANSLPR/GpuNV12SlotPool.cpp
@@ -0,0 +1,23 @@
+// GpuNV12SlotPool.cpp — Cross-DLL singleton resolver for ANSODEngine.dll.
+//
+// Finds the canonical GpuNV12SlotPool instance exported by ANSCV.dll
+// via GetProcAddress.  No link dependency on ANSCV.lib needed.
+
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include "GpuNV12SlotPool.h"
+
+GpuNV12SlotPool* GpuNV12SlotPool::resolveProcessWide() {
+    // ANSCV.dll is always loaded before inference starts (it provides RTSP).
+    HMODULE hMod = GetModuleHandleA("ANSCV.dll");
+    if (hMod) {
+        typedef GpuNV12SlotPool* (*GetInstanceFn)();
+        auto fn = reinterpret_cast<GetInstanceFn>(
+            GetProcAddress(hMod, "GpuNV12SlotPool_GetInstance"));
+        if (fn) return fn();
+    }
+    // Fallback: local instance (unit tests without ANSCV.dll).
+    static GpuNV12SlotPool local;
+    return &local;
+}
diff --git a/modules/ANSOCR/GpuNV12SlotPool.cpp b/modules/ANSOCR/GpuNV12SlotPool.cpp
new file mode 100644
index 0000000..f7bf339
--- /dev/null
+++ b/modules/ANSOCR/GpuNV12SlotPool.cpp
@@ -0,0 +1,23 @@
+// GpuNV12SlotPool.cpp — Cross-DLL singleton resolver for ANSODEngine.dll.
+//
+// Finds the canonical GpuNV12SlotPool instance exported by ANSCV.dll
+// via GetProcAddress.  No link dependency on ANSCV.lib needed.
+
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include "GpuNV12SlotPool.h"
+
+GpuNV12SlotPool* GpuNV12SlotPool::resolveProcessWide() {
+    // ANSCV.dll is always loaded before inference starts (it provides RTSP).
+    HMODULE hMod = GetModuleHandleA("ANSCV.dll");
+    if (hMod) {
+        typedef GpuNV12SlotPool* (*GetInstanceFn)();
+        auto fn = reinterpret_cast<GetInstanceFn>(
+            GetProcAddress(hMod, "GpuNV12SlotPool_GetInstance"));
+        if (fn) return fn();
+    }
+    // Fallback: local instance (unit tests without ANSCV.dll).
+    static GpuNV12SlotPool local;
+    return &local;
+}
diff --git a/modules/ANSODEngine/CMakeLists.txt b/modules/ANSODEngine/CMakeLists.txt
index 54e05ff..328d1f4 100644
--- a/modules/ANSODEngine/CMakeLists.txt
+++ b/modules/ANSODEngine/CMakeLists.txt
@@ -14,6 +14,7 @@ set(ANSOD_SOURCES
     ANSFaceDetectorEngine.cpp
     ANSFaceRecognizerEngine.cpp
     ANSGpuFrameRegistry.cpp
+    GpuNV12SlotPool.cpp
     ANSONNXCL.cpp
     ANSONNXOBB.cpp
     ANSONNXPOSE.cpp
diff --git a/modules/ANSODEngine/GpuNV12SlotPool.cpp b/modules/ANSODEngine/GpuNV12SlotPool.cpp
new file mode 100644
index 0000000..f7bf339
--- /dev/null
+++ b/modules/ANSODEngine/GpuNV12SlotPool.cpp
@@ -0,0 +1,23 @@
+// GpuNV12SlotPool.cpp — Cross-DLL singleton resolver for ANSODEngine.dll.
+//
+// Finds the canonical GpuNV12SlotPool instance exported by ANSCV.dll
+// via GetProcAddress.  No link dependency on ANSCV.lib needed.
+
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include "GpuNV12SlotPool.h"
+
+GpuNV12SlotPool* GpuNV12SlotPool::resolveProcessWide() {
+    // ANSCV.dll is always loaded before inference starts (it provides RTSP).
+    HMODULE hMod = GetModuleHandleA("ANSCV.dll");
+    if (hMod) {
+        typedef GpuNV12SlotPool* (*GetInstanceFn)();
+        auto fn = reinterpret_cast<GetInstanceFn>(
+            GetProcAddress(hMod, "GpuNV12SlotPool_GetInstance"));
+        if (fn) return fn();
+    }
+    // Fallback: local instance (unit tests without ANSCV.dll).
+    static GpuNV12SlotPool local;
+    return &local;
+}
diff --git a/modules/ANSODEngine/NV12PreprocessHelper.cpp b/modules/ANSODEngine/NV12PreprocessHelper.cpp
index 5a5d436..5cd33a2 100644
--- a/modules/ANSODEngine/NV12PreprocessHelper.cpp
+++ b/modules/ANSODEngine/NV12PreprocessHelper.cpp
@@ -276,6 +276,7 @@ namespace ANSCENTER {
         const bool useZeroCopy = isCudaDevice && gpuMatch;
 
         // --- Debug: log pointer state before reading ---
+#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
         {
             char _nv12_dbg[512];
             snprintf(_nv12_dbg, sizeof(_nv12_dbg),
@@ -294,6 +295,7 @@ namespace ANSCENTER {
 #endif
             fprintf(stderr, "%s", _nv12_dbg);
         }
+#endif
 
         // Effective plane pointers — for zero-copy, use CUDA device ptrs;
         // for CPU upload, use the CPU snapshot buffers.
@@ -362,7 +364,7 @@ namespace ANSCENTER {
         cv::cuda::GpuMat gpuY, gpuUV;
 
         if (useZeroCopy) {
-            // CUDA zero-copy: wrap NVDEC device pointers directly
+            // CUDA zero-copy: wrap pool buffer device pointers directly
             gpuY  = cv::cuda::GpuMat(frameH,     frameW, CV_8UC1,
                         effYPlane,  static_cast<size_t>(effYLinesize));
             gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
@@ -455,6 +457,7 @@ namespace ANSCENTER {
         gpuResized.create(inputH, inputW, CV_8UC3);
 
         cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
+#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
         {
             char _nv12_dbg2[256];
             snprintf(_nv12_dbg2, sizeof(_nv12_dbg2),
@@ -467,6 +470,7 @@ namespace ANSCENTER {
 #endif
             fprintf(stderr, "%s", _nv12_dbg2);
         }
+#endif
         launcher(gpuY, gpuUV, gpuResized, frameW, frameH, inputW, inputH, rawStream);
 
         stream.waitForCompletion();