#pragma once // GpuNV12SlotPool.h — Process-wide GPU NV12 buffer pool. // // Provides pre-allocated CUDA buffer slots (Y + UV planes) that are shared // across all RTSP camera instances. Slots are acquired per-frame by // GetRTSPCVImage and released back to the pool when the GpuFrameData's // refcount drops to 0 in freeOwnedBuffers_locked(). // // KEY DESIGN: Slots are NEVER freed when a camera is destroyed — they are // recycled. This decouples GPU buffer lifetime from camera lifetime, so // inference engines can safely read NV12 data even after the camera object // that produced it has been deleted and recreated (the LabVIEW reconnect // pattern: ReleaseHandle → Destroy → delete → CreateHandle). // // TIME-DELAYED RELEASE: When a GpuFrameData's refcount drops to 0, the // slot is NOT immediately available. It enters a "cooling" state for // SLOT_COOLDOWN_MS (50ms) to guarantee that any in-flight GPU kernels // (launched asynchronously by inference engines) have completed reading // from the buffer. CUDA kernels typically complete in <10ms, so 50ms // provides a 5x safety margin. The cooldown is kept short to minimize // the number of slots in COOLING, which prevents POOL FULL events. // POOL FULL triggers per-frame cudaMalloc/cudaFree, which holds the // nvcuda64 SRW lock and causes cascading stalls on other cameras' // cudaMemcpy2D operations. // // Thread-safe: acquire() locks internally, deferRelease() is lock-free. // // Cross-DLL: uses the same resolveProcessWide() singleton pattern as // ANSGpuFrameRegistry. ANSCV.dll owns the canonical instance; other DLLs // find it via GetProcAddress("GpuNV12SlotPool_GetInstance"). #include #include #include #include #include #include #include #ifdef _WIN32 #include #endif // Safety constants static constexpr int GPU_NV12_POOL_MAX_SLOTS = 64; static constexpr int SLOT_COOLDOWN_MS = 50; // Time after CPU release before slot reuse // GPU kernels complete in <10ms; 50ms = 5× margin // Debug logging for pool operations. // Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging. // In production, these are silent to avoid OutputDebugString/fprintf // lock contention (measured: 500-2000 calls/sec causes process stalls). #ifndef NV12POOL_DBG #if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG #ifdef _WIN32 #define NV12POOL_DBG(fmt, ...) do { \ char _p_buf[512]; \ snprintf(_p_buf, sizeof(_p_buf), "[NV12Pool] " fmt "\n", ##__VA_ARGS__); \ OutputDebugStringA(_p_buf); \ fprintf(stderr, "%s", _p_buf); \ } while(0) #else #define NV12POOL_DBG(fmt, ...) fprintf(stderr, "[NV12Pool] " fmt "\n", ##__VA_ARGS__) #endif #else #define NV12POOL_DBG(fmt, ...) ((void)0) #endif #endif struct GpuNV12Slot { void* bufY = nullptr; // cudaMallocPitch'd Y plane void* bufUV = nullptr; // cudaMallocPitch'd UV plane size_t pitchY = 0; size_t pitchUV = 0; int width = 0; // Resolution this slot was allocated for int height = 0; int gpuIdx = -1; // GPU device index // Slot lifecycle state: // FREE (0) = available for acquire() // ACTIVE (1) = owned by a GpuFrameData (D2D copy + inference reading) // COOLING (2) = CPU released but GPU kernel may still be reading; // becomes FREE after SLOT_COOLDOWN_MS elapses. static constexpr int STATE_FREE = 0; static constexpr int STATE_ACTIVE = 1; static constexpr int STATE_COOLING = 2; std::atomic state{STATE_FREE}; // Timestamp when the slot entered COOLING state. // Only meaningful when state == STATE_COOLING. std::chrono::steady_clock::time_point cooldownStart; // Per-slot CUDA stream for D2D copy (non-blocking). // CRITICAL: cudaMemcpy2D (no stream arg) uses the NULL stream, which on // WDDM implicitly synchronizes with ALL other streams before executing. // This means the D2D copy must wait for all inference kernels to finish // first — causing 1-2 second stalls. Using a dedicated non-blocking // stream avoids this implicit sync entirely. // Stored as void* to avoid cuda_runtime.h in the header. void* copyStream = nullptr; // cudaStream_t }; class GpuNV12SlotPool { public: // Process-wide singleton (same pattern as ANSGpuFrameRegistry). static GpuNV12SlotPool& instance() { #ifdef _WIN32 static GpuNV12SlotPool* s_inst = resolveProcessWide(); return *s_inst; #else static GpuNV12SlotPool pool; return pool; #endif } // Acquire a free slot matching (gpuIdx, w, h). // Drains cooled-down slots first, then looks for a FREE match. // If none, allocates a new one (up to GPU_NV12_POOL_MAX_SLOTS). // Returns nullptr if pool full — caller falls back to CPU path. GpuNV12Slot* acquire(int gpuIdx, int w, int h); // Deferred release: moves slot from ACTIVE → COOLING. // Called from freeOwnedBuffers_locked() when GpuFrameData refcount → 0. // The slot becomes FREE after SLOT_COOLDOWN_MS elapses (checked in acquire). static void deferRelease(GpuNV12Slot* slot) { if (slot) { slot->cooldownStart = std::chrono::steady_clock::now(); slot->state.store(GpuNV12Slot::STATE_COOLING, std::memory_order_release); } } // Number of allocated slots (for diagnostics). size_t slotCount() const { std::lock_guard lock(m_mutex); return m_slots.size(); } // Number of in-use slots (for diagnostics). size_t activeCount() const { std::lock_guard lock(m_mutex); size_t count = 0; for (auto& s : m_slots) { if (s->state.load(std::memory_order_relaxed) != GpuNV12Slot::STATE_FREE) ++count; } return count; } private: GpuNV12SlotPool() = default; #ifdef _WIN32 static GpuNV12SlotPool* resolveProcessWide(); #endif // Transition all COOLING slots that have exceeded SLOT_COOLDOWN_MS to FREE. // Called at the start of acquire() under the lock. void drainCooledSlots_locked(); mutable std::mutex m_mutex; std::vector> m_slots; };