include/GpuNV12SlotPool.h

#pragma once
// GpuNV12SlotPool.h — Process-wide GPU NV12 buffer pool.
//
// Provides pre-allocated CUDA buffer slots (Y + UV planes) that are shared
// across all RTSP camera instances.  Slots are acquired per-frame by
// GetRTSPCVImage and released back to the pool when the GpuFrameData's
// refcount drops to 0 in freeOwnedBuffers_locked().
//
// KEY DESIGN: Slots are NEVER freed when a camera is destroyed — they are
// recycled.  This decouples GPU buffer lifetime from camera lifetime, so
// inference engines can safely read NV12 data even after the camera object
// that produced it has been deleted and recreated (the LabVIEW reconnect
// pattern: ReleaseHandle → Destroy → delete → CreateHandle).
//
// TIME-DELAYED RELEASE: When a GpuFrameData's refcount drops to 0, the
// slot is NOT immediately available.  It enters a "cooling" state for
// SLOT_COOLDOWN_MS (50ms) to guarantee that any in-flight GPU kernels
// (launched asynchronously by inference engines) have completed reading
// from the buffer.  CUDA kernels typically complete in <10ms, so 50ms
// provides a 5x safety margin.  The cooldown is kept short to minimize
// the number of slots in COOLING, which prevents POOL FULL events.
// POOL FULL triggers per-frame cudaMalloc/cudaFree, which holds the
// nvcuda64 SRW lock and causes cascading stalls on other cameras'
// cudaMemcpy2D operations.
//
// Thread-safe: acquire() locks internally, deferRelease() is lock-free.
//
// Cross-DLL: uses the same resolveProcessWide() singleton pattern as
// ANSGpuFrameRegistry.  ANSCV.dll owns the canonical instance; other DLLs
// find it via GetProcAddress("GpuNV12SlotPool_GetInstance").

#include <vector>
#include <memory>
#include <mutex>
#include <atomic>
#include <cstdint>
#include <cstdio>
#include <chrono>

#ifdef _WIN32
#include <windows.h>
#endif

// Safety constants
static constexpr int GPU_NV12_POOL_MAX_SLOTS   = 64;
static constexpr int SLOT_COOLDOWN_MS          = 50;   // Time after CPU release before slot reuse
                                                        // GPU kernels complete in <10ms; 50ms = 5× margin

// Debug logging for pool operations.
// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
// In production, these are silent to avoid OutputDebugString/fprintf
// lock contention (measured: 500-2000 calls/sec causes process stalls).
#ifndef NV12POOL_DBG
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
#ifdef _WIN32
#define NV12POOL_DBG(fmt, ...) do { \
    char _p_buf[512]; \
    snprintf(_p_buf, sizeof(_p_buf), "[NV12Pool] " fmt "\n", ##__VA_ARGS__); \
    OutputDebugStringA(_p_buf); \
    fprintf(stderr, "%s", _p_buf); \
} while(0)
#else
#define NV12POOL_DBG(fmt, ...) fprintf(stderr, "[NV12Pool] " fmt "\n", ##__VA_ARGS__)
#endif
#else
#define NV12POOL_DBG(fmt, ...) ((void)0)
#endif
#endif

struct GpuNV12Slot {
    void*  bufY    = nullptr;   // cudaMallocPitch'd Y plane
    void*  bufUV   = nullptr;   // cudaMallocPitch'd UV plane
    size_t pitchY  = 0;
    size_t pitchUV = 0;
    int    width   = 0;         // Resolution this slot was allocated for
    int    height  = 0;
    int    gpuIdx  = -1;        // GPU device index

    // Slot lifecycle state:
    //   FREE      (0) = available for acquire()
    //   ACTIVE    (1) = owned by a GpuFrameData (D2D copy + inference reading)
    //   COOLING   (2) = CPU released but GPU kernel may still be reading;
    //                    becomes FREE after SLOT_COOLDOWN_MS elapses.
    static constexpr int STATE_FREE    = 0;
    static constexpr int STATE_ACTIVE  = 1;
    static constexpr int STATE_COOLING = 2;
    std::atomic<int> state{STATE_FREE};

    // Timestamp when the slot entered COOLING state.
    // Only meaningful when state == STATE_COOLING.
    std::chrono::steady_clock::time_point cooldownStart;

    // Per-slot CUDA stream for D2D copy (non-blocking).
    // CRITICAL: cudaMemcpy2D (no stream arg) uses the NULL stream, which on
    // WDDM implicitly synchronizes with ALL other streams before executing.
    // This means the D2D copy must wait for all inference kernels to finish
    // first — causing 1-2 second stalls.  Using a dedicated non-blocking
    // stream avoids this implicit sync entirely.
    // Stored as void* to avoid cuda_runtime.h in the header.
    void* copyStream = nullptr;  // cudaStream_t
};

class GpuNV12SlotPool {
public:
    // Process-wide singleton (same pattern as ANSGpuFrameRegistry).
    static GpuNV12SlotPool& instance() {
#ifdef _WIN32
        static GpuNV12SlotPool* s_inst = resolveProcessWide();
        return *s_inst;
#else
        static GpuNV12SlotPool pool;
        return pool;
#endif
    }

    // Acquire a free slot matching (gpuIdx, w, h).
    // Drains cooled-down slots first, then looks for a FREE match.
    // If none, allocates a new one (up to GPU_NV12_POOL_MAX_SLOTS).
    // Returns nullptr if pool full — caller falls back to CPU path.
    GpuNV12Slot* acquire(int gpuIdx, int w, int h);

    // Deferred release: moves slot from ACTIVE → COOLING.
    // Called from freeOwnedBuffers_locked() when GpuFrameData refcount → 0.
    // The slot becomes FREE after SLOT_COOLDOWN_MS elapses (checked in acquire).
    static void deferRelease(GpuNV12Slot* slot) {
        if (slot) {
            slot->cooldownStart = std::chrono::steady_clock::now();
            slot->state.store(GpuNV12Slot::STATE_COOLING, std::memory_order_release);
        }
    }

    // Number of allocated slots (for diagnostics).
    size_t slotCount() const {
        std::lock_guard<std::mutex> lock(m_mutex);
        return m_slots.size();
    }

    // Number of in-use slots (for diagnostics).
    size_t activeCount() const {
        std::lock_guard<std::mutex> lock(m_mutex);
        size_t count = 0;
        for (auto& s : m_slots) {
            if (s->state.load(std::memory_order_relaxed) != GpuNV12Slot::STATE_FREE) ++count;
        }
        return count;
    }

private:
    GpuNV12SlotPool() = default;

#ifdef _WIN32
    static GpuNV12SlotPool* resolveProcessWide();
#endif

    // Transition all COOLING slots that have exceeded SLOT_COOLDOWN_MS to FREE.
    // Called at the start of acquire() under the lock.
    void drainCooledSlots_locked();

    mutable std::mutex m_mutex;
    std::vector<std::unique_ptr<GpuNV12Slot>> m_slots;
};