Files
ANSCORE/include/GpuNV12SlotPool.h
2026-04-03 14:51:52 +11:00

162 lines
6.2 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#pragma once
// GpuNV12SlotPool.h — Process-wide GPU NV12 buffer pool.
//
// Provides pre-allocated CUDA buffer slots (Y + UV planes) that are shared
// across all RTSP camera instances. Slots are acquired per-frame by
// GetRTSPCVImage and released back to the pool when the GpuFrameData's
// refcount drops to 0 in freeOwnedBuffers_locked().
//
// KEY DESIGN: Slots are NEVER freed when a camera is destroyed — they are
// recycled. This decouples GPU buffer lifetime from camera lifetime, so
// inference engines can safely read NV12 data even after the camera object
// that produced it has been deleted and recreated (the LabVIEW reconnect
// pattern: ReleaseHandle → Destroy → delete → CreateHandle).
//
// TIME-DELAYED RELEASE: When a GpuFrameData's refcount drops to 0, the
// slot is NOT immediately available. It enters a "cooling" state for
// SLOT_COOLDOWN_MS (50ms) to guarantee that any in-flight GPU kernels
// (launched asynchronously by inference engines) have completed reading
// from the buffer. CUDA kernels typically complete in <10ms, so 50ms
// provides a 5x safety margin. The cooldown is kept short to minimize
// the number of slots in COOLING, which prevents POOL FULL events.
// POOL FULL triggers per-frame cudaMalloc/cudaFree, which holds the
// nvcuda64 SRW lock and causes cascading stalls on other cameras'
// cudaMemcpy2D operations.
//
// Thread-safe: acquire() locks internally, deferRelease() is lock-free.
//
// Cross-DLL: uses the same resolveProcessWide() singleton pattern as
// ANSGpuFrameRegistry. ANSCV.dll owns the canonical instance; other DLLs
// find it via GetProcAddress("GpuNV12SlotPool_GetInstance").
#include <vector>
#include <memory>
#include <mutex>
#include <atomic>
#include <cstdint>
#include <cstdio>
#include <chrono>
#ifdef _WIN32
#include <windows.h>
#endif
// Safety constants
static constexpr int GPU_NV12_POOL_MAX_SLOTS = 64;
static constexpr int SLOT_COOLDOWN_MS = 50; // Time after CPU release before slot reuse
// GPU kernels complete in <10ms; 50ms = 5× margin
// Debug logging for pool operations.
// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
// In production, these are silent to avoid OutputDebugString/fprintf
// lock contention (measured: 500-2000 calls/sec causes process stalls).
#ifndef NV12POOL_DBG
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
#ifdef _WIN32
#define NV12POOL_DBG(fmt, ...) do { \
char _p_buf[512]; \
snprintf(_p_buf, sizeof(_p_buf), "[NV12Pool] " fmt "\n", ##__VA_ARGS__); \
OutputDebugStringA(_p_buf); \
fprintf(stderr, "%s", _p_buf); \
} while(0)
#else
#define NV12POOL_DBG(fmt, ...) fprintf(stderr, "[NV12Pool] " fmt "\n", ##__VA_ARGS__)
#endif
#else
#define NV12POOL_DBG(fmt, ...) ((void)0)
#endif
#endif
struct GpuNV12Slot {
void* bufY = nullptr; // cudaMallocPitch'd Y plane
void* bufUV = nullptr; // cudaMallocPitch'd UV plane
size_t pitchY = 0;
size_t pitchUV = 0;
int width = 0; // Resolution this slot was allocated for
int height = 0;
int gpuIdx = -1; // GPU device index
// Slot lifecycle state:
// FREE (0) = available for acquire()
// ACTIVE (1) = owned by a GpuFrameData (D2D copy + inference reading)
// COOLING (2) = CPU released but GPU kernel may still be reading;
// becomes FREE after SLOT_COOLDOWN_MS elapses.
static constexpr int STATE_FREE = 0;
static constexpr int STATE_ACTIVE = 1;
static constexpr int STATE_COOLING = 2;
std::atomic<int> state{STATE_FREE};
// Timestamp when the slot entered COOLING state.
// Only meaningful when state == STATE_COOLING.
std::chrono::steady_clock::time_point cooldownStart;
// Per-slot CUDA stream for D2D copy (non-blocking).
// CRITICAL: cudaMemcpy2D (no stream arg) uses the NULL stream, which on
// WDDM implicitly synchronizes with ALL other streams before executing.
// This means the D2D copy must wait for all inference kernels to finish
// first — causing 1-2 second stalls. Using a dedicated non-blocking
// stream avoids this implicit sync entirely.
// Stored as void* to avoid cuda_runtime.h in the header.
void* copyStream = nullptr; // cudaStream_t
};
class GpuNV12SlotPool {
public:
// Process-wide singleton (same pattern as ANSGpuFrameRegistry).
static GpuNV12SlotPool& instance() {
#ifdef _WIN32
static GpuNV12SlotPool* s_inst = resolveProcessWide();
return *s_inst;
#else
static GpuNV12SlotPool pool;
return pool;
#endif
}
// Acquire a free slot matching (gpuIdx, w, h).
// Drains cooled-down slots first, then looks for a FREE match.
// If none, allocates a new one (up to GPU_NV12_POOL_MAX_SLOTS).
// Returns nullptr if pool full — caller falls back to CPU path.
GpuNV12Slot* acquire(int gpuIdx, int w, int h);
// Deferred release: moves slot from ACTIVE → COOLING.
// Called from freeOwnedBuffers_locked() when GpuFrameData refcount → 0.
// The slot becomes FREE after SLOT_COOLDOWN_MS elapses (checked in acquire).
static void deferRelease(GpuNV12Slot* slot) {
if (slot) {
slot->cooldownStart = std::chrono::steady_clock::now();
slot->state.store(GpuNV12Slot::STATE_COOLING, std::memory_order_release);
}
}
// Number of allocated slots (for diagnostics).
size_t slotCount() const {
std::lock_guard<std::mutex> lock(m_mutex);
return m_slots.size();
}
// Number of in-use slots (for diagnostics).
size_t activeCount() const {
std::lock_guard<std::mutex> lock(m_mutex);
size_t count = 0;
for (auto& s : m_slots) {
if (s->state.load(std::memory_order_relaxed) != GpuNV12Slot::STATE_FREE) ++count;
}
return count;
}
private:
GpuNV12SlotPool() = default;
#ifdef _WIN32
static GpuNV12SlotPool* resolveProcessWide();
#endif
// Transition all COOLING slots that have exceeded SLOT_COOLDOWN_MS to FREE.
// Called at the start of acquire() under the lock.
void drainCooledSlots_locked();
mutable std::mutex m_mutex;
std::vector<std::unique_ptr<GpuNV12Slot>> m_slots;
};