162 lines
6.2 KiB
C
162 lines
6.2 KiB
C
|
|
#pragma once
|
|||
|
|
// GpuNV12SlotPool.h — Process-wide GPU NV12 buffer pool.
|
|||
|
|
//
|
|||
|
|
// Provides pre-allocated CUDA buffer slots (Y + UV planes) that are shared
|
|||
|
|
// across all RTSP camera instances. Slots are acquired per-frame by
|
|||
|
|
// GetRTSPCVImage and released back to the pool when the GpuFrameData's
|
|||
|
|
// refcount drops to 0 in freeOwnedBuffers_locked().
|
|||
|
|
//
|
|||
|
|
// KEY DESIGN: Slots are NEVER freed when a camera is destroyed — they are
|
|||
|
|
// recycled. This decouples GPU buffer lifetime from camera lifetime, so
|
|||
|
|
// inference engines can safely read NV12 data even after the camera object
|
|||
|
|
// that produced it has been deleted and recreated (the LabVIEW reconnect
|
|||
|
|
// pattern: ReleaseHandle → Destroy → delete → CreateHandle).
|
|||
|
|
//
|
|||
|
|
// TIME-DELAYED RELEASE: When a GpuFrameData's refcount drops to 0, the
|
|||
|
|
// slot is NOT immediately available. It enters a "cooling" state for
|
|||
|
|
// SLOT_COOLDOWN_MS (50ms) to guarantee that any in-flight GPU kernels
|
|||
|
|
// (launched asynchronously by inference engines) have completed reading
|
|||
|
|
// from the buffer. CUDA kernels typically complete in <10ms, so 50ms
|
|||
|
|
// provides a 5x safety margin. The cooldown is kept short to minimize
|
|||
|
|
// the number of slots in COOLING, which prevents POOL FULL events.
|
|||
|
|
// POOL FULL triggers per-frame cudaMalloc/cudaFree, which holds the
|
|||
|
|
// nvcuda64 SRW lock and causes cascading stalls on other cameras'
|
|||
|
|
// cudaMemcpy2D operations.
|
|||
|
|
//
|
|||
|
|
// Thread-safe: acquire() locks internally, deferRelease() is lock-free.
|
|||
|
|
//
|
|||
|
|
// Cross-DLL: uses the same resolveProcessWide() singleton pattern as
|
|||
|
|
// ANSGpuFrameRegistry. ANSCV.dll owns the canonical instance; other DLLs
|
|||
|
|
// find it via GetProcAddress("GpuNV12SlotPool_GetInstance").
|
|||
|
|
|
|||
|
|
#include <vector>
|
|||
|
|
#include <memory>
|
|||
|
|
#include <mutex>
|
|||
|
|
#include <atomic>
|
|||
|
|
#include <cstdint>
|
|||
|
|
#include <cstdio>
|
|||
|
|
#include <chrono>
|
|||
|
|
|
|||
|
|
#ifdef _WIN32
|
|||
|
|
#include <windows.h>
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
// Safety constants
|
|||
|
|
static constexpr int GPU_NV12_POOL_MAX_SLOTS = 64;
|
|||
|
|
static constexpr int SLOT_COOLDOWN_MS = 50; // Time after CPU release before slot reuse
|
|||
|
|
// GPU kernels complete in <10ms; 50ms = 5× margin
|
|||
|
|
|
|||
|
|
// Debug logging for pool operations.
|
|||
|
|
// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
|
|||
|
|
// In production, these are silent to avoid OutputDebugString/fprintf
|
|||
|
|
// lock contention (measured: 500-2000 calls/sec causes process stalls).
|
|||
|
|
#ifndef NV12POOL_DBG
|
|||
|
|
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
|
|||
|
|
#ifdef _WIN32
|
|||
|
|
#define NV12POOL_DBG(fmt, ...) do { \
|
|||
|
|
char _p_buf[512]; \
|
|||
|
|
snprintf(_p_buf, sizeof(_p_buf), "[NV12Pool] " fmt "\n", ##__VA_ARGS__); \
|
|||
|
|
OutputDebugStringA(_p_buf); \
|
|||
|
|
fprintf(stderr, "%s", _p_buf); \
|
|||
|
|
} while(0)
|
|||
|
|
#else
|
|||
|
|
#define NV12POOL_DBG(fmt, ...) fprintf(stderr, "[NV12Pool] " fmt "\n", ##__VA_ARGS__)
|
|||
|
|
#endif
|
|||
|
|
#else
|
|||
|
|
#define NV12POOL_DBG(fmt, ...) ((void)0)
|
|||
|
|
#endif
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
struct GpuNV12Slot {
|
|||
|
|
void* bufY = nullptr; // cudaMallocPitch'd Y plane
|
|||
|
|
void* bufUV = nullptr; // cudaMallocPitch'd UV plane
|
|||
|
|
size_t pitchY = 0;
|
|||
|
|
size_t pitchUV = 0;
|
|||
|
|
int width = 0; // Resolution this slot was allocated for
|
|||
|
|
int height = 0;
|
|||
|
|
int gpuIdx = -1; // GPU device index
|
|||
|
|
|
|||
|
|
// Slot lifecycle state:
|
|||
|
|
// FREE (0) = available for acquire()
|
|||
|
|
// ACTIVE (1) = owned by a GpuFrameData (D2D copy + inference reading)
|
|||
|
|
// COOLING (2) = CPU released but GPU kernel may still be reading;
|
|||
|
|
// becomes FREE after SLOT_COOLDOWN_MS elapses.
|
|||
|
|
static constexpr int STATE_FREE = 0;
|
|||
|
|
static constexpr int STATE_ACTIVE = 1;
|
|||
|
|
static constexpr int STATE_COOLING = 2;
|
|||
|
|
std::atomic<int> state{STATE_FREE};
|
|||
|
|
|
|||
|
|
// Timestamp when the slot entered COOLING state.
|
|||
|
|
// Only meaningful when state == STATE_COOLING.
|
|||
|
|
std::chrono::steady_clock::time_point cooldownStart;
|
|||
|
|
|
|||
|
|
// Per-slot CUDA stream for D2D copy (non-blocking).
|
|||
|
|
// CRITICAL: cudaMemcpy2D (no stream arg) uses the NULL stream, which on
|
|||
|
|
// WDDM implicitly synchronizes with ALL other streams before executing.
|
|||
|
|
// This means the D2D copy must wait for all inference kernels to finish
|
|||
|
|
// first — causing 1-2 second stalls. Using a dedicated non-blocking
|
|||
|
|
// stream avoids this implicit sync entirely.
|
|||
|
|
// Stored as void* to avoid cuda_runtime.h in the header.
|
|||
|
|
void* copyStream = nullptr; // cudaStream_t
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
class GpuNV12SlotPool {
|
|||
|
|
public:
|
|||
|
|
// Process-wide singleton (same pattern as ANSGpuFrameRegistry).
|
|||
|
|
static GpuNV12SlotPool& instance() {
|
|||
|
|
#ifdef _WIN32
|
|||
|
|
static GpuNV12SlotPool* s_inst = resolveProcessWide();
|
|||
|
|
return *s_inst;
|
|||
|
|
#else
|
|||
|
|
static GpuNV12SlotPool pool;
|
|||
|
|
return pool;
|
|||
|
|
#endif
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Acquire a free slot matching (gpuIdx, w, h).
|
|||
|
|
// Drains cooled-down slots first, then looks for a FREE match.
|
|||
|
|
// If none, allocates a new one (up to GPU_NV12_POOL_MAX_SLOTS).
|
|||
|
|
// Returns nullptr if pool full — caller falls back to CPU path.
|
|||
|
|
GpuNV12Slot* acquire(int gpuIdx, int w, int h);
|
|||
|
|
|
|||
|
|
// Deferred release: moves slot from ACTIVE → COOLING.
|
|||
|
|
// Called from freeOwnedBuffers_locked() when GpuFrameData refcount → 0.
|
|||
|
|
// The slot becomes FREE after SLOT_COOLDOWN_MS elapses (checked in acquire).
|
|||
|
|
static void deferRelease(GpuNV12Slot* slot) {
|
|||
|
|
if (slot) {
|
|||
|
|
slot->cooldownStart = std::chrono::steady_clock::now();
|
|||
|
|
slot->state.store(GpuNV12Slot::STATE_COOLING, std::memory_order_release);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Number of allocated slots (for diagnostics).
|
|||
|
|
size_t slotCount() const {
|
|||
|
|
std::lock_guard<std::mutex> lock(m_mutex);
|
|||
|
|
return m_slots.size();
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Number of in-use slots (for diagnostics).
|
|||
|
|
size_t activeCount() const {
|
|||
|
|
std::lock_guard<std::mutex> lock(m_mutex);
|
|||
|
|
size_t count = 0;
|
|||
|
|
for (auto& s : m_slots) {
|
|||
|
|
if (s->state.load(std::memory_order_relaxed) != GpuNV12Slot::STATE_FREE) ++count;
|
|||
|
|
}
|
|||
|
|
return count;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
private:
|
|||
|
|
GpuNV12SlotPool() = default;
|
|||
|
|
|
|||
|
|
#ifdef _WIN32
|
|||
|
|
static GpuNV12SlotPool* resolveProcessWide();
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
// Transition all COOLING slots that have exceeded SLOT_COOLDOWN_MS to FREE.
|
|||
|
|
// Called at the start of acquire() under the lock.
|
|||
|
|
void drainCooledSlots_locked();
|
|||
|
|
|
|||
|
|
mutable std::mutex m_mutex;
|
|||
|
|
std::vector<std::unique_ptr<GpuNV12Slot>> m_slots;
|
|||
|
|
};
|