Files
ANSCORE/modules/ANSCV/GpuNV12SlotPool.cpp
2026-04-03 15:16:26 +11:00

128 lines
4.9 KiB
C++

// GpuNV12SlotPool.cpp — Process-wide singleton, compiled into ANSCV.dll.
//
// ANSCV.dll owns the canonical GpuNV12SlotPool instance. Other DLLs
// (ANSODEngine, etc.) find it via GetProcAddress at runtime.
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include "GpuNV12SlotPool.h"
#include <cuda_runtime.h>
// ANSCV.dll owns the process-wide singleton.
GpuNV12SlotPool* GpuNV12SlotPool::resolveProcessWide() {
static GpuNV12SlotPool pool;
return &pool;
}
// Exported so other DLLs (ANSODEngine, etc.) can find this instance at runtime.
extern "C" __declspec(dllexport)
GpuNV12SlotPool* GpuNV12SlotPool_GetInstance() {
return &GpuNV12SlotPool::instance();
}
// Transition all COOLING slots past the cooldown threshold to FREE.
void GpuNV12SlotPool::drainCooledSlots_locked() {
auto now = std::chrono::steady_clock::now();
auto threshold = std::chrono::milliseconds(SLOT_COOLDOWN_MS);
for (auto& s : m_slots) {
if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_COOLING) {
if (now - s->cooldownStart >= threshold) {
s->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
}
}
}
}
// Acquire a free slot matching (gpuIdx, w, h), or allocate a new one.
GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
std::lock_guard<std::mutex> lock(m_mutex);
// 1. Drain cooled-down slots to make them available
drainCooledSlots_locked();
// 2. Try to find an existing FREE slot that matches the resolution
for (auto& s : m_slots) {
if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_FREE &&
s->gpuIdx == gpuIdx && s->width == w && s->height == h) {
s->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);
NV12POOL_DBG("acquire: reuse slot Y=%p UV=%p %dx%d gpu=%d (total=%zu)",
s->bufY, s->bufUV, w, h, gpuIdx, m_slots.size());
return s.get();
}
}
// 3. No matching free slot — allocate a new one if under the limit
if (static_cast<int>(m_slots.size()) >= GPU_NV12_POOL_MAX_SLOTS) {
// Always log POOL FULL to DebugView — this is a critical diagnostic.
{
char _buf[128];
snprintf(_buf, sizeof(_buf), "[NV12Pool] POOL FULL (%zu slots) — fallback to CPU\n", m_slots.size());
#ifdef _WIN32
OutputDebugStringA(_buf);
#endif
fprintf(stderr, "%s", _buf);
}
return nullptr;
}
// Allocate CUDA buffers on the target GPU
int prevDev = -1;
cudaGetDevice(&prevDev);
if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
auto slot = std::make_unique<GpuNV12Slot>();
cudaError_t e1 = cudaMallocPitch(&slot->bufY, &slot->pitchY, w, h);
cudaError_t e2 = cudaMallocPitch(&slot->bufUV, &slot->pitchUV, w, h / 2);
// Non-blocking stream avoids NULL-stream implicit sync with inference.
// On WDDM, the NULL stream must wait for ALL other streams to finish
// before executing — this caused 1-2 second stalls when inference
// kernels were running. A non-blocking stream runs independently.
cudaStream_t stream = nullptr;
cudaError_t e3 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
if (prevDev >= 0) cudaSetDevice(prevDev);
if (e1 != cudaSuccess || e2 != cudaSuccess) {
NV12POOL_DBG("acquire: cudaMallocPitch FAILED %dx%d gpu=%d e1=%d e2=%d",
w, h, gpuIdx, (int)e1, (int)e2);
// Clean up partial allocation
int prev2 = -1; cudaGetDevice(&prev2);
if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
if (e1 == cudaSuccess && slot->bufY) cudaFree(slot->bufY);
if (e2 == cudaSuccess && slot->bufUV) cudaFree(slot->bufUV);
if (e3 == cudaSuccess && stream) cudaStreamDestroy(stream);
if (prev2 >= 0) cudaSetDevice(prev2);
return nullptr;
}
slot->width = w;
slot->height = h;
slot->gpuIdx = gpuIdx;
slot->copyStream = (e3 == cudaSuccess) ? stream : nullptr;
slot->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);
GpuNV12Slot* raw = slot.get();
m_slots.push_back(std::move(slot));
// Always log new slot allocation to DebugView (rare event — once per resolution per camera).
{
char _buf[256];
snprintf(_buf, sizeof(_buf),
"[NV12Pool] NEW slot #%zu: %dx%d gpu=%d Y=%p UV=%p pitchY=%zu stream=%p\n",
m_slots.size(), w, h, gpuIdx, raw->bufY, raw->bufUV, raw->pitchY, raw->copyStream);
#ifdef _WIN32
OutputDebugStringA(_buf);
#endif
fprintf(stderr, "%s", _buf);
}
// Also log POOL FULL to DebugView (important diagnostic).
NV12POOL_DBG("acquire: NEW slot Y=%p UV=%p pitchY=%zu pitchUV=%zu %dx%d gpu=%d stream=%p (total=%zu)",
raw->bufY, raw->bufUV, raw->pitchY, raw->pitchUV,
w, h, gpuIdx, raw->copyStream, m_slots.size());
return raw;
}