160 lines
6.3 KiB
C++
160 lines
6.3 KiB
C++
// GpuNV12SlotPool.cpp — Process-wide singleton, compiled into ANSCV.dll.
|
|
//
|
|
// ANSCV.dll owns the canonical GpuNV12SlotPool instance. Other DLLs
|
|
// (ANSODEngine, etc.) find it via GetProcAddress at runtime.
|
|
|
|
#define WIN32_LEAN_AND_MEAN
|
|
#define NOMINMAX
|
|
#include <windows.h>
|
|
#include "GpuNV12SlotPool.h"
|
|
#include "ANSLicense.h" // ANS_DBG macro for [Pool_Leak] heartbeat
|
|
|
|
#include <cuda_runtime.h>
|
|
#include <atomic>
|
|
#include <chrono>
|
|
|
|
// ANSCV.dll owns the process-wide singleton.
|
|
GpuNV12SlotPool* GpuNV12SlotPool::resolveProcessWide() {
|
|
static GpuNV12SlotPool pool;
|
|
return &pool;
|
|
}
|
|
|
|
// Exported so other DLLs (ANSODEngine, etc.) can find this instance at runtime.
|
|
extern "C" __declspec(dllexport)
|
|
GpuNV12SlotPool* GpuNV12SlotPool_GetInstance() {
|
|
return &GpuNV12SlotPool::instance();
|
|
}
|
|
|
|
// Transition all COOLING slots past the cooldown threshold to FREE.
|
|
// Collects pending AVFrames for the caller to av_frame_free.
|
|
void GpuNV12SlotPool::drainCooledSlots_locked() {
|
|
auto now = std::chrono::steady_clock::now();
|
|
auto threshold = std::chrono::milliseconds(SLOT_COOLDOWN_MS);
|
|
for (auto& s : m_slots) {
|
|
if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_COOLING) {
|
|
if (now - s->cooldownStart >= threshold) {
|
|
s->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Acquire a free slot matching (gpuIdx, w, h), or allocate a new one.
|
|
GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
|
|
std::lock_guard<std::mutex> lock(m_mutex);
|
|
|
|
// Leak diagnostic — [Pool_Leak] heartbeat fires at most once per 60 s.
|
|
// Reports current slot count and rough VRAM footprint. Slot count is
|
|
// bounded by GPU_NV12_POOL_MAX_SLOTS; if it persists near the cap we
|
|
// also see ACTIVE/COOLING state distribution which can hint at slots
|
|
// not being released.
|
|
{
|
|
using clk = std::chrono::steady_clock;
|
|
static std::atomic<long long> s_nextLog{0};
|
|
const long long tick = clk::now().time_since_epoch().count();
|
|
long long expected = s_nextLog.load(std::memory_order_relaxed);
|
|
if (tick >= expected) {
|
|
const long long deadline = tick +
|
|
std::chrono::duration_cast<clk::duration>(
|
|
std::chrono::seconds(60)).count();
|
|
if (s_nextLog.compare_exchange_strong(expected, deadline,
|
|
std::memory_order_relaxed)) {
|
|
size_t totalBytes = 0;
|
|
size_t active = 0, cooling = 0, free_ = 0;
|
|
for (const auto& sp : m_slots) {
|
|
totalBytes += sp->pitchY * sp->height
|
|
+ sp->pitchUV * (sp->height / 2);
|
|
const int st = sp->state.load(std::memory_order_relaxed);
|
|
if (st == GpuNV12Slot::STATE_ACTIVE) ++active;
|
|
else if (st == GpuNV12Slot::STATE_COOLING) ++cooling;
|
|
else ++free_;
|
|
}
|
|
ANS_DBG("Pool_Leak",
|
|
"NV12Pool slots=%zu (active=%zu cooling=%zu free=%zu) bytesMB=%.1f (max=%d)",
|
|
m_slots.size(), active, cooling, free_,
|
|
(double)totalBytes / (1024.0 * 1024.0),
|
|
GPU_NV12_POOL_MAX_SLOTS);
|
|
}
|
|
}
|
|
}
|
|
|
|
// 1. Drain cooled-down slots to make them available
|
|
drainCooledSlots_locked();
|
|
|
|
// 2. Try to find an existing FREE slot that matches the resolution
|
|
for (auto& s : m_slots) {
|
|
if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_FREE &&
|
|
s->gpuIdx == gpuIdx && s->width == w && s->height == h) {
|
|
s->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);
|
|
NV12POOL_DBG("acquire: reuse slot Y=%p UV=%p %dx%d gpu=%d (total=%zu)",
|
|
s->bufY, s->bufUV, w, h, gpuIdx, m_slots.size());
|
|
return s.get();
|
|
}
|
|
}
|
|
|
|
// 3. No matching free slot — allocate a new one if under the limit
|
|
if (static_cast<int>(m_slots.size()) >= GPU_NV12_POOL_MAX_SLOTS) {
|
|
// Always log POOL FULL to DebugView — this is a critical diagnostic.
|
|
{
|
|
char _buf[128];
|
|
snprintf(_buf, sizeof(_buf), "[NV12Pool] POOL FULL (%zu slots) — fallback to CPU\n", m_slots.size());
|
|
#ifdef _WIN32
|
|
OutputDebugStringA(_buf);
|
|
#endif
|
|
fprintf(stderr, "%s", _buf);
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
// Allocate CUDA buffers + stream + event on the target GPU
|
|
int prevDev = -1;
|
|
cudaGetDevice(&prevDev);
|
|
if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
|
|
|
|
auto slot = std::make_unique<GpuNV12Slot>();
|
|
cudaError_t e1 = cudaMallocPitch(&slot->bufY, &slot->pitchY, w, h);
|
|
cudaError_t e2 = cudaMallocPitch(&slot->bufUV, &slot->pitchUV, w, h / 2);
|
|
|
|
// Non-blocking stream: avoids NULL-stream implicit sync with inference.
|
|
cudaStream_t stream = nullptr;
|
|
cudaError_t e3 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
|
|
|
|
if (prevDev >= 0) cudaSetDevice(prevDev);
|
|
|
|
if (e1 != cudaSuccess || e2 != cudaSuccess) {
|
|
NV12POOL_DBG("acquire: cudaMallocPitch FAILED %dx%d gpu=%d e1=%d e2=%d",
|
|
w, h, gpuIdx, (int)e1, (int)e2);
|
|
int prev2 = -1; cudaGetDevice(&prev2);
|
|
if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
|
|
if (e1 == cudaSuccess && slot->bufY) cudaFree(slot->bufY);
|
|
if (e2 == cudaSuccess && slot->bufUV) cudaFree(slot->bufUV);
|
|
if (e3 == cudaSuccess && stream) cudaStreamDestroy(stream);
|
|
if (prev2 >= 0) cudaSetDevice(prev2);
|
|
return nullptr;
|
|
}
|
|
|
|
slot->width = w;
|
|
slot->height = h;
|
|
slot->gpuIdx = gpuIdx;
|
|
slot->copyStream = (e3 == cudaSuccess) ? stream : nullptr;
|
|
slot->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);
|
|
|
|
GpuNV12Slot* raw = slot.get();
|
|
m_slots.push_back(std::move(slot));
|
|
|
|
// Always log new slot allocation to DebugView (rare event).
|
|
{
|
|
char _buf[256];
|
|
snprintf(_buf, sizeof(_buf),
|
|
"[NV12Pool] NEW slot #%zu: %dx%d gpu=%d Y=%p UV=%p pitchY=%zu stream=%p\n",
|
|
m_slots.size(), w, h, gpuIdx, raw->bufY, raw->bufUV, raw->pitchY,
|
|
raw->copyStream);
|
|
#ifdef _WIN32
|
|
OutputDebugStringA(_buf);
|
|
#endif
|
|
fprintf(stderr, "%s", _buf);
|
|
}
|
|
|
|
return raw;
|
|
}
|