// GpuNV12SlotPool.cpp — Process-wide singleton, compiled into ANSCV.dll. // // ANSCV.dll owns the canonical GpuNV12SlotPool instance. Other DLLs // (ANSODEngine, etc.) find it via GetProcAddress at runtime. #define WIN32_LEAN_AND_MEAN #define NOMINMAX #include #include "GpuNV12SlotPool.h" #include // ANSCV.dll owns the process-wide singleton. GpuNV12SlotPool* GpuNV12SlotPool::resolveProcessWide() { static GpuNV12SlotPool pool; return &pool; } // Exported so other DLLs (ANSODEngine, etc.) can find this instance at runtime. extern "C" __declspec(dllexport) GpuNV12SlotPool* GpuNV12SlotPool_GetInstance() { return &GpuNV12SlotPool::instance(); } // Transition all COOLING slots past the cooldown threshold to FREE. // Collects pending AVFrames for the caller to av_frame_free. void GpuNV12SlotPool::drainCooledSlots_locked() { auto now = std::chrono::steady_clock::now(); auto threshold = std::chrono::milliseconds(SLOT_COOLDOWN_MS); for (auto& s : m_slots) { if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_COOLING) { if (now - s->cooldownStart >= threshold) { s->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release); } } } } // Acquire a free slot matching (gpuIdx, w, h), or allocate a new one. GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) { std::lock_guard lock(m_mutex); // 1. Drain cooled-down slots to make them available drainCooledSlots_locked(); // 2. Try to find an existing FREE slot that matches the resolution for (auto& s : m_slots) { if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_FREE && s->gpuIdx == gpuIdx && s->width == w && s->height == h) { s->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release); NV12POOL_DBG("acquire: reuse slot Y=%p UV=%p %dx%d gpu=%d (total=%zu)", s->bufY, s->bufUV, w, h, gpuIdx, m_slots.size()); return s.get(); } } // 3. No matching free slot — allocate a new one if under the limit if (static_cast(m_slots.size()) >= GPU_NV12_POOL_MAX_SLOTS) { // Always log POOL FULL to DebugView — this is a critical diagnostic. { char _buf[128]; snprintf(_buf, sizeof(_buf), "[NV12Pool] POOL FULL (%zu slots) — fallback to CPU\n", m_slots.size()); #ifdef _WIN32 OutputDebugStringA(_buf); #endif fprintf(stderr, "%s", _buf); } return nullptr; } // Allocate CUDA buffers + stream + event on the target GPU int prevDev = -1; cudaGetDevice(&prevDev); if (gpuIdx >= 0) cudaSetDevice(gpuIdx); auto slot = std::make_unique(); cudaError_t e1 = cudaMallocPitch(&slot->bufY, &slot->pitchY, w, h); cudaError_t e2 = cudaMallocPitch(&slot->bufUV, &slot->pitchUV, w, h / 2); // Non-blocking stream: avoids NULL-stream implicit sync with inference. cudaStream_t stream = nullptr; cudaError_t e3 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); if (prevDev >= 0) cudaSetDevice(prevDev); if (e1 != cudaSuccess || e2 != cudaSuccess) { NV12POOL_DBG("acquire: cudaMallocPitch FAILED %dx%d gpu=%d e1=%d e2=%d", w, h, gpuIdx, (int)e1, (int)e2); int prev2 = -1; cudaGetDevice(&prev2); if (gpuIdx >= 0) cudaSetDevice(gpuIdx); if (e1 == cudaSuccess && slot->bufY) cudaFree(slot->bufY); if (e2 == cudaSuccess && slot->bufUV) cudaFree(slot->bufUV); if (e3 == cudaSuccess && stream) cudaStreamDestroy(stream); if (prev2 >= 0) cudaSetDevice(prev2); return nullptr; } slot->width = w; slot->height = h; slot->gpuIdx = gpuIdx; slot->copyStream = (e3 == cudaSuccess) ? stream : nullptr; slot->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release); GpuNV12Slot* raw = slot.get(); m_slots.push_back(std::move(slot)); // Always log new slot allocation to DebugView (rare event). { char _buf[256]; snprintf(_buf, sizeof(_buf), "[NV12Pool] NEW slot #%zu: %dx%d gpu=%d Y=%p UV=%p pitchY=%zu stream=%p\n", m_slots.size(), w, h, gpuIdx, raw->bufY, raw->bufUV, raw->pitchY, raw->copyStream); #ifdef _WIN32 OutputDebugStringA(_buf); #endif fprintf(stderr, "%s", _buf); } return raw; }