modules/ANSCV/GpuNV12SlotPool.cpp

// GpuNV12SlotPool.cpp — Process-wide singleton, compiled into ANSCV.dll.
//
// ANSCV.dll owns the canonical GpuNV12SlotPool instance.  Other DLLs
// (ANSODEngine, etc.) find it via GetProcAddress at runtime.

#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include "GpuNV12SlotPool.h"

#include <cuda_runtime.h>

// ANSCV.dll owns the process-wide singleton.
GpuNV12SlotPool* GpuNV12SlotPool::resolveProcessWide() {
    static GpuNV12SlotPool pool;
    return &pool;
}

// Exported so other DLLs (ANSODEngine, etc.) can find this instance at runtime.
extern "C" __declspec(dllexport)
GpuNV12SlotPool* GpuNV12SlotPool_GetInstance() {
    return &GpuNV12SlotPool::instance();
}

// Transition all COOLING slots past the cooldown threshold to FREE.
void GpuNV12SlotPool::drainCooledSlots_locked() {
    auto now = std::chrono::steady_clock::now();
    auto threshold = std::chrono::milliseconds(SLOT_COOLDOWN_MS);
    for (auto& s : m_slots) {
        if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_COOLING) {
            if (now - s->cooldownStart >= threshold) {
                s->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
            }
        }
    }
}

// Acquire a free slot matching (gpuIdx, w, h), or allocate a new one.
GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
    std::lock_guard<std::mutex> lock(m_mutex);

    // 1. Drain cooled-down slots to make them available
    drainCooledSlots_locked();

    // 2. Try to find an existing FREE slot that matches the resolution
    for (auto& s : m_slots) {
        if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_FREE &&
            s->gpuIdx == gpuIdx && s->width == w && s->height == h) {
            s->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);
            NV12POOL_DBG("acquire: reuse slot Y=%p UV=%p %dx%d gpu=%d (total=%zu)",
                         s->bufY, s->bufUV, w, h, gpuIdx, m_slots.size());
            return s.get();
        }
    }

    // 3. No matching free slot — allocate a new one if under the limit
    if (static_cast<int>(m_slots.size()) >= GPU_NV12_POOL_MAX_SLOTS) {
        // Always log POOL FULL to DebugView — this is a critical diagnostic.
        {
            char _buf[128];
            snprintf(_buf, sizeof(_buf), "[NV12Pool] POOL FULL (%zu slots) — fallback to CPU\n", m_slots.size());
#ifdef _WIN32
            OutputDebugStringA(_buf);
#endif
            fprintf(stderr, "%s", _buf);
        }
        return nullptr;
    }

    // Allocate CUDA buffers on the target GPU
    int prevDev = -1;
    cudaGetDevice(&prevDev);
    if (gpuIdx >= 0) cudaSetDevice(gpuIdx);

    auto slot = std::make_unique<GpuNV12Slot>();
    cudaError_t e1 = cudaMallocPitch(&slot->bufY,  &slot->pitchY,  w, h);
    cudaError_t e2 = cudaMallocPitch(&slot->bufUV, &slot->pitchUV, w, h / 2);

    // Non-blocking stream avoids NULL-stream implicit sync with inference.
    // On WDDM, the NULL stream must wait for ALL other streams to finish
    // before executing — this caused 1-2 second stalls when inference
    // kernels were running.  A non-blocking stream runs independently.
    cudaStream_t stream = nullptr;
    cudaError_t e3 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

    if (prevDev >= 0) cudaSetDevice(prevDev);

    if (e1 != cudaSuccess || e2 != cudaSuccess) {
        NV12POOL_DBG("acquire: cudaMallocPitch FAILED %dx%d gpu=%d e1=%d e2=%d",
                     w, h, gpuIdx, (int)e1, (int)e2);
        // Clean up partial allocation
        int prev2 = -1; cudaGetDevice(&prev2);
        if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
        if (e1 == cudaSuccess && slot->bufY) cudaFree(slot->bufY);
        if (e2 == cudaSuccess && slot->bufUV) cudaFree(slot->bufUV);
        if (e3 == cudaSuccess && stream) cudaStreamDestroy(stream);
        if (prev2 >= 0) cudaSetDevice(prev2);
        return nullptr;
    }

    slot->width  = w;
    slot->height = h;
    slot->gpuIdx = gpuIdx;
    slot->copyStream = (e3 == cudaSuccess) ? stream : nullptr;
    slot->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);

    GpuNV12Slot* raw = slot.get();
    m_slots.push_back(std::move(slot));

    // Always log new slot allocation to DebugView (rare event — once per resolution per camera).
    {
        char _buf[256];
        snprintf(_buf, sizeof(_buf),
            "[NV12Pool] NEW slot #%zu: %dx%d gpu=%d Y=%p UV=%p pitchY=%zu stream=%p\n",
            m_slots.size(), w, h, gpuIdx, raw->bufY, raw->bufUV, raw->pitchY, raw->copyStream);
#ifdef _WIN32
        OutputDebugStringA(_buf);
#endif
        fprintf(stderr, "%s", _buf);
    }

    // Also log POOL FULL to DebugView (important diagnostic).
    NV12POOL_DBG("acquire: NEW slot Y=%p UV=%p pitchY=%zu pitchUV=%zu %dx%d gpu=%d stream=%p (total=%zu)",
                 raw->bufY, raw->bufUV, raw->pitchY, raw->pitchUV,
                 w, h, gpuIdx, raw->copyStream, m_slots.size());
    return raw;
}
Fix NV12 crash issue when recreate camera object (new structure) does not work 2026-04-03 14:51:52 +11:00			`// GpuNV12SlotPool.cpp — Process-wide singleton, compiled into ANSCV.dll.`
			`//`
			`// ANSCV.dll owns the canonical GpuNV12SlotPool instance. Other DLLs`
			`// (ANSODEngine, etc.) find it via GetProcAddress at runtime.`

			`#define WIN32_LEAN_AND_MEAN`
			`#define NOMINMAX`
			`#include <windows.h>`
			`#include "GpuNV12SlotPool.h"`

			`#include <cuda_runtime.h>`

			`// ANSCV.dll owns the process-wide singleton.`
			`GpuNV12SlotPool* GpuNV12SlotPool::resolveProcessWide() {`
			`static GpuNV12SlotPool pool;`
			`return &pool;`
			`}`

			`// Exported so other DLLs (ANSODEngine, etc.) can find this instance at runtime.`
			`extern "C" __declspec(dllexport)`
			`GpuNV12SlotPool* GpuNV12SlotPool_GetInstance() {`
			`return &GpuNV12SlotPool::instance();`
			`}`

			`// Transition all COOLING slots past the cooldown threshold to FREE.`
			`void GpuNV12SlotPool::drainCooledSlots_locked() {`
			`auto now = std::chrono::steady_clock::now();`
			`auto threshold = std::chrono::milliseconds(SLOT_COOLDOWN_MS);`
			`for (auto& s : m_slots) {`
			`if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_COOLING) {`
			`if (now - s->cooldownStart >= threshold) {`
			`s->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);`
			`}`
			`}`
			`}`
			`}`

			`// Acquire a free slot matching (gpuIdx, w, h), or allocate a new one.`
			`GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {`
			`std::lock_guard<std::mutex> lock(m_mutex);`

			`// 1. Drain cooled-down slots to make them available`
			`drainCooledSlots_locked();`

			`// 2. Try to find an existing FREE slot that matches the resolution`
			`for (auto& s : m_slots) {`
			`if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_FREE &&`
			`s->gpuIdx == gpuIdx && s->width == w && s->height == h) {`
			`s->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);`
			`NV12POOL_DBG("acquire: reuse slot Y=%p UV=%p %dx%d gpu=%d (total=%zu)",`
			`s->bufY, s->bufUV, w, h, gpuIdx, m_slots.size());`
			`return s.get();`
			`}`
			`}`

			`// 3. No matching free slot — allocate a new one if under the limit`
			`if (static_cast<int>(m_slots.size()) >= GPU_NV12_POOL_MAX_SLOTS) {`
Add log info 2026-04-03 15:16:26 +11:00			`// Always log POOL FULL to DebugView — this is a critical diagnostic.`
			`{`
			`char _buf[128];`
			`snprintf(_buf, sizeof(_buf), "[NV12Pool] POOL FULL (%zu slots) — fallback to CPU\n", m_slots.size());`
			`#ifdef _WIN32`
			`OutputDebugStringA(_buf);`
			`#endif`
			`fprintf(stderr, "%s", _buf);`
			`}`
Fix NV12 crash issue when recreate camera object (new structure) does not work 2026-04-03 14:51:52 +11:00			`return nullptr;`
			`}`

			`// Allocate CUDA buffers on the target GPU`
			`int prevDev = -1;`
			`cudaGetDevice(&prevDev);`
			`if (gpuIdx >= 0) cudaSetDevice(gpuIdx);`

			`auto slot = std::make_unique<GpuNV12Slot>();`
			`cudaError_t e1 = cudaMallocPitch(&slot->bufY, &slot->pitchY, w, h);`
			`cudaError_t e2 = cudaMallocPitch(&slot->bufUV, &slot->pitchUV, w, h / 2);`

			`// Non-blocking stream avoids NULL-stream implicit sync with inference.`
			`// On WDDM, the NULL stream must wait for ALL other streams to finish`
			`// before executing — this caused 1-2 second stalls when inference`
			`// kernels were running. A non-blocking stream runs independently.`
			`cudaStream_t stream = nullptr;`
			`cudaError_t e3 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);`

			`if (prevDev >= 0) cudaSetDevice(prevDev);`

			`if (e1 != cudaSuccess \|\| e2 != cudaSuccess) {`
			`NV12POOL_DBG("acquire: cudaMallocPitch FAILED %dx%d gpu=%d e1=%d e2=%d",`
			`w, h, gpuIdx, (int)e1, (int)e2);`
			`// Clean up partial allocation`
			`int prev2 = -1; cudaGetDevice(&prev2);`
			`if (gpuIdx >= 0) cudaSetDevice(gpuIdx);`
			`if (e1 == cudaSuccess && slot->bufY) cudaFree(slot->bufY);`
			`if (e2 == cudaSuccess && slot->bufUV) cudaFree(slot->bufUV);`
			`if (e3 == cudaSuccess && stream) cudaStreamDestroy(stream);`
			`if (prev2 >= 0) cudaSetDevice(prev2);`
			`return nullptr;`
			`}`

			`slot->width = w;`
			`slot->height = h;`
			`slot->gpuIdx = gpuIdx;`
			`slot->copyStream = (e3 == cudaSuccess) ? stream : nullptr;`
			`slot->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);`

			`GpuNV12Slot* raw = slot.get();`
			`m_slots.push_back(std::move(slot));`

Add log info 2026-04-03 15:16:26 +11:00			`// Always log new slot allocation to DebugView (rare event — once per resolution per camera).`
			`{`
			`char _buf[256];`
			`snprintf(_buf, sizeof(_buf),`
			`"[NV12Pool] NEW slot #%zu: %dx%d gpu=%d Y=%p UV=%p pitchY=%zu stream=%p\n",`
			`m_slots.size(), w, h, gpuIdx, raw->bufY, raw->bufUV, raw->pitchY, raw->copyStream);`
			`#ifdef _WIN32`
			`OutputDebugStringA(_buf);`
			`#endif`
			`fprintf(stderr, "%s", _buf);`
			`}`

			`// Also log POOL FULL to DebugView (important diagnostic).`
Fix NV12 crash issue when recreate camera object (new structure) does not work 2026-04-03 14:51:52 +11:00			`NV12POOL_DBG("acquire: NEW slot Y=%p UV=%p pitchY=%zu pitchUV=%zu %dx%d gpu=%d stream=%p (total=%zu)",`
			`raw->bufY, raw->bufUV, raw->pitchY, raw->pitchUV,`
			`w, h, gpuIdx, raw->copyStream, m_slots.size());`
			`return raw;`
			`}`