Fix NV12 crash issue when recreate camera object

(new structure) does not work
2026-04-03 14:51:52 +11:00
parent 958cab6ae3
commit 6fb09830c5
16 changed files with 854 additions and 209 deletions
--- a/modules/ANSCV/GpuNV12SlotPool.cpp
+++ b/modules/ANSCV/GpuNV12SlotPool.cpp
@@ -0,0 +1,107 @@
+// GpuNV12SlotPool.cpp — Process-wide singleton, compiled into ANSCV.dll.
+//
+// ANSCV.dll owns the canonical GpuNV12SlotPool instance.  Other DLLs
+// (ANSODEngine, etc.) find it via GetProcAddress at runtime.
+
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include "GpuNV12SlotPool.h"
+
+#include <cuda_runtime.h>
+
+// ANSCV.dll owns the process-wide singleton.
+GpuNV12SlotPool* GpuNV12SlotPool::resolveProcessWide() {
+    static GpuNV12SlotPool pool;
+    return &pool;
+}
+
+// Exported so other DLLs (ANSODEngine, etc.) can find this instance at runtime.
+extern "C" __declspec(dllexport)
+GpuNV12SlotPool* GpuNV12SlotPool_GetInstance() {
+    return &GpuNV12SlotPool::instance();
+}
+
+// Transition all COOLING slots past the cooldown threshold to FREE.
+void GpuNV12SlotPool::drainCooledSlots_locked() {
+    auto now = std::chrono::steady_clock::now();
+    auto threshold = std::chrono::milliseconds(SLOT_COOLDOWN_MS);
+    for (auto& s : m_slots) {
+        if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_COOLING) {
+            if (now - s->cooldownStart >= threshold) {
+                s->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
+            }
+        }
+    }
+}
+
+// Acquire a free slot matching (gpuIdx, w, h), or allocate a new one.
+GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    // 1. Drain cooled-down slots to make them available
+    drainCooledSlots_locked();
+
+    // 2. Try to find an existing FREE slot that matches the resolution
+    for (auto& s : m_slots) {
+        if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_FREE &&
+            s->gpuIdx == gpuIdx && s->width == w && s->height == h) {
+            s->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);
+            NV12POOL_DBG("acquire: reuse slot Y=%p UV=%p %dx%d gpu=%d (total=%zu)",
+                         s->bufY, s->bufUV, w, h, gpuIdx, m_slots.size());
+            return s.get();
+        }
+    }
+
+    // 3. No matching free slot — allocate a new one if under the limit
+    if (static_cast<int>(m_slots.size()) >= GPU_NV12_POOL_MAX_SLOTS) {
+        NV12POOL_DBG("acquire: POOL FULL (%zu slots) — fallback to CPU path",
+                     m_slots.size());
+        return nullptr;
+    }
+
+    // Allocate CUDA buffers on the target GPU
+    int prevDev = -1;
+    cudaGetDevice(&prevDev);
+    if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
+
+    auto slot = std::make_unique<GpuNV12Slot>();
+    cudaError_t e1 = cudaMallocPitch(&slot->bufY,  &slot->pitchY,  w, h);
+    cudaError_t e2 = cudaMallocPitch(&slot->bufUV, &slot->pitchUV, w, h / 2);
+
+    // Non-blocking stream avoids NULL-stream implicit sync with inference.
+    // On WDDM, the NULL stream must wait for ALL other streams to finish
+    // before executing — this caused 1-2 second stalls when inference
+    // kernels were running.  A non-blocking stream runs independently.
+    cudaStream_t stream = nullptr;
+    cudaError_t e3 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+
+    if (prevDev >= 0) cudaSetDevice(prevDev);
+
+    if (e1 != cudaSuccess || e2 != cudaSuccess) {
+        NV12POOL_DBG("acquire: cudaMallocPitch FAILED %dx%d gpu=%d e1=%d e2=%d",
+                     w, h, gpuIdx, (int)e1, (int)e2);
+        // Clean up partial allocation
+        int prev2 = -1; cudaGetDevice(&prev2);
+        if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
+        if (e1 == cudaSuccess && slot->bufY) cudaFree(slot->bufY);
+        if (e2 == cudaSuccess && slot->bufUV) cudaFree(slot->bufUV);
+        if (e3 == cudaSuccess && stream) cudaStreamDestroy(stream);
+        if (prev2 >= 0) cudaSetDevice(prev2);
+        return nullptr;
+    }
+
+    slot->width  = w;
+    slot->height = h;
+    slot->gpuIdx = gpuIdx;
+    slot->copyStream = (e3 == cudaSuccess) ? stream : nullptr;
+    slot->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);
+
+    GpuNV12Slot* raw = slot.get();
+    m_slots.push_back(std::move(slot));
+
+    NV12POOL_DBG("acquire: NEW slot Y=%p UV=%p pitchY=%zu pitchUV=%zu %dx%d gpu=%d stream=%p (total=%zu)",
+                 raw->bufY, raw->bufUV, raw->pitchY, raw->pitchUV,
+                 w, h, gpuIdx, raw->copyStream, m_slots.size());
+    return raw;
+}