Fix NV12 crash issue when recreate camera object

(new structure) does not work
2026-04-03 14:51:52 +11:00
parent 958cab6ae3
commit 6fb09830c5
16 changed files with 854 additions and 209 deletions
--- a/include/ANSGpuFrameRegistry.h
+++ b/include/ANSGpuFrameRegistry.h
@@ -42,8 +42,10 @@
 #include <windows.h>
 #endif

-// Debug logging for registry operations — both stderr and OutputDebugString.
+// Debug logging for registry operations.
+// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
 #ifndef REG_DBG
+#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
 #ifdef _WIN32
 #define REG_DBG(fmt, ...) do { \
    char _reg_buf[512]; \
@@ -54,7 +56,13 @@
 #else
 #define REG_DBG(fmt, ...) fprintf(stderr, "[Registry] " fmt "\n", ##__VA_ARGS__)
 #endif
+#else
+#define REG_DBG(fmt, ...) ((void)0)
 #endif
+#endif
+
+// GpuNV12Slot definition needed by freeOwnedBuffers_locked() (accesses inUse atomic).
+#include "GpuNV12SlotPool.h"

 // Safety constants
 static constexpr int    MAX_FRAME_REFCOUNT    = 64;
@@ -66,6 +74,7 @@ static constexpr int    EVICT_CHECK_INTERVAL_MS  = 500;
 struct GpuPendingFreeEntry {
    void* ptr       = nullptr;
    int   deviceIdx = -1;
+    std::chrono::steady_clock::time_point queuedAt;  // When this entry was queued
 };

 struct GpuFrameData {
@@ -116,6 +125,13 @@ struct GpuFrameData {
    void*  ownerClient   = nullptr;
    void (*onReleaseFn)(void*) = nullptr;

+    // --- Global pool slot (from GpuNV12SlotPool) ---
+    // When non-null, yPlane/uvPlane point into this slot's buffers.
+    // Released (slot->inUse = false) in freeOwnedBuffers_locked() when
+    // the frame's refcount drops to 0 — guarantees the buffer is not
+    // freed while any consumer is still reading it.
+    GpuNV12Slot* poolSlot = nullptr;
+
    // Default constructor
    GpuFrameData() = default;

@@ -134,6 +150,7 @@ struct GpuFrameData {
        , yLinesize(o.yLinesize), uvLinesize(o.uvLinesize)
        , refcount(o.refcount.load()), createdAt(o.createdAt)
        , ownerClient(o.ownerClient), onReleaseFn(o.onReleaseFn)
+        , poolSlot(o.poolSlot)
    {
        // Null out source to prevent double-free of owned pointers
        o.cpuYPlane = nullptr;
@@ -147,6 +164,7 @@ struct GpuFrameData {
        o.gpuCacheBytes = 0;
        o.ownerClient = nullptr;
        o.onReleaseFn = nullptr;
+        o.poolSlot = nullptr;
    }

    // No copy
@@ -344,11 +362,30 @@ public:

    // --- Drain pending GPU device pointers for caller to cudaFree ---
    // Each entry includes the device index for cudaSetDevice before cudaFree.
-    std::vector<GpuPendingFreeEntry> drain_gpu_pending() {
+    // If minAgeMs > 0, only drain entries older than minAgeMs milliseconds.
+    // This allows time-based safety: entries queued >100ms ago are guaranteed
+    // safe to free because all CUDA kernels complete in <10ms.
+    std::vector<GpuPendingFreeEntry> drain_gpu_pending(int minAgeMs = 0) {
        std::lock_guard<std::mutex> lock(m_mutex);
-        std::vector<GpuPendingFreeEntry> result;
-        result.swap(m_pendingGpuFree);
-        return result;
+        if (minAgeMs <= 0) {
+            // Drain all (used by Destroy/Reconnect with cudaDeviceSynchronize)
+            std::vector<GpuPendingFreeEntry> result;
+            result.swap(m_pendingGpuFree);
+            return result;
+        }
+        // Drain only entries older than minAgeMs
+        auto now = std::chrono::steady_clock::now();
+        auto threshold = std::chrono::milliseconds(minAgeMs);
+        std::vector<GpuPendingFreeEntry> ready;
+        std::vector<GpuPendingFreeEntry> notReady;
+        for (auto& entry : m_pendingGpuFree) {
+            if (now - entry.queuedAt >= threshold)
+                ready.push_back(entry);
+            else
+                notReady.push_back(entry);
+        }
+        m_pendingGpuFree = std::move(notReady);
+        return ready;
    }

    // --- TTL eviction: force-free frames older than FRAME_TTL_SECONDS ---
@@ -506,10 +543,23 @@ private:
    // Free malloc'd CPU NV12 buffers and GPU cache (but NOT avframe/cpuAvframe —
    // those go to pendingFree for the caller to av_frame_free).
    void freeOwnedBuffers_locked(GpuFrameData* frame) {
-        REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d",
+        REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d poolSlot=%p",
                (void*)frame, (void*)frame->cpuYPlane, (void*)frame->cpuUvPlane,
                frame->gpuCacheY, frame->gpuCacheUV,
-                frame->gpuCacheBytes, frame->gpuCacheDeviceIdx);
+                frame->gpuCacheBytes, frame->gpuCacheDeviceIdx, (void*)frame->poolSlot);
+        // Release global pool slot via DEFERRED release — the slot enters a
+        // "cooling" state for SLOT_COOLDOWN_MS (200ms) before it becomes
+        // available for reuse.  This guarantees that any in-flight GPU kernels
+        // (launched asynchronously by inference engines) have completed reading
+        // from the buffer.  CPU refcount→0 does NOT mean the GPU is done.
+        if (frame->poolSlot) {
+            GpuNV12SlotPool::deferRelease(frame->poolSlot);
+            frame->poolSlot = nullptr;
+            // yPlane/uvPlane pointed into the pool slot — null them to
+            // prevent any stale reads after this point.
+            frame->yPlane = nullptr;
+            frame->uvPlane = nullptr;
+        }
        if (frame->cpuYPlane) {
            std::free(frame->cpuYPlane);
            frame->cpuYPlane = nullptr;
@@ -525,10 +575,11 @@ private:
            frame->gpuCacheValid = false;
            frame->gpuCacheBytes = 0;
            int devIdx = frame->gpuCacheDeviceIdx;
+            auto now = std::chrono::steady_clock::now();
            if (frame->gpuCacheY)
-                m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx});
+                m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx, now});
            if (frame->gpuCacheUV)
-                m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx});
+                m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx, now});
            frame->gpuCacheY = nullptr;
            frame->gpuCacheUV = nullptr;
        }
--- a/include/GpuNV12SlotPool.h
+++ b/include/GpuNV12SlotPool.h
@@ -0,0 +1,161 @@
+#pragma once
+// GpuNV12SlotPool.h — Process-wide GPU NV12 buffer pool.
+//
+// Provides pre-allocated CUDA buffer slots (Y + UV planes) that are shared
+// across all RTSP camera instances.  Slots are acquired per-frame by
+// GetRTSPCVImage and released back to the pool when the GpuFrameData's
+// refcount drops to 0 in freeOwnedBuffers_locked().
+//
+// KEY DESIGN: Slots are NEVER freed when a camera is destroyed — they are
+// recycled.  This decouples GPU buffer lifetime from camera lifetime, so
+// inference engines can safely read NV12 data even after the camera object
+// that produced it has been deleted and recreated (the LabVIEW reconnect
+// pattern: ReleaseHandle → Destroy → delete → CreateHandle).
+//
+// TIME-DELAYED RELEASE: When a GpuFrameData's refcount drops to 0, the
+// slot is NOT immediately available.  It enters a "cooling" state for
+// SLOT_COOLDOWN_MS (50ms) to guarantee that any in-flight GPU kernels
+// (launched asynchronously by inference engines) have completed reading
+// from the buffer.  CUDA kernels typically complete in <10ms, so 50ms
+// provides a 5x safety margin.  The cooldown is kept short to minimize
+// the number of slots in COOLING, which prevents POOL FULL events.
+// POOL FULL triggers per-frame cudaMalloc/cudaFree, which holds the
+// nvcuda64 SRW lock and causes cascading stalls on other cameras'
+// cudaMemcpy2D operations.
+//
+// Thread-safe: acquire() locks internally, deferRelease() is lock-free.
+//
+// Cross-DLL: uses the same resolveProcessWide() singleton pattern as
+// ANSGpuFrameRegistry.  ANSCV.dll owns the canonical instance; other DLLs
+// find it via GetProcAddress("GpuNV12SlotPool_GetInstance").
+
+#include <vector>
+#include <memory>
+#include <mutex>
+#include <atomic>
+#include <cstdint>
+#include <cstdio>
+#include <chrono>
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+// Safety constants
+static constexpr int GPU_NV12_POOL_MAX_SLOTS   = 64;
+static constexpr int SLOT_COOLDOWN_MS          = 50;   // Time after CPU release before slot reuse
+                                                        // GPU kernels complete in <10ms; 50ms = 5× margin
+
+// Debug logging for pool operations.
+// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
+// In production, these are silent to avoid OutputDebugString/fprintf
+// lock contention (measured: 500-2000 calls/sec causes process stalls).
+#ifndef NV12POOL_DBG
+#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
+#ifdef _WIN32
+#define NV12POOL_DBG(fmt, ...) do { \
+    char _p_buf[512]; \
+    snprintf(_p_buf, sizeof(_p_buf), "[NV12Pool] " fmt "\n", ##__VA_ARGS__); \
+    OutputDebugStringA(_p_buf); \
+    fprintf(stderr, "%s", _p_buf); \
+} while(0)
+#else
+#define NV12POOL_DBG(fmt, ...) fprintf(stderr, "[NV12Pool] " fmt "\n", ##__VA_ARGS__)
+#endif
+#else
+#define NV12POOL_DBG(fmt, ...) ((void)0)
+#endif
+#endif
+
+struct GpuNV12Slot {
+    void*  bufY    = nullptr;   // cudaMallocPitch'd Y plane
+    void*  bufUV   = nullptr;   // cudaMallocPitch'd UV plane
+    size_t pitchY  = 0;
+    size_t pitchUV = 0;
+    int    width   = 0;         // Resolution this slot was allocated for
+    int    height  = 0;
+    int    gpuIdx  = -1;        // GPU device index
+
+    // Slot lifecycle state:
+    //   FREE      (0) = available for acquire()
+    //   ACTIVE    (1) = owned by a GpuFrameData (D2D copy + inference reading)
+    //   COOLING   (2) = CPU released but GPU kernel may still be reading;
+    //                    becomes FREE after SLOT_COOLDOWN_MS elapses.
+    static constexpr int STATE_FREE    = 0;
+    static constexpr int STATE_ACTIVE  = 1;
+    static constexpr int STATE_COOLING = 2;
+    std::atomic<int> state{STATE_FREE};
+
+    // Timestamp when the slot entered COOLING state.
+    // Only meaningful when state == STATE_COOLING.
+    std::chrono::steady_clock::time_point cooldownStart;
+
+    // Per-slot CUDA stream for D2D copy (non-blocking).
+    // CRITICAL: cudaMemcpy2D (no stream arg) uses the NULL stream, which on
+    // WDDM implicitly synchronizes with ALL other streams before executing.
+    // This means the D2D copy must wait for all inference kernels to finish
+    // first — causing 1-2 second stalls.  Using a dedicated non-blocking
+    // stream avoids this implicit sync entirely.
+    // Stored as void* to avoid cuda_runtime.h in the header.
+    void* copyStream = nullptr;  // cudaStream_t
+};
+
+class GpuNV12SlotPool {
+public:
+    // Process-wide singleton (same pattern as ANSGpuFrameRegistry).
+    static GpuNV12SlotPool& instance() {
+#ifdef _WIN32
+        static GpuNV12SlotPool* s_inst = resolveProcessWide();
+        return *s_inst;
+#else
+        static GpuNV12SlotPool pool;
+        return pool;
+#endif
+    }
+
+    // Acquire a free slot matching (gpuIdx, w, h).
+    // Drains cooled-down slots first, then looks for a FREE match.
+    // If none, allocates a new one (up to GPU_NV12_POOL_MAX_SLOTS).
+    // Returns nullptr if pool full — caller falls back to CPU path.
+    GpuNV12Slot* acquire(int gpuIdx, int w, int h);
+
+    // Deferred release: moves slot from ACTIVE → COOLING.
+    // Called from freeOwnedBuffers_locked() when GpuFrameData refcount → 0.
+    // The slot becomes FREE after SLOT_COOLDOWN_MS elapses (checked in acquire).
+    static void deferRelease(GpuNV12Slot* slot) {
+        if (slot) {
+            slot->cooldownStart = std::chrono::steady_clock::now();
+            slot->state.store(GpuNV12Slot::STATE_COOLING, std::memory_order_release);
+        }
+    }
+
+    // Number of allocated slots (for diagnostics).
+    size_t slotCount() const {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        return m_slots.size();
+    }
+
+    // Number of in-use slots (for diagnostics).
+    size_t activeCount() const {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        size_t count = 0;
+        for (auto& s : m_slots) {
+            if (s->state.load(std::memory_order_relaxed) != GpuNV12Slot::STATE_FREE) ++count;
+        }
+        return count;
+    }
+
+private:
+    GpuNV12SlotPool() = default;
+
+#ifdef _WIN32
+    static GpuNV12SlotPool* resolveProcessWide();
+#endif
+
+    // Transition all COOLING slots that have exceeded SLOT_COOLDOWN_MS to FREE.
+    // Called at the start of acquire() under the lock.
+    void drainCooledSlots_locked();
+
+    mutable std::mutex m_mutex;
+    std::vector<std::unique_ptr<GpuNV12Slot>> m_slots;
+};