Fix NV12 crash issue when recreate camera object

2026-04-02 22:07:27 +11:00
parent 4bedf3a3a2
commit 958cab6ae3
25 changed files with 1459 additions and 393 deletions
--- a/modules/ANSCV/ANSGpuFrameOps.h
+++ b/modules/ANSCV/ANSGpuFrameOps.h
@@ -19,8 +19,31 @@ extern "C" {
 #include "libavutil/frame.h"
 }

+#include <cuda_runtime.h>
 #include <cstring>
 #include <cstdlib>
+#include <cstdio>
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+// Debug logging macro for GPU frame operations.
+// Output goes to stderr (console) AND OutputDebugString (DebugView / VS debugger).
+// Use Sysinternals DebugView (dbgview64.exe) to capture these after a crash.
+#ifndef GPU_FRAME_DBG
+#ifdef _WIN32
+#define GPU_FRAME_DBG(fmt, ...) do { \
+    char _gpu_dbg_buf[512]; \
+    snprintf(_gpu_dbg_buf, sizeof(_gpu_dbg_buf), "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__); \
+    OutputDebugStringA(_gpu_dbg_buf); \
+    fprintf(stderr, "%s", _gpu_dbg_buf); \
+} while(0)
+#else
+#define GPU_FRAME_DBG(fmt, ...) \
+    fprintf(stderr, "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__)
+#endif
+#endif

 namespace anscv_gpu_ops {
 namespace detail {
@@ -71,6 +94,42 @@ inline bool snapshotNV12Planes(const AVFrame* nv12,
    return true;
 }

+// Drain pending GPU device pointers and actually cudaFree them.
+// Must be called from a thread with CUDA context available.
+inline void drainAndFreeGpuPending() {
+    auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
+    if (gpuPending.empty()) return;
+    GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs", gpuPending.size());
+    int prevDev = -1;
+    cudaGetDevice(&prevDev);
+
+    // Group by device to minimize cudaSetDevice calls and synchronize once per device.
+    // cudaDeviceSynchronize() is CRITICAL: NV12 kernels run on cv::cuda::Stream
+    // (not the default stream).  cudaFree on stream 0 doesn't wait for other
+    // streams, so without this sync, cudaFree can free a buffer while a kernel
+    // on another stream is still reading from it → cudaErrorIllegalAddress (700)
+    // which permanently corrupts the CUDA context.
+    int lastSyncDev = -1;
+    for (auto& entry : gpuPending) {
+        if (entry.ptr) {
+            if (entry.deviceIdx >= 0)
+                cudaSetDevice(entry.deviceIdx);
+            if (entry.deviceIdx != lastSyncDev) {
+                cudaDeviceSynchronize();
+                lastSyncDev = entry.deviceIdx;
+            }
+            GPU_FRAME_DBG("drainGpuPending: cudaFree(%p) dev=%d", entry.ptr, entry.deviceIdx);
+            cudaError_t err = cudaFree(entry.ptr);
+            if (err != cudaSuccess) {
+                GPU_FRAME_DBG("drainGpuPending: cudaFree FAILED err=%d (%s)",
+                              (int)err, cudaGetErrorString(err));
+            }
+        }
+    }
+    if (prevDev >= 0)
+        cudaSetDevice(prevDev);
+}
+
 } // namespace detail
 } // namespace anscv_gpu_ops

@@ -117,36 +176,44 @@ inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pt
    }
 }

-// Attach CUDA HW frame — keeps CUDA device pointers for zero-copy inference.
+// Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
 // TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
 //
-// Primary path: yPlane/uvPlane point to CUDA device pointers from the cloned
-// AVFrame (data[0]/data[1]).  The cloned AVFrame keeps the NVDEC surface alive
-// until gpu_frame_remove() is called after inference.  With 4 cameras each
-// holding ~1 surface, this uses 4 of NVDEC's 25-32 surface pool — safe.
+// D2D copy path: cudaMemcpy2D from NVDEC surfaces to cudaMalloc'd buffers on the
+// same GPU.  This decouples the NV12 data lifetime from the NVDEC decoder, so
+// player->close() can safely destroy the decoder at any time without invalidating
+// pointers that inference engines may be reading.  The NVDEC surface is freed
+// immediately (av_frame_free), returning it to the decoder's surface pool.
+//
+// The owned GPU pointers are stored as both yPlane/uvPlane (for zero-copy reads)
+// and gpuCacheY/gpuCacheUV (for lifecycle management / cudaFree on cleanup).
+//
+// VRAM budget: if the global GPU cache budget is exceeded, falls back to CPU-only
+// NV12 snapshot (no zero-copy, but safe).
 //
 // Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
-// inference (when decode GPU != inference GPU, CUDA device ptrs aren't
-// accessible from another GPU context).
+// inference (when decode GPU != inference GPU).
 inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
                                   AVFrame* cpuNV12 = nullptr) {
-    if (!mat || !cudaFrame) return;
+    if (!mat || !cudaFrame) {
+        GPU_FRAME_DBG("attach_cuda: SKIP mat=%p cudaFrame=%p", (void*)mat, (void*)cudaFrame);
+        return;
+    }
+
+    const int w = cudaFrame->width;
+    const int h = cudaFrame->height;
+    GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p cpuNV12=%p",
+                  (void*)mat, w, h, gpuIdx,
+                  (void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)cpuNV12);

    GpuFrameData data{};
    data.gpuIndex        = gpuIdx;
    data.pts             = pts;
-    data.width           = cudaFrame->width;
-    data.height          = cudaFrame->height;
-    data.pixelFormat     = 23; // AV_PIX_FMT_NV12 — the underlying sw_format
+    data.width           = w;
+    data.height          = h;
+    data.pixelFormat     = 23; // AV_PIX_FMT_NV12

-    // Primary: CUDA device pointers from NVDEC (zero-copy on same GPU)
-    data.isCudaDevicePtr = true;
-    data.yPlane          = cudaFrame->data[0];   // CUDA device ptr: Y plane
-    data.uvPlane         = cudaFrame->data[1];   // CUDA device ptr: UV plane
-    data.yLinesize       = cudaFrame->linesize[0];
-    data.uvLinesize      = cudaFrame->linesize[1];
-
-    // Fallback: snapshot CPU NV12 for cross-GPU inference
+    // Snapshot CPU NV12 for cross-GPU fallback (must do before freeing cpuNV12)
    if (cpuNV12) {
        anscv_gpu_ops::detail::snapshotNV12Planes(
            cpuNV12,
@@ -155,9 +222,98 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
            data.width, data.height);
    }

-    // Store AVFrames for cleanup (cudaFrame keeps NVDEC surface alive)
-    data.avframe    = cudaFrame;
-    data.cpuAvframe = cpuNV12;
+    // --- D2D copy: NVDEC surface → owned GPU memory ---
+    // Estimate VRAM needed for the owned NV12 copy
+    const size_t yBytes  = static_cast<size_t>(w) * h;
+    const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
+    const size_t totalBytes = yBytes + uvBytes;
+
+    bool d2dOk = false;
+    if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
+        int prevDev = -1;
+        cudaGetDevice(&prevDev);
+        if (gpuIdx >= 0)
+            cudaSetDevice(gpuIdx);
+
+        void*  ownedY  = nullptr;
+        void*  ownedUV = nullptr;
+        size_t yPitch  = 0;
+        size_t uvPitch = 0;
+
+        cudaError_t e1 = cudaMallocPitch(&ownedY,  &yPitch,  w, h);
+        cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);
+
+        if (e1 == cudaSuccess && e2 == cudaSuccess) {
+            cudaError_t e3 = cudaMemcpy2D(ownedY,  yPitch,
+                                           cudaFrame->data[0], cudaFrame->linesize[0],
+                                           w, h, cudaMemcpyDeviceToDevice);
+            cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
+                                           cudaFrame->data[1], cudaFrame->linesize[1],
+                                           w, h / 2, cudaMemcpyDeviceToDevice);
+
+            if (e3 == cudaSuccess && e4 == cudaSuccess) {
+                // Store owned GPU pointers as primary NV12 source
+                data.isCudaDevicePtr = true;
+                data.yPlane          = static_cast<uint8_t*>(ownedY);
+                data.uvPlane         = static_cast<uint8_t*>(ownedUV);
+                data.yLinesize       = static_cast<int>(yPitch);
+                data.uvLinesize      = static_cast<int>(uvPitch);
+
+                // Track in gpuCache for lifecycle management (cudaFree on cleanup)
+                data.gpuCacheY          = ownedY;
+                data.gpuCacheUV         = ownedUV;
+                data.gpuCacheYPitch     = yPitch;
+                data.gpuCacheUVPitch    = uvPitch;
+                data.gpuCacheDeviceIdx  = gpuIdx;
+                data.gpuCacheValid      = true;
+                data.gpuCacheBytes      = yPitch * h + uvPitch * (h / 2);
+
+                ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
+                d2dOk = true;
+                GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
+                              ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
+            } else {
+                // D2D copy failed — free allocated memory and fall back
+                GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
+                              (int)e3, (int)e4);
+                cudaFree(ownedY);
+                cudaFree(ownedUV);
+            }
+        } else {
+            // Allocation failed — free any partial allocation and fall back
+            GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
+                          (int)e1, (int)e2);
+            if (e1 == cudaSuccess) cudaFree(ownedY);
+            if (e2 == cudaSuccess) cudaFree(ownedUV);
+        }
+
+        if (prevDev >= 0)
+            cudaSetDevice(prevDev);
+    }
+
+    if (!d2dOk) {
+        // Fall back to CPU NV12 snapshot only (no zero-copy)
+        GPU_FRAME_DBG("attach_cuda: FALLBACK CPU-only cpuY=%p cpuUV=%p",
+                      (void*)data.cpuYPlane, (void*)data.cpuUvPlane);
+        data.isCudaDevicePtr = false;
+        data.yPlane          = data.cpuYPlane;
+        data.uvPlane         = data.cpuUvPlane;
+        data.yLinesize       = data.cpuYLinesize;
+        data.uvLinesize      = data.cpuUvLinesize;
+    }
+
+    // Release AVFrames immediately — NVDEC surfaces returned to pool.
+    // No longer stored in GpuFrameData (owned GPU copy is independent).
+    GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p",
+                  (void*)cudaFrame, (void*)cpuNV12);
+    av_frame_free(&cudaFrame);
+    if (cpuNV12) av_frame_free(&cpuNV12);
+    data.avframe    = nullptr;
+    data.cpuAvframe = nullptr;
+
+    GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d gpuCacheY=%p gpuCacheUV=%p",
+                  (void*)data.yPlane, (void*)data.uvPlane, (int)data.isCudaDevicePtr,
+                  data.gpuCacheY, data.gpuCacheUV);

    void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
    if (old) {
@@ -165,17 +321,23 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
        av_frame_free(&oldFrame);
    }

+    // Free stale AVFrames evicted by TTL or previous attach
    auto pending = ANSGpuFrameRegistry::instance().drain_pending();
    for (void* p : pending) {
        AVFrame* stale = static_cast<AVFrame*>(p);
        av_frame_free(&stale);
    }
+
+    // Free stale GPU device pointers
+    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }

-// Release entry by cv::Mat* and free any returned AVFrames. Safe if not in map (no-op).
+// Release entry by cv::Mat* and free any returned AVFrames + GPU pointers.
+// Safe if not in map (no-op).
 inline void gpu_frame_remove(cv::Mat* mat) {
    if (!mat) return;

+    GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat);
    ANSGpuFrameRegistry::instance().release(mat);

    // Free any AVFrames that became pending from this release or prior eviction
@@ -186,13 +348,7 @@ inline void gpu_frame_remove(cv::Mat* mat) {
    }

    // Free any GPU device pointers that became pending
-    auto gpuPending = gpu_frame_drain_gpu_pending();
-    // NOTE: cudaFree requires CUDA context — caller must be on a CUDA-capable thread.
-    // If not, these will leak. In practice, gpu_frame_remove is called from ANSCV
-    // camera threads which do have CUDA context.
-    // For safety, we skip cudaFree here and let NV12PreprocessHelper handle it.
-    // The GPU pointers are tracked in the budget and will be accounted for.
-    (void)gpuPending;
+    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }

 // Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
@@ -209,4 +365,7 @@ inline void gpu_frame_evict_stale() {
        AVFrame* stale = static_cast<AVFrame*>(p);
        av_frame_free(&stale);
    }
+
+    // Free any GPU device pointers from evicted frames
+    anscv_gpu_ops::detail::drainAndFreeGpuPending();
 }