Fix NV12 crash issue when recreate camera object

(new structure) does not work
2026-04-03 14:51:52 +11:00
parent 958cab6ae3
commit 6fb09830c5
16 changed files with 854 additions and 209 deletions
--- a/modules/ANSCV/ANSRTSP.cpp
+++ b/modules/ANSCV/ANSRTSP.cpp
@@ -1,6 +1,7 @@
 #include "ANSRTSP.h"
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
+#include "GpuNV12SlotPool.h"
 #include <memory>
 #include <format>
 #include "media_codec.h"
@@ -23,8 +24,9 @@ extern "C"
 // Note: per-instance thread safety is handled by ANSRTSPClient::_mutex
 // Mat registry thread safety is handled by anscv_mat_replace's internal registry_mutex

-// Debug logging — goes to both stderr AND OutputDebugString (DebugView).
+// Debug logging. Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame logging.
 #ifndef RTSP_DBG
+#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
 #ifdef _WIN32
 #define RTSP_DBG(fmt, ...) do { \
    char _rtsp_buf[512]; \
@@ -35,6 +37,9 @@ extern "C"
 #else
 #define RTSP_DBG(fmt, ...) fprintf(stderr, fmt "\n", ##__VA_ARGS__)
 #endif
+#else
+#define RTSP_DBG(fmt, ...) ((void)0)
+#endif
 #endif
 static bool ansrtspLicenceValid = false;
 // Global once_flag to protect license checking
@@ -62,6 +67,7 @@ namespace ANSCENTER {
    ANSRTSPClient::~ANSRTSPClient() noexcept {
 		Destroy();
    }
+
    void ANSRTSPClient::Destroy() {
        // Move the player client pointer out of the lock scope, then
        // close it OUTSIDE the mutex.  close() calls cuArrayDestroy /
@@ -80,69 +86,44 @@ namespace ANSCENTER {
                }
            }

-            // --- Inference guard: wait for in-flight frames to finish ---
-            // GetRTSPCVImage increments _inFlightFrames when it hands out
-            // a GPU frame; the registry decrements it when the frame is
-            // released after inference completes.  We wait here so that
-            // close() doesn't free NVDEC surfaces while TensorRT is
-            // still reading from them (the LabVIEW crash root cause).
+            // --- Inference guard: wait for in-flight D2D copies to finish ---
+            // With synchronous D2D copy, in-flight means "currently inside
+            // GetRTSPCVImage between TryIncrementInFlight and attach_cuda".
+            // This is typically <1ms, so the wait is very fast.
            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
            if (inFlight > 0) {
                _logger.LogInfo("ANSRTSPClient::Destroy",
-                    std::format("waiting for {} in-flight inference frame(s)...", inFlight),
+                    std::format("waiting for {} in-flight frame(s)...", inFlight),
                    __FILE__, __LINE__);
                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
                });
                if (!done) {
                    _logger.LogWarn("ANSRTSPClient::Destroy",
-                        std::format("timed out waiting for in-flight frames "
-                                    "(still {} in-flight) — force-releasing GPU frames",
-                                    _inFlightFrames.load()),
+                        std::format("timed out — still {} in-flight", _inFlightFrames.load()),
                        __FILE__, __LINE__);
                }
            }

-            // Force-release ALL GPU frames owned by this client BEFORE close().
-            // Unreleased clones (e.g. LabVIEW AI tasks still holding cloned
-            // cv::Mat*) keep gpuCacheY/gpuCacheUV allocated.  We must cudaFree
-            // them NOW while the CUDA context is still alive.  After close()
-            // destroys the context, cudaFree would crash.
-            int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
-            if (forceReleased > 0) {
-                _logger.LogWarn("ANSRTSPClient::Destroy",
-                    std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
-                    __FILE__, __LINE__);
-                // Drain and cudaFree the GPU buffers while CUDA context is alive
-                // Sync all GPU streams before freeing to avoid illegal access
-                cudaDeviceSynchronize();
-                auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
-                if (!gpuPending.empty()) {
-                    RTSP_DBG("[Destroy] cudaFree %zu GPU ptrs before close()", gpuPending.size());
-                    int prevDev = -1;
-                    cudaGetDevice(&prevDev);
-                    for (auto& entry : gpuPending) {
-                        if (entry.ptr) {
-                            if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
-                            cudaFree(entry.ptr);
-                        }
-                    }
-                    if (prevDev >= 0) cudaSetDevice(prevDev);
-                }
-                // Also drain any pending AVFrames
-                auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
-                for (void* p : avPending) {
-                    AVFrame* f = static_cast<AVFrame*>(p);
-                    av_frame_free(&f);
-                }
-            }
+            // Invalidate owner callbacks so stale GpuFrameData don't try to
+            // call DecrementInFlight on this (soon-to-be-deleted) object.
+            // The GpuFrameData and their global pool slots remain alive —
+            // inference engines can safely keep reading from them.
            ANSGpuFrameRegistry::instance().invalidateOwner(this);
            _inFlightFrames.store(0, std::memory_order_release);

+            // NO forceReleaseByOwner — frames survive camera deletion.
+            // Pool slot buffers are global (GpuNV12SlotPool) — NOT owned
+            // by this camera.  They are recycled when inference finishes
+            // (GpuFrameData refcount → 0 → slot.inUse = false).
+            // NO cudaDeviceSynchronize — no GPU buffers to free here.
+            // NO DestroyGpuPool — per-camera pool has been removed.
+
            clientToClose = std::move(_playerClient);
        }
-        // CUDA cleanup happens here, outside the mutex — now safe.
-        // All GPU frames owned by this client have been force-freed above.
+        // close() destroys the NVDEC decoder ONLY.  Pool slot buffers
+        // (regular cudaMallocPitch allocations) are untouched — they
+        // belong to the global GpuNV12SlotPool, not the decoder.
        if (clientToClose) {
            clientToClose->close();
        }
@@ -232,66 +213,44 @@ namespace ANSCENTER {
    bool ANSRTSPClient::Reconnect() {
        // 1. Mark as not-playing under the mutex FIRST.  This makes GetImage()
        //    return the cached _pLastFrame instead of calling into the player,
-        //    preventing use-after-free when close() destroys CUDA resources.
+        //    and blocks new TryIncrementInFlight calls.
        {
            std::unique_lock<std::recursive_mutex> lock(_mutex);
            _isPlaying = false;

-            // --- Inference guard: wait for in-flight frames to finish ---
-            // Same guard as Destroy(): close() will free NVDEC surfaces, so
-            // we must wait for any inference engines still reading NV12 data
-            // via zero-copy CUDA device pointers.
+            // --- Inference guard: wait for in-flight D2D copies to finish ---
+            // With synchronous D2D copy, in-flight means "currently inside
+            // GetRTSPCVImage between TryIncrementInFlight and attach_cuda".
+            // This is typically <1ms, so the wait is very fast.
            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
            if (inFlight > 0) {
                _logger.LogInfo("ANSRTSPClient::Reconnect",
-                    std::format("waiting for {} in-flight inference frame(s)...", inFlight),
+                    std::format("waiting for {} in-flight frame(s)...", inFlight),
                    __FILE__, __LINE__);
                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
                });
                if (!done) {
                    _logger.LogWarn("ANSRTSPClient::Reconnect",
-                        std::format("timed out waiting for in-flight frames "
-                                    "(still {} in-flight) — force-releasing GPU frames",
-                                    _inFlightFrames.load()),
+                        std::format("timed out — still {} in-flight", _inFlightFrames.load()),
                        __FILE__, __LINE__);
                }
            }

-            // Force-release GPU frames before close() — same as Destroy().
-            int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
-            if (forceReleased > 0) {
-                _logger.LogWarn("ANSRTSPClient::Reconnect",
-                    std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
-                    __FILE__, __LINE__);
-                // Sync all GPU streams before freeing
-                cudaDeviceSynchronize();
-                auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
-                if (!gpuPending.empty()) {
-                    int prevDev = -1;
-                    cudaGetDevice(&prevDev);
-                    for (auto& entry : gpuPending) {
-                        if (entry.ptr) {
-                            if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
-                            cudaFree(entry.ptr);
-                        }
-                    }
-                    if (prevDev >= 0) cudaSetDevice(prevDev);
-                }
-                auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
-                for (void* p : avPending) {
-                    AVFrame* f = static_cast<AVFrame*>(p);
-                    av_frame_free(&f);
-                }
-            }
+            // Invalidate owner callbacks — prevents stale DecrementInFlight
+            // calls after Reconnect re-creates the decoder.
+            // Frames and their global pool slots remain alive for inference.
            ANSGpuFrameRegistry::instance().invalidateOwner(this);
            _inFlightFrames.store(0, std::memory_order_release);
+
+            // NO forceReleaseByOwner — frames survive reconnect.
+            // NO cudaDeviceSynchronize — no GPU buffers to free.
+            // NO DestroyGpuPool — per-camera pool has been removed.
        }

-        // 2. close() does CUDA cleanup (cuArrayDestroy/cuMemFree) — run outside
-        //    _mutex to avoid deadlocking with nvcuda64 SRW lock held by inference.
-        //    Safe now because GetImage()/GetNV12Frame() won't touch the player
-        //    while _isPlaying == false, and all in-flight frames have been released.
+        // 2. close() destroys NVDEC decoder ONLY — run outside _mutex to
+        //    avoid deadlocking with nvcuda64 SRW lock held by inference.
+        //    Pool slot buffers are global and untouched.
        _logger.LogInfo("ANSRTSPClient::Reconnect",
            "calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
        RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
@@ -1071,6 +1030,8 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
    }

    try {
+        auto t0 = std::chrono::steady_clock::now();
+
        // Get image (shallow copy - reference counted, fast)
        cv::Mat img = (*Handle)->GetImage(width, height, timeStamp);

@@ -1082,6 +1043,8 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
        // Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock)
        anscv_mat_replace(image, std::move(img));

+        auto t1 = std::chrono::steady_clock::now();
+
        // Attach NV12 frame for GPU fast-path inference (side-table registry)
        // attach() takes ownership — do NOT av_frame_free here
        //
@@ -1101,7 +1064,11 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
                        cudaHW->width, cudaHW->height,
                        (void*)cudaHW->data[0], (void*)cudaHW->data[1]);
                AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
-                gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
+
+                // Acquire a slot from the global pool — survives camera Destroy.
+                GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire(
+                    gpuIdx, cudaHW->width, cudaHW->height);
+                gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot);
            } else {
                // HW decode not active — try CPU NV12
                AVFrame* nv12 = (*Handle)->GetNV12Frame();
@@ -1114,11 +1081,11 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
            // TryIncrementInFlight already incremented; DecrementInFlight fires
            // when the last clone of this frame is released after inference.
            auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
-            RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d gpuCacheY=%p",
+            RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p",
                    (void*)gpuData,
                    gpuData ? (void*)gpuData->yPlane : nullptr,
                    gpuData ? (int)gpuData->isCudaDevicePtr : -1,
-                    gpuData ? gpuData->gpuCacheY : nullptr);
+                    gpuData ? (void*)gpuData->poolSlot : nullptr);
            if (gpuData) {
                gpuData->ownerClient = *Handle;
                gpuData->onReleaseFn = [](void* client) {
@@ -1136,6 +1103,20 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
            RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
        }

+        // Lightweight timing via spdlog (no OutputDebugString).
+        // Logs only when the frame grab + D2D exceeds 50ms — helps diagnose stalls
+        // without the overhead of per-frame debug logging.
+        auto t2 = std::chrono::steady_clock::now();
+        double getImageMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
+        double cudaMs     = std::chrono::duration<double, std::milli>(t2 - t1).count();
+        double totalMs    = getImageMs + cudaMs;
+        if (totalMs > 50.0) {
+            (*Handle)->_logger.LogWarn("GetRTSPCVImage",
+                std::format("SLOW FRAME: total={:.1f}ms (getImage={:.1f}ms cuda={:.1f}ms) {}x{}",
+                    totalMs, getImageMs, cudaMs, width, height),
+                __FILE__, __LINE__);
+        }
+
        return 1;  // Success
    }
    catch (const cv::Exception& e) {