Fix NV12 crash issue when recreate camera object

2026-04-02 22:07:27 +11:00
parent 4bedf3a3a2
commit 958cab6ae3
25 changed files with 1459 additions and 393 deletions
--- a/modules/ANSCV/ANSRTSP.cpp
+++ b/modules/ANSCV/ANSRTSP.cpp
@@ -2,6 +2,7 @@
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
 #include <memory>
+#include <format>
 #include "media_codec.h"
 #include <cstdint>
 #include <cuda_runtime.h>
@@ -21,6 +22,20 @@ extern "C"
 }
 // Note: per-instance thread safety is handled by ANSRTSPClient::_mutex
 // Mat registry thread safety is handled by anscv_mat_replace's internal registry_mutex
+
+// Debug logging — goes to both stderr AND OutputDebugString (DebugView).
+#ifndef RTSP_DBG
+#ifdef _WIN32
+#define RTSP_DBG(fmt, ...) do { \
+    char _rtsp_buf[512]; \
+    snprintf(_rtsp_buf, sizeof(_rtsp_buf), fmt "\n", ##__VA_ARGS__); \
+    OutputDebugStringA(_rtsp_buf); \
+    fprintf(stderr, "%s", _rtsp_buf); \
+} while(0)
+#else
+#define RTSP_DBG(fmt, ...) fprintf(stderr, fmt "\n", ##__VA_ARGS__)
+#endif
+#endif
 static bool ansrtspLicenceValid = false;
 // Global once_flag to protect license checking
 static std::once_flag ansrtspLicenseOnceFlag;
@@ -48,19 +63,88 @@ namespace ANSCENTER {
 		Destroy();
    }
    void ANSRTSPClient::Destroy() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_playerClient) {
-            // Stop the stream first so the video decoder is flushed and
-            // the RTSP callback thread is no longer feeding frames into
-            // decode().  Without this, rtsp_close() can block waiting for
-            // CRtspClient::m_pMutex (held by the callback mid-decode),
-            // and the hardware decoder flush during destruction can hang
-            // on the GPU.
-            if (_isPlaying) {
-                _playerClient->stop();
-                _isPlaying = false;
+        // Move the player client pointer out of the lock scope, then
+        // close it OUTSIDE the mutex.  close() calls cuArrayDestroy /
+        // cuMemFree which acquire an EXCLUSIVE SRW lock inside nvcuda64.
+        // If we hold _mutex during close(), and another thread holds
+        // the nvcuda64 SRW lock (e.g. cuStreamSynchronize during
+        // inference), we get a deadlock: Stop() → _mutex → nvcuda64
+        // vs inference → nvcuda64 → (blocked by exclusive waiter).
+        decltype(_playerClient) clientToClose;
+        {
+            std::unique_lock<std::recursive_mutex> lock(_mutex);
+            if (_playerClient) {
+                if (_isPlaying) {
+                    _playerClient->stop();
+                    _isPlaying = false;
+                }
            }
-            _playerClient->close();
+
+            // --- Inference guard: wait for in-flight frames to finish ---
+            // GetRTSPCVImage increments _inFlightFrames when it hands out
+            // a GPU frame; the registry decrements it when the frame is
+            // released after inference completes.  We wait here so that
+            // close() doesn't free NVDEC surfaces while TensorRT is
+            // still reading from them (the LabVIEW crash root cause).
+            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
+            if (inFlight > 0) {
+                _logger.LogInfo("ANSRTSPClient::Destroy",
+                    std::format("waiting for {} in-flight inference frame(s)...", inFlight),
+                    __FILE__, __LINE__);
+                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
+                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
+                });
+                if (!done) {
+                    _logger.LogWarn("ANSRTSPClient::Destroy",
+                        std::format("timed out waiting for in-flight frames "
+                                    "(still {} in-flight) — force-releasing GPU frames",
+                                    _inFlightFrames.load()),
+                        __FILE__, __LINE__);
+                }
+            }
+
+            // Force-release ALL GPU frames owned by this client BEFORE close().
+            // Unreleased clones (e.g. LabVIEW AI tasks still holding cloned
+            // cv::Mat*) keep gpuCacheY/gpuCacheUV allocated.  We must cudaFree
+            // them NOW while the CUDA context is still alive.  After close()
+            // destroys the context, cudaFree would crash.
+            int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
+            if (forceReleased > 0) {
+                _logger.LogWarn("ANSRTSPClient::Destroy",
+                    std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
+                    __FILE__, __LINE__);
+                // Drain and cudaFree the GPU buffers while CUDA context is alive
+                // Sync all GPU streams before freeing to avoid illegal access
+                cudaDeviceSynchronize();
+                auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
+                if (!gpuPending.empty()) {
+                    RTSP_DBG("[Destroy] cudaFree %zu GPU ptrs before close()", gpuPending.size());
+                    int prevDev = -1;
+                    cudaGetDevice(&prevDev);
+                    for (auto& entry : gpuPending) {
+                        if (entry.ptr) {
+                            if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
+                            cudaFree(entry.ptr);
+                        }
+                    }
+                    if (prevDev >= 0) cudaSetDevice(prevDev);
+                }
+                // Also drain any pending AVFrames
+                auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
+                for (void* p : avPending) {
+                    AVFrame* f = static_cast<AVFrame*>(p);
+                    av_frame_free(&f);
+                }
+            }
+            ANSGpuFrameRegistry::instance().invalidateOwner(this);
+            _inFlightFrames.store(0, std::memory_order_release);
+
+            clientToClose = std::move(_playerClient);
+        }
+        // CUDA cleanup happens here, outside the mutex — now safe.
+        // All GPU frames owned by this client have been force-freed above.
+        if (clientToClose) {
+            clientToClose->close();
        }
    }
    static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) {
@@ -146,10 +230,81 @@ namespace ANSCENTER {
        _playerClient->setCrop(crop);
    }
    bool ANSRTSPClient::Reconnect() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        // 1. Mark as not-playing under the mutex FIRST.  This makes GetImage()
+        //    return the cached _pLastFrame instead of calling into the player,
+        //    preventing use-after-free when close() destroys CUDA resources.
+        {
+            std::unique_lock<std::recursive_mutex> lock(_mutex);
+            _isPlaying = false;
+
+            // --- Inference guard: wait for in-flight frames to finish ---
+            // Same guard as Destroy(): close() will free NVDEC surfaces, so
+            // we must wait for any inference engines still reading NV12 data
+            // via zero-copy CUDA device pointers.
+            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
+            if (inFlight > 0) {
+                _logger.LogInfo("ANSRTSPClient::Reconnect",
+                    std::format("waiting for {} in-flight inference frame(s)...", inFlight),
+                    __FILE__, __LINE__);
+                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
+                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
+                });
+                if (!done) {
+                    _logger.LogWarn("ANSRTSPClient::Reconnect",
+                        std::format("timed out waiting for in-flight frames "
+                                    "(still {} in-flight) — force-releasing GPU frames",
+                                    _inFlightFrames.load()),
+                        __FILE__, __LINE__);
+                }
+            }
+
+            // Force-release GPU frames before close() — same as Destroy().
+            int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
+            if (forceReleased > 0) {
+                _logger.LogWarn("ANSRTSPClient::Reconnect",
+                    std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
+                    __FILE__, __LINE__);
+                // Sync all GPU streams before freeing
+                cudaDeviceSynchronize();
+                auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
+                if (!gpuPending.empty()) {
+                    int prevDev = -1;
+                    cudaGetDevice(&prevDev);
+                    for (auto& entry : gpuPending) {
+                        if (entry.ptr) {
+                            if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
+                            cudaFree(entry.ptr);
+                        }
+                    }
+                    if (prevDev >= 0) cudaSetDevice(prevDev);
+                }
+                auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
+                for (void* p : avPending) {
+                    AVFrame* f = static_cast<AVFrame*>(p);
+                    av_frame_free(&f);
+                }
+            }
+            ANSGpuFrameRegistry::instance().invalidateOwner(this);
+            _inFlightFrames.store(0, std::memory_order_release);
+        }
+
+        // 2. close() does CUDA cleanup (cuArrayDestroy/cuMemFree) — run outside
+        //    _mutex to avoid deadlocking with nvcuda64 SRW lock held by inference.
+        //    Safe now because GetImage()/GetNV12Frame() won't touch the player
+        //    while _isPlaying == false, and all in-flight frames have been released.
+        _logger.LogInfo("ANSRTSPClient::Reconnect",
+            "calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
+        RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
        _playerClient->close();
+        RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this);
+
+        // 3. Re-setup and play under the mutex.
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        _logger.LogInfo("ANSRTSPClient::Reconnect",
+            "calling Setup() + play()", __FILE__, __LINE__);
        Setup();
        _isPlaying = _playerClient->play();
+        RTSP_DBG("[Reconnect] DONE isPlaying=%d this=%p", (int)_isPlaying, (void*)this);
        return _isPlaying;
    }
    void ANSRTSPClient::EnableAudio(bool status) {
@@ -169,11 +324,23 @@ namespace ANSCENTER {
    }

     bool ANSRTSPClient::Stop() {
-        std::lock_guard<std::recursive_mutex> lock(_mutex);
-        if (_isPlaying) {
-            _playerClient->stop();
-            _isPlaying = false;
-		}
+        // Grab the player pointer and clear _isPlaying under the lock,
+        // then call stop() OUTSIDE the mutex.  stop() internally calls
+        // StopVideoDecoder -> decoder->flush() which does CUDA calls
+        // that can block on the nvcuda64 SRW lock.  Holding _mutex
+        // during that time blocks all other operations on this client
+        // and contributes to the convoy when many clients stop at once.
+        CRtspPlayer* player = nullptr;
+        {
+            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            if (_isPlaying) {
+                _isPlaying = false;
+                player = _playerClient.get();
+            }
+        }
+        if (player) {
+            player->stop();
+        }
        return true;
    }
 	bool ANSRTSPClient::Pause() {
@@ -759,10 +926,12 @@ namespace ANSCENTER {
    }
    AVFrame* ANSRTSPClient::GetNV12Frame() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        if (!_isPlaying) return nullptr;  // Player may be mid-reconnect (CUDA resources freed)
        return _playerClient->getNV12Frame();  // Returns clone, caller must av_frame_free
    }
    AVFrame* ANSRTSPClient::GetCudaHWFrame() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        if (!_isPlaying) return nullptr;  // Player may be mid-reconnect (CUDA resources freed)
        return _playerClient->getCudaHWFrame();
    }
    bool ANSRTSPClient::IsCudaHWAccel() {
@@ -810,6 +979,11 @@ extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClien
        if (_username.empty() && _password.empty()) result = ptr->Init(licenseKey, url);
        else result = ptr->Init(licenseKey, username, password, url);
        if (result) {
+            // Default to CUDA/NVDEC HW decoding (mode 7) for NV12 zero-copy
+            // fast path.  LabVIEW may not call SetRTSPHWDecoding after
+            // destroy+recreate cycles, so this ensures the new handle always
+            // uses the GPU decode path instead of falling back to D3D11VA/CPU.
+            ptr->SetHWDecoding(7);  // HW_DECODING_CUDA
            *Handle = ptr.release();
            extern void anscv_unregister_handle(void*);
            extern void anscv_register_handle(void*, void(*)(void*));
@@ -830,9 +1004,37 @@ extern "C" __declspec(dllexport) int ReleaseANSRTSPHandle(ANSCENTER::ANSRTSPClie
    try {
        extern void anscv_unregister_handle(void*);
        anscv_unregister_handle(*Handle);
-        // unique_ptr destructor calls ~ANSRTSPClient which calls Destroy() — no need to call Destroy() separately
-        std::unique_ptr<ANSCENTER::ANSRTSPClient> ptr(*Handle);
+
+        // Grab the raw pointer and NULL the caller's handle immediately.
+        // This prevents the caller (LabVIEW) from issuing new calls.
+        ANSCENTER::ANSRTSPClient* raw = *Handle;
        *Handle = nullptr;
+
+        // Mark as not-playing under _mutex ONLY.  This makes
+        // GetImage()/GetNV12Frame()/GetCudaHWFrame() return empty/null
+        // on any subsequent call, and prevents NEW NV12 GPU surface
+        // pointers from being handed out.
+        //
+        // Do NOT call Destroy()/close() here — close() frees the
+        // NVDEC GPU surfaces (cuArrayDestroy/cuMemFree) which may
+        // still be in use by a CUDA inference kernel that received
+        // the NV12 pointer from a GetRTSPCVImage call that already
+        // completed before this Release was called.
+        {
+            // Use the client's _mutex to safely set _isPlaying = false.
+            // This is the same lock GetImage/GetNV12Frame acquire.
+            raw->Stop();  // sets _isPlaying = false, stops playback
+        }
+
+        // Defer the full cleanup (Destroy + delete) to a background thread
+        // so LabVIEW's UI thread is not blocked.  Destroy() now waits
+        // precisely for in-flight inference to finish (via _inFlightFrames
+        // counter + condition variable) instead of the old 500ms sleep hack.
+        std::thread([raw]() {
+            try { raw->Destroy(); } catch (...) {}
+            try { delete raw; } catch (...) {}
+        }).detach();
+
        return 0;
    } catch (...) {
        if (Handle) *Handle = nullptr;
@@ -882,19 +1084,56 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(

        // Attach NV12 frame for GPU fast-path inference (side-table registry)
        // attach() takes ownership — do NOT av_frame_free here
+        //
+        // CRITICAL: TryIncrementInFlight() MUST be called BEFORE GetCudaHWFrame().
+        // It atomically checks _isPlaying and increments _inFlightFrames under
+        // the same mutex, so Reconnect() cannot call close() while we're doing
+        // the D2D copy from NVDEC surfaces inside gpu_frame_attach_cuda().
        int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
-        AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
-        if (cudaHW) {
-            // CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers.
-            // Also attach CPU NV12 as fallback for cross-GPU inference
-            // (when decode GPU != inference GPU, CUDA ptrs aren't accessible).
-            AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
-            gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
-        } else {
-            AVFrame* nv12 = (*Handle)->GetNV12Frame();
-            if (nv12) {
-                gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
+        bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight();
+        RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
+                (void*)*image, gpuIdx, (int)inFlightGuardHeld);
+
+        if (inFlightGuardHeld) {
+            AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
+            if (cudaHW) {
+                RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p",
+                        cudaHW->width, cudaHW->height,
+                        (void*)cudaHW->data[0], (void*)cudaHW->data[1]);
+                AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
+                gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
+            } else {
+                // HW decode not active — try CPU NV12
+                AVFrame* nv12 = (*Handle)->GetNV12Frame();
+                if (nv12) {
+                    gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
+                }
            }
+
+            // Wire up the registry callback to release the in-flight guard.
+            // TryIncrementInFlight already incremented; DecrementInFlight fires
+            // when the last clone of this frame is released after inference.
+            auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
+            RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d gpuCacheY=%p",
+                    (void*)gpuData,
+                    gpuData ? (void*)gpuData->yPlane : nullptr,
+                    gpuData ? (int)gpuData->isCudaDevicePtr : -1,
+                    gpuData ? gpuData->gpuCacheY : nullptr);
+            if (gpuData) {
+                gpuData->ownerClient = *Handle;
+                gpuData->onReleaseFn = [](void* client) {
+                    static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight();
+                };
+                // NOTE: Do NOT call IncrementInFlight() again here —
+                // TryIncrementInFlight() already did it above.
+            } else {
+                // No gpuData registered (attach failed?) — release the guard
+                (*Handle)->DecrementInFlight();
+            }
+        } else {
+            // Player is stopping/reconnecting — skip CUDA path entirely.
+            // GetImage() already returned a cached BGR frame, which is safe.
+            RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
        }

        return 1;  // Success