Fix NV12 crash issue when recreate camera object

2026-04-02 22:07:27 +11:00
parent 4bedf3a3a2
commit 958cab6ae3
25 changed files with 1459 additions and 393 deletions
--- a/include/ANSGpuFrameRegistry.h
+++ b/include/ANSGpuFrameRegistry.h
@@ -34,15 +34,40 @@
 #include <atomic>
 #include <cstdint>
 #include <cstdlib>
+#include <cstdio>
 #include <chrono>
 #include <opencv2/core/mat.hpp>

+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+// Debug logging for registry operations — both stderr and OutputDebugString.
+#ifndef REG_DBG
+#ifdef _WIN32
+#define REG_DBG(fmt, ...) do { \
+    char _reg_buf[512]; \
+    snprintf(_reg_buf, sizeof(_reg_buf), "[Registry] " fmt "\n", ##__VA_ARGS__); \
+    OutputDebugStringA(_reg_buf); \
+    fprintf(stderr, "%s", _reg_buf); \
+} while(0)
+#else
+#define REG_DBG(fmt, ...) fprintf(stderr, "[Registry] " fmt "\n", ##__VA_ARGS__)
+#endif
+#endif
+
 // Safety constants
 static constexpr int    MAX_FRAME_REFCOUNT    = 64;
 static constexpr int    FRAME_TTL_SECONDS     = 3;
 static constexpr size_t GPU_CACHE_BUDGET_DEFAULT = 1ULL * 1024 * 1024 * 1024; // 1GB
 static constexpr int    EVICT_CHECK_INTERVAL_MS  = 500;

+// Entry for deferred GPU memory deallocation (tracks device index for cudaSetDevice)
+struct GpuPendingFreeEntry {
+    void* ptr       = nullptr;
+    int   deviceIdx = -1;
+};
+
 struct GpuFrameData {
    // --- CPU NV12 snapshot (OWNED malloc'd buffers, independent of decoder) ---
    uint8_t*  cpuYPlane    = nullptr;   // malloc'd Y plane copy
@@ -83,6 +108,14 @@ struct GpuFrameData {
    std::atomic<int> refcount{1};
    std::chrono::steady_clock::time_point createdAt;

+    // --- Owner callback (for per-client inference guard) ---
+    // When the last reference to this frame drops, onReleaseFn is called
+    // with ownerClient to decrement the RTSP client's in-flight counter.
+    // This lets Destroy() wait for in-flight inference to finish before
+    // freeing NVDEC surfaces (fixes LabVIEW crash).
+    void*  ownerClient   = nullptr;
+    void (*onReleaseFn)(void*) = nullptr;
+
    // Default constructor
    GpuFrameData() = default;

@@ -100,6 +133,7 @@ struct GpuFrameData {
        , yPlane(o.yPlane), uvPlane(o.uvPlane)
        , yLinesize(o.yLinesize), uvLinesize(o.uvLinesize)
        , refcount(o.refcount.load()), createdAt(o.createdAt)
+        , ownerClient(o.ownerClient), onReleaseFn(o.onReleaseFn)
    {
        // Null out source to prevent double-free of owned pointers
        o.cpuYPlane = nullptr;
@@ -111,6 +145,8 @@ struct GpuFrameData {
        o.yPlane = nullptr;
        o.uvPlane = nullptr;
        o.gpuCacheBytes = 0;
+        o.ownerClient = nullptr;
+        o.onReleaseFn = nullptr;
    }

    // No copy
@@ -140,32 +176,50 @@ public:
        if (!mat) return nullptr;
        void* oldAvframe = nullptr;

+        // Capture old frame's owner callback to invoke OUTSIDE m_mutex
+        void* oldOwner = nullptr;
+        void (*oldReleaseFn)(void*) = nullptr;
+
        data.createdAt = std::chrono::steady_clock::now();
        data.refcount.store(1);

        auto* heapData = new GpuFrameData(std::move(data));
+        REG_DBG("attach mat=%p new frame=%p yPlane=%p gpuCacheY=%p isCuda=%d %dx%d",
+                (void*)mat, (void*)heapData,
+                (void*)heapData->yPlane, heapData->gpuCacheY,
+                (int)heapData->isCudaDevicePtr,
+                heapData->width, heapData->height);

-        std::lock_guard<std::mutex> lock(m_mutex);
+        {
+            std::lock_guard<std::mutex> lock(m_mutex);

-        // If this Mat* already has an entry, release the old one
-        auto it = m_map.find(mat);
-        if (it != m_map.end()) {
-            auto* oldFrame = it->second;
-            int oldRef = oldFrame->refcount.fetch_sub(1);
-            if (oldRef <= 1) {
-                oldAvframe = oldFrame->avframe;
-                if (oldFrame->cpuAvframe)
-                    m_pendingFree.push_back(oldFrame->cpuAvframe);
-                freeOwnedBuffers_locked(oldFrame);
-                m_frameSet.erase(oldFrame);
-                delete oldFrame;
+            // If this Mat* already has an entry, release the old one
+            auto it = m_map.find(mat);
+            if (it != m_map.end()) {
+                auto* oldFrame = it->second;
+                int oldRef = oldFrame->refcount.fetch_sub(1);
+                if (oldRef <= 1) {
+                    oldOwner = oldFrame->ownerClient;
+                    oldReleaseFn = oldFrame->onReleaseFn;
+                    oldAvframe = oldFrame->avframe;
+                    if (oldFrame->cpuAvframe)
+                        m_pendingFree.push_back(oldFrame->cpuAvframe);
+                    freeOwnedBuffers_locked(oldFrame);
+                    m_frameSet.erase(oldFrame);
+                    delete oldFrame;
+                }
+                // If oldRef > 1, other clones still reference it — just unlink this Mat*
+                m_map.erase(it);
            }
-            // If oldRef > 1, other clones still reference it — just unlink this Mat*
-            m_map.erase(it);
+
+            m_map[mat] = heapData;
+            m_frameSet.insert(heapData);
        }

-        m_map[mat] = heapData;
-        m_frameSet.insert(heapData);
+        // Notify old frame's owner OUTSIDE m_mutex
+        if (oldReleaseFn && oldOwner) {
+            oldReleaseFn(oldOwner);
+        }

        return oldAvframe;  // Caller must av_frame_free if non-null
    }
@@ -197,24 +251,46 @@ public:
    void release(cv::Mat* mat) {
        if (!mat) return;

-        std::lock_guard<std::mutex> lock(m_mutex);
+        // Capture owner callback to invoke OUTSIDE m_mutex (deadlock safety)
+        void* owner = nullptr;
+        void (*releaseFn)(void*) = nullptr;

-        auto it = m_map.find(mat);
-        if (it == m_map.end()) return;
+        {
+            std::lock_guard<std::mutex> lock(m_mutex);

-        auto* frame = it->second;
-        m_map.erase(it);
+            auto it = m_map.find(mat);
+            if (it == m_map.end()) return;

-        int oldRef = frame->refcount.fetch_sub(1);
-        if (oldRef <= 1) {
-            // Last reference — free everything
-            if (frame->avframe)
-                m_pendingFree.push_back(frame->avframe);
-            if (frame->cpuAvframe)
-                m_pendingFree.push_back(frame->cpuAvframe);
-            freeOwnedBuffers_locked(frame);
-            m_frameSet.erase(frame);
-            delete frame;
+            auto* frame = it->second;
+            m_map.erase(it);
+
+            int oldRef = frame->refcount.fetch_sub(1);
+            REG_DBG("release mat=%p refcount %d->%d yPlane=%p gpuCacheY=%p owner=%p",
+                    (void*)mat, oldRef, oldRef - 1,
+                    (void*)frame->yPlane, frame->gpuCacheY, frame->ownerClient);
+            if (oldRef <= 1) {
+                // Capture owner callback before deleting frame
+                owner = frame->ownerClient;
+                releaseFn = frame->onReleaseFn;
+                REG_DBG("LAST REF — freeing frame=%p cpuY=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu",
+                        (void*)frame, (void*)frame->cpuYPlane,
+                        frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes);
+                // Last reference — free everything
+                if (frame->avframe)
+                    m_pendingFree.push_back(frame->avframe);
+                if (frame->cpuAvframe)
+                    m_pendingFree.push_back(frame->cpuAvframe);
+                freeOwnedBuffers_locked(frame);
+                m_frameSet.erase(frame);
+                delete frame;
+            }
+        }
+
+        // Notify owner OUTSIDE m_mutex — prevents lock-ordering deadlock
+        // with ANSRTSPClient::_mutex (used by Destroy's condition_variable wait)
+        if (releaseFn && owner) {
+            REG_DBG("calling onReleaseFn owner=%p", owner);
+            releaseFn(owner);
        }
    }

@@ -267,9 +343,10 @@ public:
    }

    // --- Drain pending GPU device pointers for caller to cudaFree ---
-    std::vector<void*> drain_gpu_pending() {
+    // Each entry includes the device index for cudaSetDevice before cudaFree.
+    std::vector<GpuPendingFreeEntry> drain_gpu_pending() {
        std::lock_guard<std::mutex> lock(m_mutex);
-        std::vector<void*> result;
+        std::vector<GpuPendingFreeEntry> result;
        result.swap(m_pendingGpuFree);
        return result;
    }
@@ -287,31 +364,46 @@ public:
            m_lastEvictCheck = now;
        }

-        std::lock_guard<std::mutex> lock(m_mutex);
-        for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
-            auto* frame = *it;
-            auto age_s = std::chrono::duration_cast<std::chrono::seconds>(
-                now - frame->createdAt).count();
-            if (age_s > FRAME_TTL_SECONDS && frame->refcount.load() > 0) {
-                // Force cleanup — remove all Mat* keys pointing to this frame
-                for (auto jt = m_map.begin(); jt != m_map.end(); ) {
-                    if (jt->second == frame)
-                        jt = m_map.erase(jt);
-                    else
-                        ++jt;
+        // Collect owner callbacks to invoke OUTSIDE m_mutex
+        struct OwnerCallback { void* client; void (*fn)(void*); };
+        std::vector<OwnerCallback> callbacks;
+
+        {
+            std::lock_guard<std::mutex> lock(m_mutex);
+            for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
+                auto* frame = *it;
+                auto age_s = std::chrono::duration_cast<std::chrono::seconds>(
+                    now - frame->createdAt).count();
+                if (age_s > FRAME_TTL_SECONDS && frame->refcount.load() > 0) {
+                    // Capture owner callback before deleting
+                    if (frame->onReleaseFn && frame->ownerClient) {
+                        callbacks.push_back({frame->ownerClient, frame->onReleaseFn});
+                    }
+                    // Force cleanup — remove all Mat* keys pointing to this frame
+                    for (auto jt = m_map.begin(); jt != m_map.end(); ) {
+                        if (jt->second == frame)
+                            jt = m_map.erase(jt);
+                        else
+                            ++jt;
+                    }
+                    // Push avframes to pendingFree
+                    if (frame->avframe)
+                        m_pendingFree.push_back(frame->avframe);
+                    if (frame->cpuAvframe)
+                        m_pendingFree.push_back(frame->cpuAvframe);
+                    freeOwnedBuffers_locked(frame);
+                    it = m_frameSet.erase(it);
+                    delete frame;
+                } else {
+                    ++it;
                }
-                // Push avframes to pendingFree
-                if (frame->avframe)
-                    m_pendingFree.push_back(frame->avframe);
-                if (frame->cpuAvframe)
-                    m_pendingFree.push_back(frame->cpuAvframe);
-                freeOwnedBuffers_locked(frame);
-                it = m_frameSet.erase(it);
-                delete frame;
-            } else {
-                ++it;
            }
        }
+
+        // Notify owners OUTSIDE m_mutex
+        for (auto& cb : callbacks) {
+            cb.fn(cb.client);
+        }
    }

    // --- VRAM budget management ---
@@ -340,6 +432,70 @@ public:
    void setGpuCacheBudget(size_t bytes) { m_gpuCacheBudget = bytes; }
    size_t gpuCacheBudget() const { return m_gpuCacheBudget; }

+    // --- Invalidate owner: nullify all callbacks for a client being destroyed ---
+    // Called by Destroy() on timeout to prevent callbacks into a deleted object.
+    void invalidateOwner(void* client) {
+        if (!client) return;
+        std::lock_guard<std::mutex> lock(m_mutex);
+        for (auto* frame : m_frameSet) {
+            if (frame->ownerClient == client) {
+                frame->ownerClient = nullptr;
+                frame->onReleaseFn = nullptr;
+            }
+        }
+    }
+
+    // --- Force-release all frames owned by a client ---
+    // Called by Destroy() BEFORE close() to free GPU buffers while the CUDA
+    // context is still alive.  Without this, unreleased clones (e.g. 70 cloned
+    // images held by LabVIEW AI tasks that haven't finished) keep gpuCacheY/UV
+    // allocated.  When close() destroys the CUDA context, those buffers become
+    // orphaned and later cudaFree calls crash.
+    //
+    // This force-frees ALL owned buffers for frames belonging to this client,
+    // removes all Mat* keys pointing to them, and deletes the GpuFrameData.
+    // Returns the number of frames force-released.
+    int forceReleaseByOwner(void* client) {
+        if (!client) return 0;
+        int count = 0;
+
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
+            auto* frame = *it;
+            if (frame->ownerClient == client) {
+                REG_DBG("forceReleaseByOwner: frame=%p refcount=%d gpuCacheY=%p gpuCacheUV=%p bytes=%zu",
+                        (void*)frame, frame->refcount.load(),
+                        frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes);
+
+                // Remove all Mat* keys pointing to this frame
+                for (auto jt = m_map.begin(); jt != m_map.end(); ) {
+                    if (jt->second == frame)
+                        jt = m_map.erase(jt);
+                    else
+                        ++jt;
+                }
+
+                // Free owned buffers (CPU + GPU pending)
+                if (frame->avframe)
+                    m_pendingFree.push_back(frame->avframe);
+                if (frame->cpuAvframe)
+                    m_pendingFree.push_back(frame->cpuAvframe);
+                freeOwnedBuffers_locked(frame);
+                it = m_frameSet.erase(it);
+                delete frame;
+                ++count;
+            } else {
+                ++it;
+            }
+        }
+
+        if (count > 0) {
+            REG_DBG("forceReleaseByOwner: force-released %d frames for client=%p", count, client);
+        }
+        return count;
+    }
+
 private:
    ANSGpuFrameRegistry() = default;

@@ -350,6 +506,10 @@ private:
    // Free malloc'd CPU NV12 buffers and GPU cache (but NOT avframe/cpuAvframe —
    // those go to pendingFree for the caller to av_frame_free).
    void freeOwnedBuffers_locked(GpuFrameData* frame) {
+        REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d",
+                (void*)frame, (void*)frame->cpuYPlane, (void*)frame->cpuUvPlane,
+                frame->gpuCacheY, frame->gpuCacheUV,
+                frame->gpuCacheBytes, frame->gpuCacheDeviceIdx);
        if (frame->cpuYPlane) {
            std::free(frame->cpuYPlane);
            frame->cpuYPlane = nullptr;
@@ -358,23 +518,17 @@ private:
            std::free(frame->cpuUvPlane);
            frame->cpuUvPlane = nullptr;
        }
-        // GPU cache freed via CUDA — caller (ANSODEngine) must handle this
-        // since we can't call cudaFree from this FFmpeg-free header.
-        // The gpuCacheBytes are tracked; actual deallocation happens in
-        // NV12PreprocessHelper or a GPU-aware cleanup path.
+        // GPU cache freed via CUDA — push to deferred list with device index
+        // so the caller (ANSGpuFrameOps.h) can cudaSetDevice + cudaFree.
        if (frame->gpuCacheBytes > 0) {
            onGpuCacheFreed(frame->gpuCacheBytes);
-            // Mark as invalid so no one reads stale pointers
            frame->gpuCacheValid = false;
            frame->gpuCacheBytes = 0;
-            // NOTE: gpuCacheY/gpuCacheUV device pointers are leaked here
-            // unless the caller handles GPU cleanup. This is addressed in
-            // Step 8 (NV12PreprocessHelper) where cudaFree is available.
-            // For now, push to a separate GPU-free list.
+            int devIdx = frame->gpuCacheDeviceIdx;
            if (frame->gpuCacheY)
-                m_pendingGpuFree.push_back(frame->gpuCacheY);
+                m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx});
            if (frame->gpuCacheUV)
-                m_pendingGpuFree.push_back(frame->gpuCacheUV);
+                m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx});
            frame->gpuCacheY = nullptr;
            frame->gpuCacheUV = nullptr;
        }
@@ -384,7 +538,7 @@ private:
    std::unordered_map<cv::Mat*, GpuFrameData*> m_map;
    std::unordered_set<GpuFrameData*> m_frameSet;  // All unique frames (for TTL scan)
    std::vector<void*> m_pendingFree;     // AVFrame* pointers to av_frame_free
-    std::vector<void*> m_pendingGpuFree;  // CUDA device pointers to cudaFree
+    std::vector<GpuPendingFreeEntry> m_pendingGpuFree;  // CUDA device pointers to cudaFree
    std::atomic<size_t> m_totalGpuCacheBytes{0};
    size_t m_gpuCacheBudget = GPU_CACHE_BUDGET_DEFAULT;
    std::chrono::steady_clock::time_point m_lastEvictCheck;
@@ -408,7 +562,7 @@ inline bool gpu_frame_addref(cv::Mat* src, cv::Mat* dst) {
 }

 // Drain GPU device pointers that need cudaFree.
-// Caller must cudaFree each returned pointer.
-inline std::vector<void*> gpu_frame_drain_gpu_pending() {
+// Caller must cudaSetDevice(entry.deviceIdx) + cudaFree(entry.ptr) for each.
+inline std::vector<GpuPendingFreeEntry> gpu_frame_drain_gpu_pending() {
    return ANSGpuFrameRegistry::instance().drain_gpu_pending();
 }