#pragma once // ANSGpuFrameRegistry.h — Side-table registry associating cv::Mat pointers // with GPU-friendly NV12 frame data for fast-path inference. // // Key: cv::Mat* (the heap-allocated pointer from anscv_mat_new), NOT datastart. // This survives deep copies (CloneImage_S) because each clone gets its own key // pointing to the same shared GpuFrameData via reference counting. // // When RTSP HW decode produces an NV12 AVFrame, we snapshot the CPU NV12 planes // into owned buffers and register them keyed by the cv::Mat*. When CloneImage_S // is called, addRef() links the new Mat* to the same GpuFrameData (refcount++). // When inference runs, it reads the NV12 data via a thread-local pointer set by // RunInferenceComplete_LV — no registry lookup needed in the engine hot path. // // Cleanup: // - anscv_mat_delete() calls release() → refcount--; frees when 0 // - anscv_mat_replace() calls release() on old Mat* → same // - TTL eviction catches stuck tasks (frames older than 3s with refcount > 0) // // Safety layers: // 1. Refcount cap (64) — prevents runaway refs from bugs // 2. Frame TTL (3s) — force-frees frames held by stuck tasks // 3. Global VRAM budget (1GB) — caps GPU cache allocation // // Thread-safe: all methods lock internally. // // NOTE: This header is FFmpeg-free. CPU NV12 snapshots are owned malloc'd buffers. // The opaque `avframe`/`cpuAvframe` pointers are retained for ANSCV to free via av_frame_free. #include #include #include #include #include #include #include #include #include #include #ifdef _WIN32 #include #endif // Debug logging for registry operations. // Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging. #ifndef REG_DBG #if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG #ifdef _WIN32 #define REG_DBG(fmt, ...) do { \ char _reg_buf[512]; \ snprintf(_reg_buf, sizeof(_reg_buf), "[Registry] " fmt "\n", ##__VA_ARGS__); \ OutputDebugStringA(_reg_buf); \ fprintf(stderr, "%s", _reg_buf); \ } while(0) #else #define REG_DBG(fmt, ...) fprintf(stderr, "[Registry] " fmt "\n", ##__VA_ARGS__) #endif #else #define REG_DBG(fmt, ...) ((void)0) #endif #endif // GpuNV12Slot definition needed by freeOwnedBuffers_locked() (accesses inUse atomic). #include "GpuNV12SlotPool.h" // Safety constants static constexpr int MAX_FRAME_REFCOUNT = 64; static constexpr int FRAME_TTL_SECONDS = 3; static constexpr size_t GPU_CACHE_BUDGET_DEFAULT = 1ULL * 1024 * 1024 * 1024; // 1GB static constexpr int EVICT_CHECK_INTERVAL_MS = 500; // Entry for deferred GPU memory deallocation (tracks device index for cudaSetDevice) struct GpuPendingFreeEntry { void* ptr = nullptr; int deviceIdx = -1; std::chrono::steady_clock::time_point queuedAt; // When this entry was queued }; struct GpuFrameData { // --- CPU NV12 snapshot (OWNED malloc'd buffers, independent of decoder) --- uint8_t* cpuYPlane = nullptr; // malloc'd Y plane copy uint8_t* cpuUvPlane = nullptr; // malloc'd UV plane copy int cpuYLinesize = 0; // Bytes per row in Y plane int cpuUvLinesize = 0; // Bytes per row in UV plane // --- GPU upload cache (created on first inference, shared across tasks) --- void* gpuCacheY = nullptr; // cudaMalloc'd Y on inference GPU void* gpuCacheUV = nullptr; // cudaMalloc'd UV on inference GPU size_t gpuCacheYPitch = 0; // Pitch of cached Y plane size_t gpuCacheUVPitch = 0; // Pitch of cached UV plane size_t gpuCacheBytes = 0; // Total VRAM bytes (for budget tracking) int gpuCacheDeviceIdx = -1; // GPU index where cache lives bool gpuCacheValid = false; // true after first upload // gpuCacheMutex is NOT here — use the registry mutex for cache creation // --- Legacy opaque AVFrame pointers (freed by ANSCV via av_frame_free) --- void* avframe = nullptr; // Original CUDA or CPU AVFrame (owned) void* cpuAvframe = nullptr; // CPU fallback AVFrame (owned, may be nullptr) // --- Frame metadata --- int width = 0; int height = 0; int pixelFormat = 0; // 23=NV12, 1000=BGR full-res int gpuIndex = -1; // GPU that decoded this frame int64_t pts = 0; // Presentation timestamp bool isCudaDevicePtr = false; // Legacy: true if original was CUDA zero-copy // --- Legacy NV12 plane pointers (point into avframe, used during transition) --- // TODO: Remove once all consumers use cpuYPlane/cpuUvPlane via thread-local uint8_t* yPlane = nullptr; uint8_t* uvPlane = nullptr; int yLinesize = 0; int uvLinesize = 0; // --- Lifecycle --- std::atomic refcount{1}; std::chrono::steady_clock::time_point createdAt; // --- Owner callback (for per-client inference guard) --- // When the last reference to this frame drops, onReleaseFn is called // with ownerClient to decrement the RTSP client's in-flight counter. // This lets Destroy() wait for in-flight inference to finish before // freeing NVDEC surfaces (fixes LabVIEW crash). void* ownerClient = nullptr; void (*onReleaseFn)(void*) = nullptr; // --- Global pool slot (from GpuNV12SlotPool) --- // When non-null, yPlane/uvPlane point into this slot's buffers. // Released (slot->inUse = false) in freeOwnedBuffers_locked() when // the frame's refcount drops to 0 — guarantees the buffer is not // freed while any consumer is still reading it. GpuNV12Slot* poolSlot = nullptr; // Default constructor GpuFrameData() = default; // Move constructor (std::atomic is neither copyable nor movable) GpuFrameData(GpuFrameData&& o) noexcept : cpuYPlane(o.cpuYPlane), cpuUvPlane(o.cpuUvPlane) , cpuYLinesize(o.cpuYLinesize), cpuUvLinesize(o.cpuUvLinesize) , gpuCacheY(o.gpuCacheY), gpuCacheUV(o.gpuCacheUV) , gpuCacheYPitch(o.gpuCacheYPitch), gpuCacheUVPitch(o.gpuCacheUVPitch) , gpuCacheBytes(o.gpuCacheBytes), gpuCacheDeviceIdx(o.gpuCacheDeviceIdx) , gpuCacheValid(o.gpuCacheValid) , avframe(o.avframe), cpuAvframe(o.cpuAvframe) , width(o.width), height(o.height), pixelFormat(o.pixelFormat) , gpuIndex(o.gpuIndex), pts(o.pts), isCudaDevicePtr(o.isCudaDevicePtr) , yPlane(o.yPlane), uvPlane(o.uvPlane) , yLinesize(o.yLinesize), uvLinesize(o.uvLinesize) , refcount(o.refcount.load()), createdAt(o.createdAt) , ownerClient(o.ownerClient), onReleaseFn(o.onReleaseFn) , poolSlot(o.poolSlot) { // Null out source to prevent double-free of owned pointers o.cpuYPlane = nullptr; o.cpuUvPlane = nullptr; o.gpuCacheY = nullptr; o.gpuCacheUV = nullptr; o.avframe = nullptr; o.cpuAvframe = nullptr; o.yPlane = nullptr; o.uvPlane = nullptr; o.gpuCacheBytes = 0; o.ownerClient = nullptr; o.onReleaseFn = nullptr; o.poolSlot = nullptr; } // No copy GpuFrameData(const GpuFrameData&) = delete; GpuFrameData& operator=(const GpuFrameData&) = delete; }; class ANSGpuFrameRegistry { public: // Process-wide singleton. On Windows, header-only static locals are per-DLL. // ANSCV.dll exports ANSGpuFrameRegistry_GetInstance() (defined in // ANSGpuFrameRegistry.cpp); other DLLs find it via GetProcAddress at runtime. static ANSGpuFrameRegistry& instance() { #ifdef _WIN32 static ANSGpuFrameRegistry* s_inst = resolveProcessWide(); return *s_inst; #else static ANSGpuFrameRegistry reg; return reg; #endif } // --- Attach: register a new GpuFrameData keyed by cv::Mat* --- // Allocates GpuFrameData on heap. Takes ownership of avframe/cpuAvframe. // Returns old avframe pointer if this Mat* was already registered (caller must av_frame_free). void* attach(cv::Mat* mat, GpuFrameData&& data) { if (!mat) return nullptr; void* oldAvframe = nullptr; // Capture old frame's owner callback to invoke OUTSIDE m_mutex void* oldOwner = nullptr; void (*oldReleaseFn)(void*) = nullptr; data.createdAt = std::chrono::steady_clock::now(); data.refcount.store(1); auto* heapData = new GpuFrameData(std::move(data)); REG_DBG("attach mat=%p new frame=%p yPlane=%p gpuCacheY=%p isCuda=%d %dx%d", (void*)mat, (void*)heapData, (void*)heapData->yPlane, heapData->gpuCacheY, (int)heapData->isCudaDevicePtr, heapData->width, heapData->height); { std::lock_guard lock(m_mutex); // If this Mat* already has an entry, release the old one auto it = m_map.find(mat); if (it != m_map.end()) { auto* oldFrame = it->second; int oldRef = oldFrame->refcount.fetch_sub(1); if (oldRef <= 1) { oldOwner = oldFrame->ownerClient; oldReleaseFn = oldFrame->onReleaseFn; oldAvframe = oldFrame->avframe; if (oldFrame->cpuAvframe) m_pendingFree.push_back(oldFrame->cpuAvframe); freeOwnedBuffers_locked(oldFrame); m_frameSet.erase(oldFrame); delete oldFrame; } // If oldRef > 1, other clones still reference it — just unlink this Mat* m_map.erase(it); } m_map[mat] = heapData; m_frameSet.insert(heapData); } // Notify old frame's owner OUTSIDE m_mutex if (oldReleaseFn && oldOwner) { oldReleaseFn(oldOwner); } return oldAvframe; // Caller must av_frame_free if non-null } // --- addRef: link a cloned cv::Mat* to the same GpuFrameData as src --- // Returns true if successful, false if src not found or refcount cap reached. bool addRef(cv::Mat* src, cv::Mat* dst) { if (!src || !dst || src == dst) return false; std::lock_guard lock(m_mutex); auto it = m_map.find(src); if (it == m_map.end()) return false; auto* frame = it->second; int current = frame->refcount.load(); if (current >= MAX_FRAME_REFCOUNT) { return false; // Cap reached — caller falls back to BGR } frame->refcount.fetch_add(1); m_map[dst] = frame; return true; } // --- release: decrement refcount for this Mat*, free if 0 --- // Returns avframe pointer to free (or nullptr) via pendingFree. // Caller must drain_pending() and av_frame_free each returned pointer. void release(cv::Mat* mat) { if (!mat) return; // Capture owner callback to invoke OUTSIDE m_mutex (deadlock safety) void* owner = nullptr; void (*releaseFn)(void*) = nullptr; { std::lock_guard lock(m_mutex); auto it = m_map.find(mat); if (it == m_map.end()) return; auto* frame = it->second; m_map.erase(it); int oldRef = frame->refcount.fetch_sub(1); REG_DBG("release mat=%p refcount %d->%d yPlane=%p gpuCacheY=%p owner=%p", (void*)mat, oldRef, oldRef - 1, (void*)frame->yPlane, frame->gpuCacheY, frame->ownerClient); if (oldRef <= 1) { // Capture owner callback before deleting frame owner = frame->ownerClient; releaseFn = frame->onReleaseFn; REG_DBG("LAST REF — freeing frame=%p cpuY=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu", (void*)frame, (void*)frame->cpuYPlane, frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes); // Last reference — free everything if (frame->avframe) m_pendingFree.push_back(frame->avframe); if (frame->cpuAvframe) m_pendingFree.push_back(frame->cpuAvframe); freeOwnedBuffers_locked(frame); m_frameSet.erase(frame); delete frame; } } // Notify owner OUTSIDE m_mutex — prevents lock-ordering deadlock // with ANSRTSPClient::_mutex (used by Destroy's condition_variable wait) if (releaseFn && owner) { REG_DBG("calling onReleaseFn owner=%p", owner); releaseFn(owner); } } // --- lookup: find GpuFrameData by cv::Mat* (locking) --- GpuFrameData* lookup(cv::Mat* mat) { std::lock_guard lock(m_mutex); auto it = m_map.find(mat); return (it != m_map.end()) ? it->second : nullptr; } // --- lookup_unlocked: caller MUST hold lock via acquire_lock() --- GpuFrameData* lookup_unlocked(cv::Mat* mat) { auto it = m_map.find(mat); return (it != m_map.end()) ? it->second : nullptr; } // --- Backward-compat: lookup by datastart (for transition period) --- // Searches all entries for matching datastart. O(n) — avoid in hot path. GpuFrameData* lookup_by_datastart(const uchar* datastart) { std::lock_guard lock(m_mutex); return lookup_by_datastart_unlocked(datastart); } GpuFrameData* lookup_by_datastart_unlocked(const uchar* datastart) { if (!datastart) return nullptr; for (auto& [mat, frame] : m_map) { if (mat && mat->datastart == datastart) return frame; } return nullptr; } // Acquire the registry lock explicitly. std::unique_lock acquire_lock() { return std::unique_lock(m_mutex); } // Number of map entries (Mat* keys) — caller MUST hold lock. size_t size_unlocked() const { return m_map.size(); } // Number of unique frames alive — caller MUST hold lock. size_t frame_count_unlocked() const { return m_frameSet.size(); } // --- Drain pending avframe pointers for caller to av_frame_free --- std::vector drain_pending() { std::lock_guard lock(m_mutex); std::vector result; result.swap(m_pendingFree); return result; } // --- Drain pending GPU device pointers for caller to cudaFree --- // Each entry includes the device index for cudaSetDevice before cudaFree. // If minAgeMs > 0, only drain entries older than minAgeMs milliseconds. // This allows time-based safety: entries queued >100ms ago are guaranteed // safe to free because all CUDA kernels complete in <10ms. std::vector drain_gpu_pending(int minAgeMs = 0) { std::lock_guard lock(m_mutex); if (minAgeMs <= 0) { // Drain all (used by Destroy/Reconnect with cudaDeviceSynchronize) std::vector result; result.swap(m_pendingGpuFree); return result; } // Drain only entries older than minAgeMs auto now = std::chrono::steady_clock::now(); auto threshold = std::chrono::milliseconds(minAgeMs); std::vector ready; std::vector notReady; for (auto& entry : m_pendingGpuFree) { if (now - entry.queuedAt >= threshold) ready.push_back(entry); else notReady.push_back(entry); } m_pendingGpuFree = std::move(notReady); return ready; } // --- TTL eviction: force-free frames older than FRAME_TTL_SECONDS --- // Call periodically from camera threads (piggybacked on mat_replace). void evictStaleFrames() { auto now = std::chrono::steady_clock::now(); // Throttle: skip if called too frequently { std::lock_guard lock(m_mutex); if (now - m_lastEvictCheck < std::chrono::milliseconds(EVICT_CHECK_INTERVAL_MS)) return; m_lastEvictCheck = now; } // Collect owner callbacks to invoke OUTSIDE m_mutex struct OwnerCallback { void* client; void (*fn)(void*); }; std::vector callbacks; { std::lock_guard lock(m_mutex); for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) { auto* frame = *it; auto age_s = std::chrono::duration_cast( now - frame->createdAt).count(); if (age_s > FRAME_TTL_SECONDS && frame->refcount.load() > 0) { // Capture owner callback before deleting if (frame->onReleaseFn && frame->ownerClient) { callbacks.push_back({frame->ownerClient, frame->onReleaseFn}); } // Force cleanup — remove all Mat* keys pointing to this frame for (auto jt = m_map.begin(); jt != m_map.end(); ) { if (jt->second == frame) jt = m_map.erase(jt); else ++jt; } // Push avframes to pendingFree if (frame->avframe) m_pendingFree.push_back(frame->avframe); if (frame->cpuAvframe) m_pendingFree.push_back(frame->cpuAvframe); freeOwnedBuffers_locked(frame); it = m_frameSet.erase(it); delete frame; } else { ++it; } } } // Notify owners OUTSIDE m_mutex for (auto& cb : callbacks) { cb.fn(cb.client); } } // --- VRAM budget management --- bool canAllocateGpuCache(size_t bytes) const { return m_totalGpuCacheBytes.load(std::memory_order_relaxed) + bytes <= m_gpuCacheBudget; } void onGpuCacheCreated(size_t bytes) { m_totalGpuCacheBytes.fetch_add(bytes, std::memory_order_relaxed); } void onGpuCacheFreed(size_t bytes) { // Prevent underflow size_t old = m_totalGpuCacheBytes.load(std::memory_order_relaxed); while (old >= bytes) { if (m_totalGpuCacheBytes.compare_exchange_weak(old, old - bytes, std::memory_order_relaxed)) break; } } size_t totalGpuCacheBytes() const { return m_totalGpuCacheBytes.load(std::memory_order_relaxed); } void setGpuCacheBudget(size_t bytes) { m_gpuCacheBudget = bytes; } size_t gpuCacheBudget() const { return m_gpuCacheBudget; } // --- Invalidate owner: nullify all callbacks for a client being destroyed --- // Called by Destroy() on timeout to prevent callbacks into a deleted object. void invalidateOwner(void* client) { if (!client) return; std::lock_guard lock(m_mutex); for (auto* frame : m_frameSet) { if (frame->ownerClient == client) { frame->ownerClient = nullptr; frame->onReleaseFn = nullptr; } } } // --- Force-release all frames owned by a client --- // Called by Destroy() BEFORE close() to free GPU buffers while the CUDA // context is still alive. Without this, unreleased clones (e.g. 70 cloned // images held by LabVIEW AI tasks that haven't finished) keep gpuCacheY/UV // allocated. When close() destroys the CUDA context, those buffers become // orphaned and later cudaFree calls crash. // // This force-frees ALL owned buffers for frames belonging to this client, // removes all Mat* keys pointing to them, and deletes the GpuFrameData. // Returns the number of frames force-released. int forceReleaseByOwner(void* client) { if (!client) return 0; int count = 0; std::lock_guard lock(m_mutex); for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) { auto* frame = *it; if (frame->ownerClient == client) { REG_DBG("forceReleaseByOwner: frame=%p refcount=%d gpuCacheY=%p gpuCacheUV=%p bytes=%zu", (void*)frame, frame->refcount.load(), frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes); // Remove all Mat* keys pointing to this frame for (auto jt = m_map.begin(); jt != m_map.end(); ) { if (jt->second == frame) jt = m_map.erase(jt); else ++jt; } // Free owned buffers (CPU + GPU pending) if (frame->avframe) m_pendingFree.push_back(frame->avframe); if (frame->cpuAvframe) m_pendingFree.push_back(frame->cpuAvframe); freeOwnedBuffers_locked(frame); it = m_frameSet.erase(it); delete frame; ++count; } else { ++it; } } if (count > 0) { REG_DBG("forceReleaseByOwner: force-released %d frames for client=%p", count, client); } return count; } private: ANSGpuFrameRegistry() = default; #ifdef _WIN32 static ANSGpuFrameRegistry* resolveProcessWide(); #endif // Free malloc'd CPU NV12 buffers and GPU cache (but NOT avframe/cpuAvframe — // those go to pendingFree for the caller to av_frame_free). void freeOwnedBuffers_locked(GpuFrameData* frame) { REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d poolSlot=%p", (void*)frame, (void*)frame->cpuYPlane, (void*)frame->cpuUvPlane, frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes, frame->gpuCacheDeviceIdx, (void*)frame->poolSlot); // Release global pool slot via DEFERRED release — the slot enters a // "cooling" state for SLOT_COOLDOWN_MS (200ms) before it becomes // available for reuse. This guarantees that any in-flight GPU kernels // (launched asynchronously by inference engines) have completed reading // from the buffer. CPU refcount→0 does NOT mean the GPU is done. if (frame->poolSlot) { GpuNV12SlotPool::deferRelease(frame->poolSlot); frame->poolSlot = nullptr; // yPlane/uvPlane pointed into the pool slot — null them to // prevent any stale reads after this point. frame->yPlane = nullptr; frame->uvPlane = nullptr; } if (frame->cpuYPlane) { std::free(frame->cpuYPlane); frame->cpuYPlane = nullptr; } if (frame->cpuUvPlane) { std::free(frame->cpuUvPlane); frame->cpuUvPlane = nullptr; } // GPU cache freed via CUDA — push to deferred list with device index // so the caller (ANSGpuFrameOps.h) can cudaSetDevice + cudaFree. if (frame->gpuCacheBytes > 0) { onGpuCacheFreed(frame->gpuCacheBytes); frame->gpuCacheValid = false; frame->gpuCacheBytes = 0; int devIdx = frame->gpuCacheDeviceIdx; auto now = std::chrono::steady_clock::now(); if (frame->gpuCacheY) m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx, now}); if (frame->gpuCacheUV) m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx, now}); frame->gpuCacheY = nullptr; frame->gpuCacheUV = nullptr; } } std::mutex m_mutex; std::unordered_map m_map; std::unordered_set m_frameSet; // All unique frames (for TTL scan) std::vector m_pendingFree; // AVFrame* pointers to av_frame_free std::vector m_pendingGpuFree; // CUDA device pointers to cudaFree std::atomic m_totalGpuCacheBytes{0}; size_t m_gpuCacheBudget = GPU_CACHE_BUDGET_DEFAULT; std::chrono::steady_clock::time_point m_lastEvictCheck; }; // ── Convenience free functions (FFmpeg-agnostic) ──────────────────────── // Lookup by cv::Mat* pointer (primary key) inline GpuFrameData* gpu_frame_lookup(cv::Mat* mat) { return ANSGpuFrameRegistry::instance().lookup(mat); } // Backward-compat: lookup by datastart (O(n) — avoid in hot path) inline GpuFrameData* gpu_frame_lookup(const uchar* datastart) { return ANSGpuFrameRegistry::instance().lookup_by_datastart(datastart); } // Add ref: link clone Mat* to same GpuFrameData as src Mat* inline bool gpu_frame_addref(cv::Mat* src, cv::Mat* dst) { return ANSGpuFrameRegistry::instance().addRef(src, dst); } // Drain GPU device pointers that need cudaFree. // Caller must cudaSetDevice(entry.deviceIdx) + cudaFree(entry.ptr) for each. inline std::vector gpu_frame_drain_gpu_pending() { return ANSGpuFrameRegistry::instance().drain_gpu_pending(); }