#pragma once // ANSGpuFrameRegistry.h — Side-table registry associating cv::Mat pointers // with GPU-friendly NV12 frame data for fast-path inference. // // Key: cv::Mat* (the heap-allocated pointer from anscv_mat_new), NOT datastart. // This survives deep copies (CloneImage_S) because each clone gets its own key // pointing to the same shared GpuFrameData via reference counting. // // When RTSP HW decode produces an NV12 AVFrame, we snapshot the CPU NV12 planes // into owned buffers and register them keyed by the cv::Mat*. When CloneImage_S // is called, addRef() links the new Mat* to the same GpuFrameData (refcount++). // When inference runs, it reads the NV12 data via a thread-local pointer set by // RunInferenceComplete_LV — no registry lookup needed in the engine hot path. // // Cleanup: // - anscv_mat_delete() calls release() → refcount--; frees when 0 // - anscv_mat_replace() calls release() on old Mat* → same // - TTL eviction catches stuck tasks (frames older than 3s with refcount > 0) // // Safety layers: // 1. Refcount cap (64) — prevents runaway refs from bugs // 2. Frame TTL (3s) — force-frees frames held by stuck tasks // 3. Global VRAM budget (1GB) — caps GPU cache allocation // // Thread-safe: all methods lock internally. // // NOTE: This header is FFmpeg-free. CPU NV12 snapshots are owned malloc'd buffers. // The opaque `avframe`/`cpuAvframe` pointers are retained for ANSCV to free via av_frame_free. #include #include #include #include #include #include #include #include #include // Safety constants static constexpr int MAX_FRAME_REFCOUNT = 64; static constexpr int FRAME_TTL_SECONDS = 3; static constexpr size_t GPU_CACHE_BUDGET_DEFAULT = 1ULL * 1024 * 1024 * 1024; // 1GB static constexpr int EVICT_CHECK_INTERVAL_MS = 500; struct GpuFrameData { // --- CPU NV12 snapshot (OWNED malloc'd buffers, independent of decoder) --- uint8_t* cpuYPlane = nullptr; // malloc'd Y plane copy uint8_t* cpuUvPlane = nullptr; // malloc'd UV plane copy int cpuYLinesize = 0; // Bytes per row in Y plane int cpuUvLinesize = 0; // Bytes per row in UV plane // --- GPU upload cache (created on first inference, shared across tasks) --- void* gpuCacheY = nullptr; // cudaMalloc'd Y on inference GPU void* gpuCacheUV = nullptr; // cudaMalloc'd UV on inference GPU size_t gpuCacheYPitch = 0; // Pitch of cached Y plane size_t gpuCacheUVPitch = 0; // Pitch of cached UV plane size_t gpuCacheBytes = 0; // Total VRAM bytes (for budget tracking) int gpuCacheDeviceIdx = -1; // GPU index where cache lives bool gpuCacheValid = false; // true after first upload // gpuCacheMutex is NOT here — use the registry mutex for cache creation // --- Legacy opaque AVFrame pointers (freed by ANSCV via av_frame_free) --- void* avframe = nullptr; // Original CUDA or CPU AVFrame (owned) void* cpuAvframe = nullptr; // CPU fallback AVFrame (owned, may be nullptr) // --- Frame metadata --- int width = 0; int height = 0; int pixelFormat = 0; // 23=NV12, 1000=BGR full-res int gpuIndex = -1; // GPU that decoded this frame int64_t pts = 0; // Presentation timestamp bool isCudaDevicePtr = false; // Legacy: true if original was CUDA zero-copy // --- Legacy NV12 plane pointers (point into avframe, used during transition) --- // TODO: Remove once all consumers use cpuYPlane/cpuUvPlane via thread-local uint8_t* yPlane = nullptr; uint8_t* uvPlane = nullptr; int yLinesize = 0; int uvLinesize = 0; // --- Lifecycle --- std::atomic refcount{1}; std::chrono::steady_clock::time_point createdAt; // Default constructor GpuFrameData() = default; // Move constructor (std::atomic is neither copyable nor movable) GpuFrameData(GpuFrameData&& o) noexcept : cpuYPlane(o.cpuYPlane), cpuUvPlane(o.cpuUvPlane) , cpuYLinesize(o.cpuYLinesize), cpuUvLinesize(o.cpuUvLinesize) , gpuCacheY(o.gpuCacheY), gpuCacheUV(o.gpuCacheUV) , gpuCacheYPitch(o.gpuCacheYPitch), gpuCacheUVPitch(o.gpuCacheUVPitch) , gpuCacheBytes(o.gpuCacheBytes), gpuCacheDeviceIdx(o.gpuCacheDeviceIdx) , gpuCacheValid(o.gpuCacheValid) , avframe(o.avframe), cpuAvframe(o.cpuAvframe) , width(o.width), height(o.height), pixelFormat(o.pixelFormat) , gpuIndex(o.gpuIndex), pts(o.pts), isCudaDevicePtr(o.isCudaDevicePtr) , yPlane(o.yPlane), uvPlane(o.uvPlane) , yLinesize(o.yLinesize), uvLinesize(o.uvLinesize) , refcount(o.refcount.load()), createdAt(o.createdAt) { // Null out source to prevent double-free of owned pointers o.cpuYPlane = nullptr; o.cpuUvPlane = nullptr; o.gpuCacheY = nullptr; o.gpuCacheUV = nullptr; o.avframe = nullptr; o.cpuAvframe = nullptr; o.yPlane = nullptr; o.uvPlane = nullptr; o.gpuCacheBytes = 0; } // No copy GpuFrameData(const GpuFrameData&) = delete; GpuFrameData& operator=(const GpuFrameData&) = delete; }; class ANSGpuFrameRegistry { public: // Process-wide singleton. On Windows, header-only static locals are per-DLL. // ANSCV.dll exports ANSGpuFrameRegistry_GetInstance() (defined in // ANSGpuFrameRegistry.cpp); other DLLs find it via GetProcAddress at runtime. static ANSGpuFrameRegistry& instance() { #ifdef _WIN32 static ANSGpuFrameRegistry* s_inst = resolveProcessWide(); return *s_inst; #else static ANSGpuFrameRegistry reg; return reg; #endif } // --- Attach: register a new GpuFrameData keyed by cv::Mat* --- // Allocates GpuFrameData on heap. Takes ownership of avframe/cpuAvframe. // Returns old avframe pointer if this Mat* was already registered (caller must av_frame_free). void* attach(cv::Mat* mat, GpuFrameData&& data) { if (!mat) return nullptr; void* oldAvframe = nullptr; data.createdAt = std::chrono::steady_clock::now(); data.refcount.store(1); auto* heapData = new GpuFrameData(std::move(data)); std::lock_guard lock(m_mutex); // If this Mat* already has an entry, release the old one auto it = m_map.find(mat); if (it != m_map.end()) { auto* oldFrame = it->second; int oldRef = oldFrame->refcount.fetch_sub(1); if (oldRef <= 1) { oldAvframe = oldFrame->avframe; if (oldFrame->cpuAvframe) m_pendingFree.push_back(oldFrame->cpuAvframe); freeOwnedBuffers_locked(oldFrame); m_frameSet.erase(oldFrame); delete oldFrame; } // If oldRef > 1, other clones still reference it — just unlink this Mat* m_map.erase(it); } m_map[mat] = heapData; m_frameSet.insert(heapData); return oldAvframe; // Caller must av_frame_free if non-null } // --- addRef: link a cloned cv::Mat* to the same GpuFrameData as src --- // Returns true if successful, false if src not found or refcount cap reached. bool addRef(cv::Mat* src, cv::Mat* dst) { if (!src || !dst || src == dst) return false; std::lock_guard lock(m_mutex); auto it = m_map.find(src); if (it == m_map.end()) return false; auto* frame = it->second; int current = frame->refcount.load(); if (current >= MAX_FRAME_REFCOUNT) { return false; // Cap reached — caller falls back to BGR } frame->refcount.fetch_add(1); m_map[dst] = frame; return true; } // --- release: decrement refcount for this Mat*, free if 0 --- // Returns avframe pointer to free (or nullptr) via pendingFree. // Caller must drain_pending() and av_frame_free each returned pointer. void release(cv::Mat* mat) { if (!mat) return; std::lock_guard lock(m_mutex); auto it = m_map.find(mat); if (it == m_map.end()) return; auto* frame = it->second; m_map.erase(it); int oldRef = frame->refcount.fetch_sub(1); if (oldRef <= 1) { // Last reference — free everything if (frame->avframe) m_pendingFree.push_back(frame->avframe); if (frame->cpuAvframe) m_pendingFree.push_back(frame->cpuAvframe); freeOwnedBuffers_locked(frame); m_frameSet.erase(frame); delete frame; } } // --- lookup: find GpuFrameData by cv::Mat* (locking) --- GpuFrameData* lookup(cv::Mat* mat) { std::lock_guard lock(m_mutex); auto it = m_map.find(mat); return (it != m_map.end()) ? it->second : nullptr; } // --- lookup_unlocked: caller MUST hold lock via acquire_lock() --- GpuFrameData* lookup_unlocked(cv::Mat* mat) { auto it = m_map.find(mat); return (it != m_map.end()) ? it->second : nullptr; } // --- Backward-compat: lookup by datastart (for transition period) --- // Searches all entries for matching datastart. O(n) — avoid in hot path. GpuFrameData* lookup_by_datastart(const uchar* datastart) { std::lock_guard lock(m_mutex); return lookup_by_datastart_unlocked(datastart); } GpuFrameData* lookup_by_datastart_unlocked(const uchar* datastart) { if (!datastart) return nullptr; for (auto& [mat, frame] : m_map) { if (mat && mat->datastart == datastart) return frame; } return nullptr; } // Acquire the registry lock explicitly. std::unique_lock acquire_lock() { return std::unique_lock(m_mutex); } // Number of map entries (Mat* keys) — caller MUST hold lock. size_t size_unlocked() const { return m_map.size(); } // Number of unique frames alive — caller MUST hold lock. size_t frame_count_unlocked() const { return m_frameSet.size(); } // --- Drain pending avframe pointers for caller to av_frame_free --- std::vector drain_pending() { std::lock_guard lock(m_mutex); std::vector result; result.swap(m_pendingFree); return result; } // --- Drain pending GPU device pointers for caller to cudaFree --- std::vector drain_gpu_pending() { std::lock_guard lock(m_mutex); std::vector result; result.swap(m_pendingGpuFree); return result; } // --- TTL eviction: force-free frames older than FRAME_TTL_SECONDS --- // Call periodically from camera threads (piggybacked on mat_replace). void evictStaleFrames() { auto now = std::chrono::steady_clock::now(); // Throttle: skip if called too frequently { std::lock_guard lock(m_mutex); if (now - m_lastEvictCheck < std::chrono::milliseconds(EVICT_CHECK_INTERVAL_MS)) return; m_lastEvictCheck = now; } std::lock_guard lock(m_mutex); for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) { auto* frame = *it; auto age_s = std::chrono::duration_cast( now - frame->createdAt).count(); if (age_s > FRAME_TTL_SECONDS && frame->refcount.load() > 0) { // Force cleanup — remove all Mat* keys pointing to this frame for (auto jt = m_map.begin(); jt != m_map.end(); ) { if (jt->second == frame) jt = m_map.erase(jt); else ++jt; } // Push avframes to pendingFree if (frame->avframe) m_pendingFree.push_back(frame->avframe); if (frame->cpuAvframe) m_pendingFree.push_back(frame->cpuAvframe); freeOwnedBuffers_locked(frame); it = m_frameSet.erase(it); delete frame; } else { ++it; } } } // --- VRAM budget management --- bool canAllocateGpuCache(size_t bytes) const { return m_totalGpuCacheBytes.load(std::memory_order_relaxed) + bytes <= m_gpuCacheBudget; } void onGpuCacheCreated(size_t bytes) { m_totalGpuCacheBytes.fetch_add(bytes, std::memory_order_relaxed); } void onGpuCacheFreed(size_t bytes) { // Prevent underflow size_t old = m_totalGpuCacheBytes.load(std::memory_order_relaxed); while (old >= bytes) { if (m_totalGpuCacheBytes.compare_exchange_weak(old, old - bytes, std::memory_order_relaxed)) break; } } size_t totalGpuCacheBytes() const { return m_totalGpuCacheBytes.load(std::memory_order_relaxed); } void setGpuCacheBudget(size_t bytes) { m_gpuCacheBudget = bytes; } size_t gpuCacheBudget() const { return m_gpuCacheBudget; } private: ANSGpuFrameRegistry() = default; #ifdef _WIN32 static ANSGpuFrameRegistry* resolveProcessWide(); #endif // Free malloc'd CPU NV12 buffers and GPU cache (but NOT avframe/cpuAvframe — // those go to pendingFree for the caller to av_frame_free). void freeOwnedBuffers_locked(GpuFrameData* frame) { if (frame->cpuYPlane) { std::free(frame->cpuYPlane); frame->cpuYPlane = nullptr; } if (frame->cpuUvPlane) { std::free(frame->cpuUvPlane); frame->cpuUvPlane = nullptr; } // GPU cache freed via CUDA — caller (ANSODEngine) must handle this // since we can't call cudaFree from this FFmpeg-free header. // The gpuCacheBytes are tracked; actual deallocation happens in // NV12PreprocessHelper or a GPU-aware cleanup path. if (frame->gpuCacheBytes > 0) { onGpuCacheFreed(frame->gpuCacheBytes); // Mark as invalid so no one reads stale pointers frame->gpuCacheValid = false; frame->gpuCacheBytes = 0; // NOTE: gpuCacheY/gpuCacheUV device pointers are leaked here // unless the caller handles GPU cleanup. This is addressed in // Step 8 (NV12PreprocessHelper) where cudaFree is available. // For now, push to a separate GPU-free list. if (frame->gpuCacheY) m_pendingGpuFree.push_back(frame->gpuCacheY); if (frame->gpuCacheUV) m_pendingGpuFree.push_back(frame->gpuCacheUV); frame->gpuCacheY = nullptr; frame->gpuCacheUV = nullptr; } } std::mutex m_mutex; std::unordered_map m_map; std::unordered_set m_frameSet; // All unique frames (for TTL scan) std::vector m_pendingFree; // AVFrame* pointers to av_frame_free std::vector m_pendingGpuFree; // CUDA device pointers to cudaFree std::atomic m_totalGpuCacheBytes{0}; size_t m_gpuCacheBudget = GPU_CACHE_BUDGET_DEFAULT; std::chrono::steady_clock::time_point m_lastEvictCheck; }; // ── Convenience free functions (FFmpeg-agnostic) ──────────────────────── // Lookup by cv::Mat* pointer (primary key) inline GpuFrameData* gpu_frame_lookup(cv::Mat* mat) { return ANSGpuFrameRegistry::instance().lookup(mat); } // Backward-compat: lookup by datastart (O(n) — avoid in hot path) inline GpuFrameData* gpu_frame_lookup(const uchar* datastart) { return ANSGpuFrameRegistry::instance().lookup_by_datastart(datastart); } // Add ref: link clone Mat* to same GpuFrameData as src Mat* inline bool gpu_frame_addref(cv::Mat* src, cv::Mat* dst) { return ANSGpuFrameRegistry::instance().addRef(src, dst); } // Drain GPU device pointers that need cudaFree. // Caller must cudaFree each returned pointer. inline std::vector gpu_frame_drain_gpu_pending() { return ANSGpuFrameRegistry::instance().drain_gpu_pending(); }