Update MediaClient

2026-03-28 11:39:04 +11:00
parent 24dc6c7cd0
commit f3266566eb
1284 changed files with 462406 additions and 0 deletions
--- a/include/ANSGpuFrameRegistry.h
+++ b/include/ANSGpuFrameRegistry.h
@@ -0,0 +1,414 @@
+#pragma once
+// ANSGpuFrameRegistry.h — Side-table registry associating cv::Mat pointers
+// with GPU-friendly NV12 frame data for fast-path inference.
+//
+// Key: cv::Mat* (the heap-allocated pointer from anscv_mat_new), NOT datastart.
+// This survives deep copies (CloneImage_S) because each clone gets its own key
+// pointing to the same shared GpuFrameData via reference counting.
+//
+// When RTSP HW decode produces an NV12 AVFrame, we snapshot the CPU NV12 planes
+// into owned buffers and register them keyed by the cv::Mat*. When CloneImage_S
+// is called, addRef() links the new Mat* to the same GpuFrameData (refcount++).
+// When inference runs, it reads the NV12 data via a thread-local pointer set by
+// RunInferenceComplete_LV — no registry lookup needed in the engine hot path.
+//
+// Cleanup:
+//   - anscv_mat_delete() calls release() → refcount--; frees when 0
+//   - anscv_mat_replace() calls release() on old Mat* → same
+//   - TTL eviction catches stuck tasks (frames older than 3s with refcount > 0)
+//
+// Safety layers:
+//   1. Refcount cap (64) — prevents runaway refs from bugs
+//   2. Frame TTL (3s) — force-frees frames held by stuck tasks
+//   3. Global VRAM budget (1GB) — caps GPU cache allocation
+//
+// Thread-safe: all methods lock internally.
+//
+// NOTE: This header is FFmpeg-free. CPU NV12 snapshots are owned malloc'd buffers.
+// The opaque `avframe`/`cpuAvframe` pointers are retained for ANSCV to free via av_frame_free.
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include <mutex>
+#include <atomic>
+#include <cstdint>
+#include <cstdlib>
+#include <chrono>
+#include <opencv2/core/mat.hpp>
+
+// Safety constants
+static constexpr int    MAX_FRAME_REFCOUNT    = 64;
+static constexpr int    FRAME_TTL_SECONDS     = 3;
+static constexpr size_t GPU_CACHE_BUDGET_DEFAULT = 1ULL * 1024 * 1024 * 1024; // 1GB
+static constexpr int    EVICT_CHECK_INTERVAL_MS  = 500;
+
+struct GpuFrameData {
+    // --- CPU NV12 snapshot (OWNED malloc'd buffers, independent of decoder) ---
+    uint8_t*  cpuYPlane    = nullptr;   // malloc'd Y plane copy
+    uint8_t*  cpuUvPlane   = nullptr;   // malloc'd UV plane copy
+    int       cpuYLinesize = 0;         // Bytes per row in Y plane
+    int       cpuUvLinesize = 0;        // Bytes per row in UV plane
+
+    // --- GPU upload cache (created on first inference, shared across tasks) ---
+    void*     gpuCacheY    = nullptr;   // cudaMalloc'd Y on inference GPU
+    void*     gpuCacheUV   = nullptr;   // cudaMalloc'd UV on inference GPU
+    size_t    gpuCacheYPitch  = 0;      // Pitch of cached Y plane
+    size_t    gpuCacheUVPitch = 0;      // Pitch of cached UV plane
+    size_t    gpuCacheBytes   = 0;      // Total VRAM bytes (for budget tracking)
+    int       gpuCacheDeviceIdx = -1;   // GPU index where cache lives
+    bool      gpuCacheValid = false;    // true after first upload
+    // gpuCacheMutex is NOT here — use the registry mutex for cache creation
+
+    // --- Legacy opaque AVFrame pointers (freed by ANSCV via av_frame_free) ---
+    void*     avframe      = nullptr;   // Original CUDA or CPU AVFrame (owned)
+    void*     cpuAvframe   = nullptr;   // CPU fallback AVFrame (owned, may be nullptr)
+
+    // --- Frame metadata ---
+    int       width        = 0;
+    int       height       = 0;
+    int       pixelFormat  = 0;         // 23=NV12, 1000=BGR full-res
+    int       gpuIndex     = -1;        // GPU that decoded this frame
+    int64_t   pts          = 0;         // Presentation timestamp
+    bool      isCudaDevicePtr = false;  // Legacy: true if original was CUDA zero-copy
+
+    // --- Legacy NV12 plane pointers (point into avframe, used during transition) ---
+    // TODO: Remove once all consumers use cpuYPlane/cpuUvPlane via thread-local
+    uint8_t*  yPlane       = nullptr;
+    uint8_t*  uvPlane      = nullptr;
+    int       yLinesize    = 0;
+    int       uvLinesize   = 0;
+
+    // --- Lifecycle ---
+    std::atomic<int> refcount{1};
+    std::chrono::steady_clock::time_point createdAt;
+
+    // Default constructor
+    GpuFrameData() = default;
+
+    // Move constructor (std::atomic is neither copyable nor movable)
+    GpuFrameData(GpuFrameData&& o) noexcept
+        : cpuYPlane(o.cpuYPlane), cpuUvPlane(o.cpuUvPlane)
+        , cpuYLinesize(o.cpuYLinesize), cpuUvLinesize(o.cpuUvLinesize)
+        , gpuCacheY(o.gpuCacheY), gpuCacheUV(o.gpuCacheUV)
+        , gpuCacheYPitch(o.gpuCacheYPitch), gpuCacheUVPitch(o.gpuCacheUVPitch)
+        , gpuCacheBytes(o.gpuCacheBytes), gpuCacheDeviceIdx(o.gpuCacheDeviceIdx)
+        , gpuCacheValid(o.gpuCacheValid)
+        , avframe(o.avframe), cpuAvframe(o.cpuAvframe)
+        , width(o.width), height(o.height), pixelFormat(o.pixelFormat)
+        , gpuIndex(o.gpuIndex), pts(o.pts), isCudaDevicePtr(o.isCudaDevicePtr)
+        , yPlane(o.yPlane), uvPlane(o.uvPlane)
+        , yLinesize(o.yLinesize), uvLinesize(o.uvLinesize)
+        , refcount(o.refcount.load()), createdAt(o.createdAt)
+    {
+        // Null out source to prevent double-free of owned pointers
+        o.cpuYPlane = nullptr;
+        o.cpuUvPlane = nullptr;
+        o.gpuCacheY = nullptr;
+        o.gpuCacheUV = nullptr;
+        o.avframe = nullptr;
+        o.cpuAvframe = nullptr;
+        o.yPlane = nullptr;
+        o.uvPlane = nullptr;
+        o.gpuCacheBytes = 0;
+    }
+
+    // No copy
+    GpuFrameData(const GpuFrameData&) = delete;
+    GpuFrameData& operator=(const GpuFrameData&) = delete;
+};
+
+class ANSGpuFrameRegistry {
+public:
+    // Process-wide singleton. On Windows, header-only static locals are per-DLL.
+    // ANSCV.dll exports ANSGpuFrameRegistry_GetInstance() (defined in
+    // ANSGpuFrameRegistry.cpp); other DLLs find it via GetProcAddress at runtime.
+    static ANSGpuFrameRegistry& instance() {
+#ifdef _WIN32
+        static ANSGpuFrameRegistry* s_inst = resolveProcessWide();
+        return *s_inst;
+#else
+        static ANSGpuFrameRegistry reg;
+        return reg;
+#endif
+    }
+
+    // --- Attach: register a new GpuFrameData keyed by cv::Mat* ---
+    // Allocates GpuFrameData on heap. Takes ownership of avframe/cpuAvframe.
+    // Returns old avframe pointer if this Mat* was already registered (caller must av_frame_free).
+    void* attach(cv::Mat* mat, GpuFrameData&& data) {
+        if (!mat) return nullptr;
+        void* oldAvframe = nullptr;
+
+        data.createdAt = std::chrono::steady_clock::now();
+        data.refcount.store(1);
+
+        auto* heapData = new GpuFrameData(std::move(data));
+
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        // If this Mat* already has an entry, release the old one
+        auto it = m_map.find(mat);
+        if (it != m_map.end()) {
+            auto* oldFrame = it->second;
+            int oldRef = oldFrame->refcount.fetch_sub(1);
+            if (oldRef <= 1) {
+                oldAvframe = oldFrame->avframe;
+                if (oldFrame->cpuAvframe)
+                    m_pendingFree.push_back(oldFrame->cpuAvframe);
+                freeOwnedBuffers_locked(oldFrame);
+                m_frameSet.erase(oldFrame);
+                delete oldFrame;
+            }
+            // If oldRef > 1, other clones still reference it — just unlink this Mat*
+            m_map.erase(it);
+        }
+
+        m_map[mat] = heapData;
+        m_frameSet.insert(heapData);
+
+        return oldAvframe;  // Caller must av_frame_free if non-null
+    }
+
+    // --- addRef: link a cloned cv::Mat* to the same GpuFrameData as src ---
+    // Returns true if successful, false if src not found or refcount cap reached.
+    bool addRef(cv::Mat* src, cv::Mat* dst) {
+        if (!src || !dst || src == dst) return false;
+
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        auto it = m_map.find(src);
+        if (it == m_map.end()) return false;
+
+        auto* frame = it->second;
+        int current = frame->refcount.load();
+        if (current >= MAX_FRAME_REFCOUNT) {
+            return false;  // Cap reached — caller falls back to BGR
+        }
+
+        frame->refcount.fetch_add(1);
+        m_map[dst] = frame;
+        return true;
+    }
+
+    // --- release: decrement refcount for this Mat*, free if 0 ---
+    // Returns avframe pointer to free (or nullptr) via pendingFree.
+    // Caller must drain_pending() and av_frame_free each returned pointer.
+    void release(cv::Mat* mat) {
+        if (!mat) return;
+
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        auto it = m_map.find(mat);
+        if (it == m_map.end()) return;
+
+        auto* frame = it->second;
+        m_map.erase(it);
+
+        int oldRef = frame->refcount.fetch_sub(1);
+        if (oldRef <= 1) {
+            // Last reference — free everything
+            if (frame->avframe)
+                m_pendingFree.push_back(frame->avframe);
+            if (frame->cpuAvframe)
+                m_pendingFree.push_back(frame->cpuAvframe);
+            freeOwnedBuffers_locked(frame);
+            m_frameSet.erase(frame);
+            delete frame;
+        }
+    }
+
+    // --- lookup: find GpuFrameData by cv::Mat* (locking) ---
+    GpuFrameData* lookup(cv::Mat* mat) {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        auto it = m_map.find(mat);
+        return (it != m_map.end()) ? it->second : nullptr;
+    }
+
+    // --- lookup_unlocked: caller MUST hold lock via acquire_lock() ---
+    GpuFrameData* lookup_unlocked(cv::Mat* mat) {
+        auto it = m_map.find(mat);
+        return (it != m_map.end()) ? it->second : nullptr;
+    }
+
+    // --- Backward-compat: lookup by datastart (for transition period) ---
+    // Searches all entries for matching datastart. O(n) — avoid in hot path.
+    GpuFrameData* lookup_by_datastart(const uchar* datastart) {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        return lookup_by_datastart_unlocked(datastart);
+    }
+
+    GpuFrameData* lookup_by_datastart_unlocked(const uchar* datastart) {
+        if (!datastart) return nullptr;
+        for (auto& [mat, frame] : m_map) {
+            if (mat && mat->datastart == datastart)
+                return frame;
+        }
+        return nullptr;
+    }
+
+    // Acquire the registry lock explicitly.
+    std::unique_lock<std::mutex> acquire_lock() {
+        return std::unique_lock<std::mutex>(m_mutex);
+    }
+
+    // Number of map entries (Mat* keys) — caller MUST hold lock.
+    size_t size_unlocked() const { return m_map.size(); }
+
+    // Number of unique frames alive — caller MUST hold lock.
+    size_t frame_count_unlocked() const { return m_frameSet.size(); }
+
+    // --- Drain pending avframe pointers for caller to av_frame_free ---
+    std::vector<void*> drain_pending() {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        std::vector<void*> result;
+        result.swap(m_pendingFree);
+        return result;
+    }
+
+    // --- Drain pending GPU device pointers for caller to cudaFree ---
+    std::vector<void*> drain_gpu_pending() {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        std::vector<void*> result;
+        result.swap(m_pendingGpuFree);
+        return result;
+    }
+
+    // --- TTL eviction: force-free frames older than FRAME_TTL_SECONDS ---
+    // Call periodically from camera threads (piggybacked on mat_replace).
+    void evictStaleFrames() {
+        auto now = std::chrono::steady_clock::now();
+
+        // Throttle: skip if called too frequently
+        {
+            std::lock_guard<std::mutex> lock(m_mutex);
+            if (now - m_lastEvictCheck < std::chrono::milliseconds(EVICT_CHECK_INTERVAL_MS))
+                return;
+            m_lastEvictCheck = now;
+        }
+
+        std::lock_guard<std::mutex> lock(m_mutex);
+        for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
+            auto* frame = *it;
+            auto age_s = std::chrono::duration_cast<std::chrono::seconds>(
+                now - frame->createdAt).count();
+            if (age_s > FRAME_TTL_SECONDS && frame->refcount.load() > 0) {
+                // Force cleanup — remove all Mat* keys pointing to this frame
+                for (auto jt = m_map.begin(); jt != m_map.end(); ) {
+                    if (jt->second == frame)
+                        jt = m_map.erase(jt);
+                    else
+                        ++jt;
+                }
+                // Push avframes to pendingFree
+                if (frame->avframe)
+                    m_pendingFree.push_back(frame->avframe);
+                if (frame->cpuAvframe)
+                    m_pendingFree.push_back(frame->cpuAvframe);
+                freeOwnedBuffers_locked(frame);
+                it = m_frameSet.erase(it);
+                delete frame;
+            } else {
+                ++it;
+            }
+        }
+    }
+
+    // --- VRAM budget management ---
+    bool canAllocateGpuCache(size_t bytes) const {
+        return m_totalGpuCacheBytes.load(std::memory_order_relaxed) + bytes <= m_gpuCacheBudget;
+    }
+
+    void onGpuCacheCreated(size_t bytes) {
+        m_totalGpuCacheBytes.fetch_add(bytes, std::memory_order_relaxed);
+    }
+
+    void onGpuCacheFreed(size_t bytes) {
+        // Prevent underflow
+        size_t old = m_totalGpuCacheBytes.load(std::memory_order_relaxed);
+        while (old >= bytes) {
+            if (m_totalGpuCacheBytes.compare_exchange_weak(old, old - bytes,
+                    std::memory_order_relaxed))
+                break;
+        }
+    }
+
+    size_t totalGpuCacheBytes() const {
+        return m_totalGpuCacheBytes.load(std::memory_order_relaxed);
+    }
+
+    void setGpuCacheBudget(size_t bytes) { m_gpuCacheBudget = bytes; }
+    size_t gpuCacheBudget() const { return m_gpuCacheBudget; }
+
+private:
+    ANSGpuFrameRegistry() = default;
+
+#ifdef _WIN32
+    static ANSGpuFrameRegistry* resolveProcessWide();
+#endif
+
+    // Free malloc'd CPU NV12 buffers and GPU cache (but NOT avframe/cpuAvframe —
+    // those go to pendingFree for the caller to av_frame_free).
+    void freeOwnedBuffers_locked(GpuFrameData* frame) {
+        if (frame->cpuYPlane) {
+            std::free(frame->cpuYPlane);
+            frame->cpuYPlane = nullptr;
+        }
+        if (frame->cpuUvPlane) {
+            std::free(frame->cpuUvPlane);
+            frame->cpuUvPlane = nullptr;
+        }
+        // GPU cache freed via CUDA — caller (ANSODEngine) must handle this
+        // since we can't call cudaFree from this FFmpeg-free header.
+        // The gpuCacheBytes are tracked; actual deallocation happens in
+        // NV12PreprocessHelper or a GPU-aware cleanup path.
+        if (frame->gpuCacheBytes > 0) {
+            onGpuCacheFreed(frame->gpuCacheBytes);
+            // Mark as invalid so no one reads stale pointers
+            frame->gpuCacheValid = false;
+            frame->gpuCacheBytes = 0;
+            // NOTE: gpuCacheY/gpuCacheUV device pointers are leaked here
+            // unless the caller handles GPU cleanup. This is addressed in
+            // Step 8 (NV12PreprocessHelper) where cudaFree is available.
+            // For now, push to a separate GPU-free list.
+            if (frame->gpuCacheY)
+                m_pendingGpuFree.push_back(frame->gpuCacheY);
+            if (frame->gpuCacheUV)
+                m_pendingGpuFree.push_back(frame->gpuCacheUV);
+            frame->gpuCacheY = nullptr;
+            frame->gpuCacheUV = nullptr;
+        }
+    }
+
+    std::mutex m_mutex;
+    std::unordered_map<cv::Mat*, GpuFrameData*> m_map;
+    std::unordered_set<GpuFrameData*> m_frameSet;  // All unique frames (for TTL scan)
+    std::vector<void*> m_pendingFree;     // AVFrame* pointers to av_frame_free
+    std::vector<void*> m_pendingGpuFree;  // CUDA device pointers to cudaFree
+    std::atomic<size_t> m_totalGpuCacheBytes{0};
+    size_t m_gpuCacheBudget = GPU_CACHE_BUDGET_DEFAULT;
+    std::chrono::steady_clock::time_point m_lastEvictCheck;
+};
+
+// ── Convenience free functions (FFmpeg-agnostic) ────────────────────────
+
+// Lookup by cv::Mat* pointer (primary key)
+inline GpuFrameData* gpu_frame_lookup(cv::Mat* mat) {
+    return ANSGpuFrameRegistry::instance().lookup(mat);
+}
+
+// Backward-compat: lookup by datastart (O(n) — avoid in hot path)
+inline GpuFrameData* gpu_frame_lookup(const uchar* datastart) {
+    return ANSGpuFrameRegistry::instance().lookup_by_datastart(datastart);
+}
+
+// Add ref: link clone Mat* to same GpuFrameData as src Mat*
+inline bool gpu_frame_addref(cv::Mat* src, cv::Mat* dst) {
+    return ANSGpuFrameRegistry::instance().addRef(src, dst);
+}
+
+// Drain GPU device pointers that need cudaFree.
+// Caller must cudaFree each returned pointer.
+inline std::vector<void*> gpu_frame_drain_gpu_pending() {
+    return ANSGpuFrameRegistry::instance().drain_gpu_pending();
+}