Refactor project structure

2026-03-28 19:56:39 +11:00
parent 1d267378b2
commit 8a2e721058
511 changed files with 59 additions and 48 deletions
--- a/modules/ANSCV/ANSGpuFrameOps.h
+++ b/modules/ANSCV/ANSGpuFrameOps.h
@@ -0,0 +1,212 @@
+#pragma once
+// ANSGpuFrameOps.h — FFmpeg-aware convenience functions for ANSGpuFrameRegistry.
+//
+// This header requires FFmpeg headers (libavutil/frame.h) and provides
+// typed attach/invalidate/remove operations that handle av_frame_clone/free.
+//
+// NEW DESIGN: Instead of storing AVFrame* references (which lock NVDEC surfaces),
+// we snapshot the CPU NV12 planes into malloc'd buffers and release the AVFrames
+// immediately. This prevents decoder surface pool exhaustion when many clones
+// hold references to the same frame.
+//
+// Include this in ANSCV/ANSRTSP (which link FFmpeg). For projects without
+// FFmpeg (ANSODEngine), include ANSGpuFrameRegistry.h directly and use
+// gpu_frame_lookup() + the GpuFrameData plane pointers.
+
+#include "ANSGpuFrameRegistry.h"
+
+extern "C" {
+#include "libavutil/frame.h"
+}
+
+#include <cstring>
+#include <cstdlib>
+
+namespace anscv_gpu_ops {
+namespace detail {
+
+// Snapshot NV12 Y and UV planes from an AVFrame into malloc'd buffers.
+// Returns true on success. Caller owns the output buffers.
+inline bool snapshotNV12Planes(const AVFrame* nv12,
+                                uint8_t*& outY, int& outYLinesize,
+                                uint8_t*& outUV, int& outUVLinesize,
+                                int& outWidth, int& outHeight) {
+    if (!nv12 || !nv12->data[0] || !nv12->data[1])
+        return false;
+
+    outWidth  = nv12->width;
+    outHeight = nv12->height;
+    outYLinesize  = nv12->width;   // Packed (no alignment padding)
+    outUVLinesize = nv12->width;   // UV interleaved: width bytes per row
+
+    size_t yBytes  = static_cast<size_t>(outYLinesize) * outHeight;
+    size_t uvBytes = static_cast<size_t>(outUVLinesize) * (outHeight / 2);
+
+    outY  = static_cast<uint8_t*>(std::malloc(yBytes));
+    outUV = static_cast<uint8_t*>(std::malloc(uvBytes));
+
+    if (!outY || !outUV) {
+        std::free(outY);
+        std::free(outUV);
+        outY = nullptr;
+        outUV = nullptr;
+        return false;
+    }
+
+    // Copy line-by-line (source may have padding via linesize > width)
+    const int srcYLinesize  = nv12->linesize[0];
+    const int srcUVLinesize = nv12->linesize[1];
+
+    for (int row = 0; row < outHeight; ++row) {
+        std::memcpy(outY + row * outYLinesize,
+                    nv12->data[0] + row * srcYLinesize,
+                    outWidth);
+    }
+    for (int row = 0; row < outHeight / 2; ++row) {
+        std::memcpy(outUV + row * outUVLinesize,
+                    nv12->data[1] + row * srcUVLinesize,
+                    outWidth);
+    }
+
+    return true;
+}
+
+} // namespace detail
+} // namespace anscv_gpu_ops
+
+// Attach NV12/YUV frame keyed by cv::Mat* pointer.
+// Snapshots CPU NV12 planes into owned malloc'd buffers, then releases the AVFrame.
+// TAKES OWNERSHIP of nv12 — caller must NOT av_frame_free after this call.
+inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pts) {
+    if (!mat || !nv12) return;
+
+    GpuFrameData data{};
+    data.gpuIndex    = gpuIdx;
+    data.pts         = pts;
+    data.pixelFormat = nv12->format;
+    data.width       = nv12->width;
+    data.height      = nv12->height;
+
+    // Snapshot NV12 planes to owned buffers
+    bool ok = anscv_gpu_ops::detail::snapshotNV12Planes(
+        nv12,
+        data.cpuYPlane, data.cpuYLinesize,
+        data.cpuUvPlane, data.cpuUvLinesize,
+        data.width, data.height);
+
+    // Keep legacy pointers for backward compat during transition
+    data.yPlane    = data.cpuYPlane;
+    data.uvPlane   = data.cpuUvPlane;
+    data.yLinesize = data.cpuYLinesize;
+    data.uvLinesize = data.cpuUvLinesize;
+
+    // Store AVFrame for legacy cleanup (will be freed by drain_pending)
+    data.avframe = nv12;
+
+    void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
+    if (old) {
+        AVFrame* oldFrame = static_cast<AVFrame*>(old);
+        av_frame_free(&oldFrame);
+    }
+
+    // Free stale entries evicted by TTL or previous attach
+    auto pending = ANSGpuFrameRegistry::instance().drain_pending();
+    for (void* p : pending) {
+        AVFrame* stale = static_cast<AVFrame*>(p);
+        av_frame_free(&stale);
+    }
+}
+
+// Attach CUDA HW frame — keeps CUDA device pointers for zero-copy inference.
+// TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
+//
+// Primary path: yPlane/uvPlane point to CUDA device pointers from the cloned
+// AVFrame (data[0]/data[1]).  The cloned AVFrame keeps the NVDEC surface alive
+// until gpu_frame_remove() is called after inference.  With 4 cameras each
+// holding ~1 surface, this uses 4 of NVDEC's 25-32 surface pool — safe.
+//
+// Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
+// inference (when decode GPU != inference GPU, CUDA device ptrs aren't
+// accessible from another GPU context).
+inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
+                                   AVFrame* cpuNV12 = nullptr) {
+    if (!mat || !cudaFrame) return;
+
+    GpuFrameData data{};
+    data.gpuIndex        = gpuIdx;
+    data.pts             = pts;
+    data.width           = cudaFrame->width;
+    data.height          = cudaFrame->height;
+    data.pixelFormat     = 23; // AV_PIX_FMT_NV12 — the underlying sw_format
+
+    // Primary: CUDA device pointers from NVDEC (zero-copy on same GPU)
+    data.isCudaDevicePtr = true;
+    data.yPlane          = cudaFrame->data[0];   // CUDA device ptr: Y plane
+    data.uvPlane         = cudaFrame->data[1];   // CUDA device ptr: UV plane
+    data.yLinesize       = cudaFrame->linesize[0];
+    data.uvLinesize      = cudaFrame->linesize[1];
+
+    // Fallback: snapshot CPU NV12 for cross-GPU inference
+    if (cpuNV12) {
+        anscv_gpu_ops::detail::snapshotNV12Planes(
+            cpuNV12,
+            data.cpuYPlane, data.cpuYLinesize,
+            data.cpuUvPlane, data.cpuUvLinesize,
+            data.width, data.height);
+    }
+
+    // Store AVFrames for cleanup (cudaFrame keeps NVDEC surface alive)
+    data.avframe    = cudaFrame;
+    data.cpuAvframe = cpuNV12;
+
+    void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
+    if (old) {
+        AVFrame* oldFrame = static_cast<AVFrame*>(old);
+        av_frame_free(&oldFrame);
+    }
+
+    auto pending = ANSGpuFrameRegistry::instance().drain_pending();
+    for (void* p : pending) {
+        AVFrame* stale = static_cast<AVFrame*>(p);
+        av_frame_free(&stale);
+    }
+}
+
+// Release entry by cv::Mat* and free any returned AVFrames. Safe if not in map (no-op).
+inline void gpu_frame_remove(cv::Mat* mat) {
+    if (!mat) return;
+
+    ANSGpuFrameRegistry::instance().release(mat);
+
+    // Free any AVFrames that became pending from this release or prior eviction
+    auto pending = ANSGpuFrameRegistry::instance().drain_pending();
+    for (void* p : pending) {
+        AVFrame* stale = static_cast<AVFrame*>(p);
+        av_frame_free(&stale);
+    }
+
+    // Free any GPU device pointers that became pending
+    auto gpuPending = gpu_frame_drain_gpu_pending();
+    // NOTE: cudaFree requires CUDA context — caller must be on a CUDA-capable thread.
+    // If not, these will leak. In practice, gpu_frame_remove is called from ANSCV
+    // camera threads which do have CUDA context.
+    // For safety, we skip cudaFree here and let NV12PreprocessHelper handle it.
+    // The GPU pointers are tracked in the budget and will be accounted for.
+    (void)gpuPending;
+}
+
+// Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
+inline void gpu_frame_invalidate(cv::Mat* mat) {
+    gpu_frame_remove(mat);
+}
+
+// Run TTL eviction + drain pending. Call periodically from camera threads.
+inline void gpu_frame_evict_stale() {
+    ANSGpuFrameRegistry::instance().evictStaleFrames();
+
+    auto pending = ANSGpuFrameRegistry::instance().drain_pending();
+    for (void* p : pending) {
+        AVFrame* stale = static_cast<AVFrame*>(p);
+        av_frame_free(&stale);
+    }
+}