#pragma once // ANSGpuFrameOps.h — FFmpeg-aware convenience functions for ANSGpuFrameRegistry. // // This header requires FFmpeg headers (libavutil/frame.h) and provides // typed attach/invalidate/remove operations that handle av_frame_clone/free. // // NEW DESIGN: Instead of storing AVFrame* references (which lock NVDEC surfaces), // we snapshot the CPU NV12 planes into malloc'd buffers and release the AVFrames // immediately. This prevents decoder surface pool exhaustion when many clones // hold references to the same frame. // // Include this in ANSCV/ANSRTSP (which link FFmpeg). For projects without // FFmpeg (ANSODEngine), include ANSGpuFrameRegistry.h directly and use // gpu_frame_lookup() + the GpuFrameData plane pointers. #include "ANSGpuFrameRegistry.h" extern "C" { #include "libavutil/frame.h" } #include #include namespace anscv_gpu_ops { namespace detail { // Snapshot NV12 Y and UV planes from an AVFrame into malloc'd buffers. // Returns true on success. Caller owns the output buffers. inline bool snapshotNV12Planes(const AVFrame* nv12, uint8_t*& outY, int& outYLinesize, uint8_t*& outUV, int& outUVLinesize, int& outWidth, int& outHeight) { if (!nv12 || !nv12->data[0] || !nv12->data[1]) return false; outWidth = nv12->width; outHeight = nv12->height; outYLinesize = nv12->width; // Packed (no alignment padding) outUVLinesize = nv12->width; // UV interleaved: width bytes per row size_t yBytes = static_cast(outYLinesize) * outHeight; size_t uvBytes = static_cast(outUVLinesize) * (outHeight / 2); outY = static_cast(std::malloc(yBytes)); outUV = static_cast(std::malloc(uvBytes)); if (!outY || !outUV) { std::free(outY); std::free(outUV); outY = nullptr; outUV = nullptr; return false; } // Copy line-by-line (source may have padding via linesize > width) const int srcYLinesize = nv12->linesize[0]; const int srcUVLinesize = nv12->linesize[1]; for (int row = 0; row < outHeight; ++row) { std::memcpy(outY + row * outYLinesize, nv12->data[0] + row * srcYLinesize, outWidth); } for (int row = 0; row < outHeight / 2; ++row) { std::memcpy(outUV + row * outUVLinesize, nv12->data[1] + row * srcUVLinesize, outWidth); } return true; } } // namespace detail } // namespace anscv_gpu_ops // Attach NV12/YUV frame keyed by cv::Mat* pointer. // Snapshots CPU NV12 planes into owned malloc'd buffers, then releases the AVFrame. // TAKES OWNERSHIP of nv12 — caller must NOT av_frame_free after this call. inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pts) { if (!mat || !nv12) return; GpuFrameData data{}; data.gpuIndex = gpuIdx; data.pts = pts; data.pixelFormat = nv12->format; data.width = nv12->width; data.height = nv12->height; // Snapshot NV12 planes to owned buffers bool ok = anscv_gpu_ops::detail::snapshotNV12Planes( nv12, data.cpuYPlane, data.cpuYLinesize, data.cpuUvPlane, data.cpuUvLinesize, data.width, data.height); // Keep legacy pointers for backward compat during transition data.yPlane = data.cpuYPlane; data.uvPlane = data.cpuUvPlane; data.yLinesize = data.cpuYLinesize; data.uvLinesize = data.cpuUvLinesize; // Store AVFrame for legacy cleanup (will be freed by drain_pending) data.avframe = nv12; void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data)); if (old) { AVFrame* oldFrame = static_cast(old); av_frame_free(&oldFrame); } // Free stale entries evicted by TTL or previous attach auto pending = ANSGpuFrameRegistry::instance().drain_pending(); for (void* p : pending) { AVFrame* stale = static_cast(p); av_frame_free(&stale); } } // Attach CUDA HW frame — keeps CUDA device pointers for zero-copy inference. // TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after. // // Primary path: yPlane/uvPlane point to CUDA device pointers from the cloned // AVFrame (data[0]/data[1]). The cloned AVFrame keeps the NVDEC surface alive // until gpu_frame_remove() is called after inference. With 4 cameras each // holding ~1 surface, this uses 4 of NVDEC's 25-32 surface pool — safe. // // Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU // inference (when decode GPU != inference GPU, CUDA device ptrs aren't // accessible from another GPU context). inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts, AVFrame* cpuNV12 = nullptr) { if (!mat || !cudaFrame) return; GpuFrameData data{}; data.gpuIndex = gpuIdx; data.pts = pts; data.width = cudaFrame->width; data.height = cudaFrame->height; data.pixelFormat = 23; // AV_PIX_FMT_NV12 — the underlying sw_format // Primary: CUDA device pointers from NVDEC (zero-copy on same GPU) data.isCudaDevicePtr = true; data.yPlane = cudaFrame->data[0]; // CUDA device ptr: Y plane data.uvPlane = cudaFrame->data[1]; // CUDA device ptr: UV plane data.yLinesize = cudaFrame->linesize[0]; data.uvLinesize = cudaFrame->linesize[1]; // Fallback: snapshot CPU NV12 for cross-GPU inference if (cpuNV12) { anscv_gpu_ops::detail::snapshotNV12Planes( cpuNV12, data.cpuYPlane, data.cpuYLinesize, data.cpuUvPlane, data.cpuUvLinesize, data.width, data.height); } // Store AVFrames for cleanup (cudaFrame keeps NVDEC surface alive) data.avframe = cudaFrame; data.cpuAvframe = cpuNV12; void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data)); if (old) { AVFrame* oldFrame = static_cast(old); av_frame_free(&oldFrame); } auto pending = ANSGpuFrameRegistry::instance().drain_pending(); for (void* p : pending) { AVFrame* stale = static_cast(p); av_frame_free(&stale); } } // Release entry by cv::Mat* and free any returned AVFrames. Safe if not in map (no-op). inline void gpu_frame_remove(cv::Mat* mat) { if (!mat) return; ANSGpuFrameRegistry::instance().release(mat); // Free any AVFrames that became pending from this release or prior eviction auto pending = ANSGpuFrameRegistry::instance().drain_pending(); for (void* p : pending) { AVFrame* stale = static_cast(p); av_frame_free(&stale); } // Free any GPU device pointers that became pending auto gpuPending = gpu_frame_drain_gpu_pending(); // NOTE: cudaFree requires CUDA context — caller must be on a CUDA-capable thread. // If not, these will leak. In practice, gpu_frame_remove is called from ANSCV // camera threads which do have CUDA context. // For safety, we skip cudaFree here and let NV12PreprocessHelper handle it. // The GPU pointers are tracked in the budget and will be accounted for. (void)gpuPending; } // Alias for remove — used in ANSCV mutating functions to drop stale GPU data. inline void gpu_frame_invalidate(cv::Mat* mat) { gpu_frame_remove(mat); } // Run TTL eviction + drain pending. Call periodically from camera threads. inline void gpu_frame_evict_stale() { ANSGpuFrameRegistry::instance().evictStaleFrames(); auto pending = ANSGpuFrameRegistry::instance().drain_pending(); for (void* p : pending) { AVFrame* stale = static_cast(p); av_frame_free(&stale); } }