213 lines
7.8 KiB
C
213 lines
7.8 KiB
C
|
|
#pragma once
|
||
|
|
// ANSGpuFrameOps.h — FFmpeg-aware convenience functions for ANSGpuFrameRegistry.
|
||
|
|
//
|
||
|
|
// This header requires FFmpeg headers (libavutil/frame.h) and provides
|
||
|
|
// typed attach/invalidate/remove operations that handle av_frame_clone/free.
|
||
|
|
//
|
||
|
|
// NEW DESIGN: Instead of storing AVFrame* references (which lock NVDEC surfaces),
|
||
|
|
// we snapshot the CPU NV12 planes into malloc'd buffers and release the AVFrames
|
||
|
|
// immediately. This prevents decoder surface pool exhaustion when many clones
|
||
|
|
// hold references to the same frame.
|
||
|
|
//
|
||
|
|
// Include this in ANSCV/ANSRTSP (which link FFmpeg). For projects without
|
||
|
|
// FFmpeg (ANSODEngine), include ANSGpuFrameRegistry.h directly and use
|
||
|
|
// gpu_frame_lookup() + the GpuFrameData plane pointers.
|
||
|
|
|
||
|
|
#include "ANSGpuFrameRegistry.h"
|
||
|
|
|
||
|
|
extern "C" {
|
||
|
|
#include "libavutil/frame.h"
|
||
|
|
}
|
||
|
|
|
||
|
|
#include <cstring>
|
||
|
|
#include <cstdlib>
|
||
|
|
|
||
|
|
namespace anscv_gpu_ops {
|
||
|
|
namespace detail {
|
||
|
|
|
||
|
|
// Snapshot NV12 Y and UV planes from an AVFrame into malloc'd buffers.
|
||
|
|
// Returns true on success. Caller owns the output buffers.
|
||
|
|
inline bool snapshotNV12Planes(const AVFrame* nv12,
|
||
|
|
uint8_t*& outY, int& outYLinesize,
|
||
|
|
uint8_t*& outUV, int& outUVLinesize,
|
||
|
|
int& outWidth, int& outHeight) {
|
||
|
|
if (!nv12 || !nv12->data[0] || !nv12->data[1])
|
||
|
|
return false;
|
||
|
|
|
||
|
|
outWidth = nv12->width;
|
||
|
|
outHeight = nv12->height;
|
||
|
|
outYLinesize = nv12->width; // Packed (no alignment padding)
|
||
|
|
outUVLinesize = nv12->width; // UV interleaved: width bytes per row
|
||
|
|
|
||
|
|
size_t yBytes = static_cast<size_t>(outYLinesize) * outHeight;
|
||
|
|
size_t uvBytes = static_cast<size_t>(outUVLinesize) * (outHeight / 2);
|
||
|
|
|
||
|
|
outY = static_cast<uint8_t*>(std::malloc(yBytes));
|
||
|
|
outUV = static_cast<uint8_t*>(std::malloc(uvBytes));
|
||
|
|
|
||
|
|
if (!outY || !outUV) {
|
||
|
|
std::free(outY);
|
||
|
|
std::free(outUV);
|
||
|
|
outY = nullptr;
|
||
|
|
outUV = nullptr;
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Copy line-by-line (source may have padding via linesize > width)
|
||
|
|
const int srcYLinesize = nv12->linesize[0];
|
||
|
|
const int srcUVLinesize = nv12->linesize[1];
|
||
|
|
|
||
|
|
for (int row = 0; row < outHeight; ++row) {
|
||
|
|
std::memcpy(outY + row * outYLinesize,
|
||
|
|
nv12->data[0] + row * srcYLinesize,
|
||
|
|
outWidth);
|
||
|
|
}
|
||
|
|
for (int row = 0; row < outHeight / 2; ++row) {
|
||
|
|
std::memcpy(outUV + row * outUVLinesize,
|
||
|
|
nv12->data[1] + row * srcUVLinesize,
|
||
|
|
outWidth);
|
||
|
|
}
|
||
|
|
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
} // namespace detail
|
||
|
|
} // namespace anscv_gpu_ops
|
||
|
|
|
||
|
|
// Attach NV12/YUV frame keyed by cv::Mat* pointer.
|
||
|
|
// Snapshots CPU NV12 planes into owned malloc'd buffers, then releases the AVFrame.
|
||
|
|
// TAKES OWNERSHIP of nv12 — caller must NOT av_frame_free after this call.
|
||
|
|
inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pts) {
|
||
|
|
if (!mat || !nv12) return;
|
||
|
|
|
||
|
|
GpuFrameData data{};
|
||
|
|
data.gpuIndex = gpuIdx;
|
||
|
|
data.pts = pts;
|
||
|
|
data.pixelFormat = nv12->format;
|
||
|
|
data.width = nv12->width;
|
||
|
|
data.height = nv12->height;
|
||
|
|
|
||
|
|
// Snapshot NV12 planes to owned buffers
|
||
|
|
bool ok = anscv_gpu_ops::detail::snapshotNV12Planes(
|
||
|
|
nv12,
|
||
|
|
data.cpuYPlane, data.cpuYLinesize,
|
||
|
|
data.cpuUvPlane, data.cpuUvLinesize,
|
||
|
|
data.width, data.height);
|
||
|
|
|
||
|
|
// Keep legacy pointers for backward compat during transition
|
||
|
|
data.yPlane = data.cpuYPlane;
|
||
|
|
data.uvPlane = data.cpuUvPlane;
|
||
|
|
data.yLinesize = data.cpuYLinesize;
|
||
|
|
data.uvLinesize = data.cpuUvLinesize;
|
||
|
|
|
||
|
|
// Store AVFrame for legacy cleanup (will be freed by drain_pending)
|
||
|
|
data.avframe = nv12;
|
||
|
|
|
||
|
|
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
|
||
|
|
if (old) {
|
||
|
|
AVFrame* oldFrame = static_cast<AVFrame*>(old);
|
||
|
|
av_frame_free(&oldFrame);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Free stale entries evicted by TTL or previous attach
|
||
|
|
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||
|
|
for (void* p : pending) {
|
||
|
|
AVFrame* stale = static_cast<AVFrame*>(p);
|
||
|
|
av_frame_free(&stale);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Attach CUDA HW frame — keeps CUDA device pointers for zero-copy inference.
|
||
|
|
// TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
|
||
|
|
//
|
||
|
|
// Primary path: yPlane/uvPlane point to CUDA device pointers from the cloned
|
||
|
|
// AVFrame (data[0]/data[1]). The cloned AVFrame keeps the NVDEC surface alive
|
||
|
|
// until gpu_frame_remove() is called after inference. With 4 cameras each
|
||
|
|
// holding ~1 surface, this uses 4 of NVDEC's 25-32 surface pool — safe.
|
||
|
|
//
|
||
|
|
// Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
|
||
|
|
// inference (when decode GPU != inference GPU, CUDA device ptrs aren't
|
||
|
|
// accessible from another GPU context).
|
||
|
|
inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
|
||
|
|
AVFrame* cpuNV12 = nullptr) {
|
||
|
|
if (!mat || !cudaFrame) return;
|
||
|
|
|
||
|
|
GpuFrameData data{};
|
||
|
|
data.gpuIndex = gpuIdx;
|
||
|
|
data.pts = pts;
|
||
|
|
data.width = cudaFrame->width;
|
||
|
|
data.height = cudaFrame->height;
|
||
|
|
data.pixelFormat = 23; // AV_PIX_FMT_NV12 — the underlying sw_format
|
||
|
|
|
||
|
|
// Primary: CUDA device pointers from NVDEC (zero-copy on same GPU)
|
||
|
|
data.isCudaDevicePtr = true;
|
||
|
|
data.yPlane = cudaFrame->data[0]; // CUDA device ptr: Y plane
|
||
|
|
data.uvPlane = cudaFrame->data[1]; // CUDA device ptr: UV plane
|
||
|
|
data.yLinesize = cudaFrame->linesize[0];
|
||
|
|
data.uvLinesize = cudaFrame->linesize[1];
|
||
|
|
|
||
|
|
// Fallback: snapshot CPU NV12 for cross-GPU inference
|
||
|
|
if (cpuNV12) {
|
||
|
|
anscv_gpu_ops::detail::snapshotNV12Planes(
|
||
|
|
cpuNV12,
|
||
|
|
data.cpuYPlane, data.cpuYLinesize,
|
||
|
|
data.cpuUvPlane, data.cpuUvLinesize,
|
||
|
|
data.width, data.height);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Store AVFrames for cleanup (cudaFrame keeps NVDEC surface alive)
|
||
|
|
data.avframe = cudaFrame;
|
||
|
|
data.cpuAvframe = cpuNV12;
|
||
|
|
|
||
|
|
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
|
||
|
|
if (old) {
|
||
|
|
AVFrame* oldFrame = static_cast<AVFrame*>(old);
|
||
|
|
av_frame_free(&oldFrame);
|
||
|
|
}
|
||
|
|
|
||
|
|
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||
|
|
for (void* p : pending) {
|
||
|
|
AVFrame* stale = static_cast<AVFrame*>(p);
|
||
|
|
av_frame_free(&stale);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Release entry by cv::Mat* and free any returned AVFrames. Safe if not in map (no-op).
|
||
|
|
inline void gpu_frame_remove(cv::Mat* mat) {
|
||
|
|
if (!mat) return;
|
||
|
|
|
||
|
|
ANSGpuFrameRegistry::instance().release(mat);
|
||
|
|
|
||
|
|
// Free any AVFrames that became pending from this release or prior eviction
|
||
|
|
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||
|
|
for (void* p : pending) {
|
||
|
|
AVFrame* stale = static_cast<AVFrame*>(p);
|
||
|
|
av_frame_free(&stale);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Free any GPU device pointers that became pending
|
||
|
|
auto gpuPending = gpu_frame_drain_gpu_pending();
|
||
|
|
// NOTE: cudaFree requires CUDA context — caller must be on a CUDA-capable thread.
|
||
|
|
// If not, these will leak. In practice, gpu_frame_remove is called from ANSCV
|
||
|
|
// camera threads which do have CUDA context.
|
||
|
|
// For safety, we skip cudaFree here and let NV12PreprocessHelper handle it.
|
||
|
|
// The GPU pointers are tracked in the budget and will be accounted for.
|
||
|
|
(void)gpuPending;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
|
||
|
|
inline void gpu_frame_invalidate(cv::Mat* mat) {
|
||
|
|
gpu_frame_remove(mat);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Run TTL eviction + drain pending. Call periodically from camera threads.
|
||
|
|
inline void gpu_frame_evict_stale() {
|
||
|
|
ANSGpuFrameRegistry::instance().evictStaleFrames();
|
||
|
|
|
||
|
|
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||
|
|
for (void* p : pending) {
|
||
|
|
AVFrame* stale = static_cast<AVFrame*>(p);
|
||
|
|
av_frame_free(&stale);
|
||
|
|
}
|
||
|
|
}
|