Files
ANSCORE/modules/ANSCV/ANSGpuFrameOps.h

372 lines
14 KiB
C++

#pragma once
// ANSGpuFrameOps.h — FFmpeg-aware convenience functions for ANSGpuFrameRegistry.
//
// This header requires FFmpeg headers (libavutil/frame.h) and provides
// typed attach/invalidate/remove operations that handle av_frame_clone/free.
//
// NEW DESIGN: Instead of storing AVFrame* references (which lock NVDEC surfaces),
// we snapshot the CPU NV12 planes into malloc'd buffers and release the AVFrames
// immediately. This prevents decoder surface pool exhaustion when many clones
// hold references to the same frame.
//
// Include this in ANSCV/ANSRTSP (which link FFmpeg). For projects without
// FFmpeg (ANSODEngine), include ANSGpuFrameRegistry.h directly and use
// gpu_frame_lookup() + the GpuFrameData plane pointers.
#include "ANSGpuFrameRegistry.h"
extern "C" {
#include "libavutil/frame.h"
}
#include <cuda_runtime.h>
#include <cstring>
#include <cstdlib>
#include <cstdio>
#ifdef _WIN32
#include <windows.h>
#endif
// Debug logging macro for GPU frame operations.
// Output goes to stderr (console) AND OutputDebugString (DebugView / VS debugger).
// Use Sysinternals DebugView (dbgview64.exe) to capture these after a crash.
#ifndef GPU_FRAME_DBG
#ifdef _WIN32
#define GPU_FRAME_DBG(fmt, ...) do { \
char _gpu_dbg_buf[512]; \
snprintf(_gpu_dbg_buf, sizeof(_gpu_dbg_buf), "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__); \
OutputDebugStringA(_gpu_dbg_buf); \
fprintf(stderr, "%s", _gpu_dbg_buf); \
} while(0)
#else
#define GPU_FRAME_DBG(fmt, ...) \
fprintf(stderr, "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__)
#endif
#endif
namespace anscv_gpu_ops {
namespace detail {
// Snapshot NV12 Y and UV planes from an AVFrame into malloc'd buffers.
// Returns true on success. Caller owns the output buffers.
inline bool snapshotNV12Planes(const AVFrame* nv12,
uint8_t*& outY, int& outYLinesize,
uint8_t*& outUV, int& outUVLinesize,
int& outWidth, int& outHeight) {
if (!nv12 || !nv12->data[0] || !nv12->data[1])
return false;
outWidth = nv12->width;
outHeight = nv12->height;
outYLinesize = nv12->width; // Packed (no alignment padding)
outUVLinesize = nv12->width; // UV interleaved: width bytes per row
size_t yBytes = static_cast<size_t>(outYLinesize) * outHeight;
size_t uvBytes = static_cast<size_t>(outUVLinesize) * (outHeight / 2);
outY = static_cast<uint8_t*>(std::malloc(yBytes));
outUV = static_cast<uint8_t*>(std::malloc(uvBytes));
if (!outY || !outUV) {
std::free(outY);
std::free(outUV);
outY = nullptr;
outUV = nullptr;
return false;
}
// Copy line-by-line (source may have padding via linesize > width)
const int srcYLinesize = nv12->linesize[0];
const int srcUVLinesize = nv12->linesize[1];
for (int row = 0; row < outHeight; ++row) {
std::memcpy(outY + row * outYLinesize,
nv12->data[0] + row * srcYLinesize,
outWidth);
}
for (int row = 0; row < outHeight / 2; ++row) {
std::memcpy(outUV + row * outUVLinesize,
nv12->data[1] + row * srcUVLinesize,
outWidth);
}
return true;
}
// Drain pending GPU device pointers and actually cudaFree them.
// Must be called from a thread with CUDA context available.
inline void drainAndFreeGpuPending() {
auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
if (gpuPending.empty()) return;
GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs", gpuPending.size());
int prevDev = -1;
cudaGetDevice(&prevDev);
// Group by device to minimize cudaSetDevice calls and synchronize once per device.
// cudaDeviceSynchronize() is CRITICAL: NV12 kernels run on cv::cuda::Stream
// (not the default stream). cudaFree on stream 0 doesn't wait for other
// streams, so without this sync, cudaFree can free a buffer while a kernel
// on another stream is still reading from it → cudaErrorIllegalAddress (700)
// which permanently corrupts the CUDA context.
int lastSyncDev = -1;
for (auto& entry : gpuPending) {
if (entry.ptr) {
if (entry.deviceIdx >= 0)
cudaSetDevice(entry.deviceIdx);
if (entry.deviceIdx != lastSyncDev) {
cudaDeviceSynchronize();
lastSyncDev = entry.deviceIdx;
}
GPU_FRAME_DBG("drainGpuPending: cudaFree(%p) dev=%d", entry.ptr, entry.deviceIdx);
cudaError_t err = cudaFree(entry.ptr);
if (err != cudaSuccess) {
GPU_FRAME_DBG("drainGpuPending: cudaFree FAILED err=%d (%s)",
(int)err, cudaGetErrorString(err));
}
}
}
if (prevDev >= 0)
cudaSetDevice(prevDev);
}
} // namespace detail
} // namespace anscv_gpu_ops
// Attach NV12/YUV frame keyed by cv::Mat* pointer.
// Snapshots CPU NV12 planes into owned malloc'd buffers, then releases the AVFrame.
// TAKES OWNERSHIP of nv12 — caller must NOT av_frame_free after this call.
inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pts) {
if (!mat || !nv12) return;
GpuFrameData data{};
data.gpuIndex = gpuIdx;
data.pts = pts;
data.pixelFormat = nv12->format;
data.width = nv12->width;
data.height = nv12->height;
// Snapshot NV12 planes to owned buffers
bool ok = anscv_gpu_ops::detail::snapshotNV12Planes(
nv12,
data.cpuYPlane, data.cpuYLinesize,
data.cpuUvPlane, data.cpuUvLinesize,
data.width, data.height);
// Keep legacy pointers for backward compat during transition
data.yPlane = data.cpuYPlane;
data.uvPlane = data.cpuUvPlane;
data.yLinesize = data.cpuYLinesize;
data.uvLinesize = data.cpuUvLinesize;
// Store AVFrame for legacy cleanup (will be freed by drain_pending)
data.avframe = nv12;
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
if (old) {
AVFrame* oldFrame = static_cast<AVFrame*>(old);
av_frame_free(&oldFrame);
}
// Free stale entries evicted by TTL or previous attach
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
for (void* p : pending) {
AVFrame* stale = static_cast<AVFrame*>(p);
av_frame_free(&stale);
}
}
// Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
// TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
//
// D2D copy path: cudaMemcpy2D from NVDEC surfaces to cudaMalloc'd buffers on the
// same GPU. This decouples the NV12 data lifetime from the NVDEC decoder, so
// player->close() can safely destroy the decoder at any time without invalidating
// pointers that inference engines may be reading. The NVDEC surface is freed
// immediately (av_frame_free), returning it to the decoder's surface pool.
//
// The owned GPU pointers are stored as both yPlane/uvPlane (for zero-copy reads)
// and gpuCacheY/gpuCacheUV (for lifecycle management / cudaFree on cleanup).
//
// VRAM budget: if the global GPU cache budget is exceeded, falls back to CPU-only
// NV12 snapshot (no zero-copy, but safe).
//
// Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
// inference (when decode GPU != inference GPU).
inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
AVFrame* cpuNV12 = nullptr) {
if (!mat || !cudaFrame) {
GPU_FRAME_DBG("attach_cuda: SKIP mat=%p cudaFrame=%p", (void*)mat, (void*)cudaFrame);
return;
}
const int w = cudaFrame->width;
const int h = cudaFrame->height;
GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p cpuNV12=%p",
(void*)mat, w, h, gpuIdx,
(void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)cpuNV12);
GpuFrameData data{};
data.gpuIndex = gpuIdx;
data.pts = pts;
data.width = w;
data.height = h;
data.pixelFormat = 23; // AV_PIX_FMT_NV12
// Snapshot CPU NV12 for cross-GPU fallback (must do before freeing cpuNV12)
if (cpuNV12) {
anscv_gpu_ops::detail::snapshotNV12Planes(
cpuNV12,
data.cpuYPlane, data.cpuYLinesize,
data.cpuUvPlane, data.cpuUvLinesize,
data.width, data.height);
}
// --- D2D copy: NVDEC surface → owned GPU memory ---
// Estimate VRAM needed for the owned NV12 copy
const size_t yBytes = static_cast<size_t>(w) * h;
const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
const size_t totalBytes = yBytes + uvBytes;
bool d2dOk = false;
if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
int prevDev = -1;
cudaGetDevice(&prevDev);
if (gpuIdx >= 0)
cudaSetDevice(gpuIdx);
void* ownedY = nullptr;
void* ownedUV = nullptr;
size_t yPitch = 0;
size_t uvPitch = 0;
cudaError_t e1 = cudaMallocPitch(&ownedY, &yPitch, w, h);
cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);
if (e1 == cudaSuccess && e2 == cudaSuccess) {
cudaError_t e3 = cudaMemcpy2D(ownedY, yPitch,
cudaFrame->data[0], cudaFrame->linesize[0],
w, h, cudaMemcpyDeviceToDevice);
cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
cudaFrame->data[1], cudaFrame->linesize[1],
w, h / 2, cudaMemcpyDeviceToDevice);
if (e3 == cudaSuccess && e4 == cudaSuccess) {
// Store owned GPU pointers as primary NV12 source
data.isCudaDevicePtr = true;
data.yPlane = static_cast<uint8_t*>(ownedY);
data.uvPlane = static_cast<uint8_t*>(ownedUV);
data.yLinesize = static_cast<int>(yPitch);
data.uvLinesize = static_cast<int>(uvPitch);
// Track in gpuCache for lifecycle management (cudaFree on cleanup)
data.gpuCacheY = ownedY;
data.gpuCacheUV = ownedUV;
data.gpuCacheYPitch = yPitch;
data.gpuCacheUVPitch = uvPitch;
data.gpuCacheDeviceIdx = gpuIdx;
data.gpuCacheValid = true;
data.gpuCacheBytes = yPitch * h + uvPitch * (h / 2);
ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
d2dOk = true;
GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
} else {
// D2D copy failed — free allocated memory and fall back
GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
(int)e3, (int)e4);
cudaFree(ownedY);
cudaFree(ownedUV);
}
} else {
// Allocation failed — free any partial allocation and fall back
GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
(int)e1, (int)e2);
if (e1 == cudaSuccess) cudaFree(ownedY);
if (e2 == cudaSuccess) cudaFree(ownedUV);
}
if (prevDev >= 0)
cudaSetDevice(prevDev);
}
if (!d2dOk) {
// Fall back to CPU NV12 snapshot only (no zero-copy)
GPU_FRAME_DBG("attach_cuda: FALLBACK CPU-only cpuY=%p cpuUV=%p",
(void*)data.cpuYPlane, (void*)data.cpuUvPlane);
data.isCudaDevicePtr = false;
data.yPlane = data.cpuYPlane;
data.uvPlane = data.cpuUvPlane;
data.yLinesize = data.cpuYLinesize;
data.uvLinesize = data.cpuUvLinesize;
}
// Release AVFrames immediately — NVDEC surfaces returned to pool.
// No longer stored in GpuFrameData (owned GPU copy is independent).
GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p",
(void*)cudaFrame, (void*)cpuNV12);
av_frame_free(&cudaFrame);
if (cpuNV12) av_frame_free(&cpuNV12);
data.avframe = nullptr;
data.cpuAvframe = nullptr;
GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d gpuCacheY=%p gpuCacheUV=%p",
(void*)data.yPlane, (void*)data.uvPlane, (int)data.isCudaDevicePtr,
data.gpuCacheY, data.gpuCacheUV);
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
if (old) {
AVFrame* oldFrame = static_cast<AVFrame*>(old);
av_frame_free(&oldFrame);
}
// Free stale AVFrames evicted by TTL or previous attach
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
for (void* p : pending) {
AVFrame* stale = static_cast<AVFrame*>(p);
av_frame_free(&stale);
}
// Free stale GPU device pointers
anscv_gpu_ops::detail::drainAndFreeGpuPending();
}
// Release entry by cv::Mat* and free any returned AVFrames + GPU pointers.
// Safe if not in map (no-op).
inline void gpu_frame_remove(cv::Mat* mat) {
if (!mat) return;
GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat);
ANSGpuFrameRegistry::instance().release(mat);
// Free any AVFrames that became pending from this release or prior eviction
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
for (void* p : pending) {
AVFrame* stale = static_cast<AVFrame*>(p);
av_frame_free(&stale);
}
// Free any GPU device pointers that became pending
anscv_gpu_ops::detail::drainAndFreeGpuPending();
}
// Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
inline void gpu_frame_invalidate(cv::Mat* mat) {
gpu_frame_remove(mat);
}
// Run TTL eviction + drain pending. Call periodically from camera threads.
inline void gpu_frame_evict_stale() {
ANSGpuFrameRegistry::instance().evictStaleFrames();
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
for (void* p : pending) {
AVFrame* stale = static_cast<AVFrame*>(p);
av_frame_free(&stale);
}
// Free any GPU device pointers from evicted frames
anscv_gpu_ops::detail::drainAndFreeGpuPending();
}