2026-03-28 16:54:11 +11:00
|
|
|
#pragma once
|
|
|
|
|
// ANSGpuFrameOps.h — FFmpeg-aware convenience functions for ANSGpuFrameRegistry.
|
|
|
|
|
//
|
|
|
|
|
// This header requires FFmpeg headers (libavutil/frame.h) and provides
|
|
|
|
|
// typed attach/invalidate/remove operations that handle av_frame_clone/free.
|
|
|
|
|
//
|
|
|
|
|
// NEW DESIGN: Instead of storing AVFrame* references (which lock NVDEC surfaces),
|
|
|
|
|
// we snapshot the CPU NV12 planes into malloc'd buffers and release the AVFrames
|
|
|
|
|
// immediately. This prevents decoder surface pool exhaustion when many clones
|
|
|
|
|
// hold references to the same frame.
|
|
|
|
|
//
|
|
|
|
|
// Include this in ANSCV/ANSRTSP (which link FFmpeg). For projects without
|
|
|
|
|
// FFmpeg (ANSODEngine), include ANSGpuFrameRegistry.h directly and use
|
|
|
|
|
// gpu_frame_lookup() + the GpuFrameData plane pointers.
|
|
|
|
|
|
|
|
|
|
#include "ANSGpuFrameRegistry.h"
|
2026-04-03 14:51:52 +11:00
|
|
|
#include "GpuNV12SlotPool.h"
|
2026-03-28 16:54:11 +11:00
|
|
|
|
|
|
|
|
extern "C" {
|
|
|
|
|
#include "libavutil/frame.h"
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
#include <cuda_runtime.h>
|
2026-03-28 16:54:11 +11:00
|
|
|
#include <cstring>
|
|
|
|
|
#include <cstdlib>
|
2026-04-02 22:07:27 +11:00
|
|
|
#include <cstdio>
|
|
|
|
|
|
|
|
|
|
#ifdef _WIN32
|
|
|
|
|
#include <windows.h>
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Debug logging macro for GPU frame operations.
|
2026-04-03 14:51:52 +11:00
|
|
|
// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
|
2026-04-02 22:07:27 +11:00
|
|
|
#ifndef GPU_FRAME_DBG
|
2026-04-03 14:51:52 +11:00
|
|
|
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
|
2026-04-02 22:07:27 +11:00
|
|
|
#ifdef _WIN32
|
|
|
|
|
#define GPU_FRAME_DBG(fmt, ...) do { \
|
|
|
|
|
char _gpu_dbg_buf[512]; \
|
|
|
|
|
snprintf(_gpu_dbg_buf, sizeof(_gpu_dbg_buf), "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__); \
|
|
|
|
|
OutputDebugStringA(_gpu_dbg_buf); \
|
|
|
|
|
fprintf(stderr, "%s", _gpu_dbg_buf); \
|
|
|
|
|
} while(0)
|
|
|
|
|
#else
|
|
|
|
|
#define GPU_FRAME_DBG(fmt, ...) \
|
|
|
|
|
fprintf(stderr, "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__)
|
|
|
|
|
#endif
|
2026-04-03 14:51:52 +11:00
|
|
|
#else
|
|
|
|
|
#define GPU_FRAME_DBG(fmt, ...) ((void)0)
|
|
|
|
|
#endif
|
2026-04-02 22:07:27 +11:00
|
|
|
#endif
|
2026-03-28 16:54:11 +11:00
|
|
|
|
|
|
|
|
namespace anscv_gpu_ops {
|
|
|
|
|
namespace detail {
|
|
|
|
|
|
|
|
|
|
// Snapshot NV12 Y and UV planes from an AVFrame into malloc'd buffers.
|
|
|
|
|
// Returns true on success. Caller owns the output buffers.
|
|
|
|
|
inline bool snapshotNV12Planes(const AVFrame* nv12,
|
|
|
|
|
uint8_t*& outY, int& outYLinesize,
|
|
|
|
|
uint8_t*& outUV, int& outUVLinesize,
|
|
|
|
|
int& outWidth, int& outHeight) {
|
|
|
|
|
if (!nv12 || !nv12->data[0] || !nv12->data[1])
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
outWidth = nv12->width;
|
|
|
|
|
outHeight = nv12->height;
|
|
|
|
|
outYLinesize = nv12->width; // Packed (no alignment padding)
|
|
|
|
|
outUVLinesize = nv12->width; // UV interleaved: width bytes per row
|
|
|
|
|
|
|
|
|
|
size_t yBytes = static_cast<size_t>(outYLinesize) * outHeight;
|
|
|
|
|
size_t uvBytes = static_cast<size_t>(outUVLinesize) * (outHeight / 2);
|
|
|
|
|
|
|
|
|
|
outY = static_cast<uint8_t*>(std::malloc(yBytes));
|
|
|
|
|
outUV = static_cast<uint8_t*>(std::malloc(uvBytes));
|
|
|
|
|
|
|
|
|
|
if (!outY || !outUV) {
|
|
|
|
|
std::free(outY);
|
|
|
|
|
std::free(outUV);
|
|
|
|
|
outY = nullptr;
|
|
|
|
|
outUV = nullptr;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Copy line-by-line (source may have padding via linesize > width)
|
|
|
|
|
const int srcYLinesize = nv12->linesize[0];
|
|
|
|
|
const int srcUVLinesize = nv12->linesize[1];
|
|
|
|
|
|
|
|
|
|
for (int row = 0; row < outHeight; ++row) {
|
|
|
|
|
std::memcpy(outY + row * outYLinesize,
|
|
|
|
|
nv12->data[0] + row * srcYLinesize,
|
|
|
|
|
outWidth);
|
|
|
|
|
}
|
|
|
|
|
for (int row = 0; row < outHeight / 2; ++row) {
|
|
|
|
|
std::memcpy(outUV + row * outUVLinesize,
|
|
|
|
|
nv12->data[1] + row * srcUVLinesize,
|
|
|
|
|
outWidth);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-03 14:51:52 +11:00
|
|
|
// Drain pending GPU device pointers and cudaFree them.
|
|
|
|
|
// Uses time-based safety: only frees entries queued >100ms ago, guaranteeing
|
|
|
|
|
// all CUDA kernels reading from them have completed (kernels take <10ms).
|
|
|
|
|
// NO cudaDeviceSynchronize — zero blocking of GPU pipeline.
|
|
|
|
|
//
|
|
|
|
|
// If forceAll=true, drains ALL entries with cudaDeviceSynchronize first
|
|
|
|
|
// (used only by Destroy/Reconnect for final cleanup).
|
|
|
|
|
inline void drainAndFreeGpuPending(bool forceAll = false) {
|
|
|
|
|
static constexpr int SAFE_AGE_MS = 100; // 100ms >> 10ms kernel duration
|
|
|
|
|
auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending(
|
|
|
|
|
forceAll ? 0 : SAFE_AGE_MS);
|
2026-04-02 22:07:27 +11:00
|
|
|
if (gpuPending.empty()) return;
|
2026-04-03 14:51:52 +11:00
|
|
|
GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs (force=%d)", gpuPending.size(), (int)forceAll);
|
2026-04-02 22:07:27 +11:00
|
|
|
int prevDev = -1;
|
|
|
|
|
cudaGetDevice(&prevDev);
|
2026-04-03 14:51:52 +11:00
|
|
|
if (forceAll) {
|
|
|
|
|
// Final cleanup — sync all devices first
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
}
|
2026-04-02 22:07:27 +11:00
|
|
|
for (auto& entry : gpuPending) {
|
|
|
|
|
if (entry.ptr) {
|
|
|
|
|
if (entry.deviceIdx >= 0)
|
|
|
|
|
cudaSetDevice(entry.deviceIdx);
|
|
|
|
|
cudaError_t err = cudaFree(entry.ptr);
|
|
|
|
|
if (err != cudaSuccess) {
|
|
|
|
|
GPU_FRAME_DBG("drainGpuPending: cudaFree FAILED err=%d (%s)",
|
|
|
|
|
(int)err, cudaGetErrorString(err));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (prevDev >= 0)
|
|
|
|
|
cudaSetDevice(prevDev);
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
} // namespace detail
|
|
|
|
|
} // namespace anscv_gpu_ops
|
|
|
|
|
|
|
|
|
|
// Attach NV12/YUV frame keyed by cv::Mat* pointer.
|
|
|
|
|
// Snapshots CPU NV12 planes into owned malloc'd buffers, then releases the AVFrame.
|
|
|
|
|
// TAKES OWNERSHIP of nv12 — caller must NOT av_frame_free after this call.
|
|
|
|
|
inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pts) {
|
|
|
|
|
if (!mat || !nv12) return;
|
|
|
|
|
|
|
|
|
|
GpuFrameData data{};
|
|
|
|
|
data.gpuIndex = gpuIdx;
|
|
|
|
|
data.pts = pts;
|
|
|
|
|
data.pixelFormat = nv12->format;
|
|
|
|
|
data.width = nv12->width;
|
|
|
|
|
data.height = nv12->height;
|
|
|
|
|
|
|
|
|
|
// Snapshot NV12 planes to owned buffers
|
|
|
|
|
bool ok = anscv_gpu_ops::detail::snapshotNV12Planes(
|
|
|
|
|
nv12,
|
|
|
|
|
data.cpuYPlane, data.cpuYLinesize,
|
|
|
|
|
data.cpuUvPlane, data.cpuUvLinesize,
|
|
|
|
|
data.width, data.height);
|
|
|
|
|
|
|
|
|
|
// Keep legacy pointers for backward compat during transition
|
|
|
|
|
data.yPlane = data.cpuYPlane;
|
|
|
|
|
data.uvPlane = data.cpuUvPlane;
|
|
|
|
|
data.yLinesize = data.cpuYLinesize;
|
|
|
|
|
data.uvLinesize = data.cpuUvLinesize;
|
|
|
|
|
|
|
|
|
|
// Store AVFrame for legacy cleanup (will be freed by drain_pending)
|
|
|
|
|
data.avframe = nv12;
|
|
|
|
|
|
|
|
|
|
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
|
|
|
|
|
if (old) {
|
|
|
|
|
AVFrame* oldFrame = static_cast<AVFrame*>(old);
|
|
|
|
|
av_frame_free(&oldFrame);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Free stale entries evicted by TTL or previous attach
|
|
|
|
|
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
|
|
|
|
for (void* p : pending) {
|
|
|
|
|
AVFrame* stale = static_cast<AVFrame*>(p);
|
|
|
|
|
av_frame_free(&stale);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
// Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
|
2026-03-28 16:54:11 +11:00
|
|
|
// TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
|
|
|
|
|
//
|
2026-04-03 14:51:52 +11:00
|
|
|
// D2D copy: SYNCHRONOUS cudaMemcpy2D from NVDEC surfaces into a GpuNV12Slot
|
|
|
|
|
// buffer from the global pool. Data is valid immediately after the call returns.
|
|
|
|
|
// AVFrame is freed immediately (NVDEC surface returned to decoder pool).
|
2026-04-02 22:07:27 +11:00
|
|
|
//
|
2026-04-03 14:51:52 +11:00
|
|
|
// The slot is protected by a 200ms cooldown after the GpuFrameData's refcount
|
|
|
|
|
// drops to 0, guaranteeing that all in-flight GPU kernels (which complete in
|
|
|
|
|
// <10ms) have finished reading from the buffer before it can be reused.
|
2026-04-02 22:07:27 +11:00
|
|
|
//
|
2026-04-03 14:51:52 +11:00
|
|
|
// slot: pre-acquired from GpuNV12SlotPool::instance().acquire().
|
|
|
|
|
// If non-null, D2D copy goes into slot buffers (no per-frame alloc).
|
|
|
|
|
// If nullptr, falls back to per-frame cudaMallocPitch (legacy path).
|
2026-03-28 16:54:11 +11:00
|
|
|
//
|
|
|
|
|
// Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
|
2026-04-02 22:07:27 +11:00
|
|
|
// inference (when decode GPU != inference GPU).
|
2026-03-28 16:54:11 +11:00
|
|
|
inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
|
2026-04-03 14:51:52 +11:00
|
|
|
AVFrame* cpuNV12 = nullptr,
|
|
|
|
|
GpuNV12Slot* slot = nullptr) {
|
2026-04-02 22:07:27 +11:00
|
|
|
if (!mat || !cudaFrame) {
|
|
|
|
|
GPU_FRAME_DBG("attach_cuda: SKIP mat=%p cudaFrame=%p", (void*)mat, (void*)cudaFrame);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const int w = cudaFrame->width;
|
|
|
|
|
const int h = cudaFrame->height;
|
2026-04-03 14:51:52 +11:00
|
|
|
GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p slot=%p",
|
2026-04-02 22:07:27 +11:00
|
|
|
(void*)mat, w, h, gpuIdx,
|
2026-04-03 14:51:52 +11:00
|
|
|
(void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)slot);
|
2026-03-28 16:54:11 +11:00
|
|
|
|
|
|
|
|
GpuFrameData data{};
|
|
|
|
|
data.gpuIndex = gpuIdx;
|
|
|
|
|
data.pts = pts;
|
2026-04-02 22:07:27 +11:00
|
|
|
data.width = w;
|
|
|
|
|
data.height = h;
|
|
|
|
|
data.pixelFormat = 23; // AV_PIX_FMT_NV12
|
|
|
|
|
|
2026-04-03 14:51:52 +11:00
|
|
|
// NOTE: CPU NV12 snapshot is DEFERRED — only taken if pool D2D fails.
|
|
|
|
|
// For 4K frames, the snapshot is ~12MB malloc+memcpy+free per frame.
|
|
|
|
|
// Skipping it when the pool path succeeds (the common case) eliminates
|
|
|
|
|
// ~276MB/s of CPU heap allocation churn that causes process-level stalls.
|
2026-04-02 22:07:27 +11:00
|
|
|
|
2026-04-03 14:51:52 +11:00
|
|
|
// --- D2D copy: NVDEC surface → GPU buffer ---
|
2026-04-02 22:07:27 +11:00
|
|
|
bool d2dOk = false;
|
2026-04-03 14:51:52 +11:00
|
|
|
|
|
|
|
|
if (slot && slot->bufY && slot->bufUV && slot->pitchY > 0 && slot->pitchUV > 0) {
|
|
|
|
|
// --- Global pool path: D2D copy on per-slot non-blocking stream ---
|
|
|
|
|
// CRITICAL: Using the NULL stream (cudaMemcpy2D without stream) causes
|
|
|
|
|
// 1-2 second stalls on WDDM because it implicitly synchronizes with
|
|
|
|
|
// ALL other streams before executing. By using cudaMemcpy2DAsync on
|
|
|
|
|
// the slot's own non-blocking stream + cudaStreamSynchronize, we:
|
|
|
|
|
// 1. Submit the copy immediately (no wait for inference kernels)
|
|
|
|
|
// 2. Wait ONLY for this copy to finish (~0.3ms 1080p, ~1.2ms 4K)
|
|
|
|
|
// 3. Data is valid after sync — av_frame_free is safe
|
2026-04-02 22:07:27 +11:00
|
|
|
int prevDev = -1;
|
|
|
|
|
cudaGetDevice(&prevDev);
|
2026-04-03 14:51:52 +11:00
|
|
|
if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
|
|
|
|
|
|
|
|
|
|
cudaStream_t copyStream = static_cast<cudaStream_t>(slot->copyStream);
|
|
|
|
|
cudaError_t e3, e4;
|
|
|
|
|
|
|
|
|
|
if (copyStream) {
|
|
|
|
|
e3 = cudaMemcpy2DAsync(slot->bufY, slot->pitchY,
|
|
|
|
|
cudaFrame->data[0], cudaFrame->linesize[0],
|
|
|
|
|
w, h, cudaMemcpyDeviceToDevice, copyStream);
|
|
|
|
|
e4 = cudaMemcpy2DAsync(slot->bufUV, slot->pitchUV,
|
|
|
|
|
cudaFrame->data[1], cudaFrame->linesize[1],
|
|
|
|
|
w, h / 2, cudaMemcpyDeviceToDevice, copyStream);
|
2026-04-02 22:07:27 +11:00
|
|
|
if (e3 == cudaSuccess && e4 == cudaSuccess) {
|
2026-04-03 14:51:52 +11:00
|
|
|
// Wait ONLY for this stream's 2 copies (~0.3-1.2ms).
|
|
|
|
|
// Does NOT wait for inference kernels on other streams.
|
|
|
|
|
cudaStreamSynchronize(copyStream);
|
2026-04-02 22:07:27 +11:00
|
|
|
}
|
|
|
|
|
} else {
|
2026-04-03 14:51:52 +11:00
|
|
|
// Fallback if stream creation failed — NULL stream (may stall)
|
|
|
|
|
e3 = cudaMemcpy2D(slot->bufY, slot->pitchY,
|
|
|
|
|
cudaFrame->data[0], cudaFrame->linesize[0],
|
|
|
|
|
w, h, cudaMemcpyDeviceToDevice);
|
|
|
|
|
e4 = cudaMemcpy2D(slot->bufUV, slot->pitchUV,
|
|
|
|
|
cudaFrame->data[1], cudaFrame->linesize[1],
|
|
|
|
|
w, h / 2, cudaMemcpyDeviceToDevice);
|
2026-04-02 22:07:27 +11:00
|
|
|
}
|
|
|
|
|
|
2026-04-03 14:51:52 +11:00
|
|
|
if (prevDev >= 0) cudaSetDevice(prevDev);
|
|
|
|
|
|
|
|
|
|
if (e3 == cudaSuccess && e4 == cudaSuccess) {
|
|
|
|
|
data.isCudaDevicePtr = true;
|
|
|
|
|
data.yPlane = static_cast<uint8_t*>(slot->bufY);
|
|
|
|
|
data.uvPlane = static_cast<uint8_t*>(slot->bufUV);
|
|
|
|
|
data.yLinesize = static_cast<int>(slot->pitchY);
|
|
|
|
|
data.uvLinesize = static_cast<int>(slot->pitchUV);
|
|
|
|
|
data.poolSlot = slot; // Track for deferred release
|
|
|
|
|
// gpuCacheY/UV stay nullptr — global pool owns the buffers
|
|
|
|
|
d2dOk = true;
|
|
|
|
|
GPU_FRAME_DBG("attach_cuda: D2D OK (global pool) Y=%p UV=%p yPitch=%zu uvPitch=%zu",
|
|
|
|
|
slot->bufY, slot->bufUV, slot->pitchY, slot->pitchUV);
|
|
|
|
|
} else {
|
|
|
|
|
GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED (pool) e3=%d e4=%d — fallback",
|
|
|
|
|
(int)e3, (int)e4);
|
|
|
|
|
// Release slot back to pool on failure (immediate, no cooldown needed)
|
|
|
|
|
slot->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!d2dOk && !slot) {
|
|
|
|
|
// --- Legacy path: per-frame cudaMallocPitch (for modules without pool) ---
|
|
|
|
|
const size_t yBytes = static_cast<size_t>(w) * h;
|
|
|
|
|
const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
|
|
|
|
|
const size_t totalBytes = yBytes + uvBytes;
|
|
|
|
|
|
|
|
|
|
if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
|
|
|
|
|
int prevDev = -1;
|
|
|
|
|
cudaGetDevice(&prevDev);
|
|
|
|
|
if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
|
|
|
|
|
|
|
|
|
|
void* ownedY = nullptr;
|
|
|
|
|
void* ownedUV = nullptr;
|
|
|
|
|
size_t yPitch = 0;
|
|
|
|
|
size_t uvPitch = 0;
|
|
|
|
|
|
|
|
|
|
cudaError_t e1 = cudaMallocPitch(&ownedY, &yPitch, w, h);
|
|
|
|
|
cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);
|
|
|
|
|
|
|
|
|
|
if (e1 == cudaSuccess && e2 == cudaSuccess) {
|
|
|
|
|
cudaError_t e3 = cudaMemcpy2D(ownedY, yPitch,
|
|
|
|
|
cudaFrame->data[0], cudaFrame->linesize[0],
|
|
|
|
|
w, h, cudaMemcpyDeviceToDevice);
|
|
|
|
|
cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
|
|
|
|
|
cudaFrame->data[1], cudaFrame->linesize[1],
|
|
|
|
|
w, h / 2, cudaMemcpyDeviceToDevice);
|
|
|
|
|
|
|
|
|
|
if (e3 == cudaSuccess && e4 == cudaSuccess) {
|
|
|
|
|
data.isCudaDevicePtr = true;
|
|
|
|
|
data.yPlane = static_cast<uint8_t*>(ownedY);
|
|
|
|
|
data.uvPlane = static_cast<uint8_t*>(ownedUV);
|
|
|
|
|
data.yLinesize = static_cast<int>(yPitch);
|
|
|
|
|
data.uvLinesize = static_cast<int>(uvPitch);
|
|
|
|
|
data.gpuCacheY = ownedY;
|
|
|
|
|
data.gpuCacheUV = ownedUV;
|
|
|
|
|
data.gpuCacheYPitch = yPitch;
|
|
|
|
|
data.gpuCacheUVPitch = uvPitch;
|
|
|
|
|
data.gpuCacheDeviceIdx = gpuIdx;
|
|
|
|
|
data.gpuCacheValid = true;
|
|
|
|
|
data.gpuCacheBytes = yPitch * h + uvPitch * (h / 2);
|
|
|
|
|
ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
|
|
|
|
|
d2dOk = true;
|
|
|
|
|
GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
|
|
|
|
|
ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
|
|
|
|
|
} else {
|
|
|
|
|
GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
|
|
|
|
|
(int)e3, (int)e4);
|
|
|
|
|
cudaFree(ownedY);
|
|
|
|
|
cudaFree(ownedUV);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
|
|
|
|
|
(int)e1, (int)e2);
|
|
|
|
|
if (e1 == cudaSuccess) cudaFree(ownedY);
|
|
|
|
|
if (e2 == cudaSuccess) cudaFree(ownedUV);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (prevDev >= 0) cudaSetDevice(prevDev);
|
|
|
|
|
}
|
2026-04-02 22:07:27 +11:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!d2dOk) {
|
2026-04-03 14:51:52 +11:00
|
|
|
// D2D failed or no slot — take CPU NV12 snapshot now (before freeing cpuNV12).
|
|
|
|
|
// This is the ONLY path where the CPU snapshot is needed. Skipping it
|
|
|
|
|
// on the pool-success path avoids ~12MB malloc+memcpy+free per 4K frame.
|
|
|
|
|
if (cpuNV12) {
|
|
|
|
|
anscv_gpu_ops::detail::snapshotNV12Planes(
|
|
|
|
|
cpuNV12,
|
|
|
|
|
data.cpuYPlane, data.cpuYLinesize,
|
|
|
|
|
data.cpuUvPlane, data.cpuUvLinesize,
|
|
|
|
|
data.width, data.height);
|
|
|
|
|
}
|
2026-04-02 22:07:27 +11:00
|
|
|
GPU_FRAME_DBG("attach_cuda: FALLBACK CPU-only cpuY=%p cpuUV=%p",
|
|
|
|
|
(void*)data.cpuYPlane, (void*)data.cpuUvPlane);
|
|
|
|
|
data.isCudaDevicePtr = false;
|
|
|
|
|
data.yPlane = data.cpuYPlane;
|
|
|
|
|
data.uvPlane = data.cpuUvPlane;
|
|
|
|
|
data.yLinesize = data.cpuYLinesize;
|
|
|
|
|
data.uvLinesize = data.cpuUvLinesize;
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-03 14:51:52 +11:00
|
|
|
// Free AVFrames immediately — synchronous D2D copy has completed,
|
|
|
|
|
// so NVDEC surfaces can be returned to the decoder's surface pool.
|
2026-04-02 22:07:27 +11:00
|
|
|
GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p",
|
|
|
|
|
(void*)cudaFrame, (void*)cpuNV12);
|
|
|
|
|
av_frame_free(&cudaFrame);
|
|
|
|
|
if (cpuNV12) av_frame_free(&cpuNV12);
|
|
|
|
|
data.avframe = nullptr;
|
|
|
|
|
data.cpuAvframe = nullptr;
|
|
|
|
|
|
2026-04-03 14:51:52 +11:00
|
|
|
GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d poolSlot=%p",
|
2026-04-02 22:07:27 +11:00
|
|
|
(void*)data.yPlane, (void*)data.uvPlane, (int)data.isCudaDevicePtr,
|
2026-04-03 14:51:52 +11:00
|
|
|
(void*)data.poolSlot);
|
2026-03-28 16:54:11 +11:00
|
|
|
|
|
|
|
|
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
|
|
|
|
|
if (old) {
|
|
|
|
|
AVFrame* oldFrame = static_cast<AVFrame*>(old);
|
|
|
|
|
av_frame_free(&oldFrame);
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
// Free stale AVFrames evicted by TTL or previous attach
|
2026-03-28 16:54:11 +11:00
|
|
|
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
|
|
|
|
for (void* p : pending) {
|
|
|
|
|
AVFrame* stale = static_cast<AVFrame*>(p);
|
|
|
|
|
av_frame_free(&stale);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-03 14:51:52 +11:00
|
|
|
// Release entry by cv::Mat* and free any returned AVFrames.
|
|
|
|
|
// GPU device pointers are deferred to TTL eviction or explicit cleanup.
|
2026-04-02 22:07:27 +11:00
|
|
|
// Safe if not in map (no-op).
|
2026-03-28 16:54:11 +11:00
|
|
|
inline void gpu_frame_remove(cv::Mat* mat) {
|
|
|
|
|
if (!mat) return;
|
|
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat);
|
2026-03-28 16:54:11 +11:00
|
|
|
ANSGpuFrameRegistry::instance().release(mat);
|
|
|
|
|
|
|
|
|
|
// Free any AVFrames that became pending from this release or prior eviction
|
|
|
|
|
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
|
|
|
|
for (void* p : pending) {
|
|
|
|
|
AVFrame* stale = static_cast<AVFrame*>(p);
|
|
|
|
|
av_frame_free(&stale);
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-03 14:51:52 +11:00
|
|
|
// GPU device pointers deferred — see gpu_frame_evict_stale() / Destroy()
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
|
|
|
|
|
inline void gpu_frame_invalidate(cv::Mat* mat) {
|
|
|
|
|
gpu_frame_remove(mat);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Run TTL eviction + drain pending. Call periodically from camera threads.
|
2026-04-03 14:51:52 +11:00
|
|
|
// TTL eviction is throttled to every 500ms (EVICT_CHECK_INTERVAL_MS).
|
|
|
|
|
// GPU buffer cleanup is safe here because:
|
|
|
|
|
// 1. Only frames >3 seconds old are evicted (kernels take <10ms)
|
|
|
|
|
// 2. cudaDeviceSynchronize() ensures all in-flight kernels are done
|
|
|
|
|
// 3. At 500ms interval, one sync per 500ms is ~0.1ms cost (acceptable)
|
|
|
|
|
// vs per-frame sync which caused 900ms spikes
|
2026-03-28 16:54:11 +11:00
|
|
|
inline void gpu_frame_evict_stale() {
|
|
|
|
|
ANSGpuFrameRegistry::instance().evictStaleFrames();
|
|
|
|
|
|
|
|
|
|
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
|
|
|
|
for (void* p : pending) {
|
|
|
|
|
AVFrame* stale = static_cast<AVFrame*>(p);
|
|
|
|
|
av_frame_free(&stale);
|
|
|
|
|
}
|
2026-04-02 22:07:27 +11:00
|
|
|
|
2026-04-03 14:51:52 +11:00
|
|
|
// Free GPU device pointers from evicted/released frames (legacy path).
|
|
|
|
|
// Pool-backed frames (ANSRTSP) don't add to this list (gpuCacheY=nullptr).
|
2026-04-02 22:07:27 +11:00
|
|
|
anscv_gpu_ops::detail::drainAndFreeGpuPending();
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|