Fix NV12 crash issue when recreate camera object
This commit is contained in:
@@ -19,8 +19,31 @@ extern "C" {
|
||||
#include "libavutil/frame.h"
|
||||
}
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
// Debug logging macro for GPU frame operations.
|
||||
// Output goes to stderr (console) AND OutputDebugString (DebugView / VS debugger).
|
||||
// Use Sysinternals DebugView (dbgview64.exe) to capture these after a crash.
|
||||
#ifndef GPU_FRAME_DBG
|
||||
#ifdef _WIN32
|
||||
#define GPU_FRAME_DBG(fmt, ...) do { \
|
||||
char _gpu_dbg_buf[512]; \
|
||||
snprintf(_gpu_dbg_buf, sizeof(_gpu_dbg_buf), "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__); \
|
||||
OutputDebugStringA(_gpu_dbg_buf); \
|
||||
fprintf(stderr, "%s", _gpu_dbg_buf); \
|
||||
} while(0)
|
||||
#else
|
||||
#define GPU_FRAME_DBG(fmt, ...) \
|
||||
fprintf(stderr, "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
namespace anscv_gpu_ops {
|
||||
namespace detail {
|
||||
@@ -71,6 +94,42 @@ inline bool snapshotNV12Planes(const AVFrame* nv12,
|
||||
return true;
|
||||
}
|
||||
|
||||
// Drain pending GPU device pointers and actually cudaFree them.
|
||||
// Must be called from a thread with CUDA context available.
|
||||
inline void drainAndFreeGpuPending() {
|
||||
auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
|
||||
if (gpuPending.empty()) return;
|
||||
GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs", gpuPending.size());
|
||||
int prevDev = -1;
|
||||
cudaGetDevice(&prevDev);
|
||||
|
||||
// Group by device to minimize cudaSetDevice calls and synchronize once per device.
|
||||
// cudaDeviceSynchronize() is CRITICAL: NV12 kernels run on cv::cuda::Stream
|
||||
// (not the default stream). cudaFree on stream 0 doesn't wait for other
|
||||
// streams, so without this sync, cudaFree can free a buffer while a kernel
|
||||
// on another stream is still reading from it → cudaErrorIllegalAddress (700)
|
||||
// which permanently corrupts the CUDA context.
|
||||
int lastSyncDev = -1;
|
||||
for (auto& entry : gpuPending) {
|
||||
if (entry.ptr) {
|
||||
if (entry.deviceIdx >= 0)
|
||||
cudaSetDevice(entry.deviceIdx);
|
||||
if (entry.deviceIdx != lastSyncDev) {
|
||||
cudaDeviceSynchronize();
|
||||
lastSyncDev = entry.deviceIdx;
|
||||
}
|
||||
GPU_FRAME_DBG("drainGpuPending: cudaFree(%p) dev=%d", entry.ptr, entry.deviceIdx);
|
||||
cudaError_t err = cudaFree(entry.ptr);
|
||||
if (err != cudaSuccess) {
|
||||
GPU_FRAME_DBG("drainGpuPending: cudaFree FAILED err=%d (%s)",
|
||||
(int)err, cudaGetErrorString(err));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (prevDev >= 0)
|
||||
cudaSetDevice(prevDev);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
} // namespace anscv_gpu_ops
|
||||
|
||||
@@ -117,36 +176,44 @@ inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pt
|
||||
}
|
||||
}
|
||||
|
||||
// Attach CUDA HW frame — keeps CUDA device pointers for zero-copy inference.
|
||||
// Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
|
||||
// TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
|
||||
//
|
||||
// Primary path: yPlane/uvPlane point to CUDA device pointers from the cloned
|
||||
// AVFrame (data[0]/data[1]). The cloned AVFrame keeps the NVDEC surface alive
|
||||
// until gpu_frame_remove() is called after inference. With 4 cameras each
|
||||
// holding ~1 surface, this uses 4 of NVDEC's 25-32 surface pool — safe.
|
||||
// D2D copy path: cudaMemcpy2D from NVDEC surfaces to cudaMalloc'd buffers on the
|
||||
// same GPU. This decouples the NV12 data lifetime from the NVDEC decoder, so
|
||||
// player->close() can safely destroy the decoder at any time without invalidating
|
||||
// pointers that inference engines may be reading. The NVDEC surface is freed
|
||||
// immediately (av_frame_free), returning it to the decoder's surface pool.
|
||||
//
|
||||
// The owned GPU pointers are stored as both yPlane/uvPlane (for zero-copy reads)
|
||||
// and gpuCacheY/gpuCacheUV (for lifecycle management / cudaFree on cleanup).
|
||||
//
|
||||
// VRAM budget: if the global GPU cache budget is exceeded, falls back to CPU-only
|
||||
// NV12 snapshot (no zero-copy, but safe).
|
||||
//
|
||||
// Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
|
||||
// inference (when decode GPU != inference GPU, CUDA device ptrs aren't
|
||||
// accessible from another GPU context).
|
||||
// inference (when decode GPU != inference GPU).
|
||||
inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
|
||||
AVFrame* cpuNV12 = nullptr) {
|
||||
if (!mat || !cudaFrame) return;
|
||||
if (!mat || !cudaFrame) {
|
||||
GPU_FRAME_DBG("attach_cuda: SKIP mat=%p cudaFrame=%p", (void*)mat, (void*)cudaFrame);
|
||||
return;
|
||||
}
|
||||
|
||||
const int w = cudaFrame->width;
|
||||
const int h = cudaFrame->height;
|
||||
GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p cpuNV12=%p",
|
||||
(void*)mat, w, h, gpuIdx,
|
||||
(void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)cpuNV12);
|
||||
|
||||
GpuFrameData data{};
|
||||
data.gpuIndex = gpuIdx;
|
||||
data.pts = pts;
|
||||
data.width = cudaFrame->width;
|
||||
data.height = cudaFrame->height;
|
||||
data.pixelFormat = 23; // AV_PIX_FMT_NV12 — the underlying sw_format
|
||||
data.width = w;
|
||||
data.height = h;
|
||||
data.pixelFormat = 23; // AV_PIX_FMT_NV12
|
||||
|
||||
// Primary: CUDA device pointers from NVDEC (zero-copy on same GPU)
|
||||
data.isCudaDevicePtr = true;
|
||||
data.yPlane = cudaFrame->data[0]; // CUDA device ptr: Y plane
|
||||
data.uvPlane = cudaFrame->data[1]; // CUDA device ptr: UV plane
|
||||
data.yLinesize = cudaFrame->linesize[0];
|
||||
data.uvLinesize = cudaFrame->linesize[1];
|
||||
|
||||
// Fallback: snapshot CPU NV12 for cross-GPU inference
|
||||
// Snapshot CPU NV12 for cross-GPU fallback (must do before freeing cpuNV12)
|
||||
if (cpuNV12) {
|
||||
anscv_gpu_ops::detail::snapshotNV12Planes(
|
||||
cpuNV12,
|
||||
@@ -155,9 +222,98 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
data.width, data.height);
|
||||
}
|
||||
|
||||
// Store AVFrames for cleanup (cudaFrame keeps NVDEC surface alive)
|
||||
data.avframe = cudaFrame;
|
||||
data.cpuAvframe = cpuNV12;
|
||||
// --- D2D copy: NVDEC surface → owned GPU memory ---
|
||||
// Estimate VRAM needed for the owned NV12 copy
|
||||
const size_t yBytes = static_cast<size_t>(w) * h;
|
||||
const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
|
||||
const size_t totalBytes = yBytes + uvBytes;
|
||||
|
||||
bool d2dOk = false;
|
||||
if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
|
||||
int prevDev = -1;
|
||||
cudaGetDevice(&prevDev);
|
||||
if (gpuIdx >= 0)
|
||||
cudaSetDevice(gpuIdx);
|
||||
|
||||
void* ownedY = nullptr;
|
||||
void* ownedUV = nullptr;
|
||||
size_t yPitch = 0;
|
||||
size_t uvPitch = 0;
|
||||
|
||||
cudaError_t e1 = cudaMallocPitch(&ownedY, &yPitch, w, h);
|
||||
cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);
|
||||
|
||||
if (e1 == cudaSuccess && e2 == cudaSuccess) {
|
||||
cudaError_t e3 = cudaMemcpy2D(ownedY, yPitch,
|
||||
cudaFrame->data[0], cudaFrame->linesize[0],
|
||||
w, h, cudaMemcpyDeviceToDevice);
|
||||
cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
|
||||
cudaFrame->data[1], cudaFrame->linesize[1],
|
||||
w, h / 2, cudaMemcpyDeviceToDevice);
|
||||
|
||||
if (e3 == cudaSuccess && e4 == cudaSuccess) {
|
||||
// Store owned GPU pointers as primary NV12 source
|
||||
data.isCudaDevicePtr = true;
|
||||
data.yPlane = static_cast<uint8_t*>(ownedY);
|
||||
data.uvPlane = static_cast<uint8_t*>(ownedUV);
|
||||
data.yLinesize = static_cast<int>(yPitch);
|
||||
data.uvLinesize = static_cast<int>(uvPitch);
|
||||
|
||||
// Track in gpuCache for lifecycle management (cudaFree on cleanup)
|
||||
data.gpuCacheY = ownedY;
|
||||
data.gpuCacheUV = ownedUV;
|
||||
data.gpuCacheYPitch = yPitch;
|
||||
data.gpuCacheUVPitch = uvPitch;
|
||||
data.gpuCacheDeviceIdx = gpuIdx;
|
||||
data.gpuCacheValid = true;
|
||||
data.gpuCacheBytes = yPitch * h + uvPitch * (h / 2);
|
||||
|
||||
ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
|
||||
d2dOk = true;
|
||||
GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
|
||||
ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
|
||||
} else {
|
||||
// D2D copy failed — free allocated memory and fall back
|
||||
GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
|
||||
(int)e3, (int)e4);
|
||||
cudaFree(ownedY);
|
||||
cudaFree(ownedUV);
|
||||
}
|
||||
} else {
|
||||
// Allocation failed — free any partial allocation and fall back
|
||||
GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
|
||||
(int)e1, (int)e2);
|
||||
if (e1 == cudaSuccess) cudaFree(ownedY);
|
||||
if (e2 == cudaSuccess) cudaFree(ownedUV);
|
||||
}
|
||||
|
||||
if (prevDev >= 0)
|
||||
cudaSetDevice(prevDev);
|
||||
}
|
||||
|
||||
if (!d2dOk) {
|
||||
// Fall back to CPU NV12 snapshot only (no zero-copy)
|
||||
GPU_FRAME_DBG("attach_cuda: FALLBACK CPU-only cpuY=%p cpuUV=%p",
|
||||
(void*)data.cpuYPlane, (void*)data.cpuUvPlane);
|
||||
data.isCudaDevicePtr = false;
|
||||
data.yPlane = data.cpuYPlane;
|
||||
data.uvPlane = data.cpuUvPlane;
|
||||
data.yLinesize = data.cpuYLinesize;
|
||||
data.uvLinesize = data.cpuUvLinesize;
|
||||
}
|
||||
|
||||
// Release AVFrames immediately — NVDEC surfaces returned to pool.
|
||||
// No longer stored in GpuFrameData (owned GPU copy is independent).
|
||||
GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p",
|
||||
(void*)cudaFrame, (void*)cpuNV12);
|
||||
av_frame_free(&cudaFrame);
|
||||
if (cpuNV12) av_frame_free(&cpuNV12);
|
||||
data.avframe = nullptr;
|
||||
data.cpuAvframe = nullptr;
|
||||
|
||||
GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d gpuCacheY=%p gpuCacheUV=%p",
|
||||
(void*)data.yPlane, (void*)data.uvPlane, (int)data.isCudaDevicePtr,
|
||||
data.gpuCacheY, data.gpuCacheUV);
|
||||
|
||||
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
|
||||
if (old) {
|
||||
@@ -165,17 +321,23 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
av_frame_free(&oldFrame);
|
||||
}
|
||||
|
||||
// Free stale AVFrames evicted by TTL or previous attach
|
||||
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||||
for (void* p : pending) {
|
||||
AVFrame* stale = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&stale);
|
||||
}
|
||||
|
||||
// Free stale GPU device pointers
|
||||
anscv_gpu_ops::detail::drainAndFreeGpuPending();
|
||||
}
|
||||
|
||||
// Release entry by cv::Mat* and free any returned AVFrames. Safe if not in map (no-op).
|
||||
// Release entry by cv::Mat* and free any returned AVFrames + GPU pointers.
|
||||
// Safe if not in map (no-op).
|
||||
inline void gpu_frame_remove(cv::Mat* mat) {
|
||||
if (!mat) return;
|
||||
|
||||
GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat);
|
||||
ANSGpuFrameRegistry::instance().release(mat);
|
||||
|
||||
// Free any AVFrames that became pending from this release or prior eviction
|
||||
@@ -186,13 +348,7 @@ inline void gpu_frame_remove(cv::Mat* mat) {
|
||||
}
|
||||
|
||||
// Free any GPU device pointers that became pending
|
||||
auto gpuPending = gpu_frame_drain_gpu_pending();
|
||||
// NOTE: cudaFree requires CUDA context — caller must be on a CUDA-capable thread.
|
||||
// If not, these will leak. In practice, gpu_frame_remove is called from ANSCV
|
||||
// camera threads which do have CUDA context.
|
||||
// For safety, we skip cudaFree here and let NV12PreprocessHelper handle it.
|
||||
// The GPU pointers are tracked in the budget and will be accounted for.
|
||||
(void)gpuPending;
|
||||
anscv_gpu_ops::detail::drainAndFreeGpuPending();
|
||||
}
|
||||
|
||||
// Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
|
||||
@@ -209,4 +365,7 @@ inline void gpu_frame_evict_stale() {
|
||||
AVFrame* stale = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&stale);
|
||||
}
|
||||
|
||||
// Free any GPU device pointers from evicted frames
|
||||
anscv_gpu_ops::detail::drainAndFreeGpuPending();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user