Fix NV12 crash issue when recreate camera object
(new structure) does not work
This commit is contained in:
@@ -14,6 +14,7 @@
|
||||
// gpu_frame_lookup() + the GpuFrameData plane pointers.
|
||||
|
||||
#include "ANSGpuFrameRegistry.h"
|
||||
#include "GpuNV12SlotPool.h"
|
||||
|
||||
extern "C" {
|
||||
#include "libavutil/frame.h"
|
||||
@@ -29,9 +30,9 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// Debug logging macro for GPU frame operations.
|
||||
// Output goes to stderr (console) AND OutputDebugString (DebugView / VS debugger).
|
||||
// Use Sysinternals DebugView (dbgview64.exe) to capture these after a crash.
|
||||
// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
|
||||
#ifndef GPU_FRAME_DBG
|
||||
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
|
||||
#ifdef _WIN32
|
||||
#define GPU_FRAME_DBG(fmt, ...) do { \
|
||||
char _gpu_dbg_buf[512]; \
|
||||
@@ -43,6 +44,9 @@ extern "C" {
|
||||
#define GPU_FRAME_DBG(fmt, ...) \
|
||||
fprintf(stderr, "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__)
|
||||
#endif
|
||||
#else
|
||||
#define GPU_FRAME_DBG(fmt, ...) ((void)0)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
namespace anscv_gpu_ops {
|
||||
@@ -94,31 +98,29 @@ inline bool snapshotNV12Planes(const AVFrame* nv12,
|
||||
return true;
|
||||
}
|
||||
|
||||
// Drain pending GPU device pointers and actually cudaFree them.
|
||||
// Must be called from a thread with CUDA context available.
|
||||
inline void drainAndFreeGpuPending() {
|
||||
auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
|
||||
// Drain pending GPU device pointers and cudaFree them.
|
||||
// Uses time-based safety: only frees entries queued >100ms ago, guaranteeing
|
||||
// all CUDA kernels reading from them have completed (kernels take <10ms).
|
||||
// NO cudaDeviceSynchronize — zero blocking of GPU pipeline.
|
||||
//
|
||||
// If forceAll=true, drains ALL entries with cudaDeviceSynchronize first
|
||||
// (used only by Destroy/Reconnect for final cleanup).
|
||||
inline void drainAndFreeGpuPending(bool forceAll = false) {
|
||||
static constexpr int SAFE_AGE_MS = 100; // 100ms >> 10ms kernel duration
|
||||
auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending(
|
||||
forceAll ? 0 : SAFE_AGE_MS);
|
||||
if (gpuPending.empty()) return;
|
||||
GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs", gpuPending.size());
|
||||
GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs (force=%d)", gpuPending.size(), (int)forceAll);
|
||||
int prevDev = -1;
|
||||
cudaGetDevice(&prevDev);
|
||||
|
||||
// Group by device to minimize cudaSetDevice calls and synchronize once per device.
|
||||
// cudaDeviceSynchronize() is CRITICAL: NV12 kernels run on cv::cuda::Stream
|
||||
// (not the default stream). cudaFree on stream 0 doesn't wait for other
|
||||
// streams, so without this sync, cudaFree can free a buffer while a kernel
|
||||
// on another stream is still reading from it → cudaErrorIllegalAddress (700)
|
||||
// which permanently corrupts the CUDA context.
|
||||
int lastSyncDev = -1;
|
||||
if (forceAll) {
|
||||
// Final cleanup — sync all devices first
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
for (auto& entry : gpuPending) {
|
||||
if (entry.ptr) {
|
||||
if (entry.deviceIdx >= 0)
|
||||
cudaSetDevice(entry.deviceIdx);
|
||||
if (entry.deviceIdx != lastSyncDev) {
|
||||
cudaDeviceSynchronize();
|
||||
lastSyncDev = entry.deviceIdx;
|
||||
}
|
||||
GPU_FRAME_DBG("drainGpuPending: cudaFree(%p) dev=%d", entry.ptr, entry.deviceIdx);
|
||||
cudaError_t err = cudaFree(entry.ptr);
|
||||
if (err != cudaSuccess) {
|
||||
GPU_FRAME_DBG("drainGpuPending: cudaFree FAILED err=%d (%s)",
|
||||
@@ -179,22 +181,23 @@ inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pt
|
||||
// Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
|
||||
// TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
|
||||
//
|
||||
// D2D copy path: cudaMemcpy2D from NVDEC surfaces to cudaMalloc'd buffers on the
|
||||
// same GPU. This decouples the NV12 data lifetime from the NVDEC decoder, so
|
||||
// player->close() can safely destroy the decoder at any time without invalidating
|
||||
// pointers that inference engines may be reading. The NVDEC surface is freed
|
||||
// immediately (av_frame_free), returning it to the decoder's surface pool.
|
||||
// D2D copy: SYNCHRONOUS cudaMemcpy2D from NVDEC surfaces into a GpuNV12Slot
|
||||
// buffer from the global pool. Data is valid immediately after the call returns.
|
||||
// AVFrame is freed immediately (NVDEC surface returned to decoder pool).
|
||||
//
|
||||
// The owned GPU pointers are stored as both yPlane/uvPlane (for zero-copy reads)
|
||||
// and gpuCacheY/gpuCacheUV (for lifecycle management / cudaFree on cleanup).
|
||||
// The slot is protected by a 200ms cooldown after the GpuFrameData's refcount
|
||||
// drops to 0, guaranteeing that all in-flight GPU kernels (which complete in
|
||||
// <10ms) have finished reading from the buffer before it can be reused.
|
||||
//
|
||||
// VRAM budget: if the global GPU cache budget is exceeded, falls back to CPU-only
|
||||
// NV12 snapshot (no zero-copy, but safe).
|
||||
// slot: pre-acquired from GpuNV12SlotPool::instance().acquire().
|
||||
// If non-null, D2D copy goes into slot buffers (no per-frame alloc).
|
||||
// If nullptr, falls back to per-frame cudaMallocPitch (legacy path).
|
||||
//
|
||||
// Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
|
||||
// inference (when decode GPU != inference GPU).
|
||||
inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
|
||||
AVFrame* cpuNV12 = nullptr) {
|
||||
AVFrame* cpuNV12 = nullptr,
|
||||
GpuNV12Slot* slot = nullptr) {
|
||||
if (!mat || !cudaFrame) {
|
||||
GPU_FRAME_DBG("attach_cuda: SKIP mat=%p cudaFrame=%p", (void*)mat, (void*)cudaFrame);
|
||||
return;
|
||||
@@ -202,9 +205,9 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
|
||||
const int w = cudaFrame->width;
|
||||
const int h = cudaFrame->height;
|
||||
GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p cpuNV12=%p",
|
||||
GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p slot=%p",
|
||||
(void*)mat, w, h, gpuIdx,
|
||||
(void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)cpuNV12);
|
||||
(void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)slot);
|
||||
|
||||
GpuFrameData data{};
|
||||
data.gpuIndex = gpuIdx;
|
||||
@@ -213,86 +216,145 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
data.height = h;
|
||||
data.pixelFormat = 23; // AV_PIX_FMT_NV12
|
||||
|
||||
// Snapshot CPU NV12 for cross-GPU fallback (must do before freeing cpuNV12)
|
||||
if (cpuNV12) {
|
||||
anscv_gpu_ops::detail::snapshotNV12Planes(
|
||||
cpuNV12,
|
||||
data.cpuYPlane, data.cpuYLinesize,
|
||||
data.cpuUvPlane, data.cpuUvLinesize,
|
||||
data.width, data.height);
|
||||
}
|
||||
|
||||
// --- D2D copy: NVDEC surface → owned GPU memory ---
|
||||
// Estimate VRAM needed for the owned NV12 copy
|
||||
const size_t yBytes = static_cast<size_t>(w) * h;
|
||||
const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
|
||||
const size_t totalBytes = yBytes + uvBytes;
|
||||
// NOTE: CPU NV12 snapshot is DEFERRED — only taken if pool D2D fails.
|
||||
// For 4K frames, the snapshot is ~12MB malloc+memcpy+free per frame.
|
||||
// Skipping it when the pool path succeeds (the common case) eliminates
|
||||
// ~276MB/s of CPU heap allocation churn that causes process-level stalls.
|
||||
|
||||
// --- D2D copy: NVDEC surface → GPU buffer ---
|
||||
bool d2dOk = false;
|
||||
if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
|
||||
|
||||
if (slot && slot->bufY && slot->bufUV && slot->pitchY > 0 && slot->pitchUV > 0) {
|
||||
// --- Global pool path: D2D copy on per-slot non-blocking stream ---
|
||||
// CRITICAL: Using the NULL stream (cudaMemcpy2D without stream) causes
|
||||
// 1-2 second stalls on WDDM because it implicitly synchronizes with
|
||||
// ALL other streams before executing. By using cudaMemcpy2DAsync on
|
||||
// the slot's own non-blocking stream + cudaStreamSynchronize, we:
|
||||
// 1. Submit the copy immediately (no wait for inference kernels)
|
||||
// 2. Wait ONLY for this copy to finish (~0.3ms 1080p, ~1.2ms 4K)
|
||||
// 3. Data is valid after sync — av_frame_free is safe
|
||||
int prevDev = -1;
|
||||
cudaGetDevice(&prevDev);
|
||||
if (gpuIdx >= 0)
|
||||
cudaSetDevice(gpuIdx);
|
||||
if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
|
||||
|
||||
void* ownedY = nullptr;
|
||||
void* ownedUV = nullptr;
|
||||
size_t yPitch = 0;
|
||||
size_t uvPitch = 0;
|
||||
|
||||
cudaError_t e1 = cudaMallocPitch(&ownedY, &yPitch, w, h);
|
||||
cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);
|
||||
|
||||
if (e1 == cudaSuccess && e2 == cudaSuccess) {
|
||||
cudaError_t e3 = cudaMemcpy2D(ownedY, yPitch,
|
||||
cudaFrame->data[0], cudaFrame->linesize[0],
|
||||
w, h, cudaMemcpyDeviceToDevice);
|
||||
cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
|
||||
cudaFrame->data[1], cudaFrame->linesize[1],
|
||||
w, h / 2, cudaMemcpyDeviceToDevice);
|
||||
cudaStream_t copyStream = static_cast<cudaStream_t>(slot->copyStream);
|
||||
cudaError_t e3, e4;
|
||||
|
||||
if (copyStream) {
|
||||
e3 = cudaMemcpy2DAsync(slot->bufY, slot->pitchY,
|
||||
cudaFrame->data[0], cudaFrame->linesize[0],
|
||||
w, h, cudaMemcpyDeviceToDevice, copyStream);
|
||||
e4 = cudaMemcpy2DAsync(slot->bufUV, slot->pitchUV,
|
||||
cudaFrame->data[1], cudaFrame->linesize[1],
|
||||
w, h / 2, cudaMemcpyDeviceToDevice, copyStream);
|
||||
if (e3 == cudaSuccess && e4 == cudaSuccess) {
|
||||
// Store owned GPU pointers as primary NV12 source
|
||||
data.isCudaDevicePtr = true;
|
||||
data.yPlane = static_cast<uint8_t*>(ownedY);
|
||||
data.uvPlane = static_cast<uint8_t*>(ownedUV);
|
||||
data.yLinesize = static_cast<int>(yPitch);
|
||||
data.uvLinesize = static_cast<int>(uvPitch);
|
||||
|
||||
// Track in gpuCache for lifecycle management (cudaFree on cleanup)
|
||||
data.gpuCacheY = ownedY;
|
||||
data.gpuCacheUV = ownedUV;
|
||||
data.gpuCacheYPitch = yPitch;
|
||||
data.gpuCacheUVPitch = uvPitch;
|
||||
data.gpuCacheDeviceIdx = gpuIdx;
|
||||
data.gpuCacheValid = true;
|
||||
data.gpuCacheBytes = yPitch * h + uvPitch * (h / 2);
|
||||
|
||||
ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
|
||||
d2dOk = true;
|
||||
GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
|
||||
ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
|
||||
} else {
|
||||
// D2D copy failed — free allocated memory and fall back
|
||||
GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
|
||||
(int)e3, (int)e4);
|
||||
cudaFree(ownedY);
|
||||
cudaFree(ownedUV);
|
||||
// Wait ONLY for this stream's 2 copies (~0.3-1.2ms).
|
||||
// Does NOT wait for inference kernels on other streams.
|
||||
cudaStreamSynchronize(copyStream);
|
||||
}
|
||||
} else {
|
||||
// Allocation failed — free any partial allocation and fall back
|
||||
GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
|
||||
(int)e1, (int)e2);
|
||||
if (e1 == cudaSuccess) cudaFree(ownedY);
|
||||
if (e2 == cudaSuccess) cudaFree(ownedUV);
|
||||
// Fallback if stream creation failed — NULL stream (may stall)
|
||||
e3 = cudaMemcpy2D(slot->bufY, slot->pitchY,
|
||||
cudaFrame->data[0], cudaFrame->linesize[0],
|
||||
w, h, cudaMemcpyDeviceToDevice);
|
||||
e4 = cudaMemcpy2D(slot->bufUV, slot->pitchUV,
|
||||
cudaFrame->data[1], cudaFrame->linesize[1],
|
||||
w, h / 2, cudaMemcpyDeviceToDevice);
|
||||
}
|
||||
|
||||
if (prevDev >= 0)
|
||||
cudaSetDevice(prevDev);
|
||||
if (prevDev >= 0) cudaSetDevice(prevDev);
|
||||
|
||||
if (e3 == cudaSuccess && e4 == cudaSuccess) {
|
||||
data.isCudaDevicePtr = true;
|
||||
data.yPlane = static_cast<uint8_t*>(slot->bufY);
|
||||
data.uvPlane = static_cast<uint8_t*>(slot->bufUV);
|
||||
data.yLinesize = static_cast<int>(slot->pitchY);
|
||||
data.uvLinesize = static_cast<int>(slot->pitchUV);
|
||||
data.poolSlot = slot; // Track for deferred release
|
||||
// gpuCacheY/UV stay nullptr — global pool owns the buffers
|
||||
d2dOk = true;
|
||||
GPU_FRAME_DBG("attach_cuda: D2D OK (global pool) Y=%p UV=%p yPitch=%zu uvPitch=%zu",
|
||||
slot->bufY, slot->bufUV, slot->pitchY, slot->pitchUV);
|
||||
} else {
|
||||
GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED (pool) e3=%d e4=%d — fallback",
|
||||
(int)e3, (int)e4);
|
||||
// Release slot back to pool on failure (immediate, no cooldown needed)
|
||||
slot->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
|
||||
}
|
||||
}
|
||||
|
||||
if (!d2dOk && !slot) {
|
||||
// --- Legacy path: per-frame cudaMallocPitch (for modules without pool) ---
|
||||
const size_t yBytes = static_cast<size_t>(w) * h;
|
||||
const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
|
||||
const size_t totalBytes = yBytes + uvBytes;
|
||||
|
||||
if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
|
||||
int prevDev = -1;
|
||||
cudaGetDevice(&prevDev);
|
||||
if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
|
||||
|
||||
void* ownedY = nullptr;
|
||||
void* ownedUV = nullptr;
|
||||
size_t yPitch = 0;
|
||||
size_t uvPitch = 0;
|
||||
|
||||
cudaError_t e1 = cudaMallocPitch(&ownedY, &yPitch, w, h);
|
||||
cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);
|
||||
|
||||
if (e1 == cudaSuccess && e2 == cudaSuccess) {
|
||||
cudaError_t e3 = cudaMemcpy2D(ownedY, yPitch,
|
||||
cudaFrame->data[0], cudaFrame->linesize[0],
|
||||
w, h, cudaMemcpyDeviceToDevice);
|
||||
cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
|
||||
cudaFrame->data[1], cudaFrame->linesize[1],
|
||||
w, h / 2, cudaMemcpyDeviceToDevice);
|
||||
|
||||
if (e3 == cudaSuccess && e4 == cudaSuccess) {
|
||||
data.isCudaDevicePtr = true;
|
||||
data.yPlane = static_cast<uint8_t*>(ownedY);
|
||||
data.uvPlane = static_cast<uint8_t*>(ownedUV);
|
||||
data.yLinesize = static_cast<int>(yPitch);
|
||||
data.uvLinesize = static_cast<int>(uvPitch);
|
||||
data.gpuCacheY = ownedY;
|
||||
data.gpuCacheUV = ownedUV;
|
||||
data.gpuCacheYPitch = yPitch;
|
||||
data.gpuCacheUVPitch = uvPitch;
|
||||
data.gpuCacheDeviceIdx = gpuIdx;
|
||||
data.gpuCacheValid = true;
|
||||
data.gpuCacheBytes = yPitch * h + uvPitch * (h / 2);
|
||||
ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
|
||||
d2dOk = true;
|
||||
GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
|
||||
ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
|
||||
} else {
|
||||
GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
|
||||
(int)e3, (int)e4);
|
||||
cudaFree(ownedY);
|
||||
cudaFree(ownedUV);
|
||||
}
|
||||
} else {
|
||||
GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
|
||||
(int)e1, (int)e2);
|
||||
if (e1 == cudaSuccess) cudaFree(ownedY);
|
||||
if (e2 == cudaSuccess) cudaFree(ownedUV);
|
||||
}
|
||||
|
||||
if (prevDev >= 0) cudaSetDevice(prevDev);
|
||||
}
|
||||
}
|
||||
|
||||
if (!d2dOk) {
|
||||
// Fall back to CPU NV12 snapshot only (no zero-copy)
|
||||
// D2D failed or no slot — take CPU NV12 snapshot now (before freeing cpuNV12).
|
||||
// This is the ONLY path where the CPU snapshot is needed. Skipping it
|
||||
// on the pool-success path avoids ~12MB malloc+memcpy+free per 4K frame.
|
||||
if (cpuNV12) {
|
||||
anscv_gpu_ops::detail::snapshotNV12Planes(
|
||||
cpuNV12,
|
||||
data.cpuYPlane, data.cpuYLinesize,
|
||||
data.cpuUvPlane, data.cpuUvLinesize,
|
||||
data.width, data.height);
|
||||
}
|
||||
GPU_FRAME_DBG("attach_cuda: FALLBACK CPU-only cpuY=%p cpuUV=%p",
|
||||
(void*)data.cpuYPlane, (void*)data.cpuUvPlane);
|
||||
data.isCudaDevicePtr = false;
|
||||
@@ -302,8 +364,8 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
data.uvLinesize = data.cpuUvLinesize;
|
||||
}
|
||||
|
||||
// Release AVFrames immediately — NVDEC surfaces returned to pool.
|
||||
// No longer stored in GpuFrameData (owned GPU copy is independent).
|
||||
// Free AVFrames immediately — synchronous D2D copy has completed,
|
||||
// so NVDEC surfaces can be returned to the decoder's surface pool.
|
||||
GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p",
|
||||
(void*)cudaFrame, (void*)cpuNV12);
|
||||
av_frame_free(&cudaFrame);
|
||||
@@ -311,9 +373,9 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
data.avframe = nullptr;
|
||||
data.cpuAvframe = nullptr;
|
||||
|
||||
GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d gpuCacheY=%p gpuCacheUV=%p",
|
||||
GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d poolSlot=%p",
|
||||
(void*)data.yPlane, (void*)data.uvPlane, (int)data.isCudaDevicePtr,
|
||||
data.gpuCacheY, data.gpuCacheUV);
|
||||
(void*)data.poolSlot);
|
||||
|
||||
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
|
||||
if (old) {
|
||||
@@ -327,12 +389,10 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
AVFrame* stale = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&stale);
|
||||
}
|
||||
|
||||
// Free stale GPU device pointers
|
||||
anscv_gpu_ops::detail::drainAndFreeGpuPending();
|
||||
}
|
||||
|
||||
// Release entry by cv::Mat* and free any returned AVFrames + GPU pointers.
|
||||
// Release entry by cv::Mat* and free any returned AVFrames.
|
||||
// GPU device pointers are deferred to TTL eviction or explicit cleanup.
|
||||
// Safe if not in map (no-op).
|
||||
inline void gpu_frame_remove(cv::Mat* mat) {
|
||||
if (!mat) return;
|
||||
@@ -347,8 +407,7 @@ inline void gpu_frame_remove(cv::Mat* mat) {
|
||||
av_frame_free(&stale);
|
||||
}
|
||||
|
||||
// Free any GPU device pointers that became pending
|
||||
anscv_gpu_ops::detail::drainAndFreeGpuPending();
|
||||
// GPU device pointers deferred — see gpu_frame_evict_stale() / Destroy()
|
||||
}
|
||||
|
||||
// Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
|
||||
@@ -357,6 +416,12 @@ inline void gpu_frame_invalidate(cv::Mat* mat) {
|
||||
}
|
||||
|
||||
// Run TTL eviction + drain pending. Call periodically from camera threads.
|
||||
// TTL eviction is throttled to every 500ms (EVICT_CHECK_INTERVAL_MS).
|
||||
// GPU buffer cleanup is safe here because:
|
||||
// 1. Only frames >3 seconds old are evicted (kernels take <10ms)
|
||||
// 2. cudaDeviceSynchronize() ensures all in-flight kernels are done
|
||||
// 3. At 500ms interval, one sync per 500ms is ~0.1ms cost (acceptable)
|
||||
// vs per-frame sync which caused 900ms spikes
|
||||
inline void gpu_frame_evict_stale() {
|
||||
ANSGpuFrameRegistry::instance().evictStaleFrames();
|
||||
|
||||
@@ -366,6 +431,7 @@ inline void gpu_frame_evict_stale() {
|
||||
av_frame_free(&stale);
|
||||
}
|
||||
|
||||
// Free any GPU device pointers from evicted frames
|
||||
// Free GPU device pointers from evicted/released frames (legacy path).
|
||||
// Pool-backed frames (ANSRTSP) don't add to this list (gpuCacheY=nullptr).
|
||||
anscv_gpu_ops::detail::drainAndFreeGpuPending();
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "ANSRTSP.h"
|
||||
#include "ANSMatRegistry.h"
|
||||
#include "ANSGpuFrameOps.h"
|
||||
#include "GpuNV12SlotPool.h"
|
||||
#include <memory>
|
||||
#include <format>
|
||||
#include "media_codec.h"
|
||||
@@ -23,8 +24,9 @@ extern "C"
|
||||
// Note: per-instance thread safety is handled by ANSRTSPClient::_mutex
|
||||
// Mat registry thread safety is handled by anscv_mat_replace's internal registry_mutex
|
||||
|
||||
// Debug logging — goes to both stderr AND OutputDebugString (DebugView).
|
||||
// Debug logging. Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame logging.
|
||||
#ifndef RTSP_DBG
|
||||
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
|
||||
#ifdef _WIN32
|
||||
#define RTSP_DBG(fmt, ...) do { \
|
||||
char _rtsp_buf[512]; \
|
||||
@@ -35,6 +37,9 @@ extern "C"
|
||||
#else
|
||||
#define RTSP_DBG(fmt, ...) fprintf(stderr, fmt "\n", ##__VA_ARGS__)
|
||||
#endif
|
||||
#else
|
||||
#define RTSP_DBG(fmt, ...) ((void)0)
|
||||
#endif
|
||||
#endif
|
||||
static bool ansrtspLicenceValid = false;
|
||||
// Global once_flag to protect license checking
|
||||
@@ -62,6 +67,7 @@ namespace ANSCENTER {
|
||||
ANSRTSPClient::~ANSRTSPClient() noexcept {
|
||||
Destroy();
|
||||
}
|
||||
|
||||
void ANSRTSPClient::Destroy() {
|
||||
// Move the player client pointer out of the lock scope, then
|
||||
// close it OUTSIDE the mutex. close() calls cuArrayDestroy /
|
||||
@@ -80,69 +86,44 @@ namespace ANSCENTER {
|
||||
}
|
||||
}
|
||||
|
||||
// --- Inference guard: wait for in-flight frames to finish ---
|
||||
// GetRTSPCVImage increments _inFlightFrames when it hands out
|
||||
// a GPU frame; the registry decrements it when the frame is
|
||||
// released after inference completes. We wait here so that
|
||||
// close() doesn't free NVDEC surfaces while TensorRT is
|
||||
// still reading from them (the LabVIEW crash root cause).
|
||||
// --- Inference guard: wait for in-flight D2D copies to finish ---
|
||||
// With synchronous D2D copy, in-flight means "currently inside
|
||||
// GetRTSPCVImage between TryIncrementInFlight and attach_cuda".
|
||||
// This is typically <1ms, so the wait is very fast.
|
||||
int inFlight = _inFlightFrames.load(std::memory_order_acquire);
|
||||
if (inFlight > 0) {
|
||||
_logger.LogInfo("ANSRTSPClient::Destroy",
|
||||
std::format("waiting for {} in-flight inference frame(s)...", inFlight),
|
||||
std::format("waiting for {} in-flight frame(s)...", inFlight),
|
||||
__FILE__, __LINE__);
|
||||
bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
|
||||
return _inFlightFrames.load(std::memory_order_acquire) <= 0;
|
||||
});
|
||||
if (!done) {
|
||||
_logger.LogWarn("ANSRTSPClient::Destroy",
|
||||
std::format("timed out waiting for in-flight frames "
|
||||
"(still {} in-flight) — force-releasing GPU frames",
|
||||
_inFlightFrames.load()),
|
||||
std::format("timed out — still {} in-flight", _inFlightFrames.load()),
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
}
|
||||
|
||||
// Force-release ALL GPU frames owned by this client BEFORE close().
|
||||
// Unreleased clones (e.g. LabVIEW AI tasks still holding cloned
|
||||
// cv::Mat*) keep gpuCacheY/gpuCacheUV allocated. We must cudaFree
|
||||
// them NOW while the CUDA context is still alive. After close()
|
||||
// destroys the context, cudaFree would crash.
|
||||
int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
|
||||
if (forceReleased > 0) {
|
||||
_logger.LogWarn("ANSRTSPClient::Destroy",
|
||||
std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
|
||||
__FILE__, __LINE__);
|
||||
// Drain and cudaFree the GPU buffers while CUDA context is alive
|
||||
// Sync all GPU streams before freeing to avoid illegal access
|
||||
cudaDeviceSynchronize();
|
||||
auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
|
||||
if (!gpuPending.empty()) {
|
||||
RTSP_DBG("[Destroy] cudaFree %zu GPU ptrs before close()", gpuPending.size());
|
||||
int prevDev = -1;
|
||||
cudaGetDevice(&prevDev);
|
||||
for (auto& entry : gpuPending) {
|
||||
if (entry.ptr) {
|
||||
if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
|
||||
cudaFree(entry.ptr);
|
||||
}
|
||||
}
|
||||
if (prevDev >= 0) cudaSetDevice(prevDev);
|
||||
}
|
||||
// Also drain any pending AVFrames
|
||||
auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
|
||||
for (void* p : avPending) {
|
||||
AVFrame* f = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&f);
|
||||
}
|
||||
}
|
||||
// Invalidate owner callbacks so stale GpuFrameData don't try to
|
||||
// call DecrementInFlight on this (soon-to-be-deleted) object.
|
||||
// The GpuFrameData and their global pool slots remain alive —
|
||||
// inference engines can safely keep reading from them.
|
||||
ANSGpuFrameRegistry::instance().invalidateOwner(this);
|
||||
_inFlightFrames.store(0, std::memory_order_release);
|
||||
|
||||
// NO forceReleaseByOwner — frames survive camera deletion.
|
||||
// Pool slot buffers are global (GpuNV12SlotPool) — NOT owned
|
||||
// by this camera. They are recycled when inference finishes
|
||||
// (GpuFrameData refcount → 0 → slot.inUse = false).
|
||||
// NO cudaDeviceSynchronize — no GPU buffers to free here.
|
||||
// NO DestroyGpuPool — per-camera pool has been removed.
|
||||
|
||||
clientToClose = std::move(_playerClient);
|
||||
}
|
||||
// CUDA cleanup happens here, outside the mutex — now safe.
|
||||
// All GPU frames owned by this client have been force-freed above.
|
||||
// close() destroys the NVDEC decoder ONLY. Pool slot buffers
|
||||
// (regular cudaMallocPitch allocations) are untouched — they
|
||||
// belong to the global GpuNV12SlotPool, not the decoder.
|
||||
if (clientToClose) {
|
||||
clientToClose->close();
|
||||
}
|
||||
@@ -232,66 +213,44 @@ namespace ANSCENTER {
|
||||
bool ANSRTSPClient::Reconnect() {
|
||||
// 1. Mark as not-playing under the mutex FIRST. This makes GetImage()
|
||||
// return the cached _pLastFrame instead of calling into the player,
|
||||
// preventing use-after-free when close() destroys CUDA resources.
|
||||
// and blocks new TryIncrementInFlight calls.
|
||||
{
|
||||
std::unique_lock<std::recursive_mutex> lock(_mutex);
|
||||
_isPlaying = false;
|
||||
|
||||
// --- Inference guard: wait for in-flight frames to finish ---
|
||||
// Same guard as Destroy(): close() will free NVDEC surfaces, so
|
||||
// we must wait for any inference engines still reading NV12 data
|
||||
// via zero-copy CUDA device pointers.
|
||||
// --- Inference guard: wait for in-flight D2D copies to finish ---
|
||||
// With synchronous D2D copy, in-flight means "currently inside
|
||||
// GetRTSPCVImage between TryIncrementInFlight and attach_cuda".
|
||||
// This is typically <1ms, so the wait is very fast.
|
||||
int inFlight = _inFlightFrames.load(std::memory_order_acquire);
|
||||
if (inFlight > 0) {
|
||||
_logger.LogInfo("ANSRTSPClient::Reconnect",
|
||||
std::format("waiting for {} in-flight inference frame(s)...", inFlight),
|
||||
std::format("waiting for {} in-flight frame(s)...", inFlight),
|
||||
__FILE__, __LINE__);
|
||||
bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
|
||||
return _inFlightFrames.load(std::memory_order_acquire) <= 0;
|
||||
});
|
||||
if (!done) {
|
||||
_logger.LogWarn("ANSRTSPClient::Reconnect",
|
||||
std::format("timed out waiting for in-flight frames "
|
||||
"(still {} in-flight) — force-releasing GPU frames",
|
||||
_inFlightFrames.load()),
|
||||
std::format("timed out — still {} in-flight", _inFlightFrames.load()),
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
}
|
||||
|
||||
// Force-release GPU frames before close() — same as Destroy().
|
||||
int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
|
||||
if (forceReleased > 0) {
|
||||
_logger.LogWarn("ANSRTSPClient::Reconnect",
|
||||
std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
|
||||
__FILE__, __LINE__);
|
||||
// Sync all GPU streams before freeing
|
||||
cudaDeviceSynchronize();
|
||||
auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
|
||||
if (!gpuPending.empty()) {
|
||||
int prevDev = -1;
|
||||
cudaGetDevice(&prevDev);
|
||||
for (auto& entry : gpuPending) {
|
||||
if (entry.ptr) {
|
||||
if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
|
||||
cudaFree(entry.ptr);
|
||||
}
|
||||
}
|
||||
if (prevDev >= 0) cudaSetDevice(prevDev);
|
||||
}
|
||||
auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
|
||||
for (void* p : avPending) {
|
||||
AVFrame* f = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&f);
|
||||
}
|
||||
}
|
||||
// Invalidate owner callbacks — prevents stale DecrementInFlight
|
||||
// calls after Reconnect re-creates the decoder.
|
||||
// Frames and their global pool slots remain alive for inference.
|
||||
ANSGpuFrameRegistry::instance().invalidateOwner(this);
|
||||
_inFlightFrames.store(0, std::memory_order_release);
|
||||
|
||||
// NO forceReleaseByOwner — frames survive reconnect.
|
||||
// NO cudaDeviceSynchronize — no GPU buffers to free.
|
||||
// NO DestroyGpuPool — per-camera pool has been removed.
|
||||
}
|
||||
|
||||
// 2. close() does CUDA cleanup (cuArrayDestroy/cuMemFree) — run outside
|
||||
// _mutex to avoid deadlocking with nvcuda64 SRW lock held by inference.
|
||||
// Safe now because GetImage()/GetNV12Frame() won't touch the player
|
||||
// while _isPlaying == false, and all in-flight frames have been released.
|
||||
// 2. close() destroys NVDEC decoder ONLY — run outside _mutex to
|
||||
// avoid deadlocking with nvcuda64 SRW lock held by inference.
|
||||
// Pool slot buffers are global and untouched.
|
||||
_logger.LogInfo("ANSRTSPClient::Reconnect",
|
||||
"calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
|
||||
RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
|
||||
@@ -1071,6 +1030,8 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
|
||||
}
|
||||
|
||||
try {
|
||||
auto t0 = std::chrono::steady_clock::now();
|
||||
|
||||
// Get image (shallow copy - reference counted, fast)
|
||||
cv::Mat img = (*Handle)->GetImage(width, height, timeStamp);
|
||||
|
||||
@@ -1082,6 +1043,8 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
|
||||
// Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock)
|
||||
anscv_mat_replace(image, std::move(img));
|
||||
|
||||
auto t1 = std::chrono::steady_clock::now();
|
||||
|
||||
// Attach NV12 frame for GPU fast-path inference (side-table registry)
|
||||
// attach() takes ownership — do NOT av_frame_free here
|
||||
//
|
||||
@@ -1101,7 +1064,11 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
|
||||
cudaHW->width, cudaHW->height,
|
||||
(void*)cudaHW->data[0], (void*)cudaHW->data[1]);
|
||||
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
|
||||
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
|
||||
|
||||
// Acquire a slot from the global pool — survives camera Destroy.
|
||||
GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire(
|
||||
gpuIdx, cudaHW->width, cudaHW->height);
|
||||
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot);
|
||||
} else {
|
||||
// HW decode not active — try CPU NV12
|
||||
AVFrame* nv12 = (*Handle)->GetNV12Frame();
|
||||
@@ -1114,11 +1081,11 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
|
||||
// TryIncrementInFlight already incremented; DecrementInFlight fires
|
||||
// when the last clone of this frame is released after inference.
|
||||
auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
|
||||
RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d gpuCacheY=%p",
|
||||
RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p",
|
||||
(void*)gpuData,
|
||||
gpuData ? (void*)gpuData->yPlane : nullptr,
|
||||
gpuData ? (int)gpuData->isCudaDevicePtr : -1,
|
||||
gpuData ? gpuData->gpuCacheY : nullptr);
|
||||
gpuData ? (void*)gpuData->poolSlot : nullptr);
|
||||
if (gpuData) {
|
||||
gpuData->ownerClient = *Handle;
|
||||
gpuData->onReleaseFn = [](void* client) {
|
||||
@@ -1136,6 +1103,20 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
|
||||
RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
|
||||
}
|
||||
|
||||
// Lightweight timing via spdlog (no OutputDebugString).
|
||||
// Logs only when the frame grab + D2D exceeds 50ms — helps diagnose stalls
|
||||
// without the overhead of per-frame debug logging.
|
||||
auto t2 = std::chrono::steady_clock::now();
|
||||
double getImageMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
|
||||
double cudaMs = std::chrono::duration<double, std::milli>(t2 - t1).count();
|
||||
double totalMs = getImageMs + cudaMs;
|
||||
if (totalMs > 50.0) {
|
||||
(*Handle)->_logger.LogWarn("GetRTSPCVImage",
|
||||
std::format("SLOW FRAME: total={:.1f}ms (getImage={:.1f}ms cuda={:.1f}ms) {}x{}",
|
||||
totalMs, getImageMs, cudaMs, width, height),
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
|
||||
return 1; // Success
|
||||
}
|
||||
catch (const cv::Exception& e) {
|
||||
|
||||
@@ -40,7 +40,7 @@ namespace ANSCENTER
|
||||
bool _isPlaying;
|
||||
std::recursive_mutex _mutex;
|
||||
|
||||
// --- Per-client inference guard ---
|
||||
// --- Per-client inference guard ---
|
||||
// Tracks how many GPU frames from this client are currently in-flight
|
||||
// (grabbed by GetRTSPCVImage but not yet released after inference).
|
||||
// Destroy() waits for this to reach 0 before freeing NVDEC surfaces,
|
||||
|
||||
107
modules/ANSCV/GpuNV12SlotPool.cpp
Normal file
107
modules/ANSCV/GpuNV12SlotPool.cpp
Normal file
@@ -0,0 +1,107 @@
|
||||
// GpuNV12SlotPool.cpp — Process-wide singleton, compiled into ANSCV.dll.
|
||||
//
|
||||
// ANSCV.dll owns the canonical GpuNV12SlotPool instance. Other DLLs
|
||||
// (ANSODEngine, etc.) find it via GetProcAddress at runtime.
|
||||
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#include "GpuNV12SlotPool.h"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
// ANSCV.dll owns the process-wide singleton.
|
||||
GpuNV12SlotPool* GpuNV12SlotPool::resolveProcessWide() {
|
||||
static GpuNV12SlotPool pool;
|
||||
return &pool;
|
||||
}
|
||||
|
||||
// Exported so other DLLs (ANSODEngine, etc.) can find this instance at runtime.
|
||||
extern "C" __declspec(dllexport)
|
||||
GpuNV12SlotPool* GpuNV12SlotPool_GetInstance() {
|
||||
return &GpuNV12SlotPool::instance();
|
||||
}
|
||||
|
||||
// Transition all COOLING slots past the cooldown threshold to FREE.
|
||||
void GpuNV12SlotPool::drainCooledSlots_locked() {
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
auto threshold = std::chrono::milliseconds(SLOT_COOLDOWN_MS);
|
||||
for (auto& s : m_slots) {
|
||||
if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_COOLING) {
|
||||
if (now - s->cooldownStart >= threshold) {
|
||||
s->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Acquire a free slot matching (gpuIdx, w, h), or allocate a new one.
|
||||
GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
// 1. Drain cooled-down slots to make them available
|
||||
drainCooledSlots_locked();
|
||||
|
||||
// 2. Try to find an existing FREE slot that matches the resolution
|
||||
for (auto& s : m_slots) {
|
||||
if (s->state.load(std::memory_order_acquire) == GpuNV12Slot::STATE_FREE &&
|
||||
s->gpuIdx == gpuIdx && s->width == w && s->height == h) {
|
||||
s->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);
|
||||
NV12POOL_DBG("acquire: reuse slot Y=%p UV=%p %dx%d gpu=%d (total=%zu)",
|
||||
s->bufY, s->bufUV, w, h, gpuIdx, m_slots.size());
|
||||
return s.get();
|
||||
}
|
||||
}
|
||||
|
||||
// 3. No matching free slot — allocate a new one if under the limit
|
||||
if (static_cast<int>(m_slots.size()) >= GPU_NV12_POOL_MAX_SLOTS) {
|
||||
NV12POOL_DBG("acquire: POOL FULL (%zu slots) — fallback to CPU path",
|
||||
m_slots.size());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Allocate CUDA buffers on the target GPU
|
||||
int prevDev = -1;
|
||||
cudaGetDevice(&prevDev);
|
||||
if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
|
||||
|
||||
auto slot = std::make_unique<GpuNV12Slot>();
|
||||
cudaError_t e1 = cudaMallocPitch(&slot->bufY, &slot->pitchY, w, h);
|
||||
cudaError_t e2 = cudaMallocPitch(&slot->bufUV, &slot->pitchUV, w, h / 2);
|
||||
|
||||
// Non-blocking stream avoids NULL-stream implicit sync with inference.
|
||||
// On WDDM, the NULL stream must wait for ALL other streams to finish
|
||||
// before executing — this caused 1-2 second stalls when inference
|
||||
// kernels were running. A non-blocking stream runs independently.
|
||||
cudaStream_t stream = nullptr;
|
||||
cudaError_t e3 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
|
||||
|
||||
if (prevDev >= 0) cudaSetDevice(prevDev);
|
||||
|
||||
if (e1 != cudaSuccess || e2 != cudaSuccess) {
|
||||
NV12POOL_DBG("acquire: cudaMallocPitch FAILED %dx%d gpu=%d e1=%d e2=%d",
|
||||
w, h, gpuIdx, (int)e1, (int)e2);
|
||||
// Clean up partial allocation
|
||||
int prev2 = -1; cudaGetDevice(&prev2);
|
||||
if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
|
||||
if (e1 == cudaSuccess && slot->bufY) cudaFree(slot->bufY);
|
||||
if (e2 == cudaSuccess && slot->bufUV) cudaFree(slot->bufUV);
|
||||
if (e3 == cudaSuccess && stream) cudaStreamDestroy(stream);
|
||||
if (prev2 >= 0) cudaSetDevice(prev2);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
slot->width = w;
|
||||
slot->height = h;
|
||||
slot->gpuIdx = gpuIdx;
|
||||
slot->copyStream = (e3 == cudaSuccess) ? stream : nullptr;
|
||||
slot->state.store(GpuNV12Slot::STATE_ACTIVE, std::memory_order_release);
|
||||
|
||||
GpuNV12Slot* raw = slot.get();
|
||||
m_slots.push_back(std::move(slot));
|
||||
|
||||
NV12POOL_DBG("acquire: NEW slot Y=%p UV=%p pitchY=%zu pitchUV=%zu %dx%d gpu=%d stream=%p (total=%zu)",
|
||||
raw->bufY, raw->bufUV, raw->pitchY, raw->pitchUV,
|
||||
w, h, gpuIdx, raw->copyStream, m_slots.size());
|
||||
return raw;
|
||||
}
|
||||
Reference in New Issue
Block a user