Disable NV12 path for ANSCV by default. Currenly use cv::Mat** directly
This commit is contained in:
@@ -23,6 +23,8 @@ extern "C" {
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <cstdio>
|
||||
|
||||
#ifdef _WIN32
|
||||
@@ -166,16 +168,13 @@ inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pt
|
||||
|
||||
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
|
||||
if (old) {
|
||||
AVFrame* oldFrame = static_cast<AVFrame*>(old);
|
||||
av_frame_free(&oldFrame);
|
||||
// Defer old frame's AVFrame free
|
||||
auto& reg = ANSGpuFrameRegistry::instance();
|
||||
auto lk = reg.acquire_lock();
|
||||
reg.pushPendingFree_locked(old);
|
||||
}
|
||||
|
||||
// Free stale entries evicted by TTL or previous attach
|
||||
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||||
for (void* p : pending) {
|
||||
AVFrame* stale = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&stale);
|
||||
}
|
||||
// NOTE: No drain_pending() here (hot path). Freed by evict_stale.
|
||||
}
|
||||
|
||||
// Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
|
||||
@@ -226,13 +225,10 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
|
||||
if (slot && slot->bufY && slot->bufUV && slot->pitchY > 0 && slot->pitchUV > 0) {
|
||||
// --- Global pool path: D2D copy on per-slot non-blocking stream ---
|
||||
// CRITICAL: Using the NULL stream (cudaMemcpy2D without stream) causes
|
||||
// 1-2 second stalls on WDDM because it implicitly synchronizes with
|
||||
// ALL other streams before executing. By using cudaMemcpy2DAsync on
|
||||
// the slot's own non-blocking stream + cudaStreamSynchronize, we:
|
||||
// 1. Submit the copy immediately (no wait for inference kernels)
|
||||
// 2. Wait ONLY for this copy to finish (~0.3ms 1080p, ~1.2ms 4K)
|
||||
// 3. Data is valid after sync — av_frame_free is safe
|
||||
// cudaMemcpy2DAsync + cudaStreamSynchronize(slotStream):
|
||||
// - Non-blocking stream avoids NULL-stream implicit sync with inference
|
||||
// - Sync waits ONLY for the 2 copies (~1.5ms for 4K, ~0.3ms for 1080p)
|
||||
// - Data valid after sync — av_frame_free is safe
|
||||
int prevDev = -1;
|
||||
cudaGetDevice(&prevDev);
|
||||
if (gpuIdx >= 0) cudaSetDevice(gpuIdx);
|
||||
@@ -247,13 +243,13 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
e4 = cudaMemcpy2DAsync(slot->bufUV, slot->pitchUV,
|
||||
cudaFrame->data[1], cudaFrame->linesize[1],
|
||||
w, h / 2, cudaMemcpyDeviceToDevice, copyStream);
|
||||
if (e3 == cudaSuccess && e4 == cudaSuccess) {
|
||||
// Wait ONLY for this stream's 2 copies (~0.3-1.2ms).
|
||||
// Does NOT wait for inference kernels on other streams.
|
||||
cudaStreamSynchronize(copyStream);
|
||||
}
|
||||
// NO cudaStreamSynchronize here — let the copy run asynchronously.
|
||||
// The camera thread is NOT blocked by the WDDM SRW lock.
|
||||
// Inference will call cudaStreamSynchronize(d2dCopyStream) in tryNV12()
|
||||
// before reading the buffer. By that time (~50-200ms later), the copy
|
||||
// (~0.3ms for 1080p, ~1.5ms for 4K) has long completed, so the sync
|
||||
// returns immediately with zero blocking.
|
||||
} else {
|
||||
// Fallback if stream creation failed — NULL stream (may stall)
|
||||
e3 = cudaMemcpy2D(slot->bufY, slot->pitchY,
|
||||
cudaFrame->data[0], cudaFrame->linesize[0],
|
||||
w, h, cudaMemcpyDeviceToDevice);
|
||||
@@ -270,15 +266,14 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
data.uvPlane = static_cast<uint8_t*>(slot->bufUV);
|
||||
data.yLinesize = static_cast<int>(slot->pitchY);
|
||||
data.uvLinesize = static_cast<int>(slot->pitchUV);
|
||||
data.poolSlot = slot; // Track for deferred release
|
||||
// gpuCacheY/UV stay nullptr — global pool owns the buffers
|
||||
data.poolSlot = slot;
|
||||
data.d2dCopyStream = copyStream; // Inference syncs on this before reading
|
||||
d2dOk = true;
|
||||
GPU_FRAME_DBG("attach_cuda: D2D OK (global pool) Y=%p UV=%p yPitch=%zu uvPitch=%zu",
|
||||
slot->bufY, slot->bufUV, slot->pitchY, slot->pitchUV);
|
||||
GPU_FRAME_DBG("attach_cuda: D2D OK (global pool, async) Y=%p UV=%p yPitch=%zu uvPitch=%zu stream=%p",
|
||||
slot->bufY, slot->bufUV, slot->pitchY, slot->pitchUV, copyStream);
|
||||
} else {
|
||||
GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED (pool) e3=%d e4=%d — fallback",
|
||||
(int)e3, (int)e4);
|
||||
// Release slot back to pool on failure (immediate, no cooldown needed)
|
||||
slot->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
|
||||
}
|
||||
}
|
||||
@@ -364,13 +359,34 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
data.uvLinesize = data.cpuUvLinesize;
|
||||
}
|
||||
|
||||
// Free AVFrames immediately — synchronous D2D copy has completed,
|
||||
// so NVDEC surfaces can be returned to the decoder's surface pool.
|
||||
GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p",
|
||||
(void*)cudaFrame, (void*)cpuNV12);
|
||||
av_frame_free(&cudaFrame);
|
||||
if (cpuNV12) av_frame_free(&cpuNV12);
|
||||
data.avframe = nullptr;
|
||||
// AVFrame lifetime management:
|
||||
// - If D2D was ASYNC (d2dCopyStream != null): keep cudaFrame alive in
|
||||
// GpuFrameData.avframe so the NVDEC surface (copy source) remains valid
|
||||
// until the async copy completes. The AVFrame is freed when GpuFrameData
|
||||
// is released (after inference), by which time the 0.3ms copy is long done.
|
||||
// - If D2D was SYNC or failed: push to pending free immediately (old behavior).
|
||||
if (data.d2dCopyStream && cudaFrame) {
|
||||
// Async D2D — keep AVFrame alive, inference will outlive the copy
|
||||
data.avframe = cudaFrame;
|
||||
GPU_FRAME_DBG("attach_cuda: keeping AVFrame alive for async D2D cudaFrame=%p",
|
||||
(void*)cudaFrame);
|
||||
} else {
|
||||
// Sync D2D or fallback — safe to defer free now
|
||||
GPU_FRAME_DBG("attach_cuda: deferring AVFrame free cudaFrame=%p",
|
||||
(void*)cudaFrame);
|
||||
if (cudaFrame) {
|
||||
auto& reg = ANSGpuFrameRegistry::instance();
|
||||
auto lk = reg.acquire_lock();
|
||||
reg.pushPendingFree_locked(cudaFrame);
|
||||
}
|
||||
data.avframe = nullptr;
|
||||
}
|
||||
// cpuNV12 is always safe to defer — CPU snapshot (if taken) is already copied
|
||||
if (cpuNV12) {
|
||||
auto& reg = ANSGpuFrameRegistry::instance();
|
||||
auto lk = reg.acquire_lock();
|
||||
reg.pushPendingFree_locked(cpuNV12);
|
||||
}
|
||||
data.cpuAvframe = nullptr;
|
||||
|
||||
GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d poolSlot=%p",
|
||||
@@ -379,16 +395,16 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx,
|
||||
|
||||
void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
|
||||
if (old) {
|
||||
AVFrame* oldFrame = static_cast<AVFrame*>(old);
|
||||
av_frame_free(&oldFrame);
|
||||
// Old frame's AVFrame returned — defer its free too
|
||||
auto& reg = ANSGpuFrameRegistry::instance();
|
||||
auto lk = reg.acquire_lock();
|
||||
reg.pushPendingFree_locked(old);
|
||||
}
|
||||
|
||||
// Free stale AVFrames evicted by TTL or previous attach
|
||||
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||||
for (void* p : pending) {
|
||||
AVFrame* stale = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&stale);
|
||||
}
|
||||
// NOTE: No drain_pending() here (hot path). AVFrames accumulate in
|
||||
// m_pendingFree and are freed by gpu_frame_evict_stale() which runs
|
||||
// every 500ms from anscv_mat_replace. This removes av_frame_free
|
||||
// (5-20ms SRW lock per call) from the camera frame-grabbing path.
|
||||
}
|
||||
|
||||
// Release entry by cv::Mat* and free any returned AVFrames.
|
||||
@@ -400,14 +416,7 @@ inline void gpu_frame_remove(cv::Mat* mat) {
|
||||
GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat);
|
||||
ANSGpuFrameRegistry::instance().release(mat);
|
||||
|
||||
// Free any AVFrames that became pending from this release or prior eviction
|
||||
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||||
for (void* p : pending) {
|
||||
AVFrame* stale = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&stale);
|
||||
}
|
||||
|
||||
// GPU device pointers deferred — see gpu_frame_evict_stale() / Destroy()
|
||||
// NOTE: No drain_pending() here (hot path). AVFrames freed by evict_stale.
|
||||
}
|
||||
|
||||
// Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
|
||||
@@ -425,10 +434,39 @@ inline void gpu_frame_invalidate(cv::Mat* mat) {
|
||||
inline void gpu_frame_evict_stale() {
|
||||
ANSGpuFrameRegistry::instance().evictStaleFrames();
|
||||
|
||||
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||||
for (void* p : pending) {
|
||||
AVFrame* stale = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&stale);
|
||||
// Drain and free AVFrames on a background thread to avoid blocking the
|
||||
// camera hot path. av_frame_free on CUDA-mapped frames can take 5-20ms
|
||||
// per call due to nvcuda64 SRW lock. The background thread frees them
|
||||
// periodically (every 50ms) in batches.
|
||||
{
|
||||
static std::once_flag s_initOnce;
|
||||
static std::mutex s_avFreeMutex;
|
||||
static std::vector<void*> s_avFreeQueue;
|
||||
|
||||
// Move pending AVFrames to the background queue
|
||||
auto pending = ANSGpuFrameRegistry::instance().drain_pending();
|
||||
if (!pending.empty()) {
|
||||
std::lock_guard<std::mutex> lock(s_avFreeMutex);
|
||||
s_avFreeQueue.insert(s_avFreeQueue.end(), pending.begin(), pending.end());
|
||||
}
|
||||
|
||||
// Start background free thread on first call
|
||||
std::call_once(s_initOnce, []() {
|
||||
std::thread([]() {
|
||||
while (true) {
|
||||
std::vector<void*> batch;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(s_avFreeMutex);
|
||||
batch.swap(s_avFreeQueue);
|
||||
}
|
||||
for (void* p : batch) {
|
||||
AVFrame* f = static_cast<AVFrame*>(p);
|
||||
av_frame_free(&f);
|
||||
}
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(50));
|
||||
}
|
||||
}).detach();
|
||||
});
|
||||
}
|
||||
|
||||
// Free GPU device pointers from evicted/released frames (legacy path).
|
||||
|
||||
Reference in New Issue
Block a user