#pragma once // ANSGpuFrameOps.h — FFmpeg-aware convenience functions for ANSGpuFrameRegistry. // // This header requires FFmpeg headers (libavutil/frame.h) and provides // typed attach/invalidate/remove operations that handle av_frame_clone/free. // // NEW DESIGN: Instead of storing AVFrame* references (which lock NVDEC surfaces), // we snapshot the CPU NV12 planes into malloc'd buffers and release the AVFrames // immediately. This prevents decoder surface pool exhaustion when many clones // hold references to the same frame. // // Include this in ANSCV/ANSRTSP (which link FFmpeg). For projects without // FFmpeg (ANSODEngine), include ANSGpuFrameRegistry.h directly and use // gpu_frame_lookup() + the GpuFrameData plane pointers. #include "ANSGpuFrameRegistry.h" #include "GpuNV12SlotPool.h" extern "C" { #include "libavutil/frame.h" } #include #include #include #include #include #include #ifdef _WIN32 #include #endif // Debug logging macro for GPU frame operations. // Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging. #ifndef GPU_FRAME_DBG #if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG #ifdef _WIN32 #define GPU_FRAME_DBG(fmt, ...) do { \ char _gpu_dbg_buf[512]; \ snprintf(_gpu_dbg_buf, sizeof(_gpu_dbg_buf), "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__); \ OutputDebugStringA(_gpu_dbg_buf); \ fprintf(stderr, "%s", _gpu_dbg_buf); \ } while(0) #else #define GPU_FRAME_DBG(fmt, ...) \ fprintf(stderr, "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__) #endif #else #define GPU_FRAME_DBG(fmt, ...) ((void)0) #endif #endif namespace anscv_gpu_ops { namespace detail { // Snapshot NV12 Y and UV planes from an AVFrame into malloc'd buffers. // Returns true on success. Caller owns the output buffers. inline bool snapshotNV12Planes(const AVFrame* nv12, uint8_t*& outY, int& outYLinesize, uint8_t*& outUV, int& outUVLinesize, int& outWidth, int& outHeight) { if (!nv12 || !nv12->data[0] || !nv12->data[1]) return false; outWidth = nv12->width; outHeight = nv12->height; outYLinesize = nv12->width; // Packed (no alignment padding) outUVLinesize = nv12->width; // UV interleaved: width bytes per row size_t yBytes = static_cast(outYLinesize) * outHeight; size_t uvBytes = static_cast(outUVLinesize) * (outHeight / 2); outY = static_cast(std::malloc(yBytes)); outUV = static_cast(std::malloc(uvBytes)); if (!outY || !outUV) { std::free(outY); std::free(outUV); outY = nullptr; outUV = nullptr; return false; } // Copy line-by-line (source may have padding via linesize > width) const int srcYLinesize = nv12->linesize[0]; const int srcUVLinesize = nv12->linesize[1]; for (int row = 0; row < outHeight; ++row) { std::memcpy(outY + row * outYLinesize, nv12->data[0] + row * srcYLinesize, outWidth); } for (int row = 0; row < outHeight / 2; ++row) { std::memcpy(outUV + row * outUVLinesize, nv12->data[1] + row * srcUVLinesize, outWidth); } return true; } // Drain pending GPU device pointers and cudaFree them. // Uses time-based safety: only frees entries queued >100ms ago, guaranteeing // all CUDA kernels reading from them have completed (kernels take <10ms). // NO cudaDeviceSynchronize — zero blocking of GPU pipeline. // // If forceAll=true, drains ALL entries with cudaDeviceSynchronize first // (used only by Destroy/Reconnect for final cleanup). inline void drainAndFreeGpuPending(bool forceAll = false) { static constexpr int SAFE_AGE_MS = 100; // 100ms >> 10ms kernel duration auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending( forceAll ? 0 : SAFE_AGE_MS); if (gpuPending.empty()) return; GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs (force=%d)", gpuPending.size(), (int)forceAll); int prevDev = -1; cudaGetDevice(&prevDev); if (forceAll) { // Final cleanup — sync all devices first cudaDeviceSynchronize(); } for (auto& entry : gpuPending) { if (entry.ptr) { if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx); cudaError_t err = cudaFree(entry.ptr); if (err != cudaSuccess) { GPU_FRAME_DBG("drainGpuPending: cudaFree FAILED err=%d (%s)", (int)err, cudaGetErrorString(err)); } } } if (prevDev >= 0) cudaSetDevice(prevDev); } } // namespace detail } // namespace anscv_gpu_ops // Attach NV12/YUV frame keyed by cv::Mat* pointer. // Snapshots CPU NV12 planes into owned malloc'd buffers, then releases the AVFrame. // TAKES OWNERSHIP of nv12 — caller must NOT av_frame_free after this call. inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pts) { if (!mat || !nv12) return; GpuFrameData data{}; data.gpuIndex = gpuIdx; data.pts = pts; data.pixelFormat = nv12->format; data.width = nv12->width; data.height = nv12->height; // Snapshot NV12 planes to owned buffers bool ok = anscv_gpu_ops::detail::snapshotNV12Planes( nv12, data.cpuYPlane, data.cpuYLinesize, data.cpuUvPlane, data.cpuUvLinesize, data.width, data.height); // Keep legacy pointers for backward compat during transition data.yPlane = data.cpuYPlane; data.uvPlane = data.cpuUvPlane; data.yLinesize = data.cpuYLinesize; data.uvLinesize = data.cpuUvLinesize; // Store AVFrame for legacy cleanup (will be freed by drain_pending) data.avframe = nv12; void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data)); if (old) { // Defer old frame's AVFrame free auto& reg = ANSGpuFrameRegistry::instance(); auto lk = reg.acquire_lock(); reg.pushPendingFree_locked(old); } // NOTE: No drain_pending() here (hot path). Freed by evict_stale. } // Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory. // TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after. // // D2D copy: SYNCHRONOUS cudaMemcpy2D from NVDEC surfaces into a GpuNV12Slot // buffer from the global pool. Data is valid immediately after the call returns. // AVFrame is freed immediately (NVDEC surface returned to decoder pool). // // The slot is protected by a 200ms cooldown after the GpuFrameData's refcount // drops to 0, guaranteeing that all in-flight GPU kernels (which complete in // <10ms) have finished reading from the buffer before it can be reused. // // slot: pre-acquired from GpuNV12SlotPool::instance().acquire(). // If non-null, D2D copy goes into slot buffers (no per-frame alloc). // If nullptr, falls back to per-frame cudaMallocPitch (legacy path). // // Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU // inference (when decode GPU != inference GPU). inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts, AVFrame* cpuNV12 = nullptr, GpuNV12Slot* slot = nullptr) { if (!mat || !cudaFrame) { GPU_FRAME_DBG("attach_cuda: SKIP mat=%p cudaFrame=%p", (void*)mat, (void*)cudaFrame); return; } const int w = cudaFrame->width; const int h = cudaFrame->height; GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p slot=%p", (void*)mat, w, h, gpuIdx, (void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)slot); GpuFrameData data{}; data.gpuIndex = gpuIdx; data.pts = pts; data.width = w; data.height = h; data.pixelFormat = 23; // AV_PIX_FMT_NV12 // NOTE: CPU NV12 snapshot is DEFERRED — only taken if pool D2D fails. // For 4K frames, the snapshot is ~12MB malloc+memcpy+free per frame. // Skipping it when the pool path succeeds (the common case) eliminates // ~276MB/s of CPU heap allocation churn that causes process-level stalls. // --- D2D copy: NVDEC surface → GPU buffer --- bool d2dOk = false; if (slot && slot->bufY && slot->bufUV && slot->pitchY > 0 && slot->pitchUV > 0) { // --- Global pool path: D2D copy on per-slot non-blocking stream --- // cudaMemcpy2DAsync + cudaStreamSynchronize(slotStream): // - Non-blocking stream avoids NULL-stream implicit sync with inference // - Sync waits ONLY for the 2 copies (~1.5ms for 4K, ~0.3ms for 1080p) // - Data valid after sync — av_frame_free is safe int prevDev = -1; cudaGetDevice(&prevDev); if (gpuIdx >= 0) cudaSetDevice(gpuIdx); cudaStream_t copyStream = static_cast(slot->copyStream); cudaError_t e3, e4; if (copyStream) { e3 = cudaMemcpy2DAsync(slot->bufY, slot->pitchY, cudaFrame->data[0], cudaFrame->linesize[0], w, h, cudaMemcpyDeviceToDevice, copyStream); e4 = cudaMemcpy2DAsync(slot->bufUV, slot->pitchUV, cudaFrame->data[1], cudaFrame->linesize[1], w, h / 2, cudaMemcpyDeviceToDevice, copyStream); // NO cudaStreamSynchronize here — let the copy run asynchronously. // The camera thread is NOT blocked by the WDDM SRW lock. // Inference will call cudaStreamSynchronize(d2dCopyStream) in tryNV12() // before reading the buffer. By that time (~50-200ms later), the copy // (~0.3ms for 1080p, ~1.5ms for 4K) has long completed, so the sync // returns immediately with zero blocking. } else { e3 = cudaMemcpy2D(slot->bufY, slot->pitchY, cudaFrame->data[0], cudaFrame->linesize[0], w, h, cudaMemcpyDeviceToDevice); e4 = cudaMemcpy2D(slot->bufUV, slot->pitchUV, cudaFrame->data[1], cudaFrame->linesize[1], w, h / 2, cudaMemcpyDeviceToDevice); } if (prevDev >= 0) cudaSetDevice(prevDev); if (e3 == cudaSuccess && e4 == cudaSuccess) { data.isCudaDevicePtr = true; data.yPlane = static_cast(slot->bufY); data.uvPlane = static_cast(slot->bufUV); data.yLinesize = static_cast(slot->pitchY); data.uvLinesize = static_cast(slot->pitchUV); data.poolSlot = slot; data.d2dCopyStream = copyStream; // Inference syncs on this before reading d2dOk = true; GPU_FRAME_DBG("attach_cuda: D2D OK (global pool, async) Y=%p UV=%p yPitch=%zu uvPitch=%zu stream=%p", slot->bufY, slot->bufUV, slot->pitchY, slot->pitchUV, copyStream); } else { GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED (pool) e3=%d e4=%d — fallback", (int)e3, (int)e4); slot->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release); } } if (!d2dOk && !slot) { // --- Legacy path: per-frame cudaMallocPitch (for modules without pool) --- const size_t yBytes = static_cast(w) * h; const size_t uvBytes = static_cast(w) * (h / 2); const size_t totalBytes = yBytes + uvBytes; if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) { int prevDev = -1; cudaGetDevice(&prevDev); if (gpuIdx >= 0) cudaSetDevice(gpuIdx); void* ownedY = nullptr; void* ownedUV = nullptr; size_t yPitch = 0; size_t uvPitch = 0; cudaError_t e1 = cudaMallocPitch(&ownedY, &yPitch, w, h); cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2); if (e1 == cudaSuccess && e2 == cudaSuccess) { cudaError_t e3 = cudaMemcpy2D(ownedY, yPitch, cudaFrame->data[0], cudaFrame->linesize[0], w, h, cudaMemcpyDeviceToDevice); cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch, cudaFrame->data[1], cudaFrame->linesize[1], w, h / 2, cudaMemcpyDeviceToDevice); if (e3 == cudaSuccess && e4 == cudaSuccess) { data.isCudaDevicePtr = true; data.yPlane = static_cast(ownedY); data.uvPlane = static_cast(ownedUV); data.yLinesize = static_cast(yPitch); data.uvLinesize = static_cast(uvPitch); data.gpuCacheY = ownedY; data.gpuCacheUV = ownedUV; data.gpuCacheYPitch = yPitch; data.gpuCacheUVPitch = uvPitch; data.gpuCacheDeviceIdx = gpuIdx; data.gpuCacheValid = true; data.gpuCacheBytes = yPitch * h + uvPitch * (h / 2); ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes); d2dOk = true; GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu", ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes); } else { GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU", (int)e3, (int)e4); cudaFree(ownedY); cudaFree(ownedUV); } } else { GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU", (int)e1, (int)e2); if (e1 == cudaSuccess) cudaFree(ownedY); if (e2 == cudaSuccess) cudaFree(ownedUV); } if (prevDev >= 0) cudaSetDevice(prevDev); } } if (!d2dOk) { // D2D failed or no slot — take CPU NV12 snapshot now (before freeing cpuNV12). // This is the ONLY path where the CPU snapshot is needed. Skipping it // on the pool-success path avoids ~12MB malloc+memcpy+free per 4K frame. if (cpuNV12) { anscv_gpu_ops::detail::snapshotNV12Planes( cpuNV12, data.cpuYPlane, data.cpuYLinesize, data.cpuUvPlane, data.cpuUvLinesize, data.width, data.height); } GPU_FRAME_DBG("attach_cuda: FALLBACK CPU-only cpuY=%p cpuUV=%p", (void*)data.cpuYPlane, (void*)data.cpuUvPlane); data.isCudaDevicePtr = false; data.yPlane = data.cpuYPlane; data.uvPlane = data.cpuUvPlane; data.yLinesize = data.cpuYLinesize; data.uvLinesize = data.cpuUvLinesize; } // AVFrame lifetime management: // - If D2D was ASYNC (d2dCopyStream != null): keep cudaFrame alive in // GpuFrameData.avframe so the NVDEC surface (copy source) remains valid // until the async copy completes. The AVFrame is freed when GpuFrameData // is released (after inference), by which time the 0.3ms copy is long done. // - If D2D was SYNC or failed: push to pending free immediately (old behavior). if (data.d2dCopyStream && cudaFrame) { // Async D2D — keep AVFrame alive, inference will outlive the copy data.avframe = cudaFrame; GPU_FRAME_DBG("attach_cuda: keeping AVFrame alive for async D2D cudaFrame=%p", (void*)cudaFrame); } else { // Sync D2D or fallback — safe to defer free now GPU_FRAME_DBG("attach_cuda: deferring AVFrame free cudaFrame=%p", (void*)cudaFrame); if (cudaFrame) { auto& reg = ANSGpuFrameRegistry::instance(); auto lk = reg.acquire_lock(); reg.pushPendingFree_locked(cudaFrame); } data.avframe = nullptr; } // cpuNV12 is always safe to defer — CPU snapshot (if taken) is already copied if (cpuNV12) { auto& reg = ANSGpuFrameRegistry::instance(); auto lk = reg.acquire_lock(); reg.pushPendingFree_locked(cpuNV12); } data.cpuAvframe = nullptr; GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d poolSlot=%p", (void*)data.yPlane, (void*)data.uvPlane, (int)data.isCudaDevicePtr, (void*)data.poolSlot); void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data)); if (old) { // Old frame's AVFrame returned — defer its free too auto& reg = ANSGpuFrameRegistry::instance(); auto lk = reg.acquire_lock(); reg.pushPendingFree_locked(old); } // NOTE: No drain_pending() here (hot path). AVFrames accumulate in // m_pendingFree and are freed by gpu_frame_evict_stale() which runs // every 500ms from anscv_mat_replace. This removes av_frame_free // (5-20ms SRW lock per call) from the camera frame-grabbing path. } // Release entry by cv::Mat* and free any returned AVFrames. // GPU device pointers are deferred to TTL eviction or explicit cleanup. // Safe if not in map (no-op). inline void gpu_frame_remove(cv::Mat* mat) { if (!mat) return; GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat); ANSGpuFrameRegistry::instance().release(mat); // NOTE: No drain_pending() here (hot path). AVFrames freed by evict_stale. } // Alias for remove — used in ANSCV mutating functions to drop stale GPU data. inline void gpu_frame_invalidate(cv::Mat* mat) { gpu_frame_remove(mat); } // Run TTL eviction + drain pending. Call periodically from camera threads. // TTL eviction is throttled to every 500ms (EVICT_CHECK_INTERVAL_MS). // GPU buffer cleanup is safe here because: // 1. Only frames >3 seconds old are evicted (kernels take <10ms) // 2. cudaDeviceSynchronize() ensures all in-flight kernels are done // 3. At 500ms interval, one sync per 500ms is ~0.1ms cost (acceptable) // vs per-frame sync which caused 900ms spikes inline void gpu_frame_evict_stale() { ANSGpuFrameRegistry::instance().evictStaleFrames(); // Drain and free AVFrames on a background thread to avoid blocking the // camera hot path. av_frame_free on CUDA-mapped frames can take 5-20ms // per call due to nvcuda64 SRW lock. The background thread frees them // periodically (every 50ms) in batches. { static std::once_flag s_initOnce; static std::mutex s_avFreeMutex; static std::vector s_avFreeQueue; // Move pending AVFrames to the background queue auto pending = ANSGpuFrameRegistry::instance().drain_pending(); if (!pending.empty()) { std::lock_guard lock(s_avFreeMutex); s_avFreeQueue.insert(s_avFreeQueue.end(), pending.begin(), pending.end()); } // Start background free thread on first call std::call_once(s_initOnce, []() { std::thread([]() { while (true) { std::vector batch; { std::lock_guard lock(s_avFreeMutex); batch.swap(s_avFreeQueue); } for (void* p : batch) { AVFrame* f = static_cast(p); av_frame_free(&f); } std::this_thread::sleep_for(std::chrono::milliseconds(50)); } }).detach(); }); } // Free GPU device pointers from evicted/released frames (legacy path). // Pool-backed frames (ANSRTSP) don't add to this list (gpuCacheY=nullptr). anscv_gpu_ops::detail::drainAndFreeGpuPending(); }