From 3a21026790dbc121bf1bf889ad4c1d7e62d24974 Mon Sep 17 00:00:00 2001 From: Tuan Nghia Nguyen Date: Sat, 4 Apr 2026 10:09:47 +1100 Subject: [PATCH] Disable NV12 path for ANSCV by default. Currenly use cv::Mat** directly --- .claude/settings.local.json | 12 +- MediaClient/media/video_player.cpp | 74 ++++----- MediaClient/media/video_player.h | 7 + NV12_GLOBAL_POOL_FIX_V2.md | 133 +++++++++++++++ include/ANSGpuFrameRegistry.h | 15 ++ include/GpuNV12SlotPool.h | 4 +- modules/ANSCV/ANSFLV.cpp | 43 +++-- modules/ANSCV/ANSFLV.h | 6 + modules/ANSCV/ANSGpuFrameOps.h | 144 +++++++++++------ modules/ANSCV/ANSMJPEG.cpp | 46 ++++-- modules/ANSCV/ANSMJPEG.h | 6 + modules/ANSCV/ANSRTMP.cpp | 46 ++++-- modules/ANSCV/ANSRTMP.h | 6 + modules/ANSCV/ANSRTSP.cpp | 161 ++++++++++--------- modules/ANSCV/ANSRTSP.h | 6 + modules/ANSCV/ANSSRT.cpp | 46 ++++-- modules/ANSCV/ANSSRT.h | 6 + modules/ANSCV/GpuNV12SlotPool.cpp | 18 +-- modules/ANSODEngine/NV12PreprocessHelper.cpp | 28 +++- 19 files changed, 575 insertions(+), 232 deletions(-) create mode 100644 NV12_GLOBAL_POOL_FIX_V2.md diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 13b006c..9ec122e 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -41,7 +41,17 @@ "mcp__desktop-commander__get_file_info", "mcp__desktop-commander__interact_with_process", "Bash(sort -t= -k2 -rn)", - "Bash(sort -t= -k3 -rn)" + "Bash(sort -t= -k3 -rn)", + "Bash(powershell -Command \"Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug.txt'' | Select-Object -Last 30\")", + "Bash(powershell -Command \"\\(Select-String -Path ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug.txt'' -Pattern ''POOL FULL''\\).Count\")", + "Bash(powershell -Command \"\\(Select-String -Path ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug.txt'' -Pattern ''Cam\\(\\\\d+\\)'' -AllMatches | ForEach-Object { $_Matches } | ForEach-Object { $_Groups[1].Value } | Sort-Object -Unique\\)\")", + "Bash(powershell -Command \"Select-String -Path ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug.txt'' -Pattern ''Cam\\(\\\\d+\\)'' -AllMatches | ForEach-Object { $_Matches[0].Groups[1].Value } | Sort-Object | Get-Unique\")", + "Bash(powershell -Command \"$lines = Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug.txt''; $first = \\($lines | Select-String ''07:1'' | Select-Object -First 1\\).Line; $last = \\($lines | Select-String ''07:1'' | Select-Object -Last 1\\).Line; Write-Host ''First: '' $first; Write-Host ''Last: '' $last; Write-Host ''Total lines: '' $lines.Count\")", + "Bash(powershell -Command \"$c = \\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug.txt''\\).Count; Write-Host ''Total lines:'' $c\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\logdebug1.txt''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION20.log''\\).Count\")", + "Bash(powershell -Command \"\\(Get-Content ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION21.log''\\).Count\")", + "Bash(powershell -Command \"Select-String ''NEW slot'' ''C:\\\\Users\\\\nghia\\\\Downloads\\\\ANSLEGION22.log'' | ForEach-Object { if \\($_-match ''\\(\\\\d+x\\\\d+\\)''\\) { $matches[1] } } | Group-Object | Sort-Object Count -Descending | Format-Table Name, Count\")" ] } } diff --git a/MediaClient/media/video_player.cpp b/MediaClient/media/video_player.cpp index 09d5019..071a30a 100644 --- a/MediaClient/media/video_player.cpp +++ b/MediaClient/media/video_player.cpp @@ -1250,50 +1250,25 @@ cv::Mat CVideoPlayer::avframeNV12ToCvMat(const AVFrame* frame) m_nv12OrigWidth = width; m_nv12OrigHeight = height; - // Display optimization: resize NV12 planes to max 1080p before color conversion. - // For 4K (3840x2160), this reduces pixel count by 4x: - // - 4K NV12→BGR: ~13-76ms on slow CPU (Xeon 2GHz), ~2ms on fast CPU - // - 1080p NV12→BGR: ~3-5ms on slow CPU, ~0.5ms on fast CPU - // The full-res NV12 is preserved separately for inference (m_currentNV12Frame). - const int MAX_DISPLAY_HEIGHT = 1080; - bool needsResize = (height > MAX_DISPLAY_HEIGHT); + // Return full-resolution BGR image. + // No forced downscale — LabVIEW manages display resolution via SetDisplayResolution(). + // If the caller needs a specific display size, SetDisplayResolution(w, h) applies + // resizing in GetImage() at the ANSRTSP/ANS*Client level after this returns. + + // Store original NV12 dimensions for inference coordinate mapping + m_nv12OrigWidth = width; + m_nv12OrigHeight = height; cv::Mat yPlane(height, width, CV_8UC1, frame->data[0], frame->linesize[0]); cv::Mat uvPlane(height / 2, width / 2, CV_8UC2, frame->data[1], frame->linesize[1]); - if (needsResize) { - // Scale to fit within 1080p, maintaining aspect ratio - double scale = (double)MAX_DISPLAY_HEIGHT / height; - int dstW = (int)(width * scale) & ~1; // even width for NV12 - int dstH = (int)(height * scale) & ~1; // even height for NV12 + cv::Mat bgrImage; + cv::cvtColorTwoPlane(yPlane, uvPlane, bgrImage, cv::COLOR_YUV2BGR_NV12); - cv::Mat yResized, uvResized; - cv::resize(yPlane, yResized, cv::Size(dstW, dstH), 0, 0, cv::INTER_LINEAR); - cv::resize(uvPlane, uvResized, cv::Size(dstW / 2, dstH / 2), 0, 0, cv::INTER_LINEAR); - - cv::Mat bgrImage; - cv::cvtColorTwoPlane(yResized, uvResized, bgrImage, cv::COLOR_YUV2BGR_NV12); - - if (m_nImageQuality == 1) { - bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0); - } - return bgrImage; - } - - // No resize needed (already <= 1080p) - if (m_nImageQuality == 0) { - cv::Mat bgrImage; - cv::cvtColorTwoPlane(yPlane, uvPlane, bgrImage, cv::COLOR_YUV2BGR_NV12); - return bgrImage; - } - - // Quality path with range expansion - { - cv::Mat bgrImage; - cv::cvtColorTwoPlane(yPlane, uvPlane, bgrImage, cv::COLOR_YUV2BGR_NV12); + if (m_nImageQuality == 1) { bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0); - return bgrImage; } + return bgrImage; } catch (const std::exception& e) { std::cerr << "Exception in avframeNV12ToCvMat: " << e.what() << std::endl; @@ -1861,6 +1836,12 @@ double CVideoPlayer::getFrameRate() return 0; } +void CVideoPlayer::setTargetFPS(double intervalMs) +{ + std::lock_guard lock(_mutex); + m_targetIntervalMs = intervalMs; + m_targetFPSInitialized = false; // reset timing on change +} void CVideoPlayer::playVideo(uint8* data, int len, uint32 ts, uint16 seq) { if (m_bRecording) @@ -2080,6 +2061,25 @@ void CVideoPlayer::onVideoFrame(AVFrame* frame) } } + // --- Frame rate limiting --- + // Skip post-decode processing (clone, queue push, CUDA clone) if not enough + // time has elapsed since the last processed frame. The decode itself still + // runs for every packet to maintain the H.264/H.265 reference frame chain. + if (m_targetIntervalMs > 0.0) { + auto now = std::chrono::steady_clock::now(); + if (!m_targetFPSInitialized) { + m_lastProcessedTime = now; + m_targetFPSInitialized = true; + } else { + auto elapsed = std::chrono::duration(now - m_lastProcessedTime).count(); + if (elapsed < m_targetIntervalMs) { + return; // Skip this frame — too soon + } + } + m_lastProcessedTime = now; + } + // --- End frame rate limiting --- + // Push frame to queue; during settle period getImage() will ignore the queue // and keep returning the last good cached image g_frameQueue.pushFrame(frame); // pushFrame() clones the frame internally diff --git a/MediaClient/media/video_player.h b/MediaClient/media/video_player.h index a860384..c8c5220 100644 --- a/MediaClient/media/video_player.h +++ b/MediaClient/media/video_player.h @@ -15,6 +15,7 @@ #include #include #include +#include typedef struct { @@ -146,6 +147,7 @@ public: } // Image quality mode: 0=fast (OpenCV BT.601, ~2ms), 1=quality (sws BT.709+range, ~12ms) virtual void setImageQuality(int mode) { m_nImageQuality = mode; } + void setTargetFPS(double intervalMs); // Set minimum interval between processed frames in ms (0 = no limit, 100 = ~10 FPS) virtual void setRtpMulticast(BOOL flag) {} virtual void setRtpOverUdp(BOOL flag) {} @@ -266,6 +268,11 @@ protected: int m_cleanFrameCount = 0; // Count of clean frames after keyframe static const int SETTLE_FRAME_COUNT = 5; // Number of clean frames before delivering new frames + // Frame rate limiting — skip post-decode processing for frames beyond target interval + double m_targetIntervalMs = 100.0; // default 100ms (~10 FPS), 0 = no limit (process all frames) + std::chrono::steady_clock::time_point m_lastProcessedTime; // timestamp of last processed frame + bool m_targetFPSInitialized = false; // first-frame flag + BOOL m_bPlaying; BOOL m_bPaused; diff --git a/NV12_GLOBAL_POOL_FIX_V2.md b/NV12_GLOBAL_POOL_FIX_V2.md new file mode 100644 index 0000000..fd2fb05 --- /dev/null +++ b/NV12_GLOBAL_POOL_FIX_V2.md @@ -0,0 +1,133 @@ +# NV12 Global Slot Pool Fix — Complete Reference (v2) + +## Problem Statement +When RTSP cameras disconnect in LabVIEW, the flow is `ReleaseANSRTSPHandle → Destroy() → delete → CreateANSRTSPHandle`. The old per-camera GPU buffer pool was destroyed during this cycle, causing: +1. **Frozen inference** — `forceReleaseByOwner` deleted GpuFrameData mid-inference +2. **Processing spikes** — `cudaDeviceSynchronize` blocked ALL GPU work (900ms+) +3. **Crashes** — inference read freed pool buffers after camera deletion + +## Architecture: Global GpuNV12SlotPool + +GPU buffer ownership is **decoupled from camera lifetime**: +- Buffers live in a **process-wide singleton** (`GpuNV12SlotPool`) +- Slots are **recycled** (never freed during camera Destroy) +- **50ms cooldown** prevents slot reuse while GPU kernels still read +- **Per-slot non-blocking CUDA stream** avoids NULL-stream implicit sync +- **Background av_frame_free thread** removes SRW lock blocking from hot path + +## Files Modified (from original codebase) + +### NEW FILES (3 + copies) + +| File | Purpose | +|------|---------| +| `include/GpuNV12SlotPool.h` | Global pool header — GpuNV12Slot struct, GpuNV12SlotPool class | +| `modules/ANSCV/GpuNV12SlotPool.cpp` | Canonical singleton + acquire() implementation (CUDA) | +| `modules/ANSODEngine/GpuNV12SlotPool.cpp` | Cross-DLL resolver via GetProcAddress | +| `modules/ANSOCR/GpuNV12SlotPool.cpp` | Same resolver (copy of ANSODEngine version) | +| `modules/ANSFR/GpuNV12SlotPool.cpp` | Same resolver (copy of ANSODEngine version) | +| `modules/ANSLPR/GpuNV12SlotPool.cpp` | Same resolver (copy of ANSODEngine version) | + +### MODIFIED FILES + +| File | Changes | +|------|---------| +| `include/ANSGpuFrameRegistry.h` | Added `#include "GpuNV12SlotPool.h"`, `GpuNV12Slot* poolSlot` field in GpuFrameData, move constructor transfers poolSlot, `freeOwnedBuffers_locked()` calls `deferRelease(poolSlot)`, added `pushPendingFree_locked()`, debug macros guarded by `ANSCORE_GPU_DEBUG` | +| `modules/ANSCV/ANSGpuFrameOps.h` | `gpu_frame_attach_cuda()` rewritten: sync D2D on per-slot stream, deferred av_frame_free, CPU snapshot only on fallback, background av_frame_free thread in `gpu_frame_evict_stale()`. Debug macros guarded. | +| `modules/ANSCV/ANSRTSP.h` | Removed `GpuNV12Pool` struct, `EnsureGpuPool()`, `DestroyGpuPool()`, `GetGpuPool()` | +| `modules/ANSCV/ANSRTSP.cpp` | Removed `EnsureGpuPool`/`DestroyGpuPool` implementations. `Destroy()` and `Reconnect()` simplified: no `forceReleaseByOwner`, no `cudaDeviceSynchronize`, no `DestroyGpuPool`. `GetRTSPCVImage()` uses `GpuNV12SlotPool::instance().acquire()`. Added SLOW FRAME timing log (>500ms, to both spdlog and DebugView). Debug macros guarded. | +| `modules/ANSODEngine/NV12PreprocessHelper.cpp` | Debug logging blocks guarded by `ANSCORE_GPU_DEBUG`. One-time `[NV12 ACTIVE]` log to DebugView when NV12 fast path activates. | +| `modules/ANSODEngine/CMakeLists.txt` | Added `GpuNV12SlotPool.cpp` to source list | +| `modules/ANSFR/CMakeLists.txt` | Added `GpuNV12SlotPool.cpp` to source list | +| `modules/ANSLPR/CMakeLists.txt` | Added `GpuNV12SlotPool.cpp` to source list | +| (ANSOCR uses file GLOB — auto-included) | | + +## Key Design Decisions + +| Decision | Rationale | +|----------|-----------| +| **Sync D2D on per-slot stream** | Non-blocking stream avoids NULL-stream implicit sync with inference (was causing 1-2s stalls). `cudaStreamSynchronize` waits only for the 2 copies (~1.5ms). Slot held briefly → pool stays small (64 slots for 20+ cameras). | +| **50ms cooldown on slot reuse** | GPU kernels complete in <10ms. 50ms = 5× safety margin. Prevents buffer overwrite while inference reads. Short enough to keep pool pressure low. | +| **Background av_frame_free thread** | `av_frame_free` on CUDA-mapped frames acquires nvcuda64 SRW lock (5-20ms each). Background thread frees in batches every 50ms, removing all SRW lock blocking from camera hot path. | +| **CPU NV12 snapshot deferred to fallback only** | 4K snapshot = ~12MB malloc+memcpy+free per frame (~276MB/s). Only needed for cross-GPU fallback (rare). Skipping on pool-success path eliminates heap churn. | +| **Debug logging guarded by ANSCORE_GPU_DEBUG** | 500-2000 OutputDebugString calls/sec caused process-wide mutex convoy stalls. Default off. Add `-DANSCORE_GPU_DEBUG=1` to CMake to re-enable. | +| **Always-on diagnostics** | NEW slot allocation, POOL FULL, SLOW FRAME (>500ms), and NV12 ACTIVE path selection always log to DebugView (low volume, ~1-10 per session). | + +## Data Flow + +``` +GetRTSPCVImage (camera thread): + 1. GetImage() → BGR frame (shallow copy) + 2. anscv_mat_replace → swap Mat pointer + 3. TryIncrementInFlight() → atomic guard + 4. GetCudaHWFrame() → NVDEC device pointers + 5. GetNV12Frame() → CPU NV12 AVFrame (cloned) + 6. slot = GpuNV12SlotPool::acquire(gpuIdx, w, h) + └─ drainCooledSlots_locked() first (COOLING→FREE if >50ms) + 7. gpu_frame_attach_cuda(*image, cudaFrame, gpuIdx, pts, cpuNV12, slot): + a. cudaMemcpy2DAsync(slot->bufY, ..., nvdecY, ..., slot->copyStream) + b. cudaMemcpy2DAsync(slot->bufUV, ..., nvdecUV, ..., slot->copyStream) + c. cudaStreamSynchronize(slot->copyStream) — waits ~1.5ms (copy only) + d. data.poolSlot = slot + e. DEFERRED: push cudaFrame+cpuNV12 to m_pendingFree (NOT av_frame_free) + f. registry.attach(mat, data) + 8. Wire onReleaseFn → DecrementInFlight + 9. return (~3-5ms total) + +Inference (engine thread): + 1. gpuFrame = lookup(*cvImage) → GpuFrameData* + 2. tl_currentGpuFrame() = gpuFrame + 3. tryNV12(): reads yPlane/uvPlane → slot buffers (data is valid, sync done) + 4. NV12→RGB kernel launch → reads from slot buffer + 5. Inference finishes → clone released → refcount→0 + → freeOwnedBuffers_locked → deferRelease(poolSlot) → COOLING + → onReleaseFn → DecrementInFlight + +Background av_frame_free thread (started once): + - Every 50ms: drain m_pendingFree → av_frame_free each + - Runs independently of camera/inference threads + - SRW lock blocking happens HERE, not in hot path + +Slot lifecycle: + acquire() → STATE_ACTIVE + refcount→0 → deferRelease → STATE_COOLING (cooldownStart = now) + 50ms later → drainCooledSlots_locked → STATE_FREE + next acquire() → reuse + +Destroy (camera thread) — LIGHTWEIGHT: + 1. _isPlaying = false + 2. Wait _inFlightFrames == 0 (fast — sync copy means in-flight = GetRTSPCVImage only) + 3. invalidateOwner(this) — prevent stale callbacks + 4. close() — destroys NVDEC decoder only + *** NO forceReleaseByOwner *** + *** NO cudaDeviceSynchronize *** + *** NO DestroyGpuPool *** + Pool slots survive — inference keeps reading safely. +``` + +## DebugView Diagnostics (always-on) + +``` +[NV12Pool] NEW slot #1: 1920x1080 gpu=0 Y=0000001764000000 UV=... stream=... +[NV12Pool] NEW slot #2: 3840x2160 gpu=0 Y=... UV=... stream=... +[NV12 ACTIVE] ANSRTYOLO Path: CUDA_ZERO_COPY | isCuda=1 gpuMatch=1 decodeGpu=0 infGpu=0 frame=1920x1080 +[GetRTSPCVImage] SLOW FRAME: total=523.1ms (getImage=2.1ms cuda=521.0ms) 3840x2160 +[NV12Pool] POOL FULL (64 slots) — fallback to CPU +``` + +## Build Configuration + +- **Production (default):** Debug logging OFF. Only slot allocation, POOL FULL, SLOW FRAME, and NV12 ACTIVE visible in DebugView. +- **Debug:** Add `-DANSCORE_GPU_DEBUG=1` to CMake. Enables per-frame verbose logging (WARNING: causes performance degradation from OutputDebugString lock contention at high frame rates). + +## Test Checklist + +- [ ] Start multiple RTSP cameras with HW decoding + multiple AI engines +- [ ] Verify DebugView shows: NEW slot allocations, NV12 ACTIVE with CUDA_ZERO_COPY +- [ ] Verify: zero POOL FULL entries +- [ ] Verify: zero or very few SLOW FRAME entries (>500ms) +- [ ] Trigger camera reconnect (disconnect cable or ReleaseHandle+CreateHandle) +- [ ] Verify: no crash, inference continues on remaining cameras +- [ ] Verify: processing time chart stable (no multi-second spikes) +- [ ] Check nvidia-smi: VRAM stable (slots recycled, not growing) +- [ ] Long run: 1+ hours with cameras reconnecting periodically diff --git a/include/ANSGpuFrameRegistry.h b/include/ANSGpuFrameRegistry.h index f230b32..a06b0ab 100644 --- a/include/ANSGpuFrameRegistry.h +++ b/include/ANSGpuFrameRegistry.h @@ -132,6 +132,13 @@ struct GpuFrameData { // freed while any consumer is still reading it. GpuNV12Slot* poolSlot = nullptr; + // --- Async D2D copy stream --- + // The CUDA stream used for the async D2D copy from NVDEC surface to pool buffer. + // Inference MUST call cudaStreamSynchronize on this before reading yPlane/uvPlane + // to ensure the copy has completed. Stored as void* to avoid cuda_runtime.h here. + // nullptr means D2D was synchronous (legacy path) or no D2D copy was done. + void* d2dCopyStream = nullptr; + // Default constructor GpuFrameData() = default; @@ -151,6 +158,7 @@ struct GpuFrameData { , refcount(o.refcount.load()), createdAt(o.createdAt) , ownerClient(o.ownerClient), onReleaseFn(o.onReleaseFn) , poolSlot(o.poolSlot) + , d2dCopyStream(o.d2dCopyStream) { // Null out source to prevent double-free of owned pointers o.cpuYPlane = nullptr; @@ -165,6 +173,7 @@ struct GpuFrameData { o.ownerClient = nullptr; o.onReleaseFn = nullptr; o.poolSlot = nullptr; + o.d2dCopyStream = nullptr; } // No copy @@ -360,6 +369,12 @@ public: return result; } + // Push an AVFrame* (as void*) for deferred freeing. + // Caller MUST hold the lock via acquire_lock(). + void pushPendingFree_locked(void* ptr) { + if (ptr) m_pendingFree.push_back(ptr); + } + // --- Drain pending GPU device pointers for caller to cudaFree --- // Each entry includes the device index for cudaSetDevice before cudaFree. // If minAgeMs > 0, only drain entries older than minAgeMs milliseconds. diff --git a/include/GpuNV12SlotPool.h b/include/GpuNV12SlotPool.h index 9c66ca8..2b8f3ba 100644 --- a/include/GpuNV12SlotPool.h +++ b/include/GpuNV12SlotPool.h @@ -97,7 +97,8 @@ struct GpuNV12Slot { // first — causing 1-2 second stalls. Using a dedicated non-blocking // stream avoids this implicit sync entirely. // Stored as void* to avoid cuda_runtime.h in the header. - void* copyStream = nullptr; // cudaStream_t + void* copyStream = nullptr; // cudaStream_t + }; class GpuNV12SlotPool { @@ -119,6 +120,7 @@ public: // Returns nullptr if pool full — caller falls back to CPU path. GpuNV12Slot* acquire(int gpuIdx, int w, int h); + // Deferred release: moves slot from ACTIVE → COOLING. // Called from freeOwnedBuffers_locked() when GpuFrameData refcount → 0. // The slot becomes FREE after SLOT_COOLDOWN_MS elapses (checked in acquire). diff --git a/modules/ANSCV/ANSFLV.cpp b/modules/ANSCV/ANSFLV.cpp index 37e03c3..9daa0c7 100644 --- a/modules/ANSCV/ANSFLV.cpp +++ b/modules/ANSCV/ANSFLV.cpp @@ -621,6 +621,14 @@ namespace ANSCENTER { std::lock_guard lock(_mutex); _playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display) } + void ANSFLVClient::SetTargetFPS(double intervalMs) { + std::lock_guard lock(_mutex); + _playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS + } + void ANSFLVClient::SetNV12FastPath(bool enable) { + std::lock_guard lock(_mutex); + _useNV12FastPath = enable; + } AVFrame* ANSFLVClient::GetNV12Frame() { std::lock_guard lock(_mutex); return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free @@ -767,17 +775,18 @@ extern "C" __declspec(dllexport) int GetFLVCVImage(ANSCENTER::ANSFLVClient** Han // Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock) anscv_mat_replace(image, std::move(img)); - // Attach NV12 frame for GPU fast-path inference (side-table registry) - // attach() takes ownership — do NOT av_frame_free here - int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); - AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); - if (cudaHW) { - AVFrame* cpuNV12 = (*Handle)->GetNV12Frame(); - gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12); - } else { - AVFrame* nv12 = (*Handle)->GetNV12Frame(); - if (nv12) { - gpu_frame_attach(*image, nv12, gpuIdx, timeStamp); + // NV12 GPU fast path (optional — disabled by default for stability) + if ((*Handle)->IsNV12FastPath()) { + int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); + AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); + if (cudaHW) { + AVFrame* cpuNV12 = (*Handle)->GetNV12Frame(); + gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12); + } else { + AVFrame* nv12 = (*Handle)->GetNV12Frame(); + if (nv12) { + gpu_frame_attach(*image, nv12, gpuIdx, timeStamp); + } } } @@ -952,6 +961,18 @@ extern "C" __declspec(dllexport) void SetFLVDisplayResolution(ANSCENTER::ANSFLVC (*Handle)->SetDisplayResolution(width, height); } catch (...) { } } +extern "C" __declspec(dllexport) void SetFLVTargetFPS(ANSCENTER::ANSFLVClient** Handle, double intervalMs) { + if (Handle == nullptr || *Handle == nullptr) return; + try { + (*Handle)->SetTargetFPS(intervalMs); + } catch (...) { } +} +extern "C" __declspec(dllexport) void SetFLVNV12FastPath(ANSCENTER::ANSFLVClient** Handle, int enable) { + if (Handle == nullptr || *Handle == nullptr) return; + try { + (*Handle)->SetNV12FastPath(enable != 0); + } catch (...) { } +} // ============================================================================ // V2 entry points — accept handle by value (uint64_t) instead of Handle** diff --git a/modules/ANSCV/ANSFLV.h b/modules/ANSCV/ANSFLV.h index b61caf2..d48efb5 100644 --- a/modules/ANSCV/ANSFLV.h +++ b/modules/ANSCV/ANSFLV.h @@ -36,6 +36,7 @@ namespace ANSCENTER int _imageWidth, _imageHeight; int64_t _pts; bool _isPlaying; + bool _useNV12FastPath = false; // false = original stable CPU path, true = NV12 GPU fast path std::recursive_mutex _mutex; public: ANSFLVClient(); @@ -71,6 +72,9 @@ namespace ANSCENTER int GetHWDecodingGpuIndex(); void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize) void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display) + void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS) + void SetNV12FastPath(bool enable); // true = NV12 GPU fast path, false = original CPU path (stable) + bool IsNV12FastPath() const { return _useNV12FastPath; } AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free) AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM) @@ -108,4 +112,6 @@ extern "C" __declspec(dllexport) int IsFLVHWDecodingActive(ANSCENTER::ANSFLVCli extern "C" __declspec(dllexport) int GetFLVHWDecodingGpuIndex(ANSCENTER::ANSFLVClient** Handle); extern "C" __declspec(dllexport) void SetFLVImageQuality(ANSCENTER::ANSFLVClient** Handle, int mode); extern "C" __declspec(dllexport) void SetFLVDisplayResolution(ANSCENTER::ANSFLVClient** Handle, int width, int height); +extern "C" __declspec(dllexport) void SetFLVTargetFPS(ANSCENTER::ANSFLVClient** Handle, double intervalMs); +extern "C" __declspec(dllexport) void SetFLVNV12FastPath(ANSCENTER::ANSFLVClient** Handle, int enable); #endif \ No newline at end of file diff --git a/modules/ANSCV/ANSGpuFrameOps.h b/modules/ANSCV/ANSGpuFrameOps.h index cd61e98..6cec22e 100644 --- a/modules/ANSCV/ANSGpuFrameOps.h +++ b/modules/ANSCV/ANSGpuFrameOps.h @@ -23,6 +23,8 @@ extern "C" { #include #include #include +#include +#include #include #ifdef _WIN32 @@ -166,16 +168,13 @@ inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pt void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data)); if (old) { - AVFrame* oldFrame = static_cast(old); - av_frame_free(&oldFrame); + // Defer old frame's AVFrame free + auto& reg = ANSGpuFrameRegistry::instance(); + auto lk = reg.acquire_lock(); + reg.pushPendingFree_locked(old); } - // Free stale entries evicted by TTL or previous attach - auto pending = ANSGpuFrameRegistry::instance().drain_pending(); - for (void* p : pending) { - AVFrame* stale = static_cast(p); - av_frame_free(&stale); - } + // NOTE: No drain_pending() here (hot path). Freed by evict_stale. } // Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory. @@ -226,13 +225,10 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, if (slot && slot->bufY && slot->bufUV && slot->pitchY > 0 && slot->pitchUV > 0) { // --- Global pool path: D2D copy on per-slot non-blocking stream --- - // CRITICAL: Using the NULL stream (cudaMemcpy2D without stream) causes - // 1-2 second stalls on WDDM because it implicitly synchronizes with - // ALL other streams before executing. By using cudaMemcpy2DAsync on - // the slot's own non-blocking stream + cudaStreamSynchronize, we: - // 1. Submit the copy immediately (no wait for inference kernels) - // 2. Wait ONLY for this copy to finish (~0.3ms 1080p, ~1.2ms 4K) - // 3. Data is valid after sync — av_frame_free is safe + // cudaMemcpy2DAsync + cudaStreamSynchronize(slotStream): + // - Non-blocking stream avoids NULL-stream implicit sync with inference + // - Sync waits ONLY for the 2 copies (~1.5ms for 4K, ~0.3ms for 1080p) + // - Data valid after sync — av_frame_free is safe int prevDev = -1; cudaGetDevice(&prevDev); if (gpuIdx >= 0) cudaSetDevice(gpuIdx); @@ -247,13 +243,13 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, e4 = cudaMemcpy2DAsync(slot->bufUV, slot->pitchUV, cudaFrame->data[1], cudaFrame->linesize[1], w, h / 2, cudaMemcpyDeviceToDevice, copyStream); - if (e3 == cudaSuccess && e4 == cudaSuccess) { - // Wait ONLY for this stream's 2 copies (~0.3-1.2ms). - // Does NOT wait for inference kernels on other streams. - cudaStreamSynchronize(copyStream); - } + // NO cudaStreamSynchronize here — let the copy run asynchronously. + // The camera thread is NOT blocked by the WDDM SRW lock. + // Inference will call cudaStreamSynchronize(d2dCopyStream) in tryNV12() + // before reading the buffer. By that time (~50-200ms later), the copy + // (~0.3ms for 1080p, ~1.5ms for 4K) has long completed, so the sync + // returns immediately with zero blocking. } else { - // Fallback if stream creation failed — NULL stream (may stall) e3 = cudaMemcpy2D(slot->bufY, slot->pitchY, cudaFrame->data[0], cudaFrame->linesize[0], w, h, cudaMemcpyDeviceToDevice); @@ -270,15 +266,14 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, data.uvPlane = static_cast(slot->bufUV); data.yLinesize = static_cast(slot->pitchY); data.uvLinesize = static_cast(slot->pitchUV); - data.poolSlot = slot; // Track for deferred release - // gpuCacheY/UV stay nullptr — global pool owns the buffers + data.poolSlot = slot; + data.d2dCopyStream = copyStream; // Inference syncs on this before reading d2dOk = true; - GPU_FRAME_DBG("attach_cuda: D2D OK (global pool) Y=%p UV=%p yPitch=%zu uvPitch=%zu", - slot->bufY, slot->bufUV, slot->pitchY, slot->pitchUV); + GPU_FRAME_DBG("attach_cuda: D2D OK (global pool, async) Y=%p UV=%p yPitch=%zu uvPitch=%zu stream=%p", + slot->bufY, slot->bufUV, slot->pitchY, slot->pitchUV, copyStream); } else { GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED (pool) e3=%d e4=%d — fallback", (int)e3, (int)e4); - // Release slot back to pool on failure (immediate, no cooldown needed) slot->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release); } } @@ -364,13 +359,34 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, data.uvLinesize = data.cpuUvLinesize; } - // Free AVFrames immediately — synchronous D2D copy has completed, - // so NVDEC surfaces can be returned to the decoder's surface pool. - GPU_FRAME_DBG("attach_cuda: freeing AVFrames cudaFrame=%p cpuNV12=%p", - (void*)cudaFrame, (void*)cpuNV12); - av_frame_free(&cudaFrame); - if (cpuNV12) av_frame_free(&cpuNV12); - data.avframe = nullptr; + // AVFrame lifetime management: + // - If D2D was ASYNC (d2dCopyStream != null): keep cudaFrame alive in + // GpuFrameData.avframe so the NVDEC surface (copy source) remains valid + // until the async copy completes. The AVFrame is freed when GpuFrameData + // is released (after inference), by which time the 0.3ms copy is long done. + // - If D2D was SYNC or failed: push to pending free immediately (old behavior). + if (data.d2dCopyStream && cudaFrame) { + // Async D2D — keep AVFrame alive, inference will outlive the copy + data.avframe = cudaFrame; + GPU_FRAME_DBG("attach_cuda: keeping AVFrame alive for async D2D cudaFrame=%p", + (void*)cudaFrame); + } else { + // Sync D2D or fallback — safe to defer free now + GPU_FRAME_DBG("attach_cuda: deferring AVFrame free cudaFrame=%p", + (void*)cudaFrame); + if (cudaFrame) { + auto& reg = ANSGpuFrameRegistry::instance(); + auto lk = reg.acquire_lock(); + reg.pushPendingFree_locked(cudaFrame); + } + data.avframe = nullptr; + } + // cpuNV12 is always safe to defer — CPU snapshot (if taken) is already copied + if (cpuNV12) { + auto& reg = ANSGpuFrameRegistry::instance(); + auto lk = reg.acquire_lock(); + reg.pushPendingFree_locked(cpuNV12); + } data.cpuAvframe = nullptr; GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d poolSlot=%p", @@ -379,16 +395,16 @@ inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data)); if (old) { - AVFrame* oldFrame = static_cast(old); - av_frame_free(&oldFrame); + // Old frame's AVFrame returned — defer its free too + auto& reg = ANSGpuFrameRegistry::instance(); + auto lk = reg.acquire_lock(); + reg.pushPendingFree_locked(old); } - // Free stale AVFrames evicted by TTL or previous attach - auto pending = ANSGpuFrameRegistry::instance().drain_pending(); - for (void* p : pending) { - AVFrame* stale = static_cast(p); - av_frame_free(&stale); - } + // NOTE: No drain_pending() here (hot path). AVFrames accumulate in + // m_pendingFree and are freed by gpu_frame_evict_stale() which runs + // every 500ms from anscv_mat_replace. This removes av_frame_free + // (5-20ms SRW lock per call) from the camera frame-grabbing path. } // Release entry by cv::Mat* and free any returned AVFrames. @@ -400,14 +416,7 @@ inline void gpu_frame_remove(cv::Mat* mat) { GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat); ANSGpuFrameRegistry::instance().release(mat); - // Free any AVFrames that became pending from this release or prior eviction - auto pending = ANSGpuFrameRegistry::instance().drain_pending(); - for (void* p : pending) { - AVFrame* stale = static_cast(p); - av_frame_free(&stale); - } - - // GPU device pointers deferred — see gpu_frame_evict_stale() / Destroy() + // NOTE: No drain_pending() here (hot path). AVFrames freed by evict_stale. } // Alias for remove — used in ANSCV mutating functions to drop stale GPU data. @@ -425,10 +434,39 @@ inline void gpu_frame_invalidate(cv::Mat* mat) { inline void gpu_frame_evict_stale() { ANSGpuFrameRegistry::instance().evictStaleFrames(); - auto pending = ANSGpuFrameRegistry::instance().drain_pending(); - for (void* p : pending) { - AVFrame* stale = static_cast(p); - av_frame_free(&stale); + // Drain and free AVFrames on a background thread to avoid blocking the + // camera hot path. av_frame_free on CUDA-mapped frames can take 5-20ms + // per call due to nvcuda64 SRW lock. The background thread frees them + // periodically (every 50ms) in batches. + { + static std::once_flag s_initOnce; + static std::mutex s_avFreeMutex; + static std::vector s_avFreeQueue; + + // Move pending AVFrames to the background queue + auto pending = ANSGpuFrameRegistry::instance().drain_pending(); + if (!pending.empty()) { + std::lock_guard lock(s_avFreeMutex); + s_avFreeQueue.insert(s_avFreeQueue.end(), pending.begin(), pending.end()); + } + + // Start background free thread on first call + std::call_once(s_initOnce, []() { + std::thread([]() { + while (true) { + std::vector batch; + { + std::lock_guard lock(s_avFreeMutex); + batch.swap(s_avFreeQueue); + } + for (void* p : batch) { + AVFrame* f = static_cast(p); + av_frame_free(&f); + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + }).detach(); + }); } // Free GPU device pointers from evicted/released frames (legacy path). diff --git a/modules/ANSCV/ANSMJPEG.cpp b/modules/ANSCV/ANSMJPEG.cpp index 3646c99..cab859b 100644 --- a/modules/ANSCV/ANSMJPEG.cpp +++ b/modules/ANSCV/ANSMJPEG.cpp @@ -621,6 +621,14 @@ namespace ANSCENTER { std::lock_guard lock(_mutex); _playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display) } + void ANSMJPEGClient::SetTargetFPS(double intervalMs) { + std::lock_guard lock(_mutex); + _playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS + } + void ANSMJPEGClient::SetNV12FastPath(bool enable) { + std::lock_guard lock(_mutex); + _useNV12FastPath = enable; + } AVFrame* ANSMJPEGClient::GetNV12Frame() { std::lock_guard lock(_mutex); return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free @@ -768,20 +776,18 @@ extern "C" __declspec(dllexport) int GetMJPEGCVImage(ANSCENTER::ANSMJPEGClient** // Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock) anscv_mat_replace(image, std::move(img)); - // Attach NV12 frame for GPU fast-path inference (side-table registry) - // attach() takes ownership — do NOT av_frame_free here - int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); - AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); - if (cudaHW) { - // CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers. - // Also attach CPU NV12 as fallback for cross-GPU inference - // (when decode GPU != inference GPU, CUDA ptrs aren't accessible). - AVFrame* cpuNV12 = (*Handle)->GetNV12Frame(); - gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12); - } else { - AVFrame* nv12 = (*Handle)->GetNV12Frame(); - if (nv12) { - gpu_frame_attach(*image, nv12, gpuIdx, timeStamp); + // NV12 GPU fast path (optional — disabled by default for stability) + if ((*Handle)->IsNV12FastPath()) { + int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); + AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); + if (cudaHW) { + AVFrame* cpuNV12 = (*Handle)->GetNV12Frame(); + gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12); + } else { + AVFrame* nv12 = (*Handle)->GetNV12Frame(); + if (nv12) { + gpu_frame_attach(*image, nv12, gpuIdx, timeStamp); + } } } @@ -956,6 +962,18 @@ extern "C" __declspec(dllexport) void SetMJPEGDisplayResolution(ANSCENTER::ANSMJ (*Handle)->SetDisplayResolution(width, height); } catch (...) { } } +extern "C" __declspec(dllexport) void SetMJPEGTargetFPS(ANSCENTER::ANSMJPEGClient** Handle, double intervalMs) { + if (Handle == nullptr || *Handle == nullptr) return; + try { + (*Handle)->SetTargetFPS(intervalMs); + } catch (...) { } +} +extern "C" __declspec(dllexport) void SetMJPEGNV12FastPath(ANSCENTER::ANSMJPEGClient** Handle, int enable) { + if (Handle == nullptr || *Handle == nullptr) return; + try { + (*Handle)->SetNV12FastPath(enable != 0); + } catch (...) { } +} // ============================================================================ // V2 entry points — accept handle as uint64_t by value (LabVIEW safe) diff --git a/modules/ANSCV/ANSMJPEG.h b/modules/ANSCV/ANSMJPEG.h index 31a2da7..2958413 100644 --- a/modules/ANSCV/ANSMJPEG.h +++ b/modules/ANSCV/ANSMJPEG.h @@ -35,6 +35,7 @@ namespace ANSCENTER int _imageWidth, _imageHeight; int64_t _pts; bool _isPlaying; + bool _useNV12FastPath = false; std::recursive_mutex _mutex; public: ANSMJPEGClient(); @@ -70,6 +71,9 @@ namespace ANSCENTER int GetHWDecodingGpuIndex(); void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize) void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display) + void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS) + void SetNV12FastPath(bool enable); // true = NV12 GPU fast path, false = original CPU path (stable) + bool IsNV12FastPath() const { return _useNV12FastPath; } AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free) AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM) @@ -108,4 +112,6 @@ extern "C" __declspec(dllexport) int IsMJPEGHWDecodingActive(ANSCENTER::ANSMJPE extern "C" __declspec(dllexport) int GetMJPEGHWDecodingGpuIndex(ANSCENTER::ANSMJPEGClient** Handle); extern "C" __declspec(dllexport) void SetMJPEGImageQuality(ANSCENTER::ANSMJPEGClient** Handle, int mode); extern "C" __declspec(dllexport) void SetMJPEGDisplayResolution(ANSCENTER::ANSMJPEGClient** Handle, int width, int height); +extern "C" __declspec(dllexport) void SetMJPEGTargetFPS(ANSCENTER::ANSMJPEGClient** Handle, double intervalMs); +extern "C" __declspec(dllexport) void SetMJPEGNV12FastPath(ANSCENTER::ANSMJPEGClient** Handle, int enable); #endif \ No newline at end of file diff --git a/modules/ANSCV/ANSRTMP.cpp b/modules/ANSCV/ANSRTMP.cpp index 4fadb61..a9a0e3f 100644 --- a/modules/ANSCV/ANSRTMP.cpp +++ b/modules/ANSCV/ANSRTMP.cpp @@ -635,6 +635,14 @@ namespace ANSCENTER { std::lock_guard lock(_mutex); _playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display) } + void ANSRTMPClient::SetTargetFPS(double intervalMs) { + std::lock_guard lock(_mutex); + _playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS + } + void ANSRTMPClient::SetNV12FastPath(bool enable) { + std::lock_guard lock(_mutex); + _useNV12FastPath = enable; + } AVFrame* ANSRTMPClient::GetNV12Frame() { std::lock_guard lock(_mutex); return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free @@ -792,20 +800,18 @@ extern "C" __declspec(dllexport) int GetRTMPCVImage(ANSCENTER::ANSRTMPClient** H // Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock) anscv_mat_replace(image, std::move(img)); - // Attach NV12 frame for GPU fast-path inference (side-table registry) - // attach() takes ownership — do NOT av_frame_free here - int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); - AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); - if (cudaHW) { - // CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers. - // Also attach CPU NV12 as fallback for cross-GPU inference - // (when decode GPU != inference GPU, CUDA ptrs aren't accessible). - AVFrame* cpuNV12 = (*Handle)->GetNV12Frame(); - gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12); - } else { - AVFrame* nv12 = (*Handle)->GetNV12Frame(); - if (nv12) { - gpu_frame_attach(*image, nv12, gpuIdx, timeStamp); + // NV12 GPU fast path (optional — disabled by default for stability) + if ((*Handle)->IsNV12FastPath()) { + int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); + AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); + if (cudaHW) { + AVFrame* cpuNV12 = (*Handle)->GetNV12Frame(); + gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12); + } else { + AVFrame* nv12 = (*Handle)->GetNV12Frame(); + if (nv12) { + gpu_frame_attach(*image, nv12, gpuIdx, timeStamp); + } } } @@ -978,6 +984,18 @@ extern "C" __declspec(dllexport) void SetRTMPDisplayResolution(ANSCENTER::ANSRTM (*Handle)->SetDisplayResolution(width, height); } catch (...) { } } +extern "C" __declspec(dllexport) void SetRTMPTargetFPS(ANSCENTER::ANSRTMPClient** Handle, double intervalMs) { + if (Handle == nullptr || *Handle == nullptr) return; + try { + (*Handle)->SetTargetFPS(intervalMs); + } catch (...) { } +} +extern "C" __declspec(dllexport) void SetRTMPNV12FastPath(ANSCENTER::ANSRTMPClient** Handle, int enable) { + if (Handle == nullptr || *Handle == nullptr) return; + try { + (*Handle)->SetNV12FastPath(enable != 0); + } catch (...) { } +} // ============================================================================ // V2 entry points: accept handle by value (uint64_t) to avoid LabVIEW diff --git a/modules/ANSCV/ANSRTMP.h b/modules/ANSCV/ANSRTMP.h index a5a5f39..610bdb3 100644 --- a/modules/ANSCV/ANSRTMP.h +++ b/modules/ANSCV/ANSRTMP.h @@ -36,6 +36,7 @@ namespace ANSCENTER int _imageWidth, _imageHeight; int64_t _pts; bool _isPlaying; + bool _useNV12FastPath = false; std::recursive_mutex _mutex; public: ANSRTMPClient(); @@ -71,6 +72,9 @@ namespace ANSCENTER int GetHWDecodingGpuIndex(); void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize) void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display) + void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS) + void SetNV12FastPath(bool enable); // true = NV12 GPU fast path, false = original CPU path (stable) + bool IsNV12FastPath() const { return _useNV12FastPath; } AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free) AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM) @@ -107,4 +111,6 @@ extern "C" __declspec(dllexport) int IsRTMPHWDecodingActive(ANSCENTER::ANSRTMPC extern "C" __declspec(dllexport) int GetRTMPHWDecodingGpuIndex(ANSCENTER::ANSRTMPClient** Handle); extern "C" __declspec(dllexport) void SetRTMPImageQuality(ANSCENTER::ANSRTMPClient** Handle, int mode); extern "C" __declspec(dllexport) void SetRTMPDisplayResolution(ANSCENTER::ANSRTMPClient** Handle, int width, int height); +extern "C" __declspec(dllexport) void SetRTMPTargetFPS(ANSCENTER::ANSRTMPClient** Handle, double intervalMs); +extern "C" __declspec(dllexport) void SetRTMPNV12FastPath(ANSCENTER::ANSRTMPClient** Handle, int enable); #endif \ No newline at end of file diff --git a/modules/ANSCV/ANSRTSP.cpp b/modules/ANSCV/ANSRTSP.cpp index 20f3204..90f3a1c 100644 --- a/modules/ANSCV/ANSRTSP.cpp +++ b/modules/ANSCV/ANSRTSP.cpp @@ -213,44 +213,44 @@ namespace ANSCENTER { bool ANSRTSPClient::Reconnect() { // 1. Mark as not-playing under the mutex FIRST. This makes GetImage() // return the cached _pLastFrame instead of calling into the player, - // and blocks new TryIncrementInFlight calls. + // and blocks new TryIncrementInFlight calls (no new NV12 attachments). { std::unique_lock lock(_mutex); _isPlaying = false; - // --- Inference guard: wait for in-flight D2D copies to finish --- - // With synchronous D2D copy, in-flight means "currently inside - // GetRTSPCVImage between TryIncrementInFlight and attach_cuda". - // This is typically <1ms, so the wait is very fast. + // --- Inference guard: wait for ALL in-flight inference to finish --- + // _inFlightFrames tracks frames from GetRTSPCVImage through to the + // end of inference (DecrementInFlight fires when last clone is released). + // We MUST wait for this to reach 0 before calling close(), because + // inference may still be reading NV12 pool buffer data that depends + // on the NVDEC decoder context being alive. + // + // DO NOT force-reset _inFlightFrames or invalidate onReleaseFn — + // let inference finish naturally so DecrementInFlight fires correctly. int inFlight = _inFlightFrames.load(std::memory_order_acquire); if (inFlight > 0) { _logger.LogInfo("ANSRTSPClient::Reconnect", - std::format("waiting for {} in-flight frame(s)...", inFlight), + std::format("waiting for {} in-flight inference(s) to complete...", inFlight), __FILE__, __LINE__); - bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] { + bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(10), [this] { return _inFlightFrames.load(std::memory_order_acquire) <= 0; }); if (!done) { _logger.LogWarn("ANSRTSPClient::Reconnect", - std::format("timed out — still {} in-flight", _inFlightFrames.load()), + std::format("timed out — still {} in-flight, proceeding with close()", + _inFlightFrames.load()), __FILE__, __LINE__); + // Force-reset only on timeout as last resort + ANSGpuFrameRegistry::instance().invalidateOwner(this); + _inFlightFrames.store(0, std::memory_order_release); } } - - // Invalidate owner callbacks — prevents stale DecrementInFlight - // calls after Reconnect re-creates the decoder. - // Frames and their global pool slots remain alive for inference. - ANSGpuFrameRegistry::instance().invalidateOwner(this); - _inFlightFrames.store(0, std::memory_order_release); - - // NO forceReleaseByOwner — frames survive reconnect. - // NO cudaDeviceSynchronize — no GPU buffers to free. - // NO DestroyGpuPool — per-camera pool has been removed. } // 2. close() destroys NVDEC decoder ONLY — run outside _mutex to - // avoid deadlocking with nvcuda64 SRW lock held by inference. - // Pool slot buffers are global and untouched. + // avoid deadlocking with nvcuda64 SRW lock held by other cameras. + // At this point, all inference using this camera's NV12 data has + // completed (or timed out), so close() is safe. _logger.LogInfo("ANSRTSPClient::Reconnect", "calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__); RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this); @@ -883,6 +883,14 @@ namespace ANSCENTER { std::lock_guard lock(_mutex); _playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display) } + void ANSRTSPClient::SetTargetFPS(double intervalMs) { + std::lock_guard lock(_mutex); + _playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS + } + void ANSRTSPClient::SetNV12FastPath(bool enable) { + std::lock_guard lock(_mutex); + _useNV12FastPath = enable; + } AVFrame* ANSRTSPClient::GetNV12Frame() { std::lock_guard lock(_mutex); if (!_isPlaying) return nullptr; // Player may be mid-reconnect (CUDA resources freed) @@ -1045,67 +1053,60 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage( auto t1 = std::chrono::steady_clock::now(); - // Attach NV12 frame for GPU fast-path inference (side-table registry) - // attach() takes ownership — do NOT av_frame_free here - // - // CRITICAL: TryIncrementInFlight() MUST be called BEFORE GetCudaHWFrame(). - // It atomically checks _isPlaying and increments _inFlightFrames under - // the same mutex, so Reconnect() cannot call close() while we're doing - // the D2D copy from NVDEC surfaces inside gpu_frame_attach_cuda(). - int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); - bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight(); - RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d", - (void*)*image, gpuIdx, (int)inFlightGuardHeld); + // NV12 GPU fast path: attach NV12 frame data for zero-copy inference. + // When disabled (_useNV12FastPath=false), the original stable CPU path is used: + // GetImage() returns BGR cv::Mat in CPU RAM → no CUDA calls → no SRW lock contention. + // When enabled, D2D copies NV12 from NVDEC to pool buffers for GPU inference. + if ((*Handle)->IsNV12FastPath()) { + int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); + bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight(); + RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d", + (void*)*image, gpuIdx, (int)inFlightGuardHeld); - if (inFlightGuardHeld) { - AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); - if (cudaHW) { - RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p", - cudaHW->width, cudaHW->height, - (void*)cudaHW->data[0], (void*)cudaHW->data[1]); + if (inFlightGuardHeld) { + AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); + if (cudaHW) { + RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p", + cudaHW->width, cudaHW->height, + (void*)cudaHW->data[0], (void*)cudaHW->data[1]); - // Acquire a slot from the global pool — survives camera Destroy. - GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire( - gpuIdx, cudaHW->width, cudaHW->height); + // Acquire a slot from the global pool — survives camera Destroy. + GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire( + gpuIdx, cudaHW->width, cudaHW->height); - // Only fetch CPU NV12 if pool slot unavailable (cross-GPU fallback). - // When slot is valid, the D2D copy goes GPU→GPU and CPU NV12 is never used. - // Skipping av_frame_clone + av_frame_free saves ~0.1ms per frame. - AVFrame* cpuNV12 = slot ? nullptr : (*Handle)->GetNV12Frame(); - gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot); - } else { - // HW decode not active — try CPU NV12 - AVFrame* nv12 = (*Handle)->GetNV12Frame(); - if (nv12) { - gpu_frame_attach(*image, nv12, gpuIdx, timeStamp); + // Only fetch CPU NV12 if pool slot unavailable (cross-GPU fallback). + AVFrame* cpuNV12 = slot ? nullptr : (*Handle)->GetNV12Frame(); + gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot); + } else { + // HW decode not active — try CPU NV12 + AVFrame* nv12 = (*Handle)->GetNV12Frame(); + if (nv12) { + gpu_frame_attach(*image, nv12, gpuIdx, timeStamp); + } } - } - // Wire up the registry callback to release the in-flight guard. - // TryIncrementInFlight already incremented; DecrementInFlight fires - // when the last clone of this frame is released after inference. - auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image); - RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p", - (void*)gpuData, - gpuData ? (void*)gpuData->yPlane : nullptr, - gpuData ? (int)gpuData->isCudaDevicePtr : -1, - gpuData ? (void*)gpuData->poolSlot : nullptr); - if (gpuData) { - gpuData->ownerClient = *Handle; - gpuData->onReleaseFn = [](void* client) { - static_cast(client)->DecrementInFlight(); - }; - // NOTE: Do NOT call IncrementInFlight() again here — - // TryIncrementInFlight() already did it above. + // Wire up the registry callback to release the in-flight guard. + auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image); + RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p", + (void*)gpuData, + gpuData ? (void*)gpuData->yPlane : nullptr, + gpuData ? (int)gpuData->isCudaDevicePtr : -1, + gpuData ? (void*)gpuData->poolSlot : nullptr); + if (gpuData) { + gpuData->ownerClient = *Handle; + gpuData->onReleaseFn = [](void* client) { + static_cast(client)->DecrementInFlight(); + }; + } else { + (*Handle)->DecrementInFlight(); + } } else { - // No gpuData registered (attach failed?) — release the guard - (*Handle)->DecrementInFlight(); + RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)"); } - } else { - // Player is stopping/reconnecting — skip CUDA path entirely. - // GetImage() already returned a cached BGR frame, which is safe. - RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)"); } + // else: original CPU path — cv::Mat** contains BGR data in CPU RAM. + // No CUDA calls, no pool slots, no GPU frame registry. + // Inference uses cv::Mat directly (upload to GPU in engine). // Lightweight timing — logs only when frame grab + D2D exceeds 50ms. // Goes to both spdlog (console/file) AND OutputDebugString (DebugView) @@ -1115,7 +1116,7 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage( double getImageMs = std::chrono::duration(t1 - t0).count(); double cudaMs = std::chrono::duration(t2 - t1).count(); double totalMs = getImageMs + cudaMs; - if (totalMs > 50.0) { + if (totalMs > 500.0) { auto msg = std::format("SLOW FRAME: total={:.1f}ms (getImage={:.1f}ms cuda={:.1f}ms) {}x{}", totalMs, getImageMs, cudaMs, width, height); (*Handle)->_logger.LogWarn("GetRTSPCVImage", msg, __FILE__, __LINE__); @@ -1452,6 +1453,18 @@ extern "C" __declspec(dllexport) void SetRTSPDisplayResolution(ANSCENTER::ANSRTS (*Handle)->SetDisplayResolution(width, height); } catch (...) { } } +extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient** Handle, double intervalMs) { + if (Handle == nullptr || *Handle == nullptr) return; + try { + (*Handle)->SetTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS + } catch (...) { } +} +extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable) { + if (Handle == nullptr || *Handle == nullptr) return; + try { + (*Handle)->SetNV12FastPath(enable != 0); // 0=original CPU path (stable), 1=NV12 GPU fast path + } catch (...) { } +} extern "C" __declspec(dllexport) int SetCropFlagRTSP(ANSCENTER::ANSRTSPClient** Handle, int cropFlag) { if (Handle == nullptr || *Handle == nullptr) return -1; try { diff --git a/modules/ANSCV/ANSRTSP.h b/modules/ANSCV/ANSRTSP.h index 904060a..5bc4bb5 100644 --- a/modules/ANSCV/ANSRTSP.h +++ b/modules/ANSCV/ANSRTSP.h @@ -38,6 +38,7 @@ namespace ANSCENTER int _imageWidth,_imageHeight; int64_t _pts; bool _isPlaying; + bool _useNV12FastPath = false; // false = original stable CPU path, true = NV12 GPU fast path std::recursive_mutex _mutex; // --- Per-client inference guard --- @@ -102,6 +103,9 @@ namespace ANSCENTER int GetHWDecodingGpuIndex(); void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize) void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display) + void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS) + void SetNV12FastPath(bool enable); // true = NV12 GPU fast path (zero-copy inference), false = original CPU path (stable) + bool IsNV12FastPath() const { return _useNV12FastPath; } AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free) AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM) @@ -139,4 +143,6 @@ extern "C" __declspec(dllexport) int IsRTSPHWDecodingActive(ANSCENTER::ANSRTSPC extern "C" __declspec(dllexport) int GetRTSPHWDecodingGpuIndex(ANSCENTER::ANSRTSPClient** Handle); extern "C" __declspec(dllexport) void SetRTSPImageQuality(ANSCENTER::ANSRTSPClient** Handle, int mode); extern "C" __declspec(dllexport) void SetRTSPDisplayResolution(ANSCENTER::ANSRTSPClient** Handle, int width, int height); +extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient** Handle, double intervalMs); +extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable); #endif \ No newline at end of file diff --git a/modules/ANSCV/ANSSRT.cpp b/modules/ANSCV/ANSSRT.cpp index 8c0ea25..be14595 100644 --- a/modules/ANSCV/ANSSRT.cpp +++ b/modules/ANSCV/ANSSRT.cpp @@ -652,6 +652,14 @@ namespace ANSCENTER { std::lock_guard lock(_mutex); _playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display) } + void ANSSRTClient::SetTargetFPS(double intervalMs) { + std::lock_guard lock(_mutex); + _playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS + } + void ANSSRTClient::SetNV12FastPath(bool enable) { + std::lock_guard lock(_mutex); + _useNV12FastPath = enable; + } AVFrame* ANSSRTClient::GetNV12Frame() { std::lock_guard lock(_mutex); return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free @@ -809,20 +817,18 @@ extern "C" __declspec(dllexport) int GetSRTCVImage(ANSCENTER::ANSSRTClient** Han // Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock) anscv_mat_replace(image, std::move(img)); - // Attach NV12 frame for GPU fast-path inference (side-table registry) - // attach() takes ownership — do NOT av_frame_free here - int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); - AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); - if (cudaHW) { - // CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers. - // Also attach CPU NV12 as fallback for cross-GPU inference - // (when decode GPU != inference GPU, CUDA ptrs aren't accessible). - AVFrame* cpuNV12 = (*Handle)->GetNV12Frame(); - gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12); - } else { - AVFrame* nv12 = (*Handle)->GetNV12Frame(); - if (nv12) { - gpu_frame_attach(*image, nv12, gpuIdx, timeStamp); + // NV12 GPU fast path (optional — disabled by default for stability) + if ((*Handle)->IsNV12FastPath()) { + int gpuIdx = (*Handle)->GetHWDecodingGpuIndex(); + AVFrame* cudaHW = (*Handle)->GetCudaHWFrame(); + if (cudaHW) { + AVFrame* cpuNV12 = (*Handle)->GetNV12Frame(); + gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12); + } else { + AVFrame* nv12 = (*Handle)->GetNV12Frame(); + if (nv12) { + gpu_frame_attach(*image, nv12, gpuIdx, timeStamp); + } } } @@ -994,6 +1000,18 @@ extern "C" __declspec(dllexport) void SetSRTDisplayResolution(ANSCENTER::ANSSRTC (*Handle)->SetDisplayResolution(width, height); } catch (...) { } } +extern "C" __declspec(dllexport) void SetSRTTargetFPS(ANSCENTER::ANSSRTClient** Handle, double intervalMs) { + if (Handle == nullptr || *Handle == nullptr) return; + try { + (*Handle)->SetTargetFPS(intervalMs); + } catch (...) { } +} +extern "C" __declspec(dllexport) void SetSRTNV12FastPath(ANSCENTER::ANSSRTClient** Handle, int enable) { + if (Handle == nullptr || *Handle == nullptr) return; + try { + (*Handle)->SetNV12FastPath(enable != 0); + } catch (...) { } +} // ============================================================================ // V2 entry points: accept uint64_t handleVal by value instead of Handle** diff --git a/modules/ANSCV/ANSSRT.h b/modules/ANSCV/ANSSRT.h index 21339c2..82d6aa0 100644 --- a/modules/ANSCV/ANSSRT.h +++ b/modules/ANSCV/ANSSRT.h @@ -35,6 +35,7 @@ namespace ANSCENTER int _imageWidth, _imageHeight; int64_t _pts; bool _isPlaying; + bool _useNV12FastPath = false; std::recursive_mutex _mutex; public: ANSSRTClient(); @@ -70,6 +71,9 @@ namespace ANSCENTER int GetHWDecodingGpuIndex(); void SetDisplayResolution(int width, int height); // Set display output size; 0,0 = original (no resize) void SetImageQuality(int mode); // 0=fast (AI), 1=quality (display) + void SetTargetFPS(double intervalMs); // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS) + void SetNV12FastPath(bool enable); // true = NV12 GPU fast path, false = original CPU path (stable) + bool IsNV12FastPath() const { return _useNV12FastPath; } AVFrame* GetNV12Frame(); // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free) AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference bool IsCudaHWAccel(); // true when decoder uses CUDA (NV12 stays in GPU VRAM) @@ -107,4 +111,6 @@ extern "C" __declspec(dllexport) int IsSRTHWDecodingActive(ANSCENTER::ANSSRTCli extern "C" __declspec(dllexport) int GetSRTHWDecodingGpuIndex(ANSCENTER::ANSSRTClient** Handle); extern "C" __declspec(dllexport) void SetSRTImageQuality(ANSCENTER::ANSSRTClient** Handle, int mode); extern "C" __declspec(dllexport) void SetSRTDisplayResolution(ANSCENTER::ANSSRTClient** Handle, int width, int height); +extern "C" __declspec(dllexport) void SetSRTTargetFPS(ANSCENTER::ANSSRTClient** Handle, double intervalMs); +extern "C" __declspec(dllexport) void SetSRTNV12FastPath(ANSCENTER::ANSSRTClient** Handle, int enable); #endif \ No newline at end of file diff --git a/modules/ANSCV/GpuNV12SlotPool.cpp b/modules/ANSCV/GpuNV12SlotPool.cpp index dadbdd6..71f5f65 100644 --- a/modules/ANSCV/GpuNV12SlotPool.cpp +++ b/modules/ANSCV/GpuNV12SlotPool.cpp @@ -23,6 +23,7 @@ GpuNV12SlotPool* GpuNV12SlotPool_GetInstance() { } // Transition all COOLING slots past the cooldown threshold to FREE. +// Collects pending AVFrames for the caller to av_frame_free. void GpuNV12SlotPool::drainCooledSlots_locked() { auto now = std::chrono::steady_clock::now(); auto threshold = std::chrono::milliseconds(SLOT_COOLDOWN_MS); @@ -67,7 +68,7 @@ GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) { return nullptr; } - // Allocate CUDA buffers on the target GPU + // Allocate CUDA buffers + stream + event on the target GPU int prevDev = -1; cudaGetDevice(&prevDev); if (gpuIdx >= 0) cudaSetDevice(gpuIdx); @@ -76,10 +77,7 @@ GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) { cudaError_t e1 = cudaMallocPitch(&slot->bufY, &slot->pitchY, w, h); cudaError_t e2 = cudaMallocPitch(&slot->bufUV, &slot->pitchUV, w, h / 2); - // Non-blocking stream avoids NULL-stream implicit sync with inference. - // On WDDM, the NULL stream must wait for ALL other streams to finish - // before executing — this caused 1-2 second stalls when inference - // kernels were running. A non-blocking stream runs independently. + // Non-blocking stream: avoids NULL-stream implicit sync with inference. cudaStream_t stream = nullptr; cudaError_t e3 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); @@ -88,7 +86,6 @@ GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) { if (e1 != cudaSuccess || e2 != cudaSuccess) { NV12POOL_DBG("acquire: cudaMallocPitch FAILED %dx%d gpu=%d e1=%d e2=%d", w, h, gpuIdx, (int)e1, (int)e2); - // Clean up partial allocation int prev2 = -1; cudaGetDevice(&prev2); if (gpuIdx >= 0) cudaSetDevice(gpuIdx); if (e1 == cudaSuccess && slot->bufY) cudaFree(slot->bufY); @@ -107,21 +104,18 @@ GpuNV12Slot* GpuNV12SlotPool::acquire(int gpuIdx, int w, int h) { GpuNV12Slot* raw = slot.get(); m_slots.push_back(std::move(slot)); - // Always log new slot allocation to DebugView (rare event — once per resolution per camera). + // Always log new slot allocation to DebugView (rare event). { char _buf[256]; snprintf(_buf, sizeof(_buf), "[NV12Pool] NEW slot #%zu: %dx%d gpu=%d Y=%p UV=%p pitchY=%zu stream=%p\n", - m_slots.size(), w, h, gpuIdx, raw->bufY, raw->bufUV, raw->pitchY, raw->copyStream); + m_slots.size(), w, h, gpuIdx, raw->bufY, raw->bufUV, raw->pitchY, + raw->copyStream); #ifdef _WIN32 OutputDebugStringA(_buf); #endif fprintf(stderr, "%s", _buf); } - // Also log POOL FULL to DebugView (important diagnostic). - NV12POOL_DBG("acquire: NEW slot Y=%p UV=%p pitchY=%zu pitchUV=%zu %dx%d gpu=%d stream=%p (total=%zu)", - raw->bufY, raw->bufUV, raw->pitchY, raw->pitchUV, - w, h, gpuIdx, raw->copyStream, m_slots.size()); return raw; } diff --git a/modules/ANSODEngine/NV12PreprocessHelper.cpp b/modules/ANSODEngine/NV12PreprocessHelper.cpp index 7724a27..13be033 100644 --- a/modules/ANSODEngine/NV12PreprocessHelper.cpp +++ b/modules/ANSODEngine/NV12PreprocessHelper.cpp @@ -269,6 +269,15 @@ namespace ANSCENTER { return result; } + // Ensure async D2D copy (NVDEC → pool buffer) has completed before + // reading yPlane/uvPlane. The copy was queued in gpu_frame_attach_cuda() + // on a non-blocking stream. By the time inference runs (~50-200ms later), + // the copy (~0.3ms) has long finished, so this sync returns immediately. + if (gpuData->d2dCopyStream) { + cudaStreamSynchronize(static_cast(gpuData->d2dCopyStream)); + gpuData->d2dCopyStream = nullptr; // Only sync once per frame + } + const bool isCudaDevice = gpuData->isCudaDevicePtr; const bool gpuMatch = !isCudaDevice || gpuData->gpuIndex < 0 || @@ -367,7 +376,6 @@ namespace ANSCENTER { cv::cuda::GpuMat gpuY, gpuUV; if (useZeroCopy) { - // CUDA zero-copy: wrap pool buffer device pointers directly gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1, effYPlane, static_cast(effYLinesize)); gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1, @@ -641,6 +649,12 @@ namespace ANSCENTER { return result; } + // Ensure async D2D copy has completed before reading NV12 buffers + if (gpuData->d2dCopyStream) { + cudaStreamSynchronize(static_cast(gpuData->d2dCopyStream)); + gpuData->d2dCopyStream = nullptr; + } + const bool isCudaDevice = gpuData->isCudaDevicePtr; const bool gpuMatch = !isCudaDevice || gpuData->gpuIndex < 0 || @@ -775,6 +789,12 @@ namespace ANSCENTER { if (!gpuData->isCudaDevicePtr || !gpuData->yPlane || !gpuData->uvPlane) return result; // NV12 not on GPU + // Ensure async D2D copy has completed before reading NV12 buffers + if (gpuData->d2dCopyStream) { + cudaStreamSynchronize(static_cast(gpuData->d2dCopyStream)); + gpuData->d2dCopyStream = nullptr; + } + const int frameW = gpuData->width; const int frameH = gpuData->height; @@ -890,6 +910,12 @@ namespace ANSCENTER { return result; } + // Ensure async D2D copy has completed before reading NV12 buffers + if (gpuData->d2dCopyStream) { + cudaStreamSynchronize(static_cast(gpuData->d2dCopyStream)); + gpuData->d2dCopyStream = nullptr; + } + const bool isCudaDevice = gpuData->isCudaDevicePtr; const bool gpuMatch = !isCudaDevice || gpuData->gpuIndex < 0 ||