Disable NV12 path for ANSCV by default. Currenly use cv::Mat** directly

2026-04-04 10:09:47 +11:00
parent 445abefebe
commit 3a21026790
19 changed files with 575 additions and 232 deletions
--- a/modules/ANSCV/ANSRTSP.cpp
+++ b/modules/ANSCV/ANSRTSP.cpp
@@ -213,44 +213,44 @@ namespace ANSCENTER {
    bool ANSRTSPClient::Reconnect() {
        // 1. Mark as not-playing under the mutex FIRST.  This makes GetImage()
        //    return the cached _pLastFrame instead of calling into the player,
-        //    and blocks new TryIncrementInFlight calls.
+        //    and blocks new TryIncrementInFlight calls (no new NV12 attachments).
        {
            std::unique_lock<std::recursive_mutex> lock(_mutex);
            _isPlaying = false;

-            // --- Inference guard: wait for in-flight D2D copies to finish ---
-            // With synchronous D2D copy, in-flight means "currently inside
-            // GetRTSPCVImage between TryIncrementInFlight and attach_cuda".
-            // This is typically <1ms, so the wait is very fast.
+            // --- Inference guard: wait for ALL in-flight inference to finish ---
+            // _inFlightFrames tracks frames from GetRTSPCVImage through to the
+            // end of inference (DecrementInFlight fires when last clone is released).
+            // We MUST wait for this to reach 0 before calling close(), because
+            // inference may still be reading NV12 pool buffer data that depends
+            // on the NVDEC decoder context being alive.
+            //
+            // DO NOT force-reset _inFlightFrames or invalidate onReleaseFn —
+            // let inference finish naturally so DecrementInFlight fires correctly.
            int inFlight = _inFlightFrames.load(std::memory_order_acquire);
            if (inFlight > 0) {
                _logger.LogInfo("ANSRTSPClient::Reconnect",
-                    std::format("waiting for {} in-flight frame(s)...", inFlight),
+                    std::format("waiting for {} in-flight inference(s) to complete...", inFlight),
                    __FILE__, __LINE__);
-                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
+                bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(10), [this] {
                    return _inFlightFrames.load(std::memory_order_acquire) <= 0;
                });
                if (!done) {
                    _logger.LogWarn("ANSRTSPClient::Reconnect",
-                        std::format("timed out — still {} in-flight", _inFlightFrames.load()),
+                        std::format("timed out — still {} in-flight, proceeding with close()",
+                                    _inFlightFrames.load()),
                        __FILE__, __LINE__);
+                    // Force-reset only on timeout as last resort
+                    ANSGpuFrameRegistry::instance().invalidateOwner(this);
+                    _inFlightFrames.store(0, std::memory_order_release);
                }
            }
-
-            // Invalidate owner callbacks — prevents stale DecrementInFlight
-            // calls after Reconnect re-creates the decoder.
-            // Frames and their global pool slots remain alive for inference.
-            ANSGpuFrameRegistry::instance().invalidateOwner(this);
-            _inFlightFrames.store(0, std::memory_order_release);
-
-            // NO forceReleaseByOwner — frames survive reconnect.
-            // NO cudaDeviceSynchronize — no GPU buffers to free.
-            // NO DestroyGpuPool — per-camera pool has been removed.
        }

        // 2. close() destroys NVDEC decoder ONLY — run outside _mutex to
-        //    avoid deadlocking with nvcuda64 SRW lock held by inference.
-        //    Pool slot buffers are global and untouched.
+        //    avoid deadlocking with nvcuda64 SRW lock held by other cameras.
+        //    At this point, all inference using this camera's NV12 data has
+        //    completed (or timed out), so close() is safe.
        _logger.LogInfo("ANSRTSPClient::Reconnect",
            "calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
        RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
@@ -883,6 +883,14 @@ namespace ANSCENTER {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        _playerClient->setImageQuality(mode);  // 0=fast (AI), 1=quality (display)
    }
+    void ANSRTSPClient::SetTargetFPS(double intervalMs) {
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        _playerClient->setTargetFPS(intervalMs);  // 0=no limit, 100=~10FPS, 200=~5FPS
+    }
+    void ANSRTSPClient::SetNV12FastPath(bool enable) {
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        _useNV12FastPath = enable;
+    }
    AVFrame* ANSRTSPClient::GetNV12Frame() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        if (!_isPlaying) return nullptr;  // Player may be mid-reconnect (CUDA resources freed)
@@ -1045,67 +1053,60 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(

        auto t1 = std::chrono::steady_clock::now();

-        // Attach NV12 frame for GPU fast-path inference (side-table registry)
-        // attach() takes ownership — do NOT av_frame_free here
-        //
-        // CRITICAL: TryIncrementInFlight() MUST be called BEFORE GetCudaHWFrame().
-        // It atomically checks _isPlaying and increments _inFlightFrames under
-        // the same mutex, so Reconnect() cannot call close() while we're doing
-        // the D2D copy from NVDEC surfaces inside gpu_frame_attach_cuda().
-        int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
-        bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight();
-        RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
-                (void*)*image, gpuIdx, (int)inFlightGuardHeld);
+        // NV12 GPU fast path: attach NV12 frame data for zero-copy inference.
+        // When disabled (_useNV12FastPath=false), the original stable CPU path is used:
+        //   GetImage() returns BGR cv::Mat in CPU RAM → no CUDA calls → no SRW lock contention.
+        // When enabled, D2D copies NV12 from NVDEC to pool buffers for GPU inference.
+        if ((*Handle)->IsNV12FastPath()) {
+            int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
+            bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight();
+            RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
+                    (void*)*image, gpuIdx, (int)inFlightGuardHeld);

-        if (inFlightGuardHeld) {
-            AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
-            if (cudaHW) {
-                RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p",
-                        cudaHW->width, cudaHW->height,
-                        (void*)cudaHW->data[0], (void*)cudaHW->data[1]);
+            if (inFlightGuardHeld) {
+                AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
+                if (cudaHW) {
+                    RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p",
+                            cudaHW->width, cudaHW->height,
+                            (void*)cudaHW->data[0], (void*)cudaHW->data[1]);

-                // Acquire a slot from the global pool — survives camera Destroy.
-                GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire(
-                    gpuIdx, cudaHW->width, cudaHW->height);
+                    // Acquire a slot from the global pool — survives camera Destroy.
+                    GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire(
+                        gpuIdx, cudaHW->width, cudaHW->height);

-                // Only fetch CPU NV12 if pool slot unavailable (cross-GPU fallback).
-                // When slot is valid, the D2D copy goes GPU→GPU and CPU NV12 is never used.
-                // Skipping av_frame_clone + av_frame_free saves ~0.1ms per frame.
-                AVFrame* cpuNV12 = slot ? nullptr : (*Handle)->GetNV12Frame();
-                gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot);
-            } else {
-                // HW decode not active — try CPU NV12
-                AVFrame* nv12 = (*Handle)->GetNV12Frame();
-                if (nv12) {
-                    gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
+                    // Only fetch CPU NV12 if pool slot unavailable (cross-GPU fallback).
+                    AVFrame* cpuNV12 = slot ? nullptr : (*Handle)->GetNV12Frame();
+                    gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot);
+                } else {
+                    // HW decode not active — try CPU NV12
+                    AVFrame* nv12 = (*Handle)->GetNV12Frame();
+                    if (nv12) {
+                        gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
+                    }
                }
-            }

-            // Wire up the registry callback to release the in-flight guard.
-            // TryIncrementInFlight already incremented; DecrementInFlight fires
-            // when the last clone of this frame is released after inference.
-            auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
-            RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p",
-                    (void*)gpuData,
-                    gpuData ? (void*)gpuData->yPlane : nullptr,
-                    gpuData ? (int)gpuData->isCudaDevicePtr : -1,
-                    gpuData ? (void*)gpuData->poolSlot : nullptr);
-            if (gpuData) {
-                gpuData->ownerClient = *Handle;
-                gpuData->onReleaseFn = [](void* client) {
-                    static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight();
-                };
-                // NOTE: Do NOT call IncrementInFlight() again here —
-                // TryIncrementInFlight() already did it above.
+                // Wire up the registry callback to release the in-flight guard.
+                auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
+                RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p",
+                        (void*)gpuData,
+                        gpuData ? (void*)gpuData->yPlane : nullptr,
+                        gpuData ? (int)gpuData->isCudaDevicePtr : -1,
+                        gpuData ? (void*)gpuData->poolSlot : nullptr);
+                if (gpuData) {
+                    gpuData->ownerClient = *Handle;
+                    gpuData->onReleaseFn = [](void* client) {
+                        static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight();
+                    };
+                } else {
+                    (*Handle)->DecrementInFlight();
+                }
            } else {
-                // No gpuData registered (attach failed?) — release the guard
-                (*Handle)->DecrementInFlight();
+                RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
            }
-        } else {
-            // Player is stopping/reconnecting — skip CUDA path entirely.
-            // GetImage() already returned a cached BGR frame, which is safe.
-            RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
        }
+        // else: original CPU path — cv::Mat** contains BGR data in CPU RAM.
+        // No CUDA calls, no pool slots, no GPU frame registry.
+        // Inference uses cv::Mat directly (upload to GPU in engine).

        // Lightweight timing — logs only when frame grab + D2D exceeds 50ms.
        // Goes to both spdlog (console/file) AND OutputDebugString (DebugView)
@@ -1115,7 +1116,7 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
        double getImageMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
        double cudaMs     = std::chrono::duration<double, std::milli>(t2 - t1).count();
        double totalMs    = getImageMs + cudaMs;
-        if (totalMs > 50.0) {
+        if (totalMs > 500.0) {
            auto msg = std::format("SLOW FRAME: total={:.1f}ms (getImage={:.1f}ms cuda={:.1f}ms) {}x{}",
                    totalMs, getImageMs, cudaMs, width, height);
            (*Handle)->_logger.LogWarn("GetRTSPCVImage", msg, __FILE__, __LINE__);
@@ -1452,6 +1453,18 @@ extern "C" __declspec(dllexport) void SetRTSPDisplayResolution(ANSCENTER::ANSRTS
        (*Handle)->SetDisplayResolution(width, height);
    } catch (...) { }
 }
+extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient** Handle, double intervalMs) {
+    if (Handle == nullptr || *Handle == nullptr) return;
+    try {
+        (*Handle)->SetTargetFPS(intervalMs);  // 0=no limit, 100=~10FPS, 200=~5FPS
+    } catch (...) { }
+}
+extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable) {
+    if (Handle == nullptr || *Handle == nullptr) return;
+    try {
+        (*Handle)->SetNV12FastPath(enable != 0);  // 0=original CPU path (stable), 1=NV12 GPU fast path
+    } catch (...) { }
+}
 extern "C" __declspec(dllexport) int SetCropFlagRTSP(ANSCENTER::ANSRTSPClient** Handle, int cropFlag) {
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {