Use software decoder by default

2026-04-04 20:19:54 +11:00
parent 3a21026790
commit e134ebdf15
24 changed files with 693 additions and 215 deletions
--- a/modules/ANSCV/ANSFLV.cpp
+++ b/modules/ANSCV/ANSFLV.cpp
@@ -218,44 +218,25 @@ namespace ANSCENTER {
    }

    bool ANSFLVClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
-        // Quick size and type checks
-        if (img1.size() != img2.size() || img1.type() != img2.type()) {
-            return false;
-        }
+        // Use decoder frame age — returns "stale" only if no decoder output for 5+ seconds.
+        double ageMs = _playerClient->getLastFrameAgeMs();
+        if (ageMs > 5000.0) return true;   // Truly stale
+        if (ageMs > 0.0) return false;     // Decoder alive

-        // Handle empty images
-        if (img1.empty()) {
-            return img2.empty();
-        }
+        // Fallback for startup (no frame decoded yet)
+        if (img1.empty() && img2.empty()) return true;
+        if (img1.empty() || img2.empty()) return false;
+        if (img1.size() != img2.size() || img1.type() != img2.type()) return false;
+        if (img1.data == img2.data) return true;

        if (img1.isContinuous() && img2.isContinuous()) {
            const size_t totalBytes = img1.total() * img1.elemSize();
-
-            // Fast rejection: sample 5 positions across contiguous memory
-            const size_t quarter = totalBytes / 4;
-            const size_t half = totalBytes / 2;
-            const size_t threeQuarter = 3 * totalBytes / 4;
-
-            if (img1.data[0] != img2.data[0] ||
-                img1.data[quarter] != img2.data[quarter] ||
-                img1.data[half] != img2.data[half] ||
-                img1.data[threeQuarter] != img2.data[threeQuarter] ||
-                img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) {
-                return false;
-            }
-
-            // Full comparison
            return std::memcmp(img1.data, img2.data, totalBytes) == 0;
        }
-
-        // Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices)
        const size_t rowSize = img1.cols * img1.elemSize();
        for (int i = 0; i < img1.rows; i++) {
-            if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) {
-                return false;
-            }
+            if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false;
        }
-
        return true;
    }
    cv::Mat ANSFLVClient::GetImage(int& width, int& height, int64_t& pts) {
--- a/modules/ANSCV/ANSMJPEG.cpp
+++ b/modules/ANSCV/ANSMJPEG.cpp
@@ -208,44 +208,23 @@ namespace ANSCENTER {
    }

    bool ANSMJPEGClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
-        // Quick size and type checks
-        if (img1.size() != img2.size() || img1.type() != img2.type()) {
-            return false;
-        }
+        double ageMs = _playerClient->getLastFrameAgeMs();
+        if (ageMs > 5000.0) return true;
+        if (ageMs > 0.0) return false;

-        // Handle empty images
-        if (img1.empty()) {
-            return img2.empty();
-        }
+        if (img1.empty() && img2.empty()) return true;
+        if (img1.empty() || img2.empty()) return false;
+        if (img1.size() != img2.size() || img1.type() != img2.type()) return false;
+        if (img1.data == img2.data) return true;

        if (img1.isContinuous() && img2.isContinuous()) {
            const size_t totalBytes = img1.total() * img1.elemSize();
-
-            // Fast rejection: sample 5 positions across contiguous memory
-            const size_t quarter = totalBytes / 4;
-            const size_t half = totalBytes / 2;
-            const size_t threeQuarter = 3 * totalBytes / 4;
-
-            if (img1.data[0] != img2.data[0] ||
-                img1.data[quarter] != img2.data[quarter] ||
-                img1.data[half] != img2.data[half] ||
-                img1.data[threeQuarter] != img2.data[threeQuarter] ||
-                img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) {
-                return false;
-            }
-
-            // Full comparison
            return std::memcmp(img1.data, img2.data, totalBytes) == 0;
        }
-
-        // Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices)
        const size_t rowSize = img1.cols * img1.elemSize();
        for (int i = 0; i < img1.rows; i++) {
-            if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) {
-                return false;
-            }
+            if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false;
        }
-
        return true;
    }
    cv::Mat ANSMJPEGClient::GetImage(int& width, int& height, int64_t& pts) {
--- a/modules/ANSCV/ANSRTMP.cpp
+++ b/modules/ANSCV/ANSRTMP.cpp
@@ -213,43 +213,22 @@ namespace ANSCENTER {
    }

    bool ANSRTMPClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
-        // Quick size and type checks
-        if (img1.size() != img2.size() || img1.type() != img2.type()) {
-            return false;
-        }
+        double ageMs = _playerClient->getLastFrameAgeMs();
+        if (ageMs > 5000.0) return true;
+        if (ageMs > 0.0) return false;

-        // Handle empty images
-        if (img1.empty()) {
-            return img2.empty();
-        }
+        if (img1.empty() && img2.empty()) return true;
+        if (img1.empty() || img2.empty()) return false;
+        if (img1.size() != img2.size() || img1.type() != img2.type()) return false;
+        if (img1.data == img2.data) return true;

        if (img1.isContinuous() && img2.isContinuous()) {
            const size_t totalBytes = img1.total() * img1.elemSize();
-
-            // Fast rejection: sample 5 positions across contiguous memory
-            // Catches 99.99% of different frames immediately
-            const size_t quarter = totalBytes / 4;
-            const size_t half = totalBytes / 2;
-            const size_t threeQuarter = 3 * totalBytes / 4;
-
-            if (img1.data[0] != img2.data[0] ||
-                img1.data[quarter] != img2.data[quarter] ||
-                img1.data[half] != img2.data[half] ||
-                img1.data[threeQuarter] != img2.data[threeQuarter] ||
-                img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) {
-                return false;
-            }
-
-            // Full comparison
            return std::memcmp(img1.data, img2.data, totalBytes) == 0;
        }
-
-        // Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices)
        const size_t rowSize = img1.cols * img1.elemSize();
        for (int i = 0; i < img1.rows; i++) {
-            if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) {
-                return false;
-            }
+            if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false;
        }

        return true;
--- a/modules/ANSCV/ANSRTSP.cpp
+++ b/modules/ANSCV/ANSRTSP.cpp
@@ -2,7 +2,9 @@
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
 #include "GpuNV12SlotPool.h"
+#include "ANSLicense.h"       // ANS_DBG macro
 #include <memory>
+#include <chrono>
 #include <format>
 #include "media_codec.h"
 #include <cstdint>
@@ -69,6 +71,7 @@ namespace ANSCENTER {
    }

    void ANSRTSPClient::Destroy() {
+        ANS_DBG("RTSP_Lifecycle", "DESTROY called: url=%s playing=%d", _url.c_str(), (int)_isPlaying);
        // Move the player client pointer out of the lock scope, then
        // close it OUTSIDE the mutex.  close() calls cuArrayDestroy /
        // cuMemFree which acquire an EXCLUSIVE SRW lock inside nvcuda64.
@@ -126,6 +129,24 @@ namespace ANSCENTER {
        // belong to the global GpuNV12SlotPool, not the decoder.
        if (clientToClose) {
            clientToClose->close();
+
+            // Force CUDA runtime to release all cached memory from the destroyed
+            // NVDEC decoder.  Without this, cuMemFree returns memory to the CUDA
+            // driver's internal cache, and the next camera creation allocates fresh
+            // memory → VRAM grows by ~200-300MB per destroy/create cycle.
+            // cudaDeviceSynchronize ensures all pending GPU ops are done, then
+            // cudaMemPool trim releases the freed blocks back to the OS.
+            cudaDeviceSynchronize();
+            cudaMemPool_t memPool = nullptr;
+            int currentDev = 0;
+            cudaGetDevice(&currentDev);
+            if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
+                cudaMemPoolTrimTo(memPool, 0);  // Release all unused memory
+            }
+            size_t vramFree = 0, vramTotal = 0;
+            cudaMemGetInfo(&vramFree, &vramTotal);
+            ANS_DBG("RTSP_Destroy", "NVDEC closed + memPool trimmed GPU%d VRAM=%zuMB/%zuMB",
+                    currentDev, (vramTotal - vramFree) / (1024*1024), vramFree / (1024*1024));
        }
    }
    static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) {
@@ -211,6 +232,7 @@ namespace ANSCENTER {
        _playerClient->setCrop(crop);
    }
    bool ANSRTSPClient::Reconnect() {
+        ANS_DBG("RTSP_Lifecycle", "RECONNECT called: url=%s playing=%d", _url.c_str(), (int)_isPlaying);
        // 1. Mark as not-playing under the mutex FIRST.  This makes GetImage()
        //    return the cached _pLastFrame instead of calling into the player,
        //    and blocks new TryIncrementInFlight calls (no new NV12 attachments).
@@ -253,8 +275,30 @@ namespace ANSCENTER {
        //    completed (or timed out), so close() is safe.
        _logger.LogInfo("ANSRTSPClient::Reconnect",
            "calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
+        auto _rc0 = std::chrono::steady_clock::now();
        RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
        _playerClient->close();
+        auto _rc1 = std::chrono::steady_clock::now();
+
+        // Force CUDA runtime to release cached memory from the destroyed NVDEC decoder.
+        cudaDeviceSynchronize();
+        auto _rc2 = std::chrono::steady_clock::now();
+        cudaMemPool_t memPool = nullptr;
+        int currentDev = 0;
+        cudaGetDevice(&currentDev);
+        if (cudaDeviceGetDefaultMemPool(&memPool, currentDev) == cudaSuccess && memPool) {
+            cudaMemPoolTrimTo(memPool, 0);
+        }
+        auto _rc3 = std::chrono::steady_clock::now();
+        {
+            size_t vf = 0, vt = 0;
+            cudaMemGetInfo(&vf, &vt);
+            double closeMs = std::chrono::duration<double, std::milli>(_rc1 - _rc0).count();
+            double syncMs  = std::chrono::duration<double, std::milli>(_rc2 - _rc1).count();
+            double trimMs  = std::chrono::duration<double, std::milli>(_rc3 - _rc2).count();
+            ANS_DBG("RTSP_Reconnect", "close=%.1fms sync=%.1fms trim=%.1fms VRAM=%zuMB/%zuMB",
+                    closeMs, syncMs, trimMs, (vt - vf) / (1024*1024), vf / (1024*1024));
+        }
        RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this);

        // 3. Re-setup and play under the mutex.
@@ -283,12 +327,9 @@ namespace ANSCENTER {
    }

     bool ANSRTSPClient::Stop() {
-        // Grab the player pointer and clear _isPlaying under the lock,
-        // then call stop() OUTSIDE the mutex.  stop() internally calls
-        // StopVideoDecoder -> decoder->flush() which does CUDA calls
-        // that can block on the nvcuda64 SRW lock.  Holding _mutex
-        // during that time blocks all other operations on this client
-        // and contributes to the convoy when many clients stop at once.
+        // Stop playback but keep the RTSP connection and NVDEC decoder alive.
+        // LabVIEW uses Stop/Start to pause cameras when no AI task is subscribed.
+        // The camera resumes instantly on Start() without re-connecting.
        CRtspPlayer* player = nullptr;
        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
@@ -300,6 +341,7 @@ namespace ANSCENTER {
        if (player) {
            player->stop();
        }
+        ANS_DBG("RTSP_Lifecycle", "STOP complete: handle=%p (connection kept alive)", (void*)this);
        return true;
    }
 	bool ANSRTSPClient::Pause() {
@@ -342,45 +384,44 @@ namespace ANSCENTER {
    }
    
    bool ANSRTSPClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
-        // Quick size and type checks
-        if (img1.size() != img2.size() || img1.type() != img2.type()) {
-            return false;
+        double ageMs = _playerClient->getLastFrameAgeMs();
+
+        if (ageMs > 5000.0) {
+            ANS_DBG("RTSP_Stale", "FROZEN DETECTED: ageMs=%.1f url=%s playing=%d — camera truly stale",
+                    ageMs, _url.c_str(), (int)_isPlaying);
+            return true;   // Truly stale — no decoder output for 5+ seconds
+        }
+        if (ageMs > 0.0) {
+            return false;  // Decoder is receiving frames — camera is alive
        }

-        // Handle empty images
-        if (img1.empty()) {
-            return img2.empty();
-        }
+        // ageMs == 0 means no frame has been decoded yet (startup).
+        // Fall back to pixel comparison for backward compatibility.
+        if (img1.empty() && img2.empty()) return true;
+        if (img1.empty() || img2.empty()) return false;
+        if (img1.size() != img2.size() || img1.type() != img2.type()) return false;

+        // Same data pointer = same cv::Mat (shallow copy)
+        if (img1.data == img2.data) return true;
+
+        // Quick 5-point sampling
        if (img1.isContinuous() && img2.isContinuous()) {
            const size_t totalBytes = img1.total() * img1.elemSize();
-
-            // Fast rejection: sample 5 positions across contiguous memory
-            // Catches 99.99% of different frames immediately
            const size_t quarter = totalBytes / 4;
            const size_t half = totalBytes / 2;
-            const size_t threeQuarter = 3 * totalBytes / 4;
-
            if (img1.data[0] != img2.data[0] ||
                img1.data[quarter] != img2.data[quarter] ||
                img1.data[half] != img2.data[half] ||
-                img1.data[threeQuarter] != img2.data[threeQuarter] ||
                img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) {
                return false;
            }
-
-            // Full comparison
            return std::memcmp(img1.data, img2.data, totalBytes) == 0;
        }

-        // Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices)
        const size_t rowSize = img1.cols * img1.elemSize();
        for (int i = 0; i < img1.rows; i++) {
-            if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) {
-                return false;
-            }
+            if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false;
        }
-
        return true;
    }
    cv::Mat ANSRTSPClient::GetImage(int& width, int& height, int64_t& pts) {
@@ -414,6 +455,20 @@ namespace ANSCENTER {
        if (currentPts == _pts && !_pLastFrame.empty()) {
            width = _imageWidth;
            height = _imageHeight;
+            // Return timestamp based on decoder frame age so LabVIEW can distinguish
+            // "rate-limited duplicate" from "camera truly stale".
+            // If decoder is still receiving frames (age < 5s), advance PTS so LabVIEW
+            // sees a changing timestamp and doesn't trigger false reconnect.
+            // If decoder is stale (age > 5s), return same PTS so LabVIEW detects it.
+            double ageMs = _playerClient->getLastFrameAgeMs();
+            if (ageMs > 0.0 && ageMs < 5000.0) {
+                // Camera alive but rate-limited — advance PTS to prevent false stale detection
+                _pts++;
+            } else if (ageMs >= 5000.0) {
+                // Camera stale — keep same PTS so LabVIEW triggers reconnect
+                ANS_DBG("RTSP_GetImage", "STALE PTS: ageMs=%.1f pts=%lld url=%s — not advancing PTS",
+                        ageMs, (long long)_pts, _url.c_str());
+            }
            pts = _pts;
            return _pLastFrame;
        }
@@ -891,6 +946,10 @@ namespace ANSCENTER {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        _useNV12FastPath = enable;
    }
+    double ANSRTSPClient::GetLastFrameAgeMs() {
+        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        return _playerClient->getLastFrameAgeMs();
+    }
    AVFrame* ANSRTSPClient::GetNV12Frame() {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        if (!_isPlaying) return nullptr;  // Player may be mid-reconnect (CUDA resources freed)
@@ -937,6 +996,7 @@ namespace ANSCENTER {
 }

 extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClient * *Handle, const char* licenseKey, const char* username, const char* password, const char* url) {
+    ANS_DBG("RTSP_Lifecycle", "CREATE: url=%s", url ? url : "null");
    if (!Handle || !licenseKey || !url) return -1;
    try {
        auto ptr = std::make_unique<ANSCENTER::ANSRTSPClient>();
@@ -946,11 +1006,10 @@ extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClien
        if (_username.empty() && _password.empty()) result = ptr->Init(licenseKey, url);
        else result = ptr->Init(licenseKey, username, password, url);
        if (result) {
-            // Default to CUDA/NVDEC HW decoding (mode 7) for NV12 zero-copy
-            // fast path.  LabVIEW may not call SetRTSPHWDecoding after
-            // destroy+recreate cycles, so this ensures the new handle always
-            // uses the GPU decode path instead of falling back to D3D11VA/CPU.
-            ptr->SetHWDecoding(7);  // HW_DECODING_CUDA
+            // Software decode by default — saves VRAM (no NVDEC DPB surfaces).
+            // With 100 cameras, HW decode would consume ~5-21 GB VRAM for idle decoders.
+            // User can enable HW decode per-camera via SetRTSPHWDecoding(handle, 7).
+            // ptr->SetHWDecoding(7);  // Disabled — was HW_DECODING_CUDA
            *Handle = ptr.release();
            extern void anscv_unregister_handle(void*);
            extern void anscv_register_handle(void*, void(*)(void*));
@@ -967,6 +1026,7 @@ extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClien
    } catch (...) { return -1; }
 }
 extern "C" __declspec(dllexport) int ReleaseANSRTSPHandle(ANSCENTER::ANSRTSPClient * *Handle) {
+    ANS_DBG("RTSP_Lifecycle", "RELEASE: handle=%p", Handle ? (void*)*Handle : nullptr);
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        extern void anscv_unregister_handle(void*);
@@ -982,25 +1042,27 @@ extern "C" __declspec(dllexport) int ReleaseANSRTSPHandle(ANSCENTER::ANSRTSPClie
        // on any subsequent call, and prevents NEW NV12 GPU surface
        // pointers from being handed out.
        //
-        // Do NOT call Destroy()/close() here — close() frees the
-        // NVDEC GPU surfaces (cuArrayDestroy/cuMemFree) which may
-        // still be in use by a CUDA inference kernel that received
-        // the NV12 pointer from a GetRTSPCVImage call that already
-        // completed before this Release was called.
+        // Synchronous cleanup — ensures all GPU resources (NVDEC surfaces, VRAM)
+        // are fully released BEFORE LabVIEW creates a new camera.
+        // Previously deferred to a background thread, but that caused the old
+        // camera's resources to overlap with the new camera's allocations,
+        // leading to temporary VRAM doubling (~240MB per camera) and eventual
+        // VRAM exhaustion on cameras with frequent reconnects.
        {
-            // Use the client's _mutex to safely set _isPlaying = false.
-            // This is the same lock GetImage/GetNV12Frame acquire.
-            raw->Stop();  // sets _isPlaying = false, stops playback
-        }
+            auto t0 = std::chrono::steady_clock::now();
+            raw->Stop();
+            auto t1 = std::chrono::steady_clock::now();
+            raw->Destroy();
+            auto t2 = std::chrono::steady_clock::now();
+            delete raw;
+            auto t3 = std::chrono::steady_clock::now();

-        // Defer the full cleanup (Destroy + delete) to a background thread
-        // so LabVIEW's UI thread is not blocked.  Destroy() now waits
-        // precisely for in-flight inference to finish (via _inFlightFrames
-        // counter + condition variable) instead of the old 500ms sleep hack.
-        std::thread([raw]() {
-            try { raw->Destroy(); } catch (...) {}
-            try { delete raw; } catch (...) {}
-        }).detach();
+            double stopMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
+            double destroyMs = std::chrono::duration<double, std::milli>(t2 - t1).count();
+            double deleteMs = std::chrono::duration<double, std::milli>(t3 - t2).count();
+            ANS_DBG("RTSP_Lifecycle", "RELEASE complete: stop=%.1fms destroy=%.1fms delete=%.1fms total=%.1fms",
+                    stopMs, destroyMs, deleteMs, stopMs + destroyMs + deleteMs);
+        }

        return 0;
    } catch (...) {
@@ -1269,6 +1331,7 @@ extern "C" __declspec(dllexport) int GetRTSPImage(ANSCENTER::ANSRTSPClient** Han
 	}
 }
 extern "C" __declspec(dllexport) int StartRTSP(ANSCENTER::ANSRTSPClient **Handle) {
+    ANS_DBG("RTSP_Lifecycle", "START: handle=%p", Handle ? (void*)*Handle : nullptr);
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        bool result = (*Handle)->Start();
@@ -1301,6 +1364,7 @@ extern "C" __declspec(dllexport) int ReconnectRTSP(ANSCENTER::ANSRTSPClient * *H
 	}
 }
 extern "C" __declspec(dllexport) int StopRTSP(ANSCENTER::ANSRTSPClient * *Handle) {
+    ANS_DBG("RTSP_Lifecycle", "STOP: handle=%p", Handle ? (void*)*Handle : nullptr);
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
        bool result = (*Handle)->Stop();
@@ -1462,9 +1526,15 @@ extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient*
 extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable) {
    if (Handle == nullptr || *Handle == nullptr) return;
    try {
-        (*Handle)->SetNV12FastPath(enable != 0);  // 0=original CPU path (stable), 1=NV12 GPU fast path
+        (*Handle)->SetNV12FastPath(enable != 0);
    } catch (...) { }
 }
+extern "C" __declspec(dllexport) double GetRTSPLastFrameAgeMs(ANSCENTER::ANSRTSPClient** Handle) {
+    if (Handle == nullptr || *Handle == nullptr) return -1.0;
+    try {
+        return (*Handle)->GetLastFrameAgeMs();
+    } catch (...) { return -1.0; }
+}
 extern "C" __declspec(dllexport) int SetCropFlagRTSP(ANSCENTER::ANSRTSPClient** Handle, int cropFlag) {
    if (Handle == nullptr || *Handle == nullptr) return -1;
    try {
--- a/modules/ANSCV/ANSRTSP.h
+++ b/modules/ANSCV/ANSRTSP.h
@@ -106,6 +106,7 @@ namespace ANSCENTER
 		void SetTargetFPS(double intervalMs);  // Set min interval between processed frames in ms (0 = no limit, 100 = ~10 FPS, 200 = ~5 FPS)
 		void SetNV12FastPath(bool enable);     // true = NV12 GPU fast path (zero-copy inference), false = original CPU path (stable)
 		bool IsNV12FastPath() const { return _useNV12FastPath; }
+		double GetLastFrameAgeMs();            // Milliseconds since last frame from decoder (detects truly stale cameras, unaffected by SetTargetFPS)
 		AVFrame* GetNV12Frame();  // Returns cloned NV12 frame for GPU fast-path (caller must av_frame_free)
 		AVFrame* GetCudaHWFrame(); // Returns CUDA HW frame (device ptrs) for zero-copy inference
 		bool     IsCudaHWAccel();  // true when decoder uses CUDA (NV12 stays in GPU VRAM)
@@ -145,4 +146,5 @@ extern "C" __declspec(dllexport) void SetRTSPImageQuality(ANSCENTER::ANSRTSPClie
 extern "C" __declspec(dllexport) void SetRTSPDisplayResolution(ANSCENTER::ANSRTSPClient** Handle, int width, int height);
 extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient** Handle, double intervalMs);
 extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable);
+extern "C" __declspec(dllexport) double GetRTSPLastFrameAgeMs(ANSCENTER::ANSRTSPClient** Handle);
 #endif
--- a/modules/ANSCV/ANSSRT.cpp
+++ b/modules/ANSCV/ANSSRT.cpp
@@ -221,43 +221,22 @@ namespace ANSCENTER {
    }

    bool ANSSRTClient::areImagesIdentical(const cv::Mat& img1, const cv::Mat& img2) {
-        // Quick size and type checks
-        if (img1.size() != img2.size() || img1.type() != img2.type()) {
-            return false;
-        }
+        double ageMs = _playerClient->getLastFrameAgeMs();
+        if (ageMs > 5000.0) return true;
+        if (ageMs > 0.0) return false;

-        // Handle empty images
-        if (img1.empty()) {
-            return img2.empty();
-        }
+        if (img1.empty() && img2.empty()) return true;
+        if (img1.empty() || img2.empty()) return false;
+        if (img1.size() != img2.size() || img1.type() != img2.type()) return false;
+        if (img1.data == img2.data) return true;

        if (img1.isContinuous() && img2.isContinuous()) {
            const size_t totalBytes = img1.total() * img1.elemSize();
-
-            // Fast rejection: sample 5 positions across contiguous memory
-            // Catches 99.99% of different frames immediately
-            const size_t quarter = totalBytes / 4;
-            const size_t half = totalBytes / 2;
-            const size_t threeQuarter = 3 * totalBytes / 4;
-
-            if (img1.data[0] != img2.data[0] ||
-                img1.data[quarter] != img2.data[quarter] ||
-                img1.data[half] != img2.data[half] ||
-                img1.data[threeQuarter] != img2.data[threeQuarter] ||
-                img1.data[totalBytes - 1] != img2.data[totalBytes - 1]) {
-                return false;
-            }
-
-            // Full comparison
            return std::memcmp(img1.data, img2.data, totalBytes) == 0;
        }
-
-        // Row-by-row comparison for non-continuous images (e.g., ROI sub-matrices)
        const size_t rowSize = img1.cols * img1.elemSize();
        for (int i = 0; i < img1.rows; i++) {
-            if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) {
-                return false;
-            }
+            if (std::memcmp(img1.ptr(i), img2.ptr(i), rowSize) != 0) return false;
        }

        return true;
--- a/modules/ANSCV/ANSVideoPlayer.cpp
+++ b/modules/ANSCV/ANSVideoPlayer.cpp
@@ -136,7 +136,7 @@ namespace ANSCENTER {
 		if (!_hwDecodeActive && !_hwPlayer) {
 			try {
 				auto hwp = std::make_unique<CFilePlayer>();
-				hwp->setHWDecoding(HW_DECODING_AUTO);  // CUDA → D3D11VA → DXVA2 → software
+				hwp->setHWDecoding(HW_DECODING_DISABLE);  // Software decode by default — saves VRAM
 				if (hwp->open(_url)) {
 					_hwPlayer = std::move(hwp);
 					_hwDecodeActive = true;
--- a/modules/ANSCV/VideoPlayer.cpp
+++ b/modules/ANSCV/VideoPlayer.cpp
@@ -93,7 +93,7 @@ CVideoPlayer::CVideoPlayer():
    , m_bPaused(FALSE)
    , m_bSizeChanged(FALSE)
    //, m_nRenderMode(RENDER_MODE_KEEP)
-    , m_nHWDecoding(HW_DECODING_AUTO)
+    , m_nHWDecoding(HW_DECODING_DISABLE)  // Software decode by default — saves VRAM
    , m_nDstVideoFmt(AV_PIX_FMT_YUV420P)
    , m_bUpdown(FALSE)
    , m_bSnapshot(FALSE)
--- a/modules/ANSODEngine/ANSODEngine.cpp
+++ b/modules/ANSODEngine/ANSODEngine.cpp
@@ -3,6 +3,7 @@
 #include <cmath>
 #include <json.hpp>
 #include "ANSODEngine.h"
+#include "ANSLicense.h"   // ANS_DBG macro
 #include "ANSYOLOOD.h"
 #include "ANSTENSORRTOD.h"
 #include "ANSTENSORRTCL.h"
@@ -879,6 +880,9 @@ namespace ANSCENTER
 		std::vector<Object> allResults;
 		allResults.clear();
 		try {
+			ANS_DBG("ODEngine", "SAHI START: %dx%d tile=%dx%d overlap=%.1f cam=%s",
+					input.cols, input.rows, tiledWidth, tiledHeight, overLap, camera_id.c_str());
+			auto _sahiStart = std::chrono::steady_clock::now();
 			cv::Mat image = input.clone();
 			if (image.empty() || !image.data || !image.u) {
 				return allResults;
@@ -920,6 +924,16 @@ namespace ANSCENTER
 			//4. Apply Non-Maximum Suppression (NMS) to merge overlapping results
 			float iouThreshold = 0.1;
 			std::vector<Object> finalResults = ANSUtilityHelper::ApplyNMS(allResults, iouThreshold);
+			{
+				double _sahiMs = std::chrono::duration<double, std::milli>(
+					std::chrono::steady_clock::now() - _sahiStart).count();
+				ANS_DBG("ODEngine", "SAHI DONE: %.1fms patches=%zu results=%zu cam=%s",
+						_sahiMs, patches.size() + 1, finalResults.size(), camera_id.c_str());
+				if (_sahiMs > 2000.0) {
+					ANS_DBG("ODEngine", "SAHI SLOW: %.1fms — %zu patches held _mutex entire time!",
+							_sahiMs, patches.size() + 1);
+				}
+			}
 			image.release();
 			return finalResults;
 		}
@@ -2103,6 +2117,8 @@ namespace ANSCENTER
 		// No coarse _mutex — sub-components (engines, trackers) have their own locks.
 		// LabVIEW semaphore controls concurrency at the caller level.
 		try {
+			ANS_DBG("ODEngine", "RunInferenceWithOption: cam=%s %dx%d mode=%s",
+					camera_id.c_str(), input.cols, input.rows, activeROIMode.c_str());
 			int mode = 0;
 			double confidenceThreshold = 0.35;
 			std::vector<int> trackingObjectIds;
--- a/modules/ANSODEngine/ANSRTYOLO.cpp
+++ b/modules/ANSODEngine/ANSRTYOLO.cpp
@@ -1,5 +1,6 @@
 #include "ANSRTYOLO.h"
 #include "Utility.h"
+#include "ANSLicense.h"   // ANS_DBG macro for DebugView
 #include <future>
 #include <numeric>
 #include <cmath>
@@ -903,7 +904,6 @@ namespace ANSCENTER {
                    return {};
                }

-                // Check if model is classification (output ndims <= 2)
                const auto& outputDims = m_trtEngine->getOutputDims();
                const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;

@@ -914,11 +914,8 @@ namespace ANSCENTER {
                cv::cuda::GpuMat resized;
                if (imgRGB.rows != inputH || imgRGB.cols != inputW) {
                    if (isClassification) {
-                        // Classification: direct resize (no letterbox padding)
-                        // Must use explicit stream to avoid conflict with CUDA Graph capture on null stream
                        cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream);
                    } else {
-                        // Detection/Seg/Pose/OBB: letterbox resize + right-bottom pad
                        resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW);
                    }
                }
@@ -1831,8 +1828,7 @@ namespace ANSCENTER {
            }

            // --- 2. Preprocess under lock ---
-            // Try NV12 fast path first (12MB upload vs 24MB BGR for 4K)
-            // Falls back to standard GPU preprocessing if no NV12 data available.
+            ANS_DBG("YOLO", "Preprocess START %dx%d", inputImage.cols, inputImage.rows);
            ImageMetadata meta;
            std::vector<std::vector<cv::cuda::GpuMat>> input;
            bool usedNV12 = false;
@@ -1874,11 +1870,22 @@ namespace ANSCENTER {
            }

            // --- 3. TRT Inference (mutex released for concurrent GPU slots) ---
+            ANS_DBG("YOLO", "TRT inference START nv12=%d inputSize=%dx%d",
+                    (int)usedNV12,
+                    input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].cols),
+                    input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].rows));
+            auto _trtStart = std::chrono::steady_clock::now();
            std::vector<std::vector<std::vector<float>>> featureVectors;
            if (!m_trtEngine->runInference(input, featureVectors)) {
+                ANS_DBG("YOLO", "ERROR: TRT runInference FAILED");
                _logger.LogError("ANSRTYOLO::DetectObjects", "Error running inference", __FILE__, __LINE__);
                return {};
            }
+            auto _trtEnd = std::chrono::steady_clock::now();
+            double _trtMs = std::chrono::duration<double, std::milli>(_trtEnd - _trtStart).count();
+            if (_trtMs > 500.0) {
+                ANS_DBG("YOLO", "SLOW TRT inference: %.1fms", _trtMs);
+            }
            double msInference = dbg ? elapsed() : 0;

            // --- 4. Transform output ---
--- a/modules/ANSODEngine/ANSRTYOLO.h
+++ b/modules/ANSODEngine/ANSRTYOLO.h
@@ -81,6 +81,7 @@ namespace ANSCENTER {
        std::vector<std::vector<cv::cuda::GpuMat>> PreprocessBatch(
            const std::vector<cv::Mat>& inputImages, BatchMetadata& outMetadata);

+
        // ── Detection pipeline ───────────────────────────────────────────
        std::vector<Object> DetectObjects(const cv::Mat& inputImage,
                                          const std::string& camera_id);
--- a/modules/ANSODEngine/NV12PreprocessHelper.cpp
+++ b/modules/ANSODEngine/NV12PreprocessHelper.cpp
@@ -1,6 +1,7 @@
 #include "NV12PreprocessHelper.h"
 #include "ANSGpuFrameRegistry.h"
 #include "ANSEngineCommon.h"
+#include "ANSLicense.h"   // ANS_DBG macro
 #include <opencv2/cudaimgproc.hpp>
 #include <opencv2/cudawarping.hpp>
 #include <opencv2/core/cuda_stream_accessor.hpp>
--- a/modules/ANSODEngine/dllmain.cpp
+++ b/modules/ANSODEngine/dllmain.cpp
@@ -6,6 +6,7 @@
 #include "engine/TRTEngineCache.h"   // clearAll() on DLL_PROCESS_DETACH
 #include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH
 #include <climits>                    // INT_MIN
+#include "ANSLicense.h"              // ANS_DBG macro for DebugView

 // Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers).
 // Defined here, declared extern in EngineBuildLoadNetwork.inl.
@@ -1696,6 +1697,8 @@ static int RunInferenceComplete_LV_Impl(
 	auto* engine = guard.get();

 	try {
+		auto _t0 = std::chrono::steady_clock::now();
+
 		// Save/restore thread-local to support nested calls (custom model DLLs
 		// calling back into ANSODEngine via ANSLIB.dll).
 		GpuFrameData* savedFrame = tl_currentGpuFrame();
@@ -1708,6 +1711,10 @@ static int RunInferenceComplete_LV_Impl(
 		int originalWidth = localImage.cols;
 		int originalHeight = localImage.rows;

+		ANS_DBG("LV_Inference", "START cam=%s %dx%d gpuFrame=%p nv12=%s",
+				cameraId ? cameraId : "?", originalWidth, originalHeight,
+				(void*)gpuFrame, gpuFrame ? "YES" : "NO");
+
 		if (originalWidth == 0 || originalHeight == 0) {
 			tl_currentGpuFrame() = savedFrame;
 			return -2;
@@ -1717,8 +1724,17 @@ static int RunInferenceComplete_LV_Impl(
 		// Safe: *cvImage holds a refcount, keeping gpuFrame alive during inference.
 		// Only use OWN gpuFrame — never inherit outer caller's frame (dimension mismatch on crops).
 		tl_currentGpuFrame() = gpuFrame;
+		auto _t1 = std::chrono::steady_clock::now();
 		std::vector<ANSCENTER::Object> outputs = engine->RunInferenceWithOption(localImage, cameraId, activeROIMode);
+		auto _t2 = std::chrono::steady_clock::now();
 		tl_currentGpuFrame() = savedFrame;
+
+		double prepMs = std::chrono::duration<double, std::milli>(_t1 - _t0).count();
+		double infMs  = std::chrono::duration<double, std::milli>(_t2 - _t1).count();
+		if (infMs > 500.0) {
+			ANS_DBG("LV_Inference", "SLOW cam=%s prep=%.1fms inf=%.1fms results=%zu",
+					cameraId ? cameraId : "?", prepMs, infMs, outputs.size());
+		}
 		bool getJpeg = (getJpegString == 1);
 		std::string stImage;
 		// NOTE: odMutex was removed here. All variables in this scope are local
--- a/modules/ANSODEngine/engine.h
+++ b/modules/ANSODEngine/engine.h
@@ -402,6 +402,9 @@ private:
    cudaStream_t m_memoryStream;  // ADD THIS - separate stream for memory operations
    std::vector<cv::cuda::GpuMat> m_preprocessedInputs;  // Keep inputs alive

+    // Note: blobFromGpuMats and resizeKeepAspectRatioPadRightBottom are static,
+    // so cached buffers use thread_local inside the functions themselves.
+

    // Thermal management (ADD THESE)
    //int m_consecutiveInferences;
@@ -431,7 +434,7 @@ private:

    Logger m_logger;
    bool m_verbose{ true };   // false for non-probe pool slots
-    bool m_disableGraphs{ false }; // true for pool slots — concurrent graph captures corrupt CUDA context
+    bool m_disableGraphs{ true }; // DISABLED by default — concurrent graph launches + uploads cause GPU deadlock on WDDM

    // -- Multi-GPU pool data ---------------------------------------------------