diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index 1bef8a6..e69de29 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -1,7 +0,0 @@
-{
-  "permissions": {
-    "allow": [
-      "Bash(cmake -B cmake-build-release -S .)"
-    ]
-  }
-}
diff --git a/MediaClient/media/video_decoder.cpp b/MediaClient/media/video_decoder.cpp
index 8d8f39b..963d311 100644
--- a/MediaClient/media/video_decoder.cpp
+++ b/MediaClient/media/video_decoder.cpp
@@ -332,8 +332,28 @@ void CVideoDecoder::uninit()
 {
     std::lock_guard<std::recursive_mutex> lock(_mutex);
 
+    // [MEDIA_DecClose] heartbeat — paired with [MEDIA_DecInit] for leak diagnosis.
+    // Pair count over a long run reveals whether avcodec_open2 calls are
+    // matched by full teardowns. If close-count < init-count, the FFmpeg
+    // codec context (and its custom get_buffer2 arena) is leaking per reopen.
+    {
+        static std::atomic<uint64_t> s_closeCount{0};
+        const uint64_t n = s_closeCount.fetch_add(1) + 1;
+        ANS_DBG("MEDIA_DecClose",
+            "uninit ENTRY #%llu inited=%d codec=%s %dx%d hwEnabled=%d cudaHW=%d gpu=%d (this=%p)",
+            (unsigned long long)n,
+            (int)m_bInited,
+            (m_pCodec && m_pCodec->name) ? m_pCodec->name : "?",
+            m_pContext ? m_pContext->width  : 0,
+            m_pContext ? m_pContext->height : 0,
+            (int)m_bHardwareDecoderEnabled,
+            (int)m_bCudaHWAccel,
+            m_hwGpuIndex,
+            (void*)this);
+    }
+
     // Stop processing first
-    // Backup first 
+    // Backup first
 	BOOL wasRunning = m_bRunning;
     m_bRunning = FALSE;
 
diff --git a/engines/TensorRTAPI/include/engine/EngineRunInference.inl b/engines/TensorRTAPI/include/engine/EngineRunInference.inl
index 882895c..b0619ba 100644
--- a/engines/TensorRTAPI/include/engine/EngineRunInference.inl
+++ b/engines/TensorRTAPI/include/engine/EngineRunInference.inl
@@ -6,6 +6,19 @@
 #include "TRTCompat.h"
 #include "ANSLicense.h"   // ANS_DBG macro for DebugView logging
 
+#ifdef _WIN32
+#  ifndef WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN
+#  endif
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#  include <windows.h>
+#  include <psapi.h>
+#  include <tlhelp32.h>
+#  pragma comment(lib, "psapi.lib")
+#endif
+
 // Per-device mutex for CUDA graph capture.
 // TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
 // at the CUDA context level.  When two Engine instances on the same GPU
@@ -398,6 +411,56 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
     const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
     s_globalActiveInf.fetch_add(1);
 
+    // ── Process-wide host-RAM heartbeat (once per ~60s) ──────────────────────
+    // Diagnostic for long-run leak hunts: if [PROC_MEM] privateMB climbs while
+    // [TRT_SM100] VRAM stays flat, the leak is on the host side (FFmpeg
+    // contexts, RTSP threads, GDI objects). Cheap when not firing — single
+    // atomic load + one compare in the hot path.
+#ifdef _WIN32
+    {
+        using clk = std::chrono::steady_clock;
+        static std::atomic<int64_t> s_hbLastNs{0};
+        const int64_t nowNs = clk::now().time_since_epoch().count();
+        int64_t prev = s_hbLastNs.load(std::memory_order_relaxed);
+        constexpr int64_t kIntervalNs = 60LL * 1'000'000'000LL;
+        if (nowNs - prev >= kIntervalNs &&
+            s_hbLastNs.compare_exchange_strong(prev, nowNs,
+                                               std::memory_order_relaxed)) {
+            PROCESS_MEMORY_COUNTERS_EX pmc{};
+            pmc.cb = sizeof(pmc);
+            GetProcessMemoryInfo(GetCurrentProcess(),
+                                 reinterpret_cast<PROCESS_MEMORY_COUNTERS*>(&pmc),
+                                 sizeof(pmc));
+            DWORD gdi  = GetGuiResources(GetCurrentProcess(), GR_GDIOBJECTS);
+            DWORD usr  = GetGuiResources(GetCurrentProcess(), GR_USEROBJECTS);
+
+            // Thread count via Toolhelp snapshot (filter to current PID).
+            DWORD threads = 0;
+            HANDLE snap = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);
+            if (snap != INVALID_HANDLE_VALUE) {
+                THREADENTRY32 te{ sizeof(te) };
+                const DWORD pid = GetCurrentProcessId();
+                if (Thread32First(snap, &te)) {
+                    do {
+                        if (te.th32OwnerProcessID == pid) ++threads;
+                    } while (Thread32Next(snap, &te));
+                }
+                CloseHandle(snap);
+            }
+
+            ANS_DBG("PROC_MEM",
+                "privateMB=%llu workingMB=%llu peakWorkingMB=%llu "
+                "pagefileMB=%llu gdi=%lu user=%lu threads=%lu",
+                (unsigned long long)(pmc.PrivateUsage      >> 20),
+                (unsigned long long)(pmc.WorkingSetSize    >> 20),
+                (unsigned long long)(pmc.PeakWorkingSetSize >> 20),
+                (unsigned long long)(pmc.PagefileUsage     >> 20),
+                (unsigned long)gdi, (unsigned long)usr,
+                (unsigned long)threads);
+        }
+    }
+#endif
+
     // Per-thread tracking
     {
         static thread_local int64_t s_infCount = 0;
@@ -935,15 +998,29 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
     }
 
     // ============================================================================
-    // Per-inference total timing breakdown (mutex wait + preprocess + GPU)
+    // Slow-inference alarm — ONE-SIDED FILTER, NOT A DISTRIBUTION
     // ============================================================================
+    // This emits a DebugView line ONLY when a single inference's total wall
+    // time (mutex-wait + GPU execution) exceeds 100 ms. Fast calls are silent.
+    //
+    // Consequence: if you aggregate `[TRT_Slow]` lines and compute an average,
+    // you get the mean of the slow *tail*, NOT the real average inference
+    // time. Expect this avg to look dramatic (~200–400 ms) because by design
+    // every sample here is already slow. A typical inference on a healthy
+    // system fires this line for ~1–3% of calls; >10% indicates a problem.
+    //
+    // For the true per-inference distribution, look at `[TRT_SM100] #N ...
+    // avgMs=... maxMs=...` (running-average, emitted every 50 inferences).
+    // The tag was previously `[TRT_Timing]` which misled readers into
+    // interpreting the avg as overall pipeline latency.
     {
         double totalMs = std::chrono::duration<double, std::milli>(
             std::chrono::steady_clock::now() - _mutexWaitStart).count();
         double gpuMs = totalMs - _mutexWaitMs;  // Everything after mutex acquired
-        // Log every inference that takes >100ms total (including mutex wait)
         if (totalMs > 100.0) {
-            ANS_DBG("TRT_Timing", "total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d",
+            ANS_DBG("TRT_Slow",
+                    "SLOW inference total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d "
+                    "(this filter only fires for calls >100ms)",
                     totalMs, _mutexWaitMs, gpuMs, batchSize, s_globalActiveInf.load());
         }
     }
diff --git a/modules/ANSCV/ANSFLV.cpp b/modules/ANSCV/ANSFLV.cpp
index 821a098..1079e80 100644
--- a/modules/ANSCV/ANSFLV.cpp
+++ b/modules/ANSCV/ANSFLV.cpp
@@ -2,6 +2,7 @@
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
 #include "ANSCVVendorGate.h"  // anscv_vendor_gate::IsNvidiaGpuAvailable()
+#include "ANSLicense.h"       // ANS_DBG macro
 #include <memory>
 #include <cstdint>
 #include "media_codec.h"
@@ -251,6 +252,23 @@ namespace ANSCENTER {
             return _pLastFrame;  // Shallow copy (fast)
         }
 
+        // Early stale-out: if the decoder hasn't produced a frame in 5s the
+        // source is dead. Skip _playerClient->getImage() entirely and return
+        // the cached frame with unchanged _pts so LabVIEW sees STALE PTS one
+        // poll earlier and triggers reconnect.
+        if (!_pLastFrame.empty()) {
+            double ageMs = _playerClient->getLastFrameAgeMs();
+            if (ageMs >= 5000.0) {
+                ANS_DBG("FLV_GetImage",
+                        "EARLY STALE: ageMs=%.1f pts=%lld url=%s — skipping getImage()",
+                        ageMs, (long long)_pts, _url.c_str());
+                width = _imageWidth;
+                height = _imageHeight;
+                pts = _pts;
+                return _pLastFrame;
+            }
+        }
+
         int imageW = 0, imageH = 0;
         int64_t currentPts = 0;
 
diff --git a/modules/ANSCV/ANSMJPEG.cpp b/modules/ANSCV/ANSMJPEG.cpp
index 56af91b..cee454e 100644
--- a/modules/ANSCV/ANSMJPEG.cpp
+++ b/modules/ANSCV/ANSMJPEG.cpp
@@ -2,6 +2,7 @@
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
 #include "ANSCVVendorGate.h"  // anscv_vendor_gate::IsNvidiaGpuAvailable()
+#include "ANSLicense.h"       // ANS_DBG macro
 #include <memory>
 #include <cstdint>
 #include "media_codec.h"
@@ -239,6 +240,23 @@ namespace ANSCENTER {
             return _pLastFrame;  // Shallow copy (fast)
         }
 
+        // Early stale-out: if the decoder hasn't produced a frame in 5s the
+        // source is dead. Skip _playerClient->getImage() entirely and return
+        // the cached frame with unchanged _pts so LabVIEW sees STALE PTS one
+        // poll earlier and triggers reconnect.
+        if (!_pLastFrame.empty()) {
+            double ageMs = _playerClient->getLastFrameAgeMs();
+            if (ageMs >= 5000.0) {
+                ANS_DBG("MJPEG_GetImage",
+                        "EARLY STALE: ageMs=%.1f pts=%lld url=%s — skipping getImage()",
+                        ageMs, (long long)_pts, _url.c_str());
+                width = _imageWidth;
+                height = _imageHeight;
+                pts = _pts;
+                return _pLastFrame;
+            }
+        }
+
         int imageW = 0, imageH = 0;
         int64_t currentPts = 0;
 
diff --git a/modules/ANSCV/ANSOpenCV.cpp b/modules/ANSCV/ANSOpenCV.cpp
index ffe5255..f92f0b3 100644
--- a/modules/ANSCV/ANSOpenCV.cpp
+++ b/modules/ANSCV/ANSOpenCV.cpp
@@ -473,7 +473,8 @@ namespace ANSCENTER
 	//}
 
 	std::string ANSOPENCV::EncodeJpegString(const cv::Mat& img, int quality) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: each call creates its own tjInitCompress handle and local
+		// buffers. No shared mutable state — safe to run concurrently.
 		tjhandle _jpegCompressor = nullptr;
 		unsigned char* jpegBuf = nullptr;
 
@@ -571,7 +572,7 @@ namespace ANSCENTER
 		return "";
 	}
 	std::string ANSOPENCV::MatToBinaryData(const cv::Mat& image) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: forwards to EncodeJpegString which is itself lock-free.
 		// Check if the image is empty or has invalid data
 		if (image.empty() || !image.data || !image.u) {
 			return "";
@@ -591,7 +592,8 @@ namespace ANSCENTER
 		return "";
 	}
 	void ANSOPENCV::ImageResize(const cv::Mat& inputFrame, int width, int height, cv::Mat& outputFrame) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: _licenseValid is std::atomic<bool>, cv::resize is reentrant,
+		// all Mats here are local. Safe to call concurrently across threads.
 
 		if (!_licenseValid) {
 			outputFrame = inputFrame;
@@ -647,9 +649,10 @@ namespace ANSCENTER
 			outputFrame = inputFrame;
 		}
 	}
-	void ANSOPENCV::ImageResizeWithRatio(const cv::Mat& inputFrame, int width, cv::Mat& outputFrame) 
+	void ANSOPENCV::ImageResizeWithRatio(const cv::Mat& inputFrame, int width, cv::Mat& outputFrame)
 	{
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: _licenseValid is std::atomic<bool>, cv::resize is reentrant,
+		// all Mats here are local. Safe to call concurrently across threads.
 		if (!_licenseValid) {
 			outputFrame = inputFrame;  // Shallow copy (fast)
 			return;
@@ -702,7 +705,7 @@ namespace ANSCENTER
 		}
 	}
 	cv::Mat ANSOPENCV::BlurObjects(const cv::Mat& image, const std::vector<cv::Rect>& objects) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 		// Check for valid license and empty input
 		if (!_licenseValid || image.empty()) return image;
 
@@ -725,7 +728,7 @@ namespace ANSCENTER
 		return outputImage;
 	}
 	cv::Mat ANSOPENCV::BlurBackground(const cv::Mat& image, const std::vector<cv::Rect>& objects) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 		// Check for valid license and empty input
 		if (!_licenseValid || image.empty()) return image;
 
@@ -749,7 +752,7 @@ namespace ANSCENTER
 		return blurredImage;
 	}
 	cv::Mat ANSOPENCV::ToGray(const cv::Mat& image) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 		// Check for valid license
 		if (!_licenseValid) return image;
 
@@ -779,7 +782,7 @@ namespace ANSCENTER
 		return grayMat;
 	}
 	cv::Mat ANSOPENCV::ImageDenoise(const cv::Mat& image) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 
 		if (!_licenseValid || image.empty()) {
 			return image;
@@ -797,7 +800,7 @@ namespace ANSCENTER
 		return denoised_image;
 	}
 	cv::Mat ANSOPENCV::ImageCrop(const cv::Mat& inputImage, const cv::Rect& resizeROI, int originalImageSize) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 
 		// License validation
 		if (!_licenseValid) {
@@ -870,7 +873,7 @@ namespace ANSCENTER
 		}
 	}
 	cv::Mat ANSOPENCV::ImageRepair(const cv::Mat& image) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 
 		if (!_licenseValid || image.empty()) {
 			return image;
@@ -904,7 +907,7 @@ namespace ANSCENTER
 		}
 	}
 	std::string ANSOPENCV::PatternMatches(cv::Mat& image, cv::Mat& templateImage, double threshold) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 
 		std::vector<DetectionObject> detectedObjects;
 
@@ -968,7 +971,7 @@ namespace ANSCENTER
 		}
 	}
 	std::string ANSOPENCV::QRDecoder(const cv::Mat& image) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 
 		if (!_licenseValid || image.empty()) {
 			return "";
@@ -1067,7 +1070,7 @@ namespace ANSCENTER
 		}
 	}
 	std::string ANSOPENCV::QRDecoderWithBBox(const cv::Mat& image, int maxImageSize, const std::vector<cv::Rect>& bBox) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 
 		if (!_licenseValid || image.empty()) {
 			return "";
@@ -1199,7 +1202,8 @@ namespace ANSCENTER
 		}
 	}
 	std::string ANSOPENCV::MatToBase64(const cv::Mat& image) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: _licenseValid is std::atomic<bool>, and CompressJpegToString
+		// uses a thread_local TurboJpegCompressor. Safe across threads.
 
 		if (!_licenseValid || image.empty()) {
 			return "";
@@ -1228,7 +1232,7 @@ namespace ANSCENTER
 		}
 	}
 	cv::Mat ANSOPENCV::ImageDarkEnhancement(const cv::Mat& img, double brightnessScaleFactor) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 
 		if (!_licenseValid || img.empty()) {
 			return img;  // Shallow copy (fast)
@@ -1259,7 +1263,7 @@ namespace ANSCENTER
 		}
 	}
 	cv::Mat ANSOPENCV::ImageContrastEnhancement(const cv::Mat& src) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 		double clipLimit = 2.0;
 		if (!_licenseValid || src.empty()) {
 			return src;
@@ -1312,7 +1316,7 @@ namespace ANSCENTER
 	}
 	
 	cv::Mat ANSOPENCV::ImageWhiteBalance(const cv::Mat& src) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 
 		if (!_licenseValid || src.empty()) {
 			return src;  // Shallow copy (fast)
@@ -1366,7 +1370,7 @@ namespace ANSCENTER
 		}
 	}
 	std::vector<cv::Rect> ANSOPENCV::GetBoundingBoxes(std::string strBBoxes) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local data only. Safe across threads.
 		std::vector<cv::Rect> bBoxes;
 		if (!_licenseValid) return bBoxes;
 
@@ -1410,7 +1414,7 @@ namespace ANSCENTER
 	}
 	
 	cv::Mat ANSOPENCV::RotateImage(const cv::Mat& image, double angle) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 
 		if (!_licenseValid || image.empty()) {
 			return image;  // Shallow copy (fast)
@@ -1473,7 +1477,7 @@ namespace ANSCENTER
 	}
 
 	cv::Mat ANSOPENCV::FlipImage(const cv::Mat& image, int flipCode) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 
 		if (!_licenseValid || image.empty()) {
 			return image;  // Shallow copy (fast)
@@ -1502,7 +1506,7 @@ namespace ANSCENTER
 	}
 
 	cv::Mat ANSOPENCV::ShiftImage(const cv::Mat& image, int shiftX, int shiftY) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 		if (!_licenseValid) return image;
 		if (image.empty()) return image;
 
@@ -1529,7 +1533,7 @@ namespace ANSCENTER
 	}
 
 	cv::Mat ANSOPENCV::AddGaussianNoise(const cv::Mat& image, double mean, double stddev) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 
 		if (!_licenseValid || image.empty()) {
 			return image;
@@ -1568,7 +1572,7 @@ namespace ANSCENTER
 	}
 
 	cv::Mat ANSOPENCV::AddSaltAndPepperNoise(const cv::Mat& image, double amount) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 
 		if (!_licenseValid || image.empty()) {
 			return image;
@@ -1607,7 +1611,7 @@ namespace ANSCENTER
 	}
 
 	cv::Mat ANSOPENCV::AddSpeckleNoise(const cv::Mat& image, double stddev) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 
 		if (!_licenseValid || image.empty()) {
 			return image;  // Shallow copy (fast)
@@ -1755,7 +1759,7 @@ namespace ANSCENTER
 	}
 
 	double ANSOPENCV::CalculateIoU(const cv::Rect& box1, const cv::Rect& box2) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: pure computation over inputs. Safe across threads.
 		int x1 = max(box1.x, box2.x);
 		int y1 = max(box1.y, box2.y);
 		int x2 = min(box1.x + box1.width, box2.x + box2.width);
@@ -1769,7 +1773,7 @@ namespace ANSCENTER
 		return iou;
 	}
 	void ANSOPENCV::NonMaximumSuppression(std::vector<DetectionObject>& detectedObjects, double iouThreshold) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on caller-owned vector. Safe across threads.
 		std::sort(detectedObjects.begin(), detectedObjects.end(),
 			[](const DetectionObject& a, const DetectionObject& b) {
 				return a.confidence > b.confidence;
@@ -1794,7 +1798,7 @@ namespace ANSCENTER
 	}	
 	
 	cv::Mat ANSOPENCV::ImageResizeV2(const cv::Mat& inputImage, int resizeWidth, int originalImageSize) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// Lock-free: operates on local Mats only. Safe across threads.
 
 		if (!_licenseValid) {
 			std::cerr << "Error: License is not valid in ImageResizeV2." << std::endl;
@@ -4210,12 +4214,9 @@ extern "C" __declspec(dllexport) void ANSCV_FreeCameraResource() {
 }
 
 extern "C" __declspec(dllexport) int  ANSCV_ResizeImage_Static(unsigned char* inputImage, unsigned int bufferLength, int width, int height, int& newWidth, int& newHeight, LStrHandle outputImage) {
-	//std::lock_guard<std::mutex> lock(imageMutex);  // Automatically locks and unlocks
-	std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
-	if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
-		std::cerr << "Error: Mutex timeout in ANSCV_ResizeImage_Static!" << std::endl;
-		return -6;
-	}
+	// Lock-free: operates on caller-owned input/output buffers only. No
+	// registered cv::Mat is touched, so the global timeImageMutex would
+	// serialize calls without protecting any shared state.
 	try {
 		cv::Mat inputFrame = cv::imdecode(cv::Mat(1, bufferLength, CV_8UC1, inputImage), cv::IMREAD_COLOR);
 		cv::Mat outputFrame = ANSCENTER::ANSOPENCV::resizeImageToFit(inputFrame, width, height, newWidth, newHeight);
@@ -5019,38 +5020,48 @@ extern "C" __declspec(dllexport) int ANSCV_CreateImageFromFile_S(const char* ima
 // Image Preprocessing
 extern "C" __declspec(dllexport) int ANSCV_ImageAutoWhiteBalance_S(cv::Mat** imageIn) {
 	gpu_frame_invalidate(imageIn ? *imageIn : nullptr);
-	if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-		std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-		return -2;
-	}
 	try {
+		// Shallow-copy input under lock so the processor sees a stable Mat
+		// even if another thread writes to *imageIn concurrently.
+		cv::Mat localInput;
+		{
+			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
+			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageAutoWhiteBalance_S!" << std::endl;
+				return -6;
+			}
+			if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
+				std::cerr << "Error: Invalid or empty input image in ANSCV_ImageAutoWhiteBalance_S!" << std::endl;
+				return -2;
+			}
+			localInput = **imageIn;  // ref-counted shallow copy
+		}
+
 		ANSCENTER::ANSOPENCV ansCVInstance;
 		if (!ansCVInstance.Init("")) {
 			std::cerr << "Error: Failed to initialize ANSCV instance!" << std::endl;
 			return -5;
 		}
 
-		cv::Mat imOut = ansCVInstance.ImageWhiteBalance(**imageIn);
-		// Thread-safe assignment
+		cv::Mat imOut = ansCVInstance.ImageWhiteBalance(localInput);
+		if (imOut.empty()) {
+			std::cerr << "Error: White balance processing failed in ANSCV_ImageAutoWhiteBalance_S!" << std::endl;
+			return 0;
+		}
+
+		// Swap back under lock.
 		{
 			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
 			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
-				std::cerr << "Error: Mutex timeout in ANSCV_ReSizeImage_S!" << std::endl;
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageAutoWhiteBalance_S!" << std::endl;
 				return -6;
 			}
-			//std::lock_guard<std::mutex> lock(imageMutex);
-			if (imOut.empty()) {
-				std::cerr << "Error: White balance processing failed in ANSCV_ImageAutoWhiteBalance_S!" << std::endl;
-				return 0;
-			}
-			else {
-				if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-					std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-					return -2;
-				}
-				**imageIn = std::move(imOut);
-				return 1;
+			if (!imageIn || !(*imageIn)) {
+				std::cerr << "Error: Image became invalid in ANSCV_ImageAutoWhiteBalance_S!" << std::endl;
+				return -2;
 			}
+			**imageIn = std::move(imOut);
+			return 1;
 		}
 	}
 	catch (const std::exception& e) {
@@ -5066,32 +5077,40 @@ extern "C" __declspec(dllexport) int ANSCV_ImageAutoWhiteBalance_S(cv::Mat** ima
 extern "C" __declspec(dllexport) int ANSCV_ImageBrightEnhance_S(cv::Mat** imageIn, double brightnessScaleFactor) {
 	gpu_frame_invalidate(imageIn ? *imageIn : nullptr);
 	try {
-		if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-			std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-			return -2;
-		}
-		ANSCENTER::ANSOPENCV ansCVInstance;
-		ansCVInstance.Init(""); // Initialize ANSCV instance
-		cv::Mat imOut = ansCVInstance.ImageDarkEnhancement(**imageIn, brightnessScaleFactor);
+		cv::Mat localInput;
 		{
 			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
 			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
-				std::cerr << "Error: Mutex timeout in ANSCV_ReSizeImage_S!" << std::endl;
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageBrightEnhance_S!" << std::endl;
 				return -6;
 			}
-			//std::lock_guard<std::mutex> lock(imageMutex); // Lock only during shared resource write
-			if (imOut.empty()) {
-				std::cerr << "Error: Brightness enhancement failed in ANSCV_ImageBrightEnhance_S!" << std::endl;
-				return 0;
+			if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
+				std::cerr << "Error: Invalid or empty input image in ANSCV_ImageBrightEnhance_S!" << std::endl;
+				return -2;
 			}
-			else {
-				if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-					std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-					return -2;
-				}
-				**imageIn = std::move(imOut);
-				return 1;
+			localInput = **imageIn;
+		}
+
+		ANSCENTER::ANSOPENCV ansCVInstance;
+		ansCVInstance.Init("");
+		cv::Mat imOut = ansCVInstance.ImageDarkEnhancement(localInput, brightnessScaleFactor);
+		if (imOut.empty()) {
+			std::cerr << "Error: Brightness enhancement failed in ANSCV_ImageBrightEnhance_S!" << std::endl;
+			return 0;
+		}
+
+		{
+			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
+			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageBrightEnhance_S!" << std::endl;
+				return -6;
 			}
+			if (!imageIn || !(*imageIn)) {
+				std::cerr << "Error: Image became invalid in ANSCV_ImageBrightEnhance_S!" << std::endl;
+				return -2;
+			}
+			**imageIn = std::move(imOut);
+			return 1;
 		}
 	}
 	catch (const std::exception& e) {
@@ -5107,37 +5126,41 @@ extern "C" __declspec(dllexport) int ANSCV_ImageBrightEnhance_S(cv::Mat** imageI
 
 extern "C" __declspec(dllexport) int  ANSCV_ImageContrastEnhance_S(cv::Mat** imageIn) {
 	gpu_frame_invalidate(imageIn ? *imageIn : nullptr);
-	if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-		std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-		return -2;
-	}
 	try {
-		ANSCENTER::ANSOPENCV ansCVInstance;
-		ansCVInstance.Init(""); // Initialize ANSCV instance
-
-		// Perform white balance correction
-		cv::Mat imOut = ansCVInstance.ImageContrastEnhancement(**imageIn);
-
+		cv::Mat localInput;
 		{
-			// Assign processed image back to input pointer
-			//std::lock_guard<std::mutex> lock(imageMutex); // Lock only during shared resource write
 			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
 			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
-				std::cerr << "Error: Mutex timeout in ANSCV_ReSizeImage_S!" << std::endl;
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageContrastEnhance_S!" << std::endl;
 				return -6;
 			}
-			if (imOut.empty()) {
-				std::cerr << "Error: White balance processing failed in ANSCV_ImageContrastEnhance_S!" << std::endl;
-				return 0;
+			if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
+				std::cerr << "Error: Invalid or empty input image in ANSCV_ImageContrastEnhance_S!" << std::endl;
+				return -2;
 			}
-			else {
-				if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-					std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-					return -2;
-				}
-				**imageIn = std::move(imOut);
-				return 1; // Success
+			localInput = **imageIn;
+		}
+
+		ANSCENTER::ANSOPENCV ansCVInstance;
+		ansCVInstance.Init("");
+		cv::Mat imOut = ansCVInstance.ImageContrastEnhancement(localInput);
+		if (imOut.empty()) {
+			std::cerr << "Error: Contrast enhancement failed in ANSCV_ImageContrastEnhance_S!" << std::endl;
+			return 0;
+		}
+
+		{
+			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
+			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageContrastEnhance_S!" << std::endl;
+				return -6;
 			}
+			if (!imageIn || !(*imageIn)) {
+				std::cerr << "Error: Image became invalid in ANSCV_ImageContrastEnhance_S!" << std::endl;
+				return -2;
+			}
+			**imageIn = std::move(imOut);
+			return 1;
 		}
 	}
 	catch (const std::exception& e) {
@@ -5153,34 +5176,40 @@ extern "C" __declspec(dllexport) int  ANSCV_ImageContrastEnhance_S(cv::Mat** ima
 extern "C" __declspec(dllexport) int ANSCV_ImageDenoise_S(cv::Mat** imageIn) {
 	gpu_frame_invalidate(imageIn ? *imageIn : nullptr);
 	try {
-		if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-			std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-			return -2;
-		}
-		ANSCENTER::ANSOPENCV ansCVInstance;
-		ansCVInstance.Init(""); // Initialize ANSCV instance
-
-		// Perform denoising
-		cv::Mat imOut = ansCVInstance.ImageDenoise(**imageIn);
+		cv::Mat localInput;
 		{
-			//std::lock_guard<std::mutex> lock(imageMutex); // Lock only during shared resource modification
 			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
 			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
-				std::cerr << "Error: Mutex timeout in ANSCV_ReSizeImage_S!" << std::endl;
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageDenoise_S!" << std::endl;
 				return -6;
 			}
-			if (imOut.empty()) {
-				std::cerr << "Error: Denoising processing failed in ANSCV_ImageDenoise_S!" << std::endl;
-				return 0;
+			if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
+				std::cerr << "Error: Invalid or empty input image in ANSCV_ImageDenoise_S!" << std::endl;
+				return -2;
 			}
-			else {
-				if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-					std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-					return -2;
-				}
-				**imageIn = std::move(imOut);
-				return 1; // Success
+			localInput = **imageIn;
+		}
+
+		ANSCENTER::ANSOPENCV ansCVInstance;
+		ansCVInstance.Init("");
+		cv::Mat imOut = ansCVInstance.ImageDenoise(localInput);
+		if (imOut.empty()) {
+			std::cerr << "Error: Denoising failed in ANSCV_ImageDenoise_S!" << std::endl;
+			return 0;
+		}
+
+		{
+			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
+			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageDenoise_S!" << std::endl;
+				return -6;
 			}
+			if (!imageIn || !(*imageIn)) {
+				std::cerr << "Error: Image became invalid in ANSCV_ImageDenoise_S!" << std::endl;
+				return -2;
+			}
+			**imageIn = std::move(imOut);
+			return 1;
 		}
 	}
 	catch (const std::exception& e) {
@@ -5195,34 +5224,40 @@ extern "C" __declspec(dllexport) int ANSCV_ImageDenoise_S(cv::Mat** imageIn) {
 extern "C" __declspec(dllexport) int ANSCV_ImageRepair_S(cv::Mat** imageIn) {
 	gpu_frame_invalidate(imageIn ? *imageIn : nullptr);
 	try {
-		if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-			std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-			return -2;
-		}
-		ANSCENTER::ANSOPENCV ansCVInstance;
-		ansCVInstance.Init(""); // Initialize ANSCV instance
-
-		// Perform image repair
-		cv::Mat imOut = ansCVInstance.ImageRepair(**imageIn);
+		cv::Mat localInput;
 		{
-			//std::lock_guard<std::mutex> lock(imageMutex); // Lock only during shared resource modification
 			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
 			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
-				std::cerr << "Error: Mutex timeout in ANSCV_ReSizeImage_S!" << std::endl;
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageRepair_S!" << std::endl;
 				return -6;
 			}
-			if (imOut.empty()) {
-				std::cerr << "Error: Image repair processing failed in ANSCV_ImageRepair_S!" << std::endl;
-				return 0;
+			if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
+				std::cerr << "Error: Invalid or empty input image in ANSCV_ImageRepair_S!" << std::endl;
+				return -2;
 			}
-			else {
-				if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-					std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-					return -2;
-				}
-				**imageIn = std::move(imOut);
-				return 1; // Success
+			localInput = **imageIn;
+		}
+
+		ANSCENTER::ANSOPENCV ansCVInstance;
+		ansCVInstance.Init("");
+		cv::Mat imOut = ansCVInstance.ImageRepair(localInput);
+		if (imOut.empty()) {
+			std::cerr << "Error: Image repair failed in ANSCV_ImageRepair_S!" << std::endl;
+			return 0;
+		}
+
+		{
+			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
+			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageRepair_S!" << std::endl;
+				return -6;
 			}
+			if (!imageIn || !(*imageIn)) {
+				std::cerr << "Error: Image became invalid in ANSCV_ImageRepair_S!" << std::endl;
+				return -2;
+			}
+			**imageIn = std::move(imOut);
+			return 1;
 		}
 	}
 	catch (const std::exception& e) {
@@ -5237,34 +5272,40 @@ extern "C" __declspec(dllexport) int ANSCV_ImageRepair_S(cv::Mat** imageIn) {
 extern "C" __declspec(dllexport) int  ANSCV_ImageToGray_S(cv::Mat** imageIn) {
 	gpu_frame_invalidate(imageIn ? *imageIn : nullptr);
 	try {
-		if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-			std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-			return -2;
-		}
-		ANSCENTER::ANSOPENCV ansCVInstance;
-		ansCVInstance.Init(""); // Initialize ANSCV instance
-		// Perform white balance correction
-		cv::Mat imOut = ansCVInstance.ToGray(**imageIn);
+		cv::Mat localInput;
 		{
-
 			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
 			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
-				std::cerr << "Error: Mutex timeout in ANSCV_ReSizeImage_S!" << std::endl;
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageToGray_S!" << std::endl;
 				return -6;
 			}
-			//std::lock_guard<std::mutex> lock(imageMutex); // Lock only during shared resource modification
-			if (imOut.empty()) {
-				std::cerr << "Error: White balance processing failed in ANSCV_ImageToGray_S!" << std::endl;
-				return 0;
+			if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
+				std::cerr << "Error: Invalid or empty input image in ANSCV_ImageToGray_S!" << std::endl;
+				return -2;
 			}
-			else {
-				if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-					std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-					return -2;
-				}
-				**imageIn = std::move(imOut);
-				return 1;
+			localInput = **imageIn;
+		}
+
+		ANSCENTER::ANSOPENCV ansCVInstance;
+		ansCVInstance.Init("");
+		cv::Mat imOut = ansCVInstance.ToGray(localInput);
+		if (imOut.empty()) {
+			std::cerr << "Error: Gray conversion failed in ANSCV_ImageToGray_S!" << std::endl;
+			return 0;
+		}
+
+		{
+			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
+			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageToGray_S!" << std::endl;
+				return -6;
 			}
+			if (!imageIn || !(*imageIn)) {
+				std::cerr << "Error: Image became invalid in ANSCV_ImageToGray_S!" << std::endl;
+				return -2;
+			}
+			**imageIn = std::move(imOut);
+			return 1;
 		}
 	}
 	catch (const std::exception& e) {
@@ -5279,35 +5320,40 @@ extern "C" __declspec(dllexport) int  ANSCV_ImageToGray_S(cv::Mat** imageIn) {
 extern "C" __declspec(dllexport) int  ANSCV_ImageRotate_S(cv::Mat** imageIn, double angle) {
 	gpu_frame_invalidate(imageIn ? *imageIn : nullptr);
 	try {
-		if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-			std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-			return -2;
-		}
-		ANSCENTER::ANSOPENCV ansCVInstance;
-		ansCVInstance.Init(""); // Initialize ANSCV instance
-
-		// Perform white balance correction
-		cv::Mat imOut = ansCVInstance.RotateImage(**imageIn, angle);
-		// Assign processed image back to input pointer
+		cv::Mat localInput;
 		{
 			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
 			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
-				std::cerr << "Error: Mutex timeout in ANSCV_ReSizeImage_S!" << std::endl;
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageRotate_S!" << std::endl;
 				return -6;
 			}
-			//std::lock_guard<std::mutex> lock(imageMutex); // Ensure thread safety
-			if (imOut.empty()) {
-				std::cerr << "Error: White balance processing failed in ANSCV_ImageRotate_S!" << std::endl;
-				return 0;
+			if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
+				std::cerr << "Error: Invalid or empty input image in ANSCV_ImageRotate_S!" << std::endl;
+				return -2;
 			}
-			else {
-				if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-					std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-					return -2;
-				}
-				**imageIn = std::move(imOut);
-				return 1;
+			localInput = **imageIn;
+		}
+
+		ANSCENTER::ANSOPENCV ansCVInstance;
+		ansCVInstance.Init("");
+		cv::Mat imOut = ansCVInstance.RotateImage(localInput, angle);
+		if (imOut.empty()) {
+			std::cerr << "Error: Rotation failed in ANSCV_ImageRotate_S!" << std::endl;
+			return 0;
+		}
+
+		{
+			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
+			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageRotate_S!" << std::endl;
+				return -6;
 			}
+			if (!imageIn || !(*imageIn)) {
+				std::cerr << "Error: Image became invalid in ANSCV_ImageRotate_S!" << std::endl;
+				return -2;
+			}
+			**imageIn = std::move(imOut);
+			return 1;
 		}
 	}
 	catch (const std::exception& e) {
@@ -5323,35 +5369,40 @@ extern "C" __declspec(dllexport) int  ANSCV_ImageRotate_S(cv::Mat** imageIn, dou
 extern "C" __declspec(dllexport) int  ANSCV_ImageFlip_S(cv::Mat** imageIn, int flipCode) {
 	gpu_frame_invalidate(imageIn ? *imageIn : nullptr);
 	try {
-		if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-			std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-			return -2;
-		}
-		ANSCENTER::ANSOPENCV ansCVInstance;
-		ansCVInstance.Init(""); // Initialize ANSCV instance
-
-		// Perform white balance correction
-		cv::Mat imOut = ansCVInstance.FlipImage(**imageIn, flipCode);
-		// Assign processed image back to input pointer
+		cv::Mat localInput;
 		{
 			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
 			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
-				std::cerr << "Error: Mutex timeout in ANSCV_ReSizeImage_S!" << std::endl;
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageFlip_S!" << std::endl;
 				return -6;
 			}
-			//std::lock_guard<std::mutex> lock(imageMutex); // Ensure thread safety
-			if (imOut.empty()) {
-				std::cerr << "Error: White balance processing failed in ANSCV_ImageFlip_S!" << std::endl;
-				return 0;
+			if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
+				std::cerr << "Error: Invalid or empty input image in ANSCV_ImageFlip_S!" << std::endl;
+				return -2;
 			}
-			else {
-				if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-					std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-					return -2;
-				}
-				**imageIn = std::move(imOut);
-				return 1;
+			localInput = **imageIn;
+		}
+
+		ANSCENTER::ANSOPENCV ansCVInstance;
+		ansCVInstance.Init("");
+		cv::Mat imOut = ansCVInstance.FlipImage(localInput, flipCode);
+		if (imOut.empty()) {
+			std::cerr << "Error: Flip failed in ANSCV_ImageFlip_S!" << std::endl;
+			return 0;
+		}
+
+		{
+			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
+			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageFlip_S!" << std::endl;
+				return -6;
 			}
+			if (!imageIn || !(*imageIn)) {
+				std::cerr << "Error: Image became invalid in ANSCV_ImageFlip_S!" << std::endl;
+				return -2;
+			}
+			**imageIn = std::move(imOut);
+			return 1;
 		}
 	}
 	catch (const std::exception& e) {
@@ -5368,36 +5419,41 @@ extern "C" __declspec(dllexport) int  ANSCV_ImageFlip_S(cv::Mat** imageIn, int f
 extern "C" __declspec(dllexport) int  ANSCV_ImageBlurObjects_S(cv::Mat** imageIn, const char* strBboxes) {
 	gpu_frame_invalidate(imageIn ? *imageIn : nullptr);
 	try {
-		if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-			std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-			return -2;
-		}
-		ANSCENTER::ANSOPENCV ansCVInstance;
-		ansCVInstance.Init(""); // Initialize ANSCV instance
-		std::vector<cv::Rect> objects = ansCVInstance.GetBoundingBoxes(strBboxes);
-		// Perform white balance correction
-		cv::Mat imOut = ansCVInstance.BlurObjects(**imageIn, objects);
-
-		// Assign processed image back to input pointer
+		cv::Mat localInput;
 		{
 			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
 			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
-				std::cerr << "Error: Mutex timeout in ANSCV_ReSizeImage_S!" << std::endl;
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageBlurObjects_S!" << std::endl;
 				return -6;
 			}
-			//std::lock_guard<std::mutex> lock(imageMutex); // Ensure thread safety
-			if (imOut.empty()) {
-				std::cerr << "Error: White balance processing failed in ANSCV_ImageBlurObjects_S!" << std::endl;
-				return 0;
+			if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
+				std::cerr << "Error: Invalid or empty input image in ANSCV_ImageBlurObjects_S!" << std::endl;
+				return -2;
 			}
-			else {
-				if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-					std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-					return -2;
-				}
-				**imageIn = std::move(imOut);
-				return 1;
+			localInput = **imageIn;
+		}
+
+		ANSCENTER::ANSOPENCV ansCVInstance;
+		ansCVInstance.Init("");
+		std::vector<cv::Rect> objects = ansCVInstance.GetBoundingBoxes(strBboxes);
+		cv::Mat imOut = ansCVInstance.BlurObjects(localInput, objects);
+		if (imOut.empty()) {
+			std::cerr << "Error: BlurObjects failed in ANSCV_ImageBlurObjects_S!" << std::endl;
+			return 0;
+		}
+
+		{
+			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
+			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageBlurObjects_S!" << std::endl;
+				return -6;
 			}
+			if (!imageIn || !(*imageIn)) {
+				std::cerr << "Error: Image became invalid in ANSCV_ImageBlurObjects_S!" << std::endl;
+				return -2;
+			}
+			**imageIn = std::move(imOut);
+			return 1;
 		}
 	}
 	catch (const std::exception& e) {
@@ -5413,35 +5469,41 @@ extern "C" __declspec(dllexport) int  ANSCV_ImageBlurObjects_S(cv::Mat** imageIn
 extern "C" __declspec(dllexport) int  ANSCV_ImageBlurBackground_S(cv::Mat** imageIn, const char* strBboxes) {
 	gpu_frame_invalidate(imageIn ? *imageIn : nullptr);
 	try {
-		if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-			std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-			return -2;
-		}
-		ANSCENTER::ANSOPENCV ansCVInstance;
-		ansCVInstance.Init(""); // Initialize ANSCV instance
-		std::vector<cv::Rect> objects = ansCVInstance.GetBoundingBoxes(strBboxes);
-		// Perform white balance correction
-		cv::Mat imOut = ansCVInstance.BlurBackground(**imageIn, objects);
-		// Assign processed image back to input pointer
+		cv::Mat localInput;
 		{
 			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
 			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
-				std::cerr << "Error: Mutex timeout in ANSCV_ReSizeImage_S!" << std::endl;
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageBlurBackground_S!" << std::endl;
 				return -6;
 			}
-			//std::lock_guard<std::mutex> lock(imageMutex); // Ensure thread safety
-			if (imOut.empty()) {
-				std::cerr << "Error: White balance processing failed in ANSCV_ImageBlurBackground_S!" << std::endl;
-				return 0;
+			if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
+				std::cerr << "Error: Invalid or empty input image in ANSCV_ImageBlurBackground_S!" << std::endl;
+				return -2;
 			}
-			else {
-				if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-					std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-					return -2;
-				}
-				**imageIn = std::move(imOut);
-				return 1;
+			localInput = **imageIn;
+		}
+
+		ANSCENTER::ANSOPENCV ansCVInstance;
+		ansCVInstance.Init("");
+		std::vector<cv::Rect> objects = ansCVInstance.GetBoundingBoxes(strBboxes);
+		cv::Mat imOut = ansCVInstance.BlurBackground(localInput, objects);
+		if (imOut.empty()) {
+			std::cerr << "Error: BlurBackground failed in ANSCV_ImageBlurBackground_S!" << std::endl;
+			return 0;
+		}
+
+		{
+			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
+			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageBlurBackground_S!" << std::endl;
+				return -6;
 			}
+			if (!imageIn || !(*imageIn)) {
+				std::cerr << "Error: Image became invalid in ANSCV_ImageBlurBackground_S!" << std::endl;
+				return -2;
+			}
+			**imageIn = std::move(imOut);
+			return 1;
 		}
 	}
 	catch (const std::exception& e) {
@@ -5456,44 +5518,37 @@ extern "C" __declspec(dllexport) int  ANSCV_ImageBlurBackground_S(cv::Mat** imag
 
 extern "C" __declspec(dllexport) int ANSCV_ImageQRDecoder_S(cv::Mat** imageIn, int maxImageWidth, const char* strBboxes, LStrHandle detectedQRText) {
 	try {
-		if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-			std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-			return -2;
-		}
-		ANSCENTER::ANSOPENCV ansCVInstance;
-		ansCVInstance.Init(""); // Initialize ANSCV instance
-		std::vector<cv::Rect> Bboxes = ansCVInstance.GetBoundingBoxes(strBboxes);
-		// Decode the QR code
-		std::string qrText = ansCVInstance.QRDecoderWithBBox(**imageIn, maxImageWidth, Bboxes);
+		cv::Mat localInput;
 		{
-			// Assign QR decoded text to detectedQRText handle
 			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
 			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
-				std::cerr << "Error: Mutex timeout in ANSCV_ReSizeImage_S!" << std::endl;
+				std::cerr << "Error: Mutex timeout in ANSCV_ImageQRDecoder_S!" << std::endl;
 				return -6;
 			}
-			//std::lock_guard<std::mutex> lock(imageMutex); // Ensure thread safety when modifying the handle
-			if (qrText.empty()) {
-				std::cerr << "Error: QR decoding failed in ANSCV_ImageQRDecoder_S!" << std::endl;
-				return 0;
-			}
-			int size = qrText.length();
-			if (size > 0) {
-				MgErr error;
-				error = DSSetHandleSize(detectedQRText, sizeof(int32) + size * sizeof(uChar));
-				if (error == noErr) {
-					(*detectedQRText)->cnt = size;
-					memcpy((*detectedQRText)->str, qrText.c_str(), size);
-					return 1; // Success
-				}
-				else {
-					return 0; // Error setting handle size
-				}
-			}
-			else {
-				return 0; // No QR code found
+			if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
+				std::cerr << "Error: Invalid or empty input image in ANSCV_ImageQRDecoder_S!" << std::endl;
+				return -2;
 			}
+			localInput = **imageIn;
 		}
+
+		ANSCENTER::ANSOPENCV ansCVInstance;
+		ansCVInstance.Init("");
+		std::vector<cv::Rect> Bboxes = ansCVInstance.GetBoundingBoxes(strBboxes);
+		std::string qrText = ansCVInstance.QRDecoderWithBBox(localInput, maxImageWidth, Bboxes);
+		if (qrText.empty()) {
+			std::cerr << "Error: QR decoding failed in ANSCV_ImageQRDecoder_S!" << std::endl;
+			return 0;
+		}
+
+		// detectedQRText is a caller-owned LabVIEW handle; no global lock needed.
+		const int size = static_cast<int>(qrText.length());
+		if (size <= 0) return 0;
+		MgErr error = DSSetHandleSize(detectedQRText, sizeof(int32) + size * sizeof(uChar));
+		if (error != noErr) return 0;
+		(*detectedQRText)->cnt = size;
+		memcpy((*detectedQRText)->str, qrText.c_str(), size);
+		return 1;
 	}
 	catch (const std::exception& e) {
 		std::cerr << "Error: Exception occurred in ANSCV_ImageQRDecoder_S: " << e.what() << std::endl;
@@ -5507,47 +5562,41 @@ extern "C" __declspec(dllexport) int ANSCV_ImageQRDecoder_S(cv::Mat** imageIn, i
 
 extern "C" __declspec(dllexport) int ANSCV_ImagePatternMatchs_S(cv::Mat** imageIn, const char* templateFilePath, double threshold, LStrHandle detectedMatchedLocations) {
 	try {
-		if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
-			std::cerr << "Error: Invalid or empty input image in ANSCV_CloneImage_S!" << std::endl;
-			return -2;
-		}
-		ANSCENTER::ANSOPENCV ansCVInstance;
-		ansCVInstance.Init(""); // Initialize ANSCV instance
-
-		// Load template image
-		cv::Mat templateImage = cv::imread(templateFilePath, cv::IMREAD_COLOR);
-		if (templateImage.empty()) {
-			std::cerr << "Error: Failed to load template image from " << templateFilePath << std::endl;
-			return -2; // Return error if template cannot be loaded
-		}
-
-		// Perform pattern matching
-		std::string strMatchedLocations = ansCVInstance.PatternMatches(**imageIn, templateImage, threshold);
+		cv::Mat localInput;
 		{
 			std::unique_lock<std::timed_mutex> lock(timeImageMutex, std::defer_lock);
 			if (!lock.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT_MS))) {
-				std::cerr << "Error: Mutex timeout in ANSCV_ReSizeImage_S!" << std::endl;
+				std::cerr << "Error: Mutex timeout in ANSCV_ImagePatternMatchs_S!" << std::endl;
 				return -6;
 			}
-			//std::lock_guard<std::mutex> lock(imageMutex); // Ensure thread safety when modifying detectedMatchedLocations
-			int size = strMatchedLocations.length();
-			if (size > 0) {
-				MgErr error;
-				error = DSSetHandleSize(detectedMatchedLocations, sizeof(int32) + size * sizeof(uChar));
-				if (error == noErr) {
-					(*detectedMatchedLocations)->cnt = size;
-					memcpy((*detectedMatchedLocations)->str, strMatchedLocations.c_str(), size);
-					return 1; // Success
-				}
-				else {
-					std::cerr << "Error: Failed to set handle size for detectedMatchedLocations!" << std::endl;
-					return 0; // Error setting handle size
-				}
-			}
-			else {
-				return 0; // No matches found
+			if (!imageIn || !(*imageIn) || (*imageIn)->empty() || !(*imageIn)->data) {
+				std::cerr << "Error: Invalid or empty input image in ANSCV_ImagePatternMatchs_S!" << std::endl;
+				return -2;
 			}
+			localInput = **imageIn;
 		}
+
+		cv::Mat templateImage = cv::imread(templateFilePath, cv::IMREAD_COLOR);
+		if (templateImage.empty()) {
+			std::cerr << "Error: Failed to load template image from " << templateFilePath << std::endl;
+			return -2;
+		}
+
+		ANSCENTER::ANSOPENCV ansCVInstance;
+		ansCVInstance.Init("");
+		std::string strMatchedLocations = ansCVInstance.PatternMatches(localInput, templateImage, threshold);
+
+		// detectedMatchedLocations is a caller-owned LabVIEW handle; no global lock needed.
+		const int size = static_cast<int>(strMatchedLocations.length());
+		if (size <= 0) return 0;
+		MgErr error = DSSetHandleSize(detectedMatchedLocations, sizeof(int32) + size * sizeof(uChar));
+		if (error != noErr) {
+			std::cerr << "Error: Failed to set handle size for detectedMatchedLocations!" << std::endl;
+			return 0;
+		}
+		(*detectedMatchedLocations)->cnt = size;
+		memcpy((*detectedMatchedLocations)->str, strMatchedLocations.c_str(), size);
+		return 1;
 	}
 	catch (const std::exception& e) {
 		std::cerr << "Error: Exception occurred in ANSCV_ImagePatternMatchs_S: " << e.what() << std::endl;
diff --git a/modules/ANSCV/ANSOpenCV.h b/modules/ANSCV/ANSOpenCV.h
index ab41f51..a646b0d 100644
--- a/modules/ANSCV/ANSOpenCV.h
+++ b/modules/ANSCV/ANSOpenCV.h
@@ -155,7 +155,9 @@ namespace ANSCENTER
 		std::recursive_mutex		_mutex;
 
 		//std::once_flag				licenseOnceFlag; // For one-time license check
-		bool						_licenseValid = false;
+		// Atomic so lock-free methods (ImageResize, ImageResizeWithRatio,
+		// MatToBinaryData, EncodeJpegString) can read it without _mutex.
+		std::atomic<bool>			_licenseValid{ false };
 	public:
 	};
 }
diff --git a/modules/ANSCV/ANSRTMP.cpp b/modules/ANSCV/ANSRTMP.cpp
index dfe996e..b88f655 100644
--- a/modules/ANSCV/ANSRTMP.cpp
+++ b/modules/ANSCV/ANSRTMP.cpp
@@ -2,6 +2,7 @@
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
 #include "ANSCVVendorGate.h"  // anscv_vendor_gate::IsNvidiaGpuAvailable()
+#include "ANSLicense.h"       // ANS_DBG macro
 #include <memory>
 #include "media_codec.h"
 #include <cstdint>
@@ -245,6 +246,23 @@ namespace ANSCENTER {
             return _pLastFrame;  // Shallow copy (fast)
         }
 
+        // Early stale-out: if the decoder hasn't produced a frame in 5s the
+        // source is dead. Skip _playerClient->getImage() entirely and return
+        // the cached frame with unchanged _pts so LabVIEW sees STALE PTS one
+        // poll earlier and triggers reconnect.
+        if (!_pLastFrame.empty()) {
+            double ageMs = _playerClient->getLastFrameAgeMs();
+            if (ageMs >= 5000.0) {
+                ANS_DBG("RTMP_GetImage",
+                        "EARLY STALE: ageMs=%.1f pts=%lld url=%s — skipping getImage()",
+                        ageMs, (long long)_pts, _url.c_str());
+                width = _imageWidth;
+                height = _imageHeight;
+                pts = _pts;
+                return _pLastFrame;
+            }
+        }
+
         int imageW = 0, imageH = 0;
         int64_t currentPts = 0;
 
diff --git a/modules/ANSCV/ANSSRT.cpp b/modules/ANSCV/ANSSRT.cpp
index b25d809..01621d3 100644
--- a/modules/ANSCV/ANSSRT.cpp
+++ b/modules/ANSCV/ANSSRT.cpp
@@ -2,6 +2,7 @@
 #include "ANSMatRegistry.h"
 #include "ANSGpuFrameOps.h"
 #include "ANSCVVendorGate.h"  // anscv_vendor_gate::IsNvidiaGpuAvailable()
+#include "ANSLicense.h"       // ANS_DBG macro
 #include <memory>
 #include "media_codec.h"
 #include <cstdint>
@@ -253,6 +254,23 @@ namespace ANSCENTER {
             return _pLastFrame;  // Shallow copy (fast)
         }
 
+        // Early stale-out: if the decoder hasn't produced a frame in 5s the
+        // source is dead. Skip _playerClient->getImage() entirely and return
+        // the cached frame with unchanged _pts so LabVIEW sees STALE PTS one
+        // poll earlier and triggers reconnect.
+        if (!_pLastFrame.empty()) {
+            double ageMs = _playerClient->getLastFrameAgeMs();
+            if (ageMs >= 5000.0) {
+                ANS_DBG("SRT_GetImage",
+                        "EARLY STALE: ageMs=%.1f pts=%lld url=%s — skipping getImage()",
+                        ageMs, (long long)_pts, _url.c_str());
+                width = _imageWidth;
+                height = _imageHeight;
+                pts = _pts;
+                return _pLastFrame;
+            }
+        }
+
         int imageW = 0, imageH = 0;
         int64_t currentPts = 0;
 
diff --git a/modules/ANSFR/ANSFaceRecognizer.cpp b/modules/ANSFR/ANSFaceRecognizer.cpp
index b80ef53..8c9e21e 100644
--- a/modules/ANSFR/ANSFaceRecognizer.cpp
+++ b/modules/ANSFR/ANSFaceRecognizer.cpp
@@ -91,9 +91,14 @@ namespace ANSCENTER {
 				}
 
 				if (!m_trtEngine) {
-					// Enable batch support
-					m_options.optBatchSize = 8;
-					m_options.maxBatchSize = 32;
+					// Enable batch support. maxBatchSize controls the TRT workspace
+					// allocation (~linear in batch); opt is the kernel-selection sweet
+					// spot. Max=4 was picked to fit 4 concurrent face crops per frame
+					// comfortably on 8 GB GPUs while freeing ~1.5 GB VRAM vs max=32
+					// — most scenes have ≤4 faces visible, so throughput cost is
+					// near-zero (amortized per-face latency drops too at lower batch).
+					m_options.optBatchSize = 4;
+					m_options.maxBatchSize = 4;
 
 					m_options.maxInputHeight = GPU_FACE_HEIGHT;
 					m_options.minInputHeight = GPU_FACE_HEIGHT;
diff --git a/modules/ANSLPR/ANSLPR_OD.cpp b/modules/ANSLPR/ANSLPR_OD.cpp
index fa7b994..452de04 100644
--- a/modules/ANSLPR/ANSLPR_OD.cpp
+++ b/modules/ANSLPR/ANSLPR_OD.cpp
@@ -534,8 +534,12 @@ namespace ANSCENTER {
 
 			_ocrModelConfig.inpHeight = 640;
 			_ocrModelConfig.inpWidth = 640;
-			_ocrModelConfig.gpuOptBatchSize = 8;
-			_ocrModelConfig.gpuMaxBatchSize = 32;   // desired max; engine builder auto-caps by GPU VRAM
+			// Max=4 chosen to fit typical plate counts per frame on 8 GB GPUs.
+			// Was opt=8/max=32 which sized TRT workspace for 32 concurrent plates
+			// (~1 GB for this model alone). Cap of 4 is still >= the usual 1–3
+			// plates visible per camera frame, amortized throughput unchanged.
+			_ocrModelConfig.gpuOptBatchSize = 4;
+			_ocrModelConfig.gpuMaxBatchSize = 4;    // desired max; engine builder auto-caps by GPU VRAM
 			_ocrModelConfig.maxInputHeight = 640;
 			_ocrModelConfig.maxInputWidth = 640;
 			_ocrModelConfig.minInputHeight = 640;
@@ -545,8 +549,9 @@ namespace ANSCENTER {
 
 			_lpColourModelConfig.inpHeight = 224;
 			_lpColourModelConfig.inpWidth = 224;
-			_lpColourModelConfig.gpuOptBatchSize = 8;
-			_lpColourModelConfig.gpuMaxBatchSize = 32;   // desired max; engine builder auto-caps by GPU VRAM
+			// See _ocrModelConfig above — matching batch cap for consistency.
+			_lpColourModelConfig.gpuOptBatchSize = 4;
+			_lpColourModelConfig.gpuMaxBatchSize = 4;    // desired max; engine builder auto-caps by GPU VRAM
 			_lpColourModelConfig.maxInputHeight = 224;
 			_lpColourModelConfig.maxInputWidth = 224;
 			_lpColourModelConfig.minInputHeight = 224;
diff --git a/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp b/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
index 758963d..06f8b4c 100644
--- a/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
@@ -28,8 +28,11 @@ bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string&
         ANSCENTER::Options options;
         options.deviceIndex = gpuId;
         options.precision = ANSCENTER::Precision::FP16;
-        options.maxBatchSize = 1;
-        options.optBatchSize = 1;
+        // maxBatch=4 matches FaceRecognizer / ALPR configuration — allows the
+        // recognizer to process up to 4 detected text lines in one call,
+        // amortizing per-invocation overhead while keeping TRT workspace small.
+        options.maxBatchSize = 4;
+        options.optBatchSize = 4;
 
         // Fixed height, dynamic width for recognition
         options.minInputHeight = imgH_;
diff --git a/modules/ANSOCR/dllmain.cpp b/modules/ANSOCR/dllmain.cpp
index 6dbc522..6628f94 100644
--- a/modules/ANSOCR/dllmain.cpp
+++ b/modules/ANSOCR/dllmain.cpp
@@ -185,11 +185,22 @@ extern "C" ANSOCR_API int		CreateANSOCRHandleEx(ANSCENTER::ANSOCRBase** Handle,
 		ANSCENTER::ANSLibsLoader::Initialize();
 		ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
 		{
+			// Describe the backend the engine-selector below will actually choose
+			// for this (hardware, engineMode) combination. Previous versions of
+			// this log claimed "TensorRT OCR enabled" based on hardware alone,
+			// which was misleading because engineMode=0 (auto) unconditionally
+			// picked ONNX — users saw the log and assumed TRT was running.
+			const bool isNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
+			const bool willUseTRT =
+				isNvidia && (engineMode == 0 /* auto → TRT on NVIDIA */ ||
+				             engineMode == 1 /* GPU  → TRT on NVIDIA */);
 			const char* vendorTag =
-				engineType == ANSCENTER::EngineType::NVIDIA_GPU   ? "NVIDIA_GPU (TensorRT OCR enabled)" :
-				engineType == ANSCENTER::EngineType::AMD_GPU      ? "AMD_GPU (ONNX Runtime / DirectML, TensorRT OCR DISABLED)" :
-				engineType == ANSCENTER::EngineType::OPENVINO_GPU ? "OPENVINO_GPU (ONNX Runtime / OpenVINO, TensorRT OCR DISABLED)" :
-				                                                    "CPU (ONNX Runtime, TensorRT OCR DISABLED)";
+				engineType == ANSCENTER::EngineType::NVIDIA_GPU
+					? (willUseTRT ? "NVIDIA_GPU (TensorRT OCR active)"
+					              : "NVIDIA_GPU (TensorRT available, but engineMode forces ONNX)")
+				: engineType == ANSCENTER::EngineType::AMD_GPU      ? "AMD_GPU (ONNX Runtime / DirectML, TensorRT OCR unavailable)"
+				: engineType == ANSCENTER::EngineType::OPENVINO_GPU ? "OPENVINO_GPU (ONNX Runtime / OpenVINO, TensorRT OCR unavailable)"
+				:                                                     "CPU (ONNX Runtime, TensorRT OCR unavailable)";
 			char buf[192];
 			snprintf(buf, sizeof(buf),
 				"[ANSOCR] CreateANSOCRHandleEx: detected engineType=%d [%s], engineMode=%d\n",
@@ -230,10 +241,23 @@ extern "C" ANSOCR_API int		CreateANSOCRHandleEx(ANSCENTER::ANSOCRBase** Handle,
 			// select, including DirectML for AMD).
 			const bool isNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
 			switch (engineMode) {
-			case 0:// Auto-detect, always use ONNX for better compatibility, especially on AMD GPUs and high-res images
-				(*Handle) = new ANSCENTER::ANSONNXOCR();
+			case 0: // Auto-detect — prefer TensorRT on NVIDIA, ONNX elsewhere.
+				// Previous policy was "always ONNX" for cross-platform safety,
+				// but on NVIDIA that defeated the point: each ANSONNXOCR handle
+				// allocates its own cls/dec/rec OrtSessions (no dedupe), which
+				// wasted ~300–600 MB VRAM per extra instance and ran ~2× slower
+				// than ANSRTOCR's shared-engine path via EnginePoolManager.
+				if (isNvidia) {
+					limitSideLen = 960;
+					(*Handle) = new ANSCENTER::ANSRTOCR();
+				} else {
+					// AMD / Intel / CPU — ANSRTOCR hard-requires CUDA and would
+					// crash. ANSONNXOCR auto-picks the correct ORT EP
+					// (DirectML on AMD, OpenVINO on Intel, CPU otherwise).
+					(*Handle) = new ANSCENTER::ANSONNXOCR();
+				}
 				break;
-			case 1:// GPU — use TensorRT engine ONLY on NVIDIA hardware.
+			case 1: // GPU — use TensorRT engine ONLY on NVIDIA hardware.
 				if (isNvidia) {
 					limitSideLen = 960;
 					(*Handle) = new ANSCENTER::ANSRTOCR();
@@ -244,7 +268,7 @@ extern "C" ANSOCR_API int		CreateANSOCRHandleEx(ANSCENTER::ANSOCRBase** Handle,
 					(*Handle) = new ANSCENTER::ANSONNXOCR();
 				}
 				break;
-			case 2:// CPU
+			case 2: // CPU
 				(*Handle) = new ANSCENTER::ANSONNXOCR();
 				break;
 			default:
diff --git a/modules/ANSODEngine/dllmain.cpp b/modules/ANSODEngine/dllmain.cpp
index c62c34f..88fcd2d 100644
--- a/modules/ANSODEngine/dllmain.cpp
+++ b/modules/ANSODEngine/dllmain.cpp
@@ -426,27 +426,37 @@ extern "C" ANSODENGINE_API std::string  CreateANSODHandle(ANSCENTER::ANSODBase**
 	ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
 	if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU
 
-	//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
-
-	if ((modelType == 4) || // TensorRT  
-		(modelType == 14)|| // TensorRT Yolov10	 
-		(modelType == 22)|| // TensorRT Pose
-		(modelType == 24))  // TensorRT Segmentation
-	{
-		if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) modelType = 31; // RTYOLO
-		else modelType=30;// ONNXYOLO
-	}
-	else if ((modelType == 3) ||	// YoloV8/YoloV11 (Object Detection)
-		     (modelType == 17)||	// YOLO V12
-			 (modelType == 20) ||	// ONNX Classification	
-			 (modelType == 21) ||	// ONNX Pose
-			 (modelType == 23) ||	// ONNX Segmentation	
-			 (modelType == 25))	    // OBB Segmentation
-	{
-		modelType = 30; // ONNXYOLO
-	}
-	else {
-		// do nothing, use the modelType specified by user
+	// Route detection / pose / segmentation / OBB / classification to the best
+	// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
+	// handler. Unlisted modelType values are left untouched for the switch below.
+	// See CreateANSODHandleEx for the full rationale — three correctness bugs
+	// were fixed in that dispatcher and must be kept in sync across copies.
+	const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
+	switch (modelType) {
+		// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
+		case 3:   // YOLOV8 / YOLOV11
+		case 4:   // generic TensorRT
+		case 14:  // YOLOv10RTOD (TRT end-to-end NMS)
+		case 17:  // YOLOV12
+			modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
+			break;
+		// ── Pose ─────────────────────────────────────────────────────────────
+		case 21:  // ONNXPOSE
+		case 22:  // RTPOSE
+			modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
+			break;
+		// ── Segmentation ─────────────────────────────────────────────────────
+		case 23:  // ONNXSEG
+		case 24:  // RTSEG
+			modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
+			break;
+		// ── OBB / Classification (ONNX-only today — leave as-is) ─────────────
+		case 20:  // ONNXCL
+		case 25:  // ONNXOBB
+			break;
+		default:
+			// Any other modelType is handled directly by the switch below.
+			break;
 	}
 
 	switch (detectionType) {
@@ -764,27 +774,53 @@ extern "C" ANSODENGINE_API int  CreateANSODHandleEx(ANSCENTER::ANSODBase** Handl
 	ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
 	if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU
 
-	//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
-
-	if ((modelType == 4) || // TensorRT
-		(modelType == 14)|| // TensorRT Yolov10
-		(modelType == 22)|| // TensorRT Pose
-		(modelType == 24))  // TensorRT Segmentation
-	{
-		if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) modelType = 31; // RTYOLO
-		else modelType=30;// ONNXYOLO
-	}
-	else if ((modelType == 3) ||	// YoloV8/YoloV11 (Object Detection)
-		     (modelType == 17)||	// YOLO V12
-			 (modelType == 20) ||	// ONNX Classification
-			 (modelType == 21) ||	// ONNX Pose
-			 (modelType == 23) ||	// ONNX Segmentation
-			 (modelType == 25))	    // OBB Segmentation
-	{
-		modelType = 30; // ONNXYOLO
-	}
-	else {
-		// do nothing, use the modelType specified by user
+	// Route detection / pose / segmentation / OBB / classification to the best
+	// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
+	// handler. Unlisted modelType values are left untouched for the switch below.
+	//
+	// Previous revisions of this block had two correctness bugs:
+	//   (1) modelType == 3 / 17 (YoloV8/V11/V12 detection) was hard-wired to
+	//       ONNXYOLO even on NVIDIA — bypassing the TensorRT path entirely and
+	//       duplicating VRAM when multiple handles loaded the same .onnx (ORT
+	//       has no EnginePoolManager dedupe).
+	//   (2) modelType == 20 / 21 / 23 / 25 (ONNX CLS / POSE / SEG / OBB) was
+	//       rewritten to 30 (ONNXYOLO = detection), making the dedicated
+	//       case 20 / 21 / 23 / 25 handlers unreachable dead code. A user
+	//       passing modelType=20 for classification ended up with a YOLO head.
+	//   (3) modelType == 22 / 24 (TRT pose / TRT seg) on a non-NVIDIA box fell
+	//       back to ONNXYOLO instead of the correct ONNXPOSE / ONNXSEG handler.
+	const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
+	switch (modelType) {
+		// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
+		case 3:   // YOLOV8 / YOLOV11
+		case 4:   // generic TensorRT
+		case 14:  // YOLOv10RTOD (TRT end-to-end NMS)
+		case 17:  // YOLOV12
+			modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
+			break;
+		// ── Pose ─────────────────────────────────────────────────────────────
+		case 21:  // ONNXPOSE
+		case 22:  // RTPOSE
+			modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
+			break;
+		// ── Segmentation ─────────────────────────────────────────────────────
+		case 23:  // ONNXSEG
+		case 24:  // RTSEG
+			modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
+			break;
+		// ── Oriented Bounding Box (ONNX-only today) ──────────────────────────
+		case 25:  // ONNXOBB — no TRT variant; leave as-is
+			break;
+		// ── Classification (ONNX-only in this dispatcher) ────────────────────
+		case 20:  // ONNXCL — no TRT variant; leave as-is
+			break;
+		default:
+			// Any other modelType is handled directly by the switch below
+			// (TENSORFLOW, YOLOV4, YOLOV5, FACEDETECT, FACERECOGNIZE, ALPR,
+			// OCR, ANOMALIB, POSE, SAM, ODHUBMODEL, CUSTOMDETECTOR, CUSTOMPY,
+			// MOTIONDETECTOR, MOVIENET, ONNXSAM3, RTSAM3, ONNXYOLO=30,
+			// RTYOLO=31). Do nothing — keep user's value.
+			break;
 	}
 	// returnModelType will be set after the switch to reflect the actual
 	// model class that was instantiated (e.g. RTYOLO→ONNXYOLO on AMD).
@@ -1151,26 +1187,39 @@ extern "C" __declspec(dllexport) int LoadModelFromFolder(ANSCENTER::ANSODBase**
 		if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU
 
 
-		//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
-		if ((modelType == 4) || // TensorRT  
-			(modelType == 14) || // TensorRT Yolov10	 
-			(modelType == 22) || // TensorRT Pose
-			(modelType == 24))  // TensorRT Segmentation
+		// Route detection / pose / segmentation / OBB / classification to the best
+		// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
+		// handler. Unlisted modelType values are left untouched for the switch below.
+		// See CreateANSODHandleEx for the full rationale — three correctness bugs
+		// were fixed in that dispatcher and must be kept in sync across copies.
 		{
-			if (engineType == ANSCENTER::EngineType::NVIDIA_GPU)modelType = 31; // RTYOLO
-			else modelType = 30;// ONNXYOLO
-		}
-		else if ((modelType == 3) ||	// YoloV8/YoloV11 (Object Detection)
-			(modelType == 17) ||	// YOLO V12
-			(modelType == 20) ||	// ONNX Classification	
-			(modelType == 21) ||	// ONNX Pose
-			(modelType == 23) ||	// ONNX Segmentation	
-			(modelType == 25))	    // OBB Segmentation
-		{
-			modelType = 30; // ONNXYOLO
-		}
-		else {
-			// do nothing, use the modelType specified by user
+			const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
+			switch (modelType) {
+				// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
+				case 3:   // YOLOV8 / YOLOV11
+				case 4:   // generic TensorRT
+				case 14:  // YOLOv10RTOD (TRT end-to-end NMS)
+				case 17:  // YOLOV12
+					modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
+					break;
+				// ── Pose ─────────────────────────────────────────────────────────
+				case 21:  // ONNXPOSE
+				case 22:  // RTPOSE
+					modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
+					break;
+				// ── Segmentation ─────────────────────────────────────────────────
+				case 23:  // ONNXSEG
+				case 24:  // RTSEG
+					modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
+					break;
+				// ── OBB / Classification (ONNX-only today — leave as-is) ─────────
+				case 20:  // ONNXCL
+				case 25:  // ONNXOBB
+					break;
+				default:
+					// Any other modelType is handled directly by the switch below.
+					break;
+			}
 		}
 		// NOTE: We intentionally do NOT destroy any existing *Handle here.
 		// LabVIEW reuses DLL parameter buffer addresses, so *Handle may point
@@ -1461,26 +1510,39 @@ ANSODENGINE_API int OptimizeModelStr(const char* modelFilePath, const char* mode
 		ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
 
 
-		//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
-		if ((modelType == 4) || // TensorRT
-			(modelType == 14) || // TensorRT Yolov10
-			(modelType == 22) || // TensorRT Pose
-			(modelType == 24))  // TensorRT Segmentation
+		// Route detection / pose / segmentation / OBB / classification to the best
+		// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
+		// handler. Unlisted modelType values are left untouched for the switch below.
+		// See CreateANSODHandleEx for the full rationale — three correctness bugs
+		// were fixed in that dispatcher and must be kept in sync across copies.
 		{
-			if (engineType == ANSCENTER::EngineType::NVIDIA_GPU)modelType = 31; // RTYOLO
-			else modelType = 30;// ONNXYOLO
-		}
-		else if ((modelType == 3) ||	// YoloV8/YoloV11 (Object Detection)
-			(modelType == 17) ||	// YOLO V12
-			(modelType == 20) ||	// ONNX Classification
-			(modelType == 21) ||	// ONNX Pose
-			(modelType == 23) ||	// ONNX Segmentation
-			(modelType == 25))	    // OBB Segmentation
-		{
-			modelType = 30; // ONNXYOLO
-		}
-		else {
-			// do nothing, use the modelType specified by user
+			const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
+			switch (modelType) {
+				// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
+				case 3:   // YOLOV8 / YOLOV11
+				case 4:   // generic TensorRT
+				case 14:  // YOLOv10RTOD (TRT end-to-end NMS)
+				case 17:  // YOLOV12
+					modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
+					break;
+				// ── Pose ─────────────────────────────────────────────────────────
+				case 21:  // ONNXPOSE
+				case 22:  // RTPOSE
+					modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
+					break;
+				// ── Segmentation ─────────────────────────────────────────────────
+				case 23:  // ONNXSEG
+				case 24:  // RTSEG
+					modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
+					break;
+				// ── OBB / Classification (ONNX-only today — leave as-is) ─────────
+				case 20:  // ONNXCL
+				case 25:  // ONNXOBB
+					break;
+				default:
+					// Any other modelType is handled directly by the switch below.
+					break;
+			}
 		}
 
 		
diff --git a/modules/ANSODEngine/engine.h b/modules/ANSODEngine/engine.h
index 49a0b69..1492924 100644
--- a/modules/ANSODEngine/engine.h
+++ b/modules/ANSODEngine/engine.h
@@ -720,8 +720,24 @@ void Engine<T>::lockGpuClocks(int deviceIndex, int requestedMHz) {
     if (rc == nvml_types::SUCCESS) {
         m_clocksLocked  = true;
         m_nvmlDeviceIdx = static_cast<unsigned int>(deviceIndex);
+        // Always emit to DebugView so operators can confirm the lock took
+        // effect without needing to read engine-level verbose output.
+        ANS_DBG("TRT_Clock",
+            "GPU clocks LOCKED at %u MHz (device %d) — P-state will stay high, "
+            "no WDDM down-clock between inferences",
+            targetMHz, deviceIndex);
         if (m_verbose) std::cout << "Info: GPU clocks locked at " << targetMHz << " MHz (device " << deviceIndex << ")" << std::endl;
     } else {
+        // Surface the failure reason + remediation in DebugView. Most common
+        // failure is access-denied (requires Administrator) or the driver
+        // refusing the requested frequency. Users see this in the log and
+        // know to elevate, set NVCP 'Prefer maximum performance', or run
+        // `nvidia-smi -lgc <MHz>,<MHz>` before launching.
+        ANS_DBG("TRT_Clock",
+            "GPU clock lock FAILED (nvml rc=%s) — expect 2-3x inference latency from "
+            "WDDM down-clocking. Fix: run as Admin, OR set NVCP 'Prefer maximum "
+            "performance' for this app, OR: nvidia-smi -lgc %u,%u",
+            errName(rc), targetMHz, targetMHz);
         if (m_verbose) {
             std::cout << "Warning: nvmlDeviceSetGpuLockedClocks failed: " << errName(rc) << std::endl;
             std::cout << "  (Run as Administrator, or use: nvidia-smi -lgc " << targetMHz << "," << targetMHz << ")" << std::endl;