From 97d814936d01b1acd5c98e6addb2b9620740599f Mon Sep 17 00:00:00 2001
From: Tuan Nghia Nguyen <nghia.nguyen@anscenter.com>
Date: Tue, 21 Apr 2026 15:48:27 +1000
Subject: [PATCH] Enable log information. Disable NPU in U9

---
 .claude/settings.local.json                   | 13 +++-
 MediaClient/media/video_decoder.cpp           | 43 ++++++++++-
 MediaClient/media/video_player.cpp            | 75 ++++++++++++++++++-
 MediaClient/media/video_player.h              | 23 +++++-
 engines/ONNXEngine/ONNXEngine.cpp             |  7 +-
 engines/ONNXEngine/ONNXSAM3.cpp               | 13 ++--
 engines/ONNXEngine/OpenVINODeviceConfig.h     | 38 ++++++++++
 modules/ANSFR/ANSFR.cpp                       |  9 ++-
 modules/ANSLPR/ANSLPR_CPU.cpp                 |  7 +-
 modules/ANSMOT/ByteTrack/src/BYTETracker.cpp  | 19 +++++
 modules/ANSODEngine/ANSFaceDetectorEngine.cpp |  9 ++-
 modules/ANSODEngine/ANSODEngine.cpp           |  7 +-
 modules/ANSODEngine/ANSONNXCL.cpp             | 23 ++++--
 modules/ANSODEngine/ANSOPENVINOCL.cpp         |  7 +-
 modules/ANSODEngine/ANSOPENVINOOD.cpp         |  8 +-
 modules/ANSODEngine/ANSOVSEG.cpp              |  8 +-
 modules/ANSODEngine/ANSYOLO12OD.cpp           | 23 ++++--
 modules/ANSODEngine/ANSYOLOOD.cpp             | 23 ++++--
 18 files changed, 301 insertions(+), 54 deletions(-)
 create mode 100644 engines/ONNXEngine/OpenVINODeviceConfig.h
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index af377d6..51de336 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -13,7 +13,18 @@
       "Bash(powershell.exe -NoProfile -Command \"[System.Environment]::GetEnvironmentVariable\\('PATH','Machine'\\) -split ';' | Select-String -Pattern 'ANSCENTER|Shared'\")",
       "Bash(cmd.exe //c 'dir /AL \"C:\\\\Program Files\\\\ANSCENTER\\\\ANSVIS\\\\data\" 2>&1 | findstr /i \"junction symlink\"')",
       "Bash(cmd.exe //c 'dir /AL \"C:\\\\Program Files\\\\ANSCENTER\\\\ANSVIS\\\\data\"')",
-      "PowerShell(Get-ChildItem \"C:\\\\Program Files\\\\ANSCENTER\\\\ANSVIS\\\\data\" -Force | Where-Object { $_.LinkType } | Select-Object Name, LinkType, Target | Format-Table -AutoSize)"
+      "PowerShell(Get-ChildItem \"C:\\\\Program Files\\\\ANSCENTER\\\\ANSVIS\\\\data\" -Force | Where-Object { $_.LinkType } | Select-Object Name, LinkType, Target | Format-Table -AutoSize)",
+      "Bash(awk '{print \"start: \"$2\"s\"}')",
+      "Bash(awk '{print \"end: \"$2\"s\"}')",
+      "Bash(awk '{ *)",
+      "Bash(awk '{v[NR]=$1} END {asort\\(v\\); n=length\\(v\\); printf \"count=%d\\\\nmedian=%.1fms\\\\np90=%.1fms\\\\np95=%.1fms\\\\np99=%.1fms\\\\nmax=%.1fms\\\\n\", n, v[int\\(n*0.5\\)], v[int\\(n*0.9\\)], v[int\\(n*0.95\\)], v[int\\(n*0.99\\)], v[n]}')",
+      "Bash(awk '{v[NR]=$1} END {asort\\(v\\); n=length\\(v\\); printf \"slow_inf_count=%d \\(over %d total inferences = %.1f%%\\)\\\\nmedian=%.1fms max=%.1fms\\\\n\", n, 10456, 100.0*n/10456, v[int\\(n*0.5\\)], v[n]}')",
+      "Bash(awk '{v[NR]=$1} END {asort\\(v\\); n=length\\(v\\); if\\(n>0\\){printf \"slow_getImage_count=%d median=%.1fms max=%.1fms\\\\n\", n, v[int\\(n*0.5\\)], v[n]}}')",
+      "Bash(awk -F= '{print $2}')",
+      "Bash(git -C \"C:\\\\Projects\\\\CLionProjects\\\\ANSCORE\" status --short engines/ONNXEngine/ONNXEngine.cpp)",
+      "Bash(git -C \"C:\\\\Projects\\\\CLionProjects\\\\ANSCORE\" diff engines/ONNXEngine/ONNXEngine.cpp)",
+      "Bash(git -C \"C:\\\\Projects\\\\CLionProjects\\\\ANSCORE\" status --short)",
+      "Bash(grep -E \"\\\\.\\(cpp|h\\)$\")"
     ]
   }
 }
diff --git a/MediaClient/media/video_decoder.cpp b/MediaClient/media/video_decoder.cpp
index 963d311..33efd46 100644
--- a/MediaClient/media/video_decoder.cpp
+++ b/MediaClient/media/video_decoder.cpp
@@ -4,6 +4,7 @@
 #include "lock.h"
 #include "media_codec.h"
 #include "media_parse.h"
+#include <atomic>
 #include <memory>
 
 #include "ANSLicense.h"   // ANS_DBG macro (gated by ANSCORE_DEBUGVIEW)
@@ -14,6 +15,16 @@ extern "C" {
 #include "libavutil/mem.h"
 }
 
+// ---------------------------------------------------------------------------
+//  Leak diagnostics — exported counters for media allocation balance.
+//  Incremented in allocation sites, decremented in free paths. If (alloc -
+//  free) climbs monotonically over time, the allocator is leaking.
+//  Read by the MEDIA_Leak heartbeat in video_player.cpp (every 60 s).
+// ---------------------------------------------------------------------------
+std::atomic<int64_t> g_contiguousAllocs{0};
+std::atomic<int64_t> g_contiguousFrees{0};
+std::atomic<int64_t> g_contiguousBytesInFlight{0};   // sum(total) of unfreed buffers
+
 // ---------------------------------------------------------------------------
 //  Contiguous YUV420P allocator — trims per-call malloc overhead and enables
 //  the zero-copy fast path in avframeYUV420PToCvMat for resolutions where the
@@ -23,7 +34,20 @@ extern "C" {
 //  single-block layout still improves cache behaviour for the bulk memcpy.)
 // ---------------------------------------------------------------------------
 namespace {
-    void anscore_contiguous_free(void* /*opaque*/, uint8_t* data) {
+    // Opaque payload stored in AVBufferRef so the free callback can account
+    // for the exact byte count being returned (no global lookup needed).
+    struct ContiguousOpaque {
+        size_t bytes;
+    };
+
+    void anscore_contiguous_free(void* opaque, uint8_t* data) {
+        if (opaque) {
+            auto* o = static_cast<ContiguousOpaque*>(opaque);
+            g_contiguousBytesInFlight.fetch_sub(static_cast<int64_t>(o->bytes),
+                                                std::memory_order_relaxed);
+            delete o;
+        }
+        g_contiguousFrees.fetch_add(1, std::memory_order_relaxed);
         av_free(data);
     }
 }
@@ -77,13 +101,24 @@ int CVideoDecoder::contiguousGetBuffer2(AVCodecContext* s, AVFrame* frame, int f
         return AVERROR(ENOMEM);
     }
 
-    AVBufferRef* ref = av_buffer_create(buf, (int)total,
-                                        anscore_contiguous_free, nullptr, 0);
-    if (!ref) {
+    auto* opaque = new (std::nothrow) ContiguousOpaque{total};
+    if (!opaque) {
         av_free(buf);
         return AVERROR(ENOMEM);
     }
 
+    AVBufferRef* ref = av_buffer_create(buf, (int)total,
+                                        anscore_contiguous_free, opaque, 0);
+    if (!ref) {
+        delete opaque;
+        av_free(buf);
+        return AVERROR(ENOMEM);
+    }
+
+    g_contiguousAllocs.fetch_add(1, std::memory_order_relaxed);
+    g_contiguousBytesInFlight.fetch_add(static_cast<int64_t>(total),
+                                        std::memory_order_relaxed);
+
     for (int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
         frame->buf[i]      = nullptr;
         frame->data[i]     = nullptr;
diff --git a/MediaClient/media/video_player.cpp b/MediaClient/media/video_player.cpp
index 7b1ef57..0b2ccce 100644
--- a/MediaClient/media/video_player.cpp
+++ b/MediaClient/media/video_player.cpp
@@ -37,6 +37,22 @@ extern "C"
 
 #include "ANSLicense.h"   // ANS_DBG macro (gated by ANSCORE_DEBUGVIEW)
 
+// ---------------------------------------------------------------------------
+//  Leak diagnostics — definitions for counters declared extern in header.
+//  Also references counters defined in video_decoder.cpp so the heartbeat
+//  below can report media allocator balance in a single line.
+// ---------------------------------------------------------------------------
+std::atomic<int64_t> g_queueClones{0};
+std::atomic<int64_t> g_queueFrees{0};
+std::atomic<int64_t> g_nv12Clones{0};
+std::atomic<int64_t> g_nv12Frees{0};
+std::atomic<int64_t> g_cudaHWClones{0};
+std::atomic<int64_t> g_cudaHWFrees{0};
+
+extern std::atomic<int64_t> g_contiguousAllocs;
+extern std::atomic<int64_t> g_contiguousFrees;
+extern std::atomic<int64_t> g_contiguousBytesInFlight;
+
 // libyuv: SIMD-accelerated YUV↔RGB conversion with native strided-plane input.
 // Replaces the memcpy-into-staging + cv::cvtColor(COLOR_YUV2BGR_I420) chain
 // in avframeYUV420PToCvMat with a direct I420→RGB24 (== OpenCV BGR memory
@@ -1629,10 +1645,12 @@ void CVideoPlayer::close()
 	closeAudio();
 	if (m_currentNV12Frame) {
 		av_frame_free(&m_currentNV12Frame);
+		g_nv12Frees.fetch_add(1, std::memory_order_relaxed);
 		m_currentNV12Frame = nullptr;
 	}
 	if (m_currentCudaHWFrame) {
 		av_frame_free(&m_currentCudaHWFrame);
+		g_cudaHWFrees.fetch_add(1, std::memory_order_relaxed);
 		m_currentCudaHWFrame = nullptr;
 	}
 	if (m_pSnapFrame)
@@ -2329,8 +2347,12 @@ void CVideoPlayer::onVideoFrame(AVFrame* frame)
 		// and we can safely clone the CUDA frame without deadlock risk.
 		// cloneCudaHWFrame_unlocked() is safe because decoder._mutex is already held.
 		if (m_pVideoDecoder && m_pVideoDecoder->isCudaHWAccel()) {
-			if (m_currentCudaHWFrame) av_frame_free(&m_currentCudaHWFrame);
+			if (m_currentCudaHWFrame) {
+				av_frame_free(&m_currentCudaHWFrame);
+				g_cudaHWFrees.fetch_add(1, std::memory_order_relaxed);
+			}
 			m_currentCudaHWFrame = m_pVideoDecoder->cloneCudaHWFrame_unlocked();
+			if (m_currentCudaHWFrame) g_cudaHWClones.fetch_add(1, std::memory_order_relaxed);
 		}
 
 		// Track how many clean frames have arrived since keyframe
@@ -2455,8 +2477,12 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
 				(frameToProcess->format == AV_PIX_FMT_NV12 ||
 				 frameToProcess->format == AV_PIX_FMT_YUV420P ||
 				 frameToProcess->format == AV_PIX_FMT_YUVJ420P)) {
-				if (m_currentNV12Frame) av_frame_free(&m_currentNV12Frame);
+				if (m_currentNV12Frame) {
+					av_frame_free(&m_currentNV12Frame);
+					g_nv12Frees.fetch_add(1, std::memory_order_relaxed);
+				}
 				m_currentNV12Frame = av_frame_clone(frameToProcess);
+				if (m_currentNV12Frame) g_nv12Clones.fetch_add(1, std::memory_order_relaxed);
 			}
 
 			width = m_currentImage.cols;
@@ -2466,6 +2492,49 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
 		}
 
 		av_frame_free(&frameToProcess);
+		g_queueFrees.fetch_add(1, std::memory_order_relaxed);
+
+		// Leak diagnostics — one heartbeat every 60 s across the whole process.
+		// Each counter pair (allocs, frees) should stay balanced. A monotonic
+		// rise in (allocs - frees) identifies the leaking pool. Bytes field
+		// covers the ~12 MB/frame contiguous YUV420P buffers specifically —
+		// watch for steady climb while the counters look balanced (refcount
+		// leak in a held clone would show that shape).
+		{
+			using clk = std::chrono::steady_clock;
+			static std::atomic<long long> s_nextLeakLogTick{0};
+			const long long tick = clk::now().time_since_epoch().count();
+			long long expected = s_nextLeakLogTick.load(std::memory_order_relaxed);
+			if (tick >= expected) {
+				const long long deadline = tick +
+					std::chrono::duration_cast<clk::duration>(
+						std::chrono::seconds(60)).count();
+				// Claim the next window — first writer wins so only one thread logs.
+				if (s_nextLeakLogTick.compare_exchange_strong(
+						expected, deadline, std::memory_order_relaxed)) {
+					const int64_t qA = g_queueClones.load(std::memory_order_relaxed);
+					const int64_t qF = g_queueFrees.load(std::memory_order_relaxed);
+					const int64_t nvA = g_nv12Clones.load(std::memory_order_relaxed);
+					const int64_t nvF = g_nv12Frees.load(std::memory_order_relaxed);
+					const int64_t cuA = g_cudaHWClones.load(std::memory_order_relaxed);
+					const int64_t cuF = g_cudaHWFrees.load(std::memory_order_relaxed);
+					const int64_t cgA = g_contiguousAllocs.load(std::memory_order_relaxed);
+					const int64_t cgF = g_contiguousFrees.load(std::memory_order_relaxed);
+					const int64_t cgB = g_contiguousBytesInFlight.load(std::memory_order_relaxed);
+					ANS_DBG("MEDIA_Leak",
+						"queue(C=%lld F=%lld net=%lld depth=%zu) "
+						"nv12(C=%lld F=%lld net=%lld) "
+						"cudaHW(C=%lld F=%lld net=%lld) "
+						"contig(A=%lld F=%lld net=%lld bytesMB=%.1f)",
+						(long long)qA, (long long)qF, (long long)(qA - qF),
+						g_frameQueue.size(),
+						(long long)nvA, (long long)nvF, (long long)(nvA - nvF),
+						(long long)cuA, (long long)cuF, (long long)(cuA - cuF),
+						(long long)cgA, (long long)cgF, (long long)(cgA - cgF),
+						(double)cgB / (1024.0 * 1024.0));
+				}
+			}
+		}
 
 		// Emit timing breakdown. Throttled so DebugView / stderr stay readable.
 		{
@@ -2540,11 +2609,13 @@ std::string CVideoPlayer::getJpegImage(int& width, int& height, int64_t& pts) {
 		catch (const std::exception& e) {
 			std::cerr << "Exception while converting AVFrame to JPEG string: " << e.what() << std::endl;
 			av_frame_free(&frameToProcess);
+			g_queueFrees.fetch_add(1, std::memory_order_relaxed);
 			return m_lastJpegImage;
 		}
 		const auto t3 = clk::now();
 
 		av_frame_free(&frameToProcess);
+		g_queueFrees.fetch_add(1, std::memory_order_relaxed);
 
 		if (m_pts < INT64_MAX) {
 			m_pts++;
diff --git a/MediaClient/media/video_player.h b/MediaClient/media/video_player.h
index 6fc430c..7ad3b8d 100644
--- a/MediaClient/media/video_player.h
+++ b/MediaClient/media/video_player.h
@@ -15,8 +15,18 @@
 #include <opencv2/highgui.hpp>
 #include <opencv2/opencv.hpp>
 #include <turbojpeg.h>
+#include <atomic>
 #include <chrono>
 
+// Leak diagnostics — net counters surfaced in MEDIA_Leak heartbeat.
+// Defined in video_player.cpp; also incremented from FrameQueue here.
+extern std::atomic<int64_t> g_queueClones;         // av_frame_clone from FrameQueue
+extern std::atomic<int64_t> g_queueFrees;          // av_frame_free from FrameQueue
+extern std::atomic<int64_t> g_nv12Clones;          // m_currentNV12Frame = av_frame_clone
+extern std::atomic<int64_t> g_nv12Frees;           // av_frame_free(&m_currentNV12Frame)
+extern std::atomic<int64_t> g_cudaHWClones;        // m_currentCudaHWFrame = clone
+extern std::atomic<int64_t> g_cudaHWFrees;         // av_frame_free(&m_currentCudaHWFrame)
+
 typedef struct
 {
     uint32          SyncTimestamp;
@@ -46,6 +56,7 @@ public:
             std::cerr << "Failed to clone AVFrame!" << std::endl;
             return;
         }
+        g_queueClones.fetch_add(1, std::memory_order_relaxed);
 
         frameQueue.push(frameCopy);
         m_frameSeq++;  // New frame arrived
@@ -55,6 +66,7 @@ public:
             AVFrame* oldFrame = frameQueue.front();
             frameQueue.pop();
             av_frame_free(&oldFrame);
+            g_queueFrees.fetch_add(1, std::memory_order_relaxed);
         }
     }
 
@@ -73,7 +85,15 @@ public:
         }
 
         // Clone the latest frame before returning it
-        return av_frame_clone(frameQueue.back());
+        AVFrame* clone = av_frame_clone(frameQueue.back());
+        if (clone) g_queueClones.fetch_add(1, std::memory_order_relaxed);
+        return clone;
+    }
+
+    // Current depth — snapshot used by the leak heartbeat.
+    size_t size() {
+        std::lock_guard<std::mutex> lock(queueMutex);
+        return frameQueue.size();
     }
 
     // Retrieve and remove the oldest frame from the queue
@@ -102,6 +122,7 @@ public:
             AVFrame* frame = frameQueue.front();
             frameQueue.pop();
             av_frame_free(&frame);
+            g_queueFrees.fetch_add(1, std::memory_order_relaxed);
         }
         m_frameSeq = 0;
     }
diff --git a/engines/ONNXEngine/ONNXEngine.cpp b/engines/ONNXEngine/ONNXEngine.cpp
index 7a2881b..0dfd32e 100644
--- a/engines/ONNXEngine/ONNXEngine.cpp
+++ b/engines/ONNXEngine/ONNXEngine.cpp
@@ -1,8 +1,10 @@
 ﻿#include "ONNXEngine.h"
 #include "EPLoader.h"
+#include "OpenVINODeviceConfig.h"
 #include "Utility.h"
 
 #include <algorithm>
+#include <cctype>
 #include <limits>
 #include <filesystem>
 #include <fstream>
@@ -318,8 +320,9 @@ namespace ANSCENTER {
 
         std::vector<std::unordered_map<std::string, std::string>> try_configs;
 
-        // Only try NPU if it hasn't been probed yet or was previously available
-        if (!s_npuProbed || s_npuAvailable) {
+        // NPU is disabled by default — see OpenVINODeviceConfig.h. Opt in via
+        // OPENVINO_ENABLE_NPU=1. Even when enabled, skip if a prior probe failed.
+        if (IsOpenVINONpuEnabled() && (!s_npuProbed || s_npuAvailable)) {
             try_configs.push_back(makeConfig("AUTO:NPU,GPU"));
         }
         try_configs.push_back(makeConfig("GPU.0"));
diff --git a/engines/ONNXEngine/ONNXSAM3.cpp b/engines/ONNXEngine/ONNXSAM3.cpp
index 74d537c..8713a2e 100644
--- a/engines/ONNXEngine/ONNXSAM3.cpp
+++ b/engines/ONNXEngine/ONNXSAM3.cpp
@@ -1,5 +1,6 @@
 #include "ONNXSAM3.h"
 #include "ONNXEngine.h"   // OrtCompatiableGetInputName/OutputName helpers
+#include "OpenVINODeviceConfig.h"
 
 #include <iostream>
 #include <fstream>
@@ -73,11 +74,13 @@ namespace ANSCENTER
 
     bool ONNXSAM3::TryAppendOpenVINO(Ort::SessionOptions& session_options)
     {
-        std::vector<std::unordered_map<std::string, std::string>> configs = {
-            {{"device_type","AUTO:NPU,GPU"},{"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}},
-            {{"device_type","GPU.0"},       {"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}},
-            {{"device_type","AUTO:GPU,CPU"},{"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}}
-        };
+        // NPU gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h
+        std::vector<std::unordered_map<std::string, std::string>> configs;
+        if (IsOpenVINONpuEnabled()) {
+            configs.push_back({{"device_type","AUTO:NPU,GPU"},{"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}});
+        }
+        configs.push_back({{"device_type","GPU.0"},       {"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}});
+        configs.push_back({{"device_type","AUTO:GPU,CPU"},{"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}});
         for (const auto& config : configs) {
             try {
                 session_options.AppendExecutionProvider_OpenVINO_V2(config);
diff --git a/engines/ONNXEngine/OpenVINODeviceConfig.h b/engines/ONNXEngine/OpenVINODeviceConfig.h
new file mode 100644
index 0000000..4e030a8
--- /dev/null
+++ b/engines/ONNXEngine/OpenVINODeviceConfig.h
@@ -0,0 +1,38 @@
+#pragma once
+
+// Shared runtime switch for enabling the Intel NPU in OpenVINO code paths.
+//
+// NPU is DISABLED BY DEFAULT because the NPU plugin on some Intel platforms
+// (observed: Core Ultra 9 285K / Arrow Lake) crashes inside
+// ov::Core::compile_model or Ort::Session construction when compiling
+// multiple ONNX models in quick succession. That failure mode cannot be
+// caught by the surrounding try/catch (it fires on a plugin worker thread)
+// and takes down the host process.
+//
+// To opt into NPU (e.g. on a machine with a known-good NPU driver), set the
+// environment variable OPENVINO_ENABLE_NPU to 1 / true / yes / on before
+// launching the host process.
+//
+// Every OpenVINO device-selection site in this codebase consults this helper
+// rather than probing NPU unconditionally.
+
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
+#include <string>
+
+namespace ANSCENTER {
+
+    inline bool IsOpenVINONpuEnabled() {
+        static const bool enabled = [] {
+            const char* v = std::getenv("OPENVINO_ENABLE_NPU");
+            if (!v || !*v) return false;
+            std::string s(v);
+            std::transform(s.begin(), s.end(), s.begin(),
+                [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            return s == "1" || s == "true" || s == "yes" || s == "on";
+        }();
+        return enabled;
+    }
+
+}
diff --git a/modules/ANSFR/ANSFR.cpp b/modules/ANSFR/ANSFR.cpp
index cc53785..e603ada 100644
--- a/modules/ANSFR/ANSFR.cpp
+++ b/modules/ANSFR/ANSFR.cpp
@@ -1,4 +1,5 @@
 ﻿#include "ANSFR.h"
+#include "OpenVINODeviceConfig.h"
 #include <opencv2/imgcodecs.hpp>
 #include "ANSOVFaceDetector.h"
 #include "SCRFDFaceDetector.h"
@@ -2695,8 +2696,12 @@ namespace ANSCENTER {
         for (const auto& d : available_devices) {
             ANS_DBG("ANSFR", "  OpenVINO device: %s", d.c_str());
         }
-        // Prioritize devices: NPU > GPU > CPU
-        std::vector<std::string> priority_devices = { "NPU","GPU","CPU" };
+        // Prioritize devices: NPU > GPU > CPU. NPU gated behind runtime switch
+        // (OPENVINO_ENABLE_NPU=1) — see OpenVINODeviceConfig.h.
+        std::vector<std::string> priority_devices;
+        if (IsOpenVINONpuEnabled()) priority_devices.push_back("NPU");
+        priority_devices.push_back("GPU");
+        priority_devices.push_back("CPU");
         for (const auto& device : priority_devices) {
             if (std::find(available_devices.begin(), available_devices.end(), device) != available_devices.end()) {
                 ANS_DBG("ANSFR", "GetOpenVINODevice: selected %s", device.c_str());
diff --git a/modules/ANSLPR/ANSLPR_CPU.cpp b/modules/ANSLPR/ANSLPR_CPU.cpp
index 92135df..e7faa14 100644
--- a/modules/ANSLPR/ANSLPR_CPU.cpp
+++ b/modules/ANSLPR/ANSLPR_CPU.cpp
@@ -1,4 +1,5 @@
 #include "ANSLPR_CPU.h"
+#include "OpenVINODeviceConfig.h"
 #include "ANSYOLOV10OVOD.h"
 #include "ANSOPENVINOOD.h"
 #include "ANSTENSORRTOD.h"
@@ -119,8 +120,10 @@ namespace ANSCENTER {
         std::vector<std::string> available_devices = _core.get_available_devices();
         bool device_found = false;
         std::string deviceName = "CPU";
-        // Search for NPU
-        auto it = std::find(available_devices.begin(), available_devices.end(), "NPU");
+        // Search for NPU (gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h)
+        auto it = IsOpenVINONpuEnabled()
+            ? std::find(available_devices.begin(), available_devices.end(), "NPU")
+            : available_devices.end();
         if (it != available_devices.end()) {
             _core.set_property("NPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
             _core.set_property("GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
diff --git a/modules/ANSMOT/ByteTrack/src/BYTETracker.cpp b/modules/ANSMOT/ByteTrack/src/BYTETracker.cpp
index 771eb96..792cf65 100644
--- a/modules/ANSMOT/ByteTrack/src/BYTETracker.cpp
+++ b/modules/ANSMOT/ByteTrack/src/BYTETracker.cpp
@@ -1,4 +1,5 @@
 #include "BYTETracker.h"
+#include "ANSLicense.h"   // ANS_DBG for tracker-state-size diagnostic
 #include <algorithm>
 #include <cstddef>
 #include <limits>
@@ -322,6 +323,24 @@ std::vector<ByteTrack::BYTETracker::STrackPtr> ByteTrack::BYTETracker::update(co
     tracked_stracks_ = tracked_stracks_out;
     lost_stracks_ = lost_stracks_out;
 
+    // Diagnostic: report tracker state size at most once every 60 s per instance.
+    // removed_stracks_ is append-only in this implementation — watch it grow.
+    {
+        static thread_local std::chrono::steady_clock::time_point s_nextLog{};
+        auto now = std::chrono::steady_clock::now();
+        if (now >= s_nextLog) {
+            s_nextLog = now + std::chrono::seconds(60);
+            ANS_DBG("ANSMOT",
+                "BYTETracker state this=%p frame=%zu nextId=%zu tracked=%zu lost=%zu removed=%zu",
+                (void*)this,
+                frame_id_,
+                track_id_count_,
+                tracked_stracks_.size(),
+                lost_stracks_.size(),
+                removed_stracks_.size());
+        }
+    }
+
     std::vector<STrackPtr> output_stracks;
     for (const auto &track : tracked_stracks_)
     {
diff --git a/modules/ANSODEngine/ANSFaceDetectorEngine.cpp b/modules/ANSODEngine/ANSFaceDetectorEngine.cpp
index 5ab32ad..ca5d320 100644
--- a/modules/ANSODEngine/ANSFaceDetectorEngine.cpp
+++ b/modules/ANSODEngine/ANSFaceDetectorEngine.cpp
@@ -1,5 +1,6 @@
 ﻿#pragma once
 #include "ANSODEngine.h"
+#include "OpenVINODeviceConfig.h"
 #include "ANSYOLOOD.h"
 #include "ANSTENSORRTOD.h"
 #include "ANSTENSORRTCL.h"
@@ -333,8 +334,10 @@ namespace ANSCENTER
 			std::vector<std::string> available_devices = core.get_available_devices();
 			bool device_found = false;
 			std::string deviceName = "CPU";
-			// Search for NPU
-			auto it = std::find(available_devices.begin(), available_devices.end(), "NPU");
+			// Search for NPU (gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h)
+			auto it = IsOpenVINONpuEnabled()
+				? std::find(available_devices.begin(), available_devices.end(), "NPU")
+				: available_devices.end();
 			if (it != available_devices.end()) {
 				core.set_property("NPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
 				core.set_property("GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
@@ -1414,7 +1417,7 @@ namespace ANSCENTER
 					};
 
 					std::vector<std::unordered_map<std::string, std::string>> try_configs;
-					if (!s_npuProbed || s_npuAvailable) {
+					if (IsOpenVINONpuEnabled() && (!s_npuProbed || s_npuAvailable)) {
 						try_configs.push_back(makeConfig("AUTO:NPU,GPU"));
 					}
 					try_configs.push_back(makeConfig("GPU.0"));
diff --git a/modules/ANSODEngine/ANSODEngine.cpp b/modules/ANSODEngine/ANSODEngine.cpp
index 173e61d..f1c480f 100644
--- a/modules/ANSODEngine/ANSODEngine.cpp
+++ b/modules/ANSODEngine/ANSODEngine.cpp
@@ -4,6 +4,7 @@
 #include <json.hpp>
 #include "ANSODEngine.h"
 #include "ANSLicense.h"   // ANS_DBG macro
+#include "OpenVINODeviceConfig.h"
 #include "ANSYOLOOD.h"
 #include "ANSTENSORRTOD.h"
 #include "ANSTENSORRTCL.h"
@@ -354,8 +355,10 @@ namespace ANSCENTER
 			std::vector<std::string> available_devices = core.get_available_devices();
 			bool device_found = false;
 			std::string deviceName = "CPU";
-			// Search for NPU
-			auto it = std::find(available_devices.begin(), available_devices.end(), "NPU");
+			// Search for NPU (gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h)
+			auto it = IsOpenVINONpuEnabled()
+				? std::find(available_devices.begin(), available_devices.end(), "NPU")
+				: available_devices.end();
 			if (it != available_devices.end()) {
 				core.set_property("NPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
 				core.set_property("GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
diff --git a/modules/ANSODEngine/ANSONNXCL.cpp b/modules/ANSODEngine/ANSONNXCL.cpp
index 1fa72cf..7276dff 100644
--- a/modules/ANSODEngine/ANSONNXCL.cpp
+++ b/modules/ANSODEngine/ANSONNXCL.cpp
@@ -1,5 +1,6 @@
 ﻿#include"ANSONNXCL.h"
 #include "EPLoader.h"
+#include "OpenVINODeviceConfig.h"
 namespace ANSCENTER
 {
 
@@ -143,20 +144,26 @@ namespace ANSCENTER
                     const std::string numberOfThreads = "1";
                     const std::string numberOfStreams = "1";
 
-                    std::vector<std::unordered_map<std::string, std::string>> try_configs = {
-                        { {"device_type","AUTO:NPU,GPU"}, {"precision",precision},
-                          {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
-                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} },
+                    std::vector<std::unordered_map<std::string, std::string>> try_configs;
+                    // NPU gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h
+                    if (IsOpenVINONpuEnabled()) {
+                        try_configs.push_back(
+                            { {"device_type","AUTO:NPU,GPU"}, {"precision",precision},
+                              {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
+                              {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} });
+                    }
+                    try_configs.push_back(
                         { {"device_type","GPU.0"}, {"precision",precision},
                           {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
-                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} },
+                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} });
+                    try_configs.push_back(
                         { {"device_type","GPU.1"}, {"precision",precision},
                           {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
-                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} },
+                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} });
+                    try_configs.push_back(
                         { {"device_type","AUTO:GPU,CPU"}, {"precision",precision},
                           {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
-                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} }
-                    };
+                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} });
 
                     for (const auto& config : try_configs) {
                         try {
diff --git a/modules/ANSODEngine/ANSOPENVINOCL.cpp b/modules/ANSODEngine/ANSOPENVINOCL.cpp
index 3fb3214..a491f9f 100644
--- a/modules/ANSODEngine/ANSOPENVINOCL.cpp
+++ b/modules/ANSODEngine/ANSOPENVINOCL.cpp
@@ -1,5 +1,6 @@
 #include "ANSOPENVINOCL.h"
 #include "Utility.h"
+#include "OpenVINODeviceConfig.h"
 namespace ANSCENTER
 {
 	bool OPENVINOCL::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
@@ -369,8 +370,10 @@ namespace ANSCENTER
 			std::vector<std::string> available_devices = core.get_available_devices();
 			bool device_found = false;
 
-			// Search for NPU
-			auto it = std::find(available_devices.begin(), available_devices.end(), "NPU");
+			// Search for NPU (gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h)
+			auto it = IsOpenVINONpuEnabled()
+				? std::find(available_devices.begin(), available_devices.end(), "NPU")
+				: available_devices.end();
 			if (it != available_devices.end()) {
 				core.set_property("NPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
 				core.set_property("GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
diff --git a/modules/ANSODEngine/ANSOPENVINOOD.cpp b/modules/ANSODEngine/ANSOPENVINOOD.cpp
index 4da131a..c2b4000 100644
--- a/modules/ANSODEngine/ANSOPENVINOOD.cpp
+++ b/modules/ANSODEngine/ANSOPENVINOOD.cpp
@@ -1,5 +1,6 @@
 #include "ANSOPENVINOOD.h"
 #include "Utility.h"
+#include "OpenVINODeviceConfig.h"
 namespace ANSCENTER
 {
 	bool OPENVINOOD::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
@@ -437,8 +438,11 @@ namespace ANSCENTER
 			ov::Core core;
 			// Step 2: Get Available Devices and Log
 			std::vector<std::string> available_devices = core.get_available_devices();
-			// Define device priority: NPU > GPU > CPU
-			std::vector<std::string> priority_devices = { "NPU", "GPU" };
+			// Define device priority: NPU > GPU > CPU. NPU gated by
+			// OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h.
+			std::vector<std::string> priority_devices;
+			if (IsOpenVINONpuEnabled()) priority_devices.push_back("NPU");
+			priority_devices.push_back("GPU");
 			bool device_found = false;
 
 			// Iterate over prioritized devices
diff --git a/modules/ANSODEngine/ANSOVSEG.cpp b/modules/ANSODEngine/ANSOVSEG.cpp
index 3b796e1..969e473 100644
--- a/modules/ANSODEngine/ANSOVSEG.cpp
+++ b/modules/ANSODEngine/ANSOVSEG.cpp
@@ -1,4 +1,5 @@
 #include "ANSOVSEG.h"
+#include "OpenVINODeviceConfig.h"
 namespace ANSCENTER {
 	bool ANSOVSEG::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
@@ -493,8 +494,11 @@ namespace ANSCENTER {
 			ov::Core core;
 			// Step 2: Get Available Devices and Log
 			std::vector<std::string> available_devices = core.get_available_devices();
-			// Define device priority: NPU > GPU > CPU
-			std::vector<std::string> priority_devices = { "NPU", "GPU" };
+			// Define device priority: NPU > GPU > CPU. NPU gated by
+			// OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h.
+			std::vector<std::string> priority_devices;
+			if (IsOpenVINONpuEnabled()) priority_devices.push_back("NPU");
+			priority_devices.push_back("GPU");
 			bool device_found = false;
 
 			// Iterate over prioritized devices
diff --git a/modules/ANSODEngine/ANSYOLO12OD.cpp b/modules/ANSODEngine/ANSYOLO12OD.cpp
index ab9e03b..e9e7829 100644
--- a/modules/ANSODEngine/ANSYOLO12OD.cpp
+++ b/modules/ANSODEngine/ANSYOLO12OD.cpp
@@ -1,5 +1,6 @@
 ﻿#include "ANSYOLO12OD.h"
 #include "EPLoader.h"
+#include "OpenVINODeviceConfig.h"
 #ifdef USEONNXOV
 #endif
 
@@ -365,20 +366,26 @@ namespace ANSCENTER {
                     const std::string numberOfThreads = "8";
                     const std::string numberOfStreams = "8";
 
-                    std::vector<std::unordered_map<std::string, std::string>> try_configs = {
-                        { {"device_type","AUTO:NPU,GPU"}, {"precision",precision},
-                          {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
-                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} },
+                    std::vector<std::unordered_map<std::string, std::string>> try_configs;
+                    // NPU gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h
+                    if (IsOpenVINONpuEnabled()) {
+                        try_configs.push_back(
+                            { {"device_type","AUTO:NPU,GPU"}, {"precision",precision},
+                              {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
+                              {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} });
+                    }
+                    try_configs.push_back(
                         { {"device_type","GPU.0"}, {"precision",precision},
                           {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
-                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} },
+                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} });
+                    try_configs.push_back(
                         { {"device_type","GPU.1"}, {"precision",precision},
                           {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
-                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} },
+                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} });
+                    try_configs.push_back(
                         { {"device_type","AUTO:GPU,CPU"}, {"precision",precision},
                           {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
-                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }
-                    };
+                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} });
 
                     for (const auto& config : try_configs) {
                         try {
diff --git a/modules/ANSODEngine/ANSYOLOOD.cpp b/modules/ANSODEngine/ANSYOLOOD.cpp
index df12098..bcb502d 100644
--- a/modules/ANSODEngine/ANSYOLOOD.cpp
+++ b/modules/ANSODEngine/ANSYOLOOD.cpp
@@ -1,6 +1,7 @@
 ﻿#include "ANSYOLOOD.h"
 #include "Utility.h"
 #include "EPLoader.h"
+#include "OpenVINODeviceConfig.h"
 #include "ANSGpuFrameRegistry.h"
 #include "NV12PreprocessHelper.h"   // tl_currentGpuFrame()
 #ifdef USEONNXOV
@@ -303,20 +304,26 @@ namespace ANSCENTER
 					const std::string numberOfThreads = "8";
 					const std::string numberOfStreams = "8";
 
-					std::vector<std::unordered_map<std::string, std::string>> try_configs = {
-						{ {"device_type","AUTO:NPU,GPU"}, {"precision",precision},
-						  {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
-						  {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} },
+					std::vector<std::unordered_map<std::string, std::string>> try_configs;
+					// NPU gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h
+					if (IsOpenVINONpuEnabled()) {
+						try_configs.push_back(
+							{ {"device_type","AUTO:NPU,GPU"}, {"precision",precision},
+							  {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
+							  {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} });
+					}
+					try_configs.push_back(
 						{ {"device_type","GPU.0"}, {"precision",precision},
 						  {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
-						  {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} },
+						  {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} });
+					try_configs.push_back(
 						{ {"device_type","GPU.1"}, {"precision",precision},
 						  {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
-						  {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} },
+						  {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} });
+					try_configs.push_back(
 						{ {"device_type","AUTO:GPU,CPU"}, {"precision",precision},
 						  {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
-						  {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }
-					};
+						  {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} });
 
 					for (const auto& config : try_configs) {
 						try {