From 97d814936d01b1acd5c98e6addb2b9620740599f Mon Sep 17 00:00:00 2001 From: Tuan Nghia Nguyen Date: Tue, 21 Apr 2026 15:48:27 +1000 Subject: [PATCH] Enable log information. Disable NPU in U9 --- .claude/settings.local.json | 13 +++- MediaClient/media/video_decoder.cpp | 43 ++++++++++- MediaClient/media/video_player.cpp | 75 ++++++++++++++++++- MediaClient/media/video_player.h | 23 +++++- engines/ONNXEngine/ONNXEngine.cpp | 7 +- engines/ONNXEngine/ONNXSAM3.cpp | 13 ++-- engines/ONNXEngine/OpenVINODeviceConfig.h | 38 ++++++++++ modules/ANSFR/ANSFR.cpp | 9 ++- modules/ANSLPR/ANSLPR_CPU.cpp | 7 +- modules/ANSMOT/ByteTrack/src/BYTETracker.cpp | 19 +++++ modules/ANSODEngine/ANSFaceDetectorEngine.cpp | 9 ++- modules/ANSODEngine/ANSODEngine.cpp | 7 +- modules/ANSODEngine/ANSONNXCL.cpp | 23 ++++-- modules/ANSODEngine/ANSOPENVINOCL.cpp | 7 +- modules/ANSODEngine/ANSOPENVINOOD.cpp | 8 +- modules/ANSODEngine/ANSOVSEG.cpp | 8 +- modules/ANSODEngine/ANSYOLO12OD.cpp | 23 ++++-- modules/ANSODEngine/ANSYOLOOD.cpp | 23 ++++-- 18 files changed, 301 insertions(+), 54 deletions(-) create mode 100644 engines/ONNXEngine/OpenVINODeviceConfig.h diff --git a/.claude/settings.local.json b/.claude/settings.local.json index af377d6..51de336 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -13,7 +13,18 @@ "Bash(powershell.exe -NoProfile -Command \"[System.Environment]::GetEnvironmentVariable\\('PATH','Machine'\\) -split ';' | Select-String -Pattern 'ANSCENTER|Shared'\")", "Bash(cmd.exe //c 'dir /AL \"C:\\\\Program Files\\\\ANSCENTER\\\\ANSVIS\\\\data\" 2>&1 | findstr /i \"junction symlink\"')", "Bash(cmd.exe //c 'dir /AL \"C:\\\\Program Files\\\\ANSCENTER\\\\ANSVIS\\\\data\"')", - "PowerShell(Get-ChildItem \"C:\\\\Program Files\\\\ANSCENTER\\\\ANSVIS\\\\data\" -Force | Where-Object { $_.LinkType } | Select-Object Name, LinkType, Target | Format-Table -AutoSize)" + "PowerShell(Get-ChildItem \"C:\\\\Program Files\\\\ANSCENTER\\\\ANSVIS\\\\data\" -Force | Where-Object { $_.LinkType } | Select-Object Name, LinkType, Target | Format-Table -AutoSize)", + "Bash(awk '{print \"start: \"$2\"s\"}')", + "Bash(awk '{print \"end: \"$2\"s\"}')", + "Bash(awk '{ *)", + "Bash(awk '{v[NR]=$1} END {asort\\(v\\); n=length\\(v\\); printf \"count=%d\\\\nmedian=%.1fms\\\\np90=%.1fms\\\\np95=%.1fms\\\\np99=%.1fms\\\\nmax=%.1fms\\\\n\", n, v[int\\(n*0.5\\)], v[int\\(n*0.9\\)], v[int\\(n*0.95\\)], v[int\\(n*0.99\\)], v[n]}')", + "Bash(awk '{v[NR]=$1} END {asort\\(v\\); n=length\\(v\\); printf \"slow_inf_count=%d \\(over %d total inferences = %.1f%%\\)\\\\nmedian=%.1fms max=%.1fms\\\\n\", n, 10456, 100.0*n/10456, v[int\\(n*0.5\\)], v[n]}')", + "Bash(awk '{v[NR]=$1} END {asort\\(v\\); n=length\\(v\\); if\\(n>0\\){printf \"slow_getImage_count=%d median=%.1fms max=%.1fms\\\\n\", n, v[int\\(n*0.5\\)], v[n]}}')", + "Bash(awk -F= '{print $2}')", + "Bash(git -C \"C:\\\\Projects\\\\CLionProjects\\\\ANSCORE\" status --short engines/ONNXEngine/ONNXEngine.cpp)", + "Bash(git -C \"C:\\\\Projects\\\\CLionProjects\\\\ANSCORE\" diff engines/ONNXEngine/ONNXEngine.cpp)", + "Bash(git -C \"C:\\\\Projects\\\\CLionProjects\\\\ANSCORE\" status --short)", + "Bash(grep -E \"\\\\.\\(cpp|h\\)$\")" ] } } diff --git a/MediaClient/media/video_decoder.cpp b/MediaClient/media/video_decoder.cpp index 963d311..33efd46 100644 --- a/MediaClient/media/video_decoder.cpp +++ b/MediaClient/media/video_decoder.cpp @@ -4,6 +4,7 @@ #include "lock.h" #include "media_codec.h" #include "media_parse.h" +#include #include #include "ANSLicense.h" // ANS_DBG macro (gated by ANSCORE_DEBUGVIEW) @@ -14,6 +15,16 @@ extern "C" { #include "libavutil/mem.h" } +// --------------------------------------------------------------------------- +// Leak diagnostics — exported counters for media allocation balance. +// Incremented in allocation sites, decremented in free paths. If (alloc - +// free) climbs monotonically over time, the allocator is leaking. +// Read by the MEDIA_Leak heartbeat in video_player.cpp (every 60 s). +// --------------------------------------------------------------------------- +std::atomic g_contiguousAllocs{0}; +std::atomic g_contiguousFrees{0}; +std::atomic g_contiguousBytesInFlight{0}; // sum(total) of unfreed buffers + // --------------------------------------------------------------------------- // Contiguous YUV420P allocator — trims per-call malloc overhead and enables // the zero-copy fast path in avframeYUV420PToCvMat for resolutions where the @@ -23,7 +34,20 @@ extern "C" { // single-block layout still improves cache behaviour for the bulk memcpy.) // --------------------------------------------------------------------------- namespace { - void anscore_contiguous_free(void* /*opaque*/, uint8_t* data) { + // Opaque payload stored in AVBufferRef so the free callback can account + // for the exact byte count being returned (no global lookup needed). + struct ContiguousOpaque { + size_t bytes; + }; + + void anscore_contiguous_free(void* opaque, uint8_t* data) { + if (opaque) { + auto* o = static_cast(opaque); + g_contiguousBytesInFlight.fetch_sub(static_cast(o->bytes), + std::memory_order_relaxed); + delete o; + } + g_contiguousFrees.fetch_add(1, std::memory_order_relaxed); av_free(data); } } @@ -77,13 +101,24 @@ int CVideoDecoder::contiguousGetBuffer2(AVCodecContext* s, AVFrame* frame, int f return AVERROR(ENOMEM); } - AVBufferRef* ref = av_buffer_create(buf, (int)total, - anscore_contiguous_free, nullptr, 0); - if (!ref) { + auto* opaque = new (std::nothrow) ContiguousOpaque{total}; + if (!opaque) { av_free(buf); return AVERROR(ENOMEM); } + AVBufferRef* ref = av_buffer_create(buf, (int)total, + anscore_contiguous_free, opaque, 0); + if (!ref) { + delete opaque; + av_free(buf); + return AVERROR(ENOMEM); + } + + g_contiguousAllocs.fetch_add(1, std::memory_order_relaxed); + g_contiguousBytesInFlight.fetch_add(static_cast(total), + std::memory_order_relaxed); + for (int i = 0; i < AV_NUM_DATA_POINTERS; ++i) { frame->buf[i] = nullptr; frame->data[i] = nullptr; diff --git a/MediaClient/media/video_player.cpp b/MediaClient/media/video_player.cpp index 7b1ef57..0b2ccce 100644 --- a/MediaClient/media/video_player.cpp +++ b/MediaClient/media/video_player.cpp @@ -37,6 +37,22 @@ extern "C" #include "ANSLicense.h" // ANS_DBG macro (gated by ANSCORE_DEBUGVIEW) +// --------------------------------------------------------------------------- +// Leak diagnostics — definitions for counters declared extern in header. +// Also references counters defined in video_decoder.cpp so the heartbeat +// below can report media allocator balance in a single line. +// --------------------------------------------------------------------------- +std::atomic g_queueClones{0}; +std::atomic g_queueFrees{0}; +std::atomic g_nv12Clones{0}; +std::atomic g_nv12Frees{0}; +std::atomic g_cudaHWClones{0}; +std::atomic g_cudaHWFrees{0}; + +extern std::atomic g_contiguousAllocs; +extern std::atomic g_contiguousFrees; +extern std::atomic g_contiguousBytesInFlight; + // libyuv: SIMD-accelerated YUV↔RGB conversion with native strided-plane input. // Replaces the memcpy-into-staging + cv::cvtColor(COLOR_YUV2BGR_I420) chain // in avframeYUV420PToCvMat with a direct I420→RGB24 (== OpenCV BGR memory @@ -1629,10 +1645,12 @@ void CVideoPlayer::close() closeAudio(); if (m_currentNV12Frame) { av_frame_free(&m_currentNV12Frame); + g_nv12Frees.fetch_add(1, std::memory_order_relaxed); m_currentNV12Frame = nullptr; } if (m_currentCudaHWFrame) { av_frame_free(&m_currentCudaHWFrame); + g_cudaHWFrees.fetch_add(1, std::memory_order_relaxed); m_currentCudaHWFrame = nullptr; } if (m_pSnapFrame) @@ -2329,8 +2347,12 @@ void CVideoPlayer::onVideoFrame(AVFrame* frame) // and we can safely clone the CUDA frame without deadlock risk. // cloneCudaHWFrame_unlocked() is safe because decoder._mutex is already held. if (m_pVideoDecoder && m_pVideoDecoder->isCudaHWAccel()) { - if (m_currentCudaHWFrame) av_frame_free(&m_currentCudaHWFrame); + if (m_currentCudaHWFrame) { + av_frame_free(&m_currentCudaHWFrame); + g_cudaHWFrees.fetch_add(1, std::memory_order_relaxed); + } m_currentCudaHWFrame = m_pVideoDecoder->cloneCudaHWFrame_unlocked(); + if (m_currentCudaHWFrame) g_cudaHWClones.fetch_add(1, std::memory_order_relaxed); } // Track how many clean frames have arrived since keyframe @@ -2455,8 +2477,12 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) { (frameToProcess->format == AV_PIX_FMT_NV12 || frameToProcess->format == AV_PIX_FMT_YUV420P || frameToProcess->format == AV_PIX_FMT_YUVJ420P)) { - if (m_currentNV12Frame) av_frame_free(&m_currentNV12Frame); + if (m_currentNV12Frame) { + av_frame_free(&m_currentNV12Frame); + g_nv12Frees.fetch_add(1, std::memory_order_relaxed); + } m_currentNV12Frame = av_frame_clone(frameToProcess); + if (m_currentNV12Frame) g_nv12Clones.fetch_add(1, std::memory_order_relaxed); } width = m_currentImage.cols; @@ -2466,6 +2492,49 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) { } av_frame_free(&frameToProcess); + g_queueFrees.fetch_add(1, std::memory_order_relaxed); + + // Leak diagnostics — one heartbeat every 60 s across the whole process. + // Each counter pair (allocs, frees) should stay balanced. A monotonic + // rise in (allocs - frees) identifies the leaking pool. Bytes field + // covers the ~12 MB/frame contiguous YUV420P buffers specifically — + // watch for steady climb while the counters look balanced (refcount + // leak in a held clone would show that shape). + { + using clk = std::chrono::steady_clock; + static std::atomic s_nextLeakLogTick{0}; + const long long tick = clk::now().time_since_epoch().count(); + long long expected = s_nextLeakLogTick.load(std::memory_order_relaxed); + if (tick >= expected) { + const long long deadline = tick + + std::chrono::duration_cast( + std::chrono::seconds(60)).count(); + // Claim the next window — first writer wins so only one thread logs. + if (s_nextLeakLogTick.compare_exchange_strong( + expected, deadline, std::memory_order_relaxed)) { + const int64_t qA = g_queueClones.load(std::memory_order_relaxed); + const int64_t qF = g_queueFrees.load(std::memory_order_relaxed); + const int64_t nvA = g_nv12Clones.load(std::memory_order_relaxed); + const int64_t nvF = g_nv12Frees.load(std::memory_order_relaxed); + const int64_t cuA = g_cudaHWClones.load(std::memory_order_relaxed); + const int64_t cuF = g_cudaHWFrees.load(std::memory_order_relaxed); + const int64_t cgA = g_contiguousAllocs.load(std::memory_order_relaxed); + const int64_t cgF = g_contiguousFrees.load(std::memory_order_relaxed); + const int64_t cgB = g_contiguousBytesInFlight.load(std::memory_order_relaxed); + ANS_DBG("MEDIA_Leak", + "queue(C=%lld F=%lld net=%lld depth=%zu) " + "nv12(C=%lld F=%lld net=%lld) " + "cudaHW(C=%lld F=%lld net=%lld) " + "contig(A=%lld F=%lld net=%lld bytesMB=%.1f)", + (long long)qA, (long long)qF, (long long)(qA - qF), + g_frameQueue.size(), + (long long)nvA, (long long)nvF, (long long)(nvA - nvF), + (long long)cuA, (long long)cuF, (long long)(cuA - cuF), + (long long)cgA, (long long)cgF, (long long)(cgA - cgF), + (double)cgB / (1024.0 * 1024.0)); + } + } + } // Emit timing breakdown. Throttled so DebugView / stderr stay readable. { @@ -2540,11 +2609,13 @@ std::string CVideoPlayer::getJpegImage(int& width, int& height, int64_t& pts) { catch (const std::exception& e) { std::cerr << "Exception while converting AVFrame to JPEG string: " << e.what() << std::endl; av_frame_free(&frameToProcess); + g_queueFrees.fetch_add(1, std::memory_order_relaxed); return m_lastJpegImage; } const auto t3 = clk::now(); av_frame_free(&frameToProcess); + g_queueFrees.fetch_add(1, std::memory_order_relaxed); if (m_pts < INT64_MAX) { m_pts++; diff --git a/MediaClient/media/video_player.h b/MediaClient/media/video_player.h index 6fc430c..7ad3b8d 100644 --- a/MediaClient/media/video_player.h +++ b/MediaClient/media/video_player.h @@ -15,8 +15,18 @@ #include #include #include +#include #include +// Leak diagnostics — net counters surfaced in MEDIA_Leak heartbeat. +// Defined in video_player.cpp; also incremented from FrameQueue here. +extern std::atomic g_queueClones; // av_frame_clone from FrameQueue +extern std::atomic g_queueFrees; // av_frame_free from FrameQueue +extern std::atomic g_nv12Clones; // m_currentNV12Frame = av_frame_clone +extern std::atomic g_nv12Frees; // av_frame_free(&m_currentNV12Frame) +extern std::atomic g_cudaHWClones; // m_currentCudaHWFrame = clone +extern std::atomic g_cudaHWFrees; // av_frame_free(&m_currentCudaHWFrame) + typedef struct { uint32 SyncTimestamp; @@ -46,6 +56,7 @@ public: std::cerr << "Failed to clone AVFrame!" << std::endl; return; } + g_queueClones.fetch_add(1, std::memory_order_relaxed); frameQueue.push(frameCopy); m_frameSeq++; // New frame arrived @@ -55,6 +66,7 @@ public: AVFrame* oldFrame = frameQueue.front(); frameQueue.pop(); av_frame_free(&oldFrame); + g_queueFrees.fetch_add(1, std::memory_order_relaxed); } } @@ -73,7 +85,15 @@ public: } // Clone the latest frame before returning it - return av_frame_clone(frameQueue.back()); + AVFrame* clone = av_frame_clone(frameQueue.back()); + if (clone) g_queueClones.fetch_add(1, std::memory_order_relaxed); + return clone; + } + + // Current depth — snapshot used by the leak heartbeat. + size_t size() { + std::lock_guard lock(queueMutex); + return frameQueue.size(); } // Retrieve and remove the oldest frame from the queue @@ -102,6 +122,7 @@ public: AVFrame* frame = frameQueue.front(); frameQueue.pop(); av_frame_free(&frame); + g_queueFrees.fetch_add(1, std::memory_order_relaxed); } m_frameSeq = 0; } diff --git a/engines/ONNXEngine/ONNXEngine.cpp b/engines/ONNXEngine/ONNXEngine.cpp index 7a2881b..0dfd32e 100644 --- a/engines/ONNXEngine/ONNXEngine.cpp +++ b/engines/ONNXEngine/ONNXEngine.cpp @@ -1,8 +1,10 @@ #include "ONNXEngine.h" #include "EPLoader.h" +#include "OpenVINODeviceConfig.h" #include "Utility.h" #include +#include #include #include #include @@ -318,8 +320,9 @@ namespace ANSCENTER { std::vector> try_configs; - // Only try NPU if it hasn't been probed yet or was previously available - if (!s_npuProbed || s_npuAvailable) { + // NPU is disabled by default — see OpenVINODeviceConfig.h. Opt in via + // OPENVINO_ENABLE_NPU=1. Even when enabled, skip if a prior probe failed. + if (IsOpenVINONpuEnabled() && (!s_npuProbed || s_npuAvailable)) { try_configs.push_back(makeConfig("AUTO:NPU,GPU")); } try_configs.push_back(makeConfig("GPU.0")); diff --git a/engines/ONNXEngine/ONNXSAM3.cpp b/engines/ONNXEngine/ONNXSAM3.cpp index 74d537c..8713a2e 100644 --- a/engines/ONNXEngine/ONNXSAM3.cpp +++ b/engines/ONNXEngine/ONNXSAM3.cpp @@ -1,5 +1,6 @@ #include "ONNXSAM3.h" #include "ONNXEngine.h" // OrtCompatiableGetInputName/OutputName helpers +#include "OpenVINODeviceConfig.h" #include #include @@ -73,11 +74,13 @@ namespace ANSCENTER bool ONNXSAM3::TryAppendOpenVINO(Ort::SessionOptions& session_options) { - std::vector> configs = { - {{"device_type","AUTO:NPU,GPU"},{"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}}, - {{"device_type","GPU.0"}, {"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}}, - {{"device_type","AUTO:GPU,CPU"},{"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}} - }; + // NPU gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h + std::vector> configs; + if (IsOpenVINONpuEnabled()) { + configs.push_back({{"device_type","AUTO:NPU,GPU"},{"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}}); + } + configs.push_back({{"device_type","GPU.0"}, {"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}}); + configs.push_back({{"device_type","AUTO:GPU,CPU"},{"precision","FP16"},{"num_of_threads","4"},{"num_streams","4"}}); for (const auto& config : configs) { try { session_options.AppendExecutionProvider_OpenVINO_V2(config); diff --git a/engines/ONNXEngine/OpenVINODeviceConfig.h b/engines/ONNXEngine/OpenVINODeviceConfig.h new file mode 100644 index 0000000..4e030a8 --- /dev/null +++ b/engines/ONNXEngine/OpenVINODeviceConfig.h @@ -0,0 +1,38 @@ +#pragma once + +// Shared runtime switch for enabling the Intel NPU in OpenVINO code paths. +// +// NPU is DISABLED BY DEFAULT because the NPU plugin on some Intel platforms +// (observed: Core Ultra 9 285K / Arrow Lake) crashes inside +// ov::Core::compile_model or Ort::Session construction when compiling +// multiple ONNX models in quick succession. That failure mode cannot be +// caught by the surrounding try/catch (it fires on a plugin worker thread) +// and takes down the host process. +// +// To opt into NPU (e.g. on a machine with a known-good NPU driver), set the +// environment variable OPENVINO_ENABLE_NPU to 1 / true / yes / on before +// launching the host process. +// +// Every OpenVINO device-selection site in this codebase consults this helper +// rather than probing NPU unconditionally. + +#include +#include +#include +#include + +namespace ANSCENTER { + + inline bool IsOpenVINONpuEnabled() { + static const bool enabled = [] { + const char* v = std::getenv("OPENVINO_ENABLE_NPU"); + if (!v || !*v) return false; + std::string s(v); + std::transform(s.begin(), s.end(), s.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + return s == "1" || s == "true" || s == "yes" || s == "on"; + }(); + return enabled; + } + +} diff --git a/modules/ANSFR/ANSFR.cpp b/modules/ANSFR/ANSFR.cpp index cc53785..e603ada 100644 --- a/modules/ANSFR/ANSFR.cpp +++ b/modules/ANSFR/ANSFR.cpp @@ -1,4 +1,5 @@ #include "ANSFR.h" +#include "OpenVINODeviceConfig.h" #include #include "ANSOVFaceDetector.h" #include "SCRFDFaceDetector.h" @@ -2695,8 +2696,12 @@ namespace ANSCENTER { for (const auto& d : available_devices) { ANS_DBG("ANSFR", " OpenVINO device: %s", d.c_str()); } - // Prioritize devices: NPU > GPU > CPU - std::vector priority_devices = { "NPU","GPU","CPU" }; + // Prioritize devices: NPU > GPU > CPU. NPU gated behind runtime switch + // (OPENVINO_ENABLE_NPU=1) — see OpenVINODeviceConfig.h. + std::vector priority_devices; + if (IsOpenVINONpuEnabled()) priority_devices.push_back("NPU"); + priority_devices.push_back("GPU"); + priority_devices.push_back("CPU"); for (const auto& device : priority_devices) { if (std::find(available_devices.begin(), available_devices.end(), device) != available_devices.end()) { ANS_DBG("ANSFR", "GetOpenVINODevice: selected %s", device.c_str()); diff --git a/modules/ANSLPR/ANSLPR_CPU.cpp b/modules/ANSLPR/ANSLPR_CPU.cpp index 92135df..e7faa14 100644 --- a/modules/ANSLPR/ANSLPR_CPU.cpp +++ b/modules/ANSLPR/ANSLPR_CPU.cpp @@ -1,4 +1,5 @@ #include "ANSLPR_CPU.h" +#include "OpenVINODeviceConfig.h" #include "ANSYOLOV10OVOD.h" #include "ANSOPENVINOOD.h" #include "ANSTENSORRTOD.h" @@ -119,8 +120,10 @@ namespace ANSCENTER { std::vector available_devices = _core.get_available_devices(); bool device_found = false; std::string deviceName = "CPU"; - // Search for NPU - auto it = std::find(available_devices.begin(), available_devices.end(), "NPU"); + // Search for NPU (gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h) + auto it = IsOpenVINONpuEnabled() + ? std::find(available_devices.begin(), available_devices.end(), "NPU") + : available_devices.end(); if (it != available_devices.end()) { _core.set_property("NPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)); _core.set_property("GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)); diff --git a/modules/ANSMOT/ByteTrack/src/BYTETracker.cpp b/modules/ANSMOT/ByteTrack/src/BYTETracker.cpp index 771eb96..792cf65 100644 --- a/modules/ANSMOT/ByteTrack/src/BYTETracker.cpp +++ b/modules/ANSMOT/ByteTrack/src/BYTETracker.cpp @@ -1,4 +1,5 @@ #include "BYTETracker.h" +#include "ANSLicense.h" // ANS_DBG for tracker-state-size diagnostic #include #include #include @@ -322,6 +323,24 @@ std::vector ByteTrack::BYTETracker::update(co tracked_stracks_ = tracked_stracks_out; lost_stracks_ = lost_stracks_out; + // Diagnostic: report tracker state size at most once every 60 s per instance. + // removed_stracks_ is append-only in this implementation — watch it grow. + { + static thread_local std::chrono::steady_clock::time_point s_nextLog{}; + auto now = std::chrono::steady_clock::now(); + if (now >= s_nextLog) { + s_nextLog = now + std::chrono::seconds(60); + ANS_DBG("ANSMOT", + "BYTETracker state this=%p frame=%zu nextId=%zu tracked=%zu lost=%zu removed=%zu", + (void*)this, + frame_id_, + track_id_count_, + tracked_stracks_.size(), + lost_stracks_.size(), + removed_stracks_.size()); + } + } + std::vector output_stracks; for (const auto &track : tracked_stracks_) { diff --git a/modules/ANSODEngine/ANSFaceDetectorEngine.cpp b/modules/ANSODEngine/ANSFaceDetectorEngine.cpp index 5ab32ad..ca5d320 100644 --- a/modules/ANSODEngine/ANSFaceDetectorEngine.cpp +++ b/modules/ANSODEngine/ANSFaceDetectorEngine.cpp @@ -1,5 +1,6 @@ #pragma once #include "ANSODEngine.h" +#include "OpenVINODeviceConfig.h" #include "ANSYOLOOD.h" #include "ANSTENSORRTOD.h" #include "ANSTENSORRTCL.h" @@ -333,8 +334,10 @@ namespace ANSCENTER std::vector available_devices = core.get_available_devices(); bool device_found = false; std::string deviceName = "CPU"; - // Search for NPU - auto it = std::find(available_devices.begin(), available_devices.end(), "NPU"); + // Search for NPU (gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h) + auto it = IsOpenVINONpuEnabled() + ? std::find(available_devices.begin(), available_devices.end(), "NPU") + : available_devices.end(); if (it != available_devices.end()) { core.set_property("NPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)); core.set_property("GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)); @@ -1414,7 +1417,7 @@ namespace ANSCENTER }; std::vector> try_configs; - if (!s_npuProbed || s_npuAvailable) { + if (IsOpenVINONpuEnabled() && (!s_npuProbed || s_npuAvailable)) { try_configs.push_back(makeConfig("AUTO:NPU,GPU")); } try_configs.push_back(makeConfig("GPU.0")); diff --git a/modules/ANSODEngine/ANSODEngine.cpp b/modules/ANSODEngine/ANSODEngine.cpp index 173e61d..f1c480f 100644 --- a/modules/ANSODEngine/ANSODEngine.cpp +++ b/modules/ANSODEngine/ANSODEngine.cpp @@ -4,6 +4,7 @@ #include #include "ANSODEngine.h" #include "ANSLicense.h" // ANS_DBG macro +#include "OpenVINODeviceConfig.h" #include "ANSYOLOOD.h" #include "ANSTENSORRTOD.h" #include "ANSTENSORRTCL.h" @@ -354,8 +355,10 @@ namespace ANSCENTER std::vector available_devices = core.get_available_devices(); bool device_found = false; std::string deviceName = "CPU"; - // Search for NPU - auto it = std::find(available_devices.begin(), available_devices.end(), "NPU"); + // Search for NPU (gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h) + auto it = IsOpenVINONpuEnabled() + ? std::find(available_devices.begin(), available_devices.end(), "NPU") + : available_devices.end(); if (it != available_devices.end()) { core.set_property("NPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)); core.set_property("GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)); diff --git a/modules/ANSODEngine/ANSONNXCL.cpp b/modules/ANSODEngine/ANSONNXCL.cpp index 1fa72cf..7276dff 100644 --- a/modules/ANSODEngine/ANSONNXCL.cpp +++ b/modules/ANSODEngine/ANSONNXCL.cpp @@ -1,5 +1,6 @@ #include"ANSONNXCL.h" #include "EPLoader.h" +#include "OpenVINODeviceConfig.h" namespace ANSCENTER { @@ -143,20 +144,26 @@ namespace ANSCENTER const std::string numberOfThreads = "1"; const std::string numberOfStreams = "1"; - std::vector> try_configs = { - { {"device_type","AUTO:NPU,GPU"}, {"precision",precision}, - {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, - {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} }, + std::vector> try_configs; + // NPU gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h + if (IsOpenVINONpuEnabled()) { + try_configs.push_back( + { {"device_type","AUTO:NPU,GPU"}, {"precision",precision}, + {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, + {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} }); + } + try_configs.push_back( { {"device_type","GPU.0"}, {"precision",precision}, {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, - {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} }, + {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} }); + try_configs.push_back( { {"device_type","GPU.1"}, {"precision",precision}, {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, - {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} }, + {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} }); + try_configs.push_back( { {"device_type","AUTO:GPU,CPU"}, {"precision",precision}, {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, - {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} } - }; + {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","False"} }); for (const auto& config : try_configs) { try { diff --git a/modules/ANSODEngine/ANSOPENVINOCL.cpp b/modules/ANSODEngine/ANSOPENVINOCL.cpp index 3fb3214..a491f9f 100644 --- a/modules/ANSODEngine/ANSOPENVINOCL.cpp +++ b/modules/ANSODEngine/ANSOPENVINOCL.cpp @@ -1,5 +1,6 @@ #include "ANSOPENVINOCL.h" #include "Utility.h" +#include "OpenVINODeviceConfig.h" namespace ANSCENTER { bool OPENVINOCL::OptimizeModel(bool fp16, std::string& optimizedModelFolder) { @@ -369,8 +370,10 @@ namespace ANSCENTER std::vector available_devices = core.get_available_devices(); bool device_found = false; - // Search for NPU - auto it = std::find(available_devices.begin(), available_devices.end(), "NPU"); + // Search for NPU (gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h) + auto it = IsOpenVINONpuEnabled() + ? std::find(available_devices.begin(), available_devices.end(), "NPU") + : available_devices.end(); if (it != available_devices.end()) { core.set_property("NPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)); core.set_property("GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)); diff --git a/modules/ANSODEngine/ANSOPENVINOOD.cpp b/modules/ANSODEngine/ANSOPENVINOOD.cpp index 4da131a..c2b4000 100644 --- a/modules/ANSODEngine/ANSOPENVINOOD.cpp +++ b/modules/ANSODEngine/ANSOPENVINOOD.cpp @@ -1,5 +1,6 @@ #include "ANSOPENVINOOD.h" #include "Utility.h" +#include "OpenVINODeviceConfig.h" namespace ANSCENTER { bool OPENVINOOD::OptimizeModel(bool fp16, std::string& optimizedModelFolder) { @@ -437,8 +438,11 @@ namespace ANSCENTER ov::Core core; // Step 2: Get Available Devices and Log std::vector available_devices = core.get_available_devices(); - // Define device priority: NPU > GPU > CPU - std::vector priority_devices = { "NPU", "GPU" }; + // Define device priority: NPU > GPU > CPU. NPU gated by + // OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h. + std::vector priority_devices; + if (IsOpenVINONpuEnabled()) priority_devices.push_back("NPU"); + priority_devices.push_back("GPU"); bool device_found = false; // Iterate over prioritized devices diff --git a/modules/ANSODEngine/ANSOVSEG.cpp b/modules/ANSODEngine/ANSOVSEG.cpp index 3b796e1..969e473 100644 --- a/modules/ANSODEngine/ANSOVSEG.cpp +++ b/modules/ANSODEngine/ANSOVSEG.cpp @@ -1,4 +1,5 @@ #include "ANSOVSEG.h" +#include "OpenVINODeviceConfig.h" namespace ANSCENTER { bool ANSOVSEG::OptimizeModel(bool fp16, std::string& optimizedModelFolder) { std::lock_guard lock(_mutex); @@ -493,8 +494,11 @@ namespace ANSCENTER { ov::Core core; // Step 2: Get Available Devices and Log std::vector available_devices = core.get_available_devices(); - // Define device priority: NPU > GPU > CPU - std::vector priority_devices = { "NPU", "GPU" }; + // Define device priority: NPU > GPU > CPU. NPU gated by + // OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h. + std::vector priority_devices; + if (IsOpenVINONpuEnabled()) priority_devices.push_back("NPU"); + priority_devices.push_back("GPU"); bool device_found = false; // Iterate over prioritized devices diff --git a/modules/ANSODEngine/ANSYOLO12OD.cpp b/modules/ANSODEngine/ANSYOLO12OD.cpp index ab9e03b..e9e7829 100644 --- a/modules/ANSODEngine/ANSYOLO12OD.cpp +++ b/modules/ANSODEngine/ANSYOLO12OD.cpp @@ -1,5 +1,6 @@ #include "ANSYOLO12OD.h" #include "EPLoader.h" +#include "OpenVINODeviceConfig.h" #ifdef USEONNXOV #endif @@ -365,20 +366,26 @@ namespace ANSCENTER { const std::string numberOfThreads = "8"; const std::string numberOfStreams = "8"; - std::vector> try_configs = { - { {"device_type","AUTO:NPU,GPU"}, {"precision",precision}, - {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, - {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }, + std::vector> try_configs; + // NPU gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h + if (IsOpenVINONpuEnabled()) { + try_configs.push_back( + { {"device_type","AUTO:NPU,GPU"}, {"precision",precision}, + {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, + {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }); + } + try_configs.push_back( { {"device_type","GPU.0"}, {"precision",precision}, {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, - {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }, + {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }); + try_configs.push_back( { {"device_type","GPU.1"}, {"precision",precision}, {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, - {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }, + {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }); + try_configs.push_back( { {"device_type","AUTO:GPU,CPU"}, {"precision",precision}, {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, - {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} } - }; + {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }); for (const auto& config : try_configs) { try { diff --git a/modules/ANSODEngine/ANSYOLOOD.cpp b/modules/ANSODEngine/ANSYOLOOD.cpp index df12098..bcb502d 100644 --- a/modules/ANSODEngine/ANSYOLOOD.cpp +++ b/modules/ANSODEngine/ANSYOLOOD.cpp @@ -1,6 +1,7 @@ #include "ANSYOLOOD.h" #include "Utility.h" #include "EPLoader.h" +#include "OpenVINODeviceConfig.h" #include "ANSGpuFrameRegistry.h" #include "NV12PreprocessHelper.h" // tl_currentGpuFrame() #ifdef USEONNXOV @@ -303,20 +304,26 @@ namespace ANSCENTER const std::string numberOfThreads = "8"; const std::string numberOfStreams = "8"; - std::vector> try_configs = { - { {"device_type","AUTO:NPU,GPU"}, {"precision",precision}, - {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, - {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }, + std::vector> try_configs; + // NPU gated by OPENVINO_ENABLE_NPU — see OpenVINODeviceConfig.h + if (IsOpenVINONpuEnabled()) { + try_configs.push_back( + { {"device_type","AUTO:NPU,GPU"}, {"precision",precision}, + {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, + {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }); + } + try_configs.push_back( { {"device_type","GPU.0"}, {"precision",precision}, {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, - {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }, + {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }); + try_configs.push_back( { {"device_type","GPU.1"}, {"precision",precision}, {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, - {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }, + {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }); + try_configs.push_back( { {"device_type","AUTO:GPU,CPU"}, {"precision",precision}, {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, - {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} } - }; + {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }); for (const auto& config : try_configs) { try {