Improve ANSCV
This commit is contained in:
@@ -1,7 +0,0 @@
|
|||||||
{
|
|
||||||
"permissions": {
|
|
||||||
"allow": [
|
|
||||||
"Bash(cmake -B cmake-build-release -S .)"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -332,8 +332,28 @@ void CVideoDecoder::uninit()
|
|||||||
{
|
{
|
||||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||||
|
|
||||||
|
// [MEDIA_DecClose] heartbeat — paired with [MEDIA_DecInit] for leak diagnosis.
|
||||||
|
// Pair count over a long run reveals whether avcodec_open2 calls are
|
||||||
|
// matched by full teardowns. If close-count < init-count, the FFmpeg
|
||||||
|
// codec context (and its custom get_buffer2 arena) is leaking per reopen.
|
||||||
|
{
|
||||||
|
static std::atomic<uint64_t> s_closeCount{0};
|
||||||
|
const uint64_t n = s_closeCount.fetch_add(1) + 1;
|
||||||
|
ANS_DBG("MEDIA_DecClose",
|
||||||
|
"uninit ENTRY #%llu inited=%d codec=%s %dx%d hwEnabled=%d cudaHW=%d gpu=%d (this=%p)",
|
||||||
|
(unsigned long long)n,
|
||||||
|
(int)m_bInited,
|
||||||
|
(m_pCodec && m_pCodec->name) ? m_pCodec->name : "?",
|
||||||
|
m_pContext ? m_pContext->width : 0,
|
||||||
|
m_pContext ? m_pContext->height : 0,
|
||||||
|
(int)m_bHardwareDecoderEnabled,
|
||||||
|
(int)m_bCudaHWAccel,
|
||||||
|
m_hwGpuIndex,
|
||||||
|
(void*)this);
|
||||||
|
}
|
||||||
|
|
||||||
// Stop processing first
|
// Stop processing first
|
||||||
// Backup first
|
// Backup first
|
||||||
BOOL wasRunning = m_bRunning;
|
BOOL wasRunning = m_bRunning;
|
||||||
m_bRunning = FALSE;
|
m_bRunning = FALSE;
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,19 @@
|
|||||||
#include "TRTCompat.h"
|
#include "TRTCompat.h"
|
||||||
#include "ANSLicense.h" // ANS_DBG macro for DebugView logging
|
#include "ANSLicense.h" // ANS_DBG macro for DebugView logging
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
# ifndef WIN32_LEAN_AND_MEAN
|
||||||
|
# define WIN32_LEAN_AND_MEAN
|
||||||
|
# endif
|
||||||
|
# ifndef NOMINMAX
|
||||||
|
# define NOMINMAX
|
||||||
|
# endif
|
||||||
|
# include <windows.h>
|
||||||
|
# include <psapi.h>
|
||||||
|
# include <tlhelp32.h>
|
||||||
|
# pragma comment(lib, "psapi.lib")
|
||||||
|
#endif
|
||||||
|
|
||||||
// Per-device mutex for CUDA graph capture.
|
// Per-device mutex for CUDA graph capture.
|
||||||
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
|
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
|
||||||
// at the CUDA context level. When two Engine instances on the same GPU
|
// at the CUDA context level. When two Engine instances on the same GPU
|
||||||
@@ -398,6 +411,56 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
|
const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
|
||||||
s_globalActiveInf.fetch_add(1);
|
s_globalActiveInf.fetch_add(1);
|
||||||
|
|
||||||
|
// ── Process-wide host-RAM heartbeat (once per ~60s) ──────────────────────
|
||||||
|
// Diagnostic for long-run leak hunts: if [PROC_MEM] privateMB climbs while
|
||||||
|
// [TRT_SM100] VRAM stays flat, the leak is on the host side (FFmpeg
|
||||||
|
// contexts, RTSP threads, GDI objects). Cheap when not firing — single
|
||||||
|
// atomic load + one compare in the hot path.
|
||||||
|
#ifdef _WIN32
|
||||||
|
{
|
||||||
|
using clk = std::chrono::steady_clock;
|
||||||
|
static std::atomic<int64_t> s_hbLastNs{0};
|
||||||
|
const int64_t nowNs = clk::now().time_since_epoch().count();
|
||||||
|
int64_t prev = s_hbLastNs.load(std::memory_order_relaxed);
|
||||||
|
constexpr int64_t kIntervalNs = 60LL * 1'000'000'000LL;
|
||||||
|
if (nowNs - prev >= kIntervalNs &&
|
||||||
|
s_hbLastNs.compare_exchange_strong(prev, nowNs,
|
||||||
|
std::memory_order_relaxed)) {
|
||||||
|
PROCESS_MEMORY_COUNTERS_EX pmc{};
|
||||||
|
pmc.cb = sizeof(pmc);
|
||||||
|
GetProcessMemoryInfo(GetCurrentProcess(),
|
||||||
|
reinterpret_cast<PROCESS_MEMORY_COUNTERS*>(&pmc),
|
||||||
|
sizeof(pmc));
|
||||||
|
DWORD gdi = GetGuiResources(GetCurrentProcess(), GR_GDIOBJECTS);
|
||||||
|
DWORD usr = GetGuiResources(GetCurrentProcess(), GR_USEROBJECTS);
|
||||||
|
|
||||||
|
// Thread count via Toolhelp snapshot (filter to current PID).
|
||||||
|
DWORD threads = 0;
|
||||||
|
HANDLE snap = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);
|
||||||
|
if (snap != INVALID_HANDLE_VALUE) {
|
||||||
|
THREADENTRY32 te{ sizeof(te) };
|
||||||
|
const DWORD pid = GetCurrentProcessId();
|
||||||
|
if (Thread32First(snap, &te)) {
|
||||||
|
do {
|
||||||
|
if (te.th32OwnerProcessID == pid) ++threads;
|
||||||
|
} while (Thread32Next(snap, &te));
|
||||||
|
}
|
||||||
|
CloseHandle(snap);
|
||||||
|
}
|
||||||
|
|
||||||
|
ANS_DBG("PROC_MEM",
|
||||||
|
"privateMB=%llu workingMB=%llu peakWorkingMB=%llu "
|
||||||
|
"pagefileMB=%llu gdi=%lu user=%lu threads=%lu",
|
||||||
|
(unsigned long long)(pmc.PrivateUsage >> 20),
|
||||||
|
(unsigned long long)(pmc.WorkingSetSize >> 20),
|
||||||
|
(unsigned long long)(pmc.PeakWorkingSetSize >> 20),
|
||||||
|
(unsigned long long)(pmc.PagefileUsage >> 20),
|
||||||
|
(unsigned long)gdi, (unsigned long)usr,
|
||||||
|
(unsigned long)threads);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Per-thread tracking
|
// Per-thread tracking
|
||||||
{
|
{
|
||||||
static thread_local int64_t s_infCount = 0;
|
static thread_local int64_t s_infCount = 0;
|
||||||
@@ -935,15 +998,29 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// Per-inference total timing breakdown (mutex wait + preprocess + GPU)
|
// Slow-inference alarm — ONE-SIDED FILTER, NOT A DISTRIBUTION
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
// This emits a DebugView line ONLY when a single inference's total wall
|
||||||
|
// time (mutex-wait + GPU execution) exceeds 100 ms. Fast calls are silent.
|
||||||
|
//
|
||||||
|
// Consequence: if you aggregate `[TRT_Slow]` lines and compute an average,
|
||||||
|
// you get the mean of the slow *tail*, NOT the real average inference
|
||||||
|
// time. Expect this avg to look dramatic (~200–400 ms) because by design
|
||||||
|
// every sample here is already slow. A typical inference on a healthy
|
||||||
|
// system fires this line for ~1–3% of calls; >10% indicates a problem.
|
||||||
|
//
|
||||||
|
// For the true per-inference distribution, look at `[TRT_SM100] #N ...
|
||||||
|
// avgMs=... maxMs=...` (running-average, emitted every 50 inferences).
|
||||||
|
// The tag was previously `[TRT_Timing]` which misled readers into
|
||||||
|
// interpreting the avg as overall pipeline latency.
|
||||||
{
|
{
|
||||||
double totalMs = std::chrono::duration<double, std::milli>(
|
double totalMs = std::chrono::duration<double, std::milli>(
|
||||||
std::chrono::steady_clock::now() - _mutexWaitStart).count();
|
std::chrono::steady_clock::now() - _mutexWaitStart).count();
|
||||||
double gpuMs = totalMs - _mutexWaitMs; // Everything after mutex acquired
|
double gpuMs = totalMs - _mutexWaitMs; // Everything after mutex acquired
|
||||||
// Log every inference that takes >100ms total (including mutex wait)
|
|
||||||
if (totalMs > 100.0) {
|
if (totalMs > 100.0) {
|
||||||
ANS_DBG("TRT_Timing", "total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d",
|
ANS_DBG("TRT_Slow",
|
||||||
|
"SLOW inference total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d "
|
||||||
|
"(this filter only fires for calls >100ms)",
|
||||||
totalMs, _mutexWaitMs, gpuMs, batchSize, s_globalActiveInf.load());
|
totalMs, _mutexWaitMs, gpuMs, batchSize, s_globalActiveInf.load());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
#include "ANSMatRegistry.h"
|
#include "ANSMatRegistry.h"
|
||||||
#include "ANSGpuFrameOps.h"
|
#include "ANSGpuFrameOps.h"
|
||||||
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
||||||
|
#include "ANSLicense.h" // ANS_DBG macro
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include "media_codec.h"
|
#include "media_codec.h"
|
||||||
@@ -251,6 +252,23 @@ namespace ANSCENTER {
|
|||||||
return _pLastFrame; // Shallow copy (fast)
|
return _pLastFrame; // Shallow copy (fast)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Early stale-out: if the decoder hasn't produced a frame in 5s the
|
||||||
|
// source is dead. Skip _playerClient->getImage() entirely and return
|
||||||
|
// the cached frame with unchanged _pts so LabVIEW sees STALE PTS one
|
||||||
|
// poll earlier and triggers reconnect.
|
||||||
|
if (!_pLastFrame.empty()) {
|
||||||
|
double ageMs = _playerClient->getLastFrameAgeMs();
|
||||||
|
if (ageMs >= 5000.0) {
|
||||||
|
ANS_DBG("FLV_GetImage",
|
||||||
|
"EARLY STALE: ageMs=%.1f pts=%lld url=%s — skipping getImage()",
|
||||||
|
ageMs, (long long)_pts, _url.c_str());
|
||||||
|
width = _imageWidth;
|
||||||
|
height = _imageHeight;
|
||||||
|
pts = _pts;
|
||||||
|
return _pLastFrame;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int imageW = 0, imageH = 0;
|
int imageW = 0, imageH = 0;
|
||||||
int64_t currentPts = 0;
|
int64_t currentPts = 0;
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
#include "ANSMatRegistry.h"
|
#include "ANSMatRegistry.h"
|
||||||
#include "ANSGpuFrameOps.h"
|
#include "ANSGpuFrameOps.h"
|
||||||
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
||||||
|
#include "ANSLicense.h" // ANS_DBG macro
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include "media_codec.h"
|
#include "media_codec.h"
|
||||||
@@ -239,6 +240,23 @@ namespace ANSCENTER {
|
|||||||
return _pLastFrame; // Shallow copy (fast)
|
return _pLastFrame; // Shallow copy (fast)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Early stale-out: if the decoder hasn't produced a frame in 5s the
|
||||||
|
// source is dead. Skip _playerClient->getImage() entirely and return
|
||||||
|
// the cached frame with unchanged _pts so LabVIEW sees STALE PTS one
|
||||||
|
// poll earlier and triggers reconnect.
|
||||||
|
if (!_pLastFrame.empty()) {
|
||||||
|
double ageMs = _playerClient->getLastFrameAgeMs();
|
||||||
|
if (ageMs >= 5000.0) {
|
||||||
|
ANS_DBG("MJPEG_GetImage",
|
||||||
|
"EARLY STALE: ageMs=%.1f pts=%lld url=%s — skipping getImage()",
|
||||||
|
ageMs, (long long)_pts, _url.c_str());
|
||||||
|
width = _imageWidth;
|
||||||
|
height = _imageHeight;
|
||||||
|
pts = _pts;
|
||||||
|
return _pLastFrame;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int imageW = 0, imageH = 0;
|
int imageW = 0, imageH = 0;
|
||||||
int64_t currentPts = 0;
|
int64_t currentPts = 0;
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -155,7 +155,9 @@ namespace ANSCENTER
|
|||||||
std::recursive_mutex _mutex;
|
std::recursive_mutex _mutex;
|
||||||
|
|
||||||
//std::once_flag licenseOnceFlag; // For one-time license check
|
//std::once_flag licenseOnceFlag; // For one-time license check
|
||||||
bool _licenseValid = false;
|
// Atomic so lock-free methods (ImageResize, ImageResizeWithRatio,
|
||||||
|
// MatToBinaryData, EncodeJpegString) can read it without _mutex.
|
||||||
|
std::atomic<bool> _licenseValid{ false };
|
||||||
public:
|
public:
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
#include "ANSMatRegistry.h"
|
#include "ANSMatRegistry.h"
|
||||||
#include "ANSGpuFrameOps.h"
|
#include "ANSGpuFrameOps.h"
|
||||||
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
||||||
|
#include "ANSLicense.h" // ANS_DBG macro
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include "media_codec.h"
|
#include "media_codec.h"
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
@@ -245,6 +246,23 @@ namespace ANSCENTER {
|
|||||||
return _pLastFrame; // Shallow copy (fast)
|
return _pLastFrame; // Shallow copy (fast)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Early stale-out: if the decoder hasn't produced a frame in 5s the
|
||||||
|
// source is dead. Skip _playerClient->getImage() entirely and return
|
||||||
|
// the cached frame with unchanged _pts so LabVIEW sees STALE PTS one
|
||||||
|
// poll earlier and triggers reconnect.
|
||||||
|
if (!_pLastFrame.empty()) {
|
||||||
|
double ageMs = _playerClient->getLastFrameAgeMs();
|
||||||
|
if (ageMs >= 5000.0) {
|
||||||
|
ANS_DBG("RTMP_GetImage",
|
||||||
|
"EARLY STALE: ageMs=%.1f pts=%lld url=%s — skipping getImage()",
|
||||||
|
ageMs, (long long)_pts, _url.c_str());
|
||||||
|
width = _imageWidth;
|
||||||
|
height = _imageHeight;
|
||||||
|
pts = _pts;
|
||||||
|
return _pLastFrame;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int imageW = 0, imageH = 0;
|
int imageW = 0, imageH = 0;
|
||||||
int64_t currentPts = 0;
|
int64_t currentPts = 0;
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
#include "ANSMatRegistry.h"
|
#include "ANSMatRegistry.h"
|
||||||
#include "ANSGpuFrameOps.h"
|
#include "ANSGpuFrameOps.h"
|
||||||
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
|
||||||
|
#include "ANSLicense.h" // ANS_DBG macro
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include "media_codec.h"
|
#include "media_codec.h"
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
@@ -253,6 +254,23 @@ namespace ANSCENTER {
|
|||||||
return _pLastFrame; // Shallow copy (fast)
|
return _pLastFrame; // Shallow copy (fast)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Early stale-out: if the decoder hasn't produced a frame in 5s the
|
||||||
|
// source is dead. Skip _playerClient->getImage() entirely and return
|
||||||
|
// the cached frame with unchanged _pts so LabVIEW sees STALE PTS one
|
||||||
|
// poll earlier and triggers reconnect.
|
||||||
|
if (!_pLastFrame.empty()) {
|
||||||
|
double ageMs = _playerClient->getLastFrameAgeMs();
|
||||||
|
if (ageMs >= 5000.0) {
|
||||||
|
ANS_DBG("SRT_GetImage",
|
||||||
|
"EARLY STALE: ageMs=%.1f pts=%lld url=%s — skipping getImage()",
|
||||||
|
ageMs, (long long)_pts, _url.c_str());
|
||||||
|
width = _imageWidth;
|
||||||
|
height = _imageHeight;
|
||||||
|
pts = _pts;
|
||||||
|
return _pLastFrame;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int imageW = 0, imageH = 0;
|
int imageW = 0, imageH = 0;
|
||||||
int64_t currentPts = 0;
|
int64_t currentPts = 0;
|
||||||
|
|
||||||
|
|||||||
@@ -91,9 +91,14 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!m_trtEngine) {
|
if (!m_trtEngine) {
|
||||||
// Enable batch support
|
// Enable batch support. maxBatchSize controls the TRT workspace
|
||||||
m_options.optBatchSize = 8;
|
// allocation (~linear in batch); opt is the kernel-selection sweet
|
||||||
m_options.maxBatchSize = 32;
|
// spot. Max=4 was picked to fit 4 concurrent face crops per frame
|
||||||
|
// comfortably on 8 GB GPUs while freeing ~1.5 GB VRAM vs max=32
|
||||||
|
// — most scenes have ≤4 faces visible, so throughput cost is
|
||||||
|
// near-zero (amortized per-face latency drops too at lower batch).
|
||||||
|
m_options.optBatchSize = 4;
|
||||||
|
m_options.maxBatchSize = 4;
|
||||||
|
|
||||||
m_options.maxInputHeight = GPU_FACE_HEIGHT;
|
m_options.maxInputHeight = GPU_FACE_HEIGHT;
|
||||||
m_options.minInputHeight = GPU_FACE_HEIGHT;
|
m_options.minInputHeight = GPU_FACE_HEIGHT;
|
||||||
|
|||||||
@@ -534,8 +534,12 @@ namespace ANSCENTER {
|
|||||||
|
|
||||||
_ocrModelConfig.inpHeight = 640;
|
_ocrModelConfig.inpHeight = 640;
|
||||||
_ocrModelConfig.inpWidth = 640;
|
_ocrModelConfig.inpWidth = 640;
|
||||||
_ocrModelConfig.gpuOptBatchSize = 8;
|
// Max=4 chosen to fit typical plate counts per frame on 8 GB GPUs.
|
||||||
_ocrModelConfig.gpuMaxBatchSize = 32; // desired max; engine builder auto-caps by GPU VRAM
|
// Was opt=8/max=32 which sized TRT workspace for 32 concurrent plates
|
||||||
|
// (~1 GB for this model alone). Cap of 4 is still >= the usual 1–3
|
||||||
|
// plates visible per camera frame, amortized throughput unchanged.
|
||||||
|
_ocrModelConfig.gpuOptBatchSize = 4;
|
||||||
|
_ocrModelConfig.gpuMaxBatchSize = 4; // desired max; engine builder auto-caps by GPU VRAM
|
||||||
_ocrModelConfig.maxInputHeight = 640;
|
_ocrModelConfig.maxInputHeight = 640;
|
||||||
_ocrModelConfig.maxInputWidth = 640;
|
_ocrModelConfig.maxInputWidth = 640;
|
||||||
_ocrModelConfig.minInputHeight = 640;
|
_ocrModelConfig.minInputHeight = 640;
|
||||||
@@ -545,8 +549,9 @@ namespace ANSCENTER {
|
|||||||
|
|
||||||
_lpColourModelConfig.inpHeight = 224;
|
_lpColourModelConfig.inpHeight = 224;
|
||||||
_lpColourModelConfig.inpWidth = 224;
|
_lpColourModelConfig.inpWidth = 224;
|
||||||
_lpColourModelConfig.gpuOptBatchSize = 8;
|
// See _ocrModelConfig above — matching batch cap for consistency.
|
||||||
_lpColourModelConfig.gpuMaxBatchSize = 32; // desired max; engine builder auto-caps by GPU VRAM
|
_lpColourModelConfig.gpuOptBatchSize = 4;
|
||||||
|
_lpColourModelConfig.gpuMaxBatchSize = 4; // desired max; engine builder auto-caps by GPU VRAM
|
||||||
_lpColourModelConfig.maxInputHeight = 224;
|
_lpColourModelConfig.maxInputHeight = 224;
|
||||||
_lpColourModelConfig.maxInputWidth = 224;
|
_lpColourModelConfig.maxInputWidth = 224;
|
||||||
_lpColourModelConfig.minInputHeight = 224;
|
_lpColourModelConfig.minInputHeight = 224;
|
||||||
|
|||||||
@@ -28,8 +28,11 @@ bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string&
|
|||||||
ANSCENTER::Options options;
|
ANSCENTER::Options options;
|
||||||
options.deviceIndex = gpuId;
|
options.deviceIndex = gpuId;
|
||||||
options.precision = ANSCENTER::Precision::FP16;
|
options.precision = ANSCENTER::Precision::FP16;
|
||||||
options.maxBatchSize = 1;
|
// maxBatch=4 matches FaceRecognizer / ALPR configuration — allows the
|
||||||
options.optBatchSize = 1;
|
// recognizer to process up to 4 detected text lines in one call,
|
||||||
|
// amortizing per-invocation overhead while keeping TRT workspace small.
|
||||||
|
options.maxBatchSize = 4;
|
||||||
|
options.optBatchSize = 4;
|
||||||
|
|
||||||
// Fixed height, dynamic width for recognition
|
// Fixed height, dynamic width for recognition
|
||||||
options.minInputHeight = imgH_;
|
options.minInputHeight = imgH_;
|
||||||
|
|||||||
@@ -185,11 +185,22 @@ extern "C" ANSOCR_API int CreateANSOCRHandleEx(ANSCENTER::ANSOCRBase** Handle,
|
|||||||
ANSCENTER::ANSLibsLoader::Initialize();
|
ANSCENTER::ANSLibsLoader::Initialize();
|
||||||
ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
|
ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
|
||||||
{
|
{
|
||||||
|
// Describe the backend the engine-selector below will actually choose
|
||||||
|
// for this (hardware, engineMode) combination. Previous versions of
|
||||||
|
// this log claimed "TensorRT OCR enabled" based on hardware alone,
|
||||||
|
// which was misleading because engineMode=0 (auto) unconditionally
|
||||||
|
// picked ONNX — users saw the log and assumed TRT was running.
|
||||||
|
const bool isNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
|
||||||
|
const bool willUseTRT =
|
||||||
|
isNvidia && (engineMode == 0 /* auto → TRT on NVIDIA */ ||
|
||||||
|
engineMode == 1 /* GPU → TRT on NVIDIA */);
|
||||||
const char* vendorTag =
|
const char* vendorTag =
|
||||||
engineType == ANSCENTER::EngineType::NVIDIA_GPU ? "NVIDIA_GPU (TensorRT OCR enabled)" :
|
engineType == ANSCENTER::EngineType::NVIDIA_GPU
|
||||||
engineType == ANSCENTER::EngineType::AMD_GPU ? "AMD_GPU (ONNX Runtime / DirectML, TensorRT OCR DISABLED)" :
|
? (willUseTRT ? "NVIDIA_GPU (TensorRT OCR active)"
|
||||||
engineType == ANSCENTER::EngineType::OPENVINO_GPU ? "OPENVINO_GPU (ONNX Runtime / OpenVINO, TensorRT OCR DISABLED)" :
|
: "NVIDIA_GPU (TensorRT available, but engineMode forces ONNX)")
|
||||||
"CPU (ONNX Runtime, TensorRT OCR DISABLED)";
|
: engineType == ANSCENTER::EngineType::AMD_GPU ? "AMD_GPU (ONNX Runtime / DirectML, TensorRT OCR unavailable)"
|
||||||
|
: engineType == ANSCENTER::EngineType::OPENVINO_GPU ? "OPENVINO_GPU (ONNX Runtime / OpenVINO, TensorRT OCR unavailable)"
|
||||||
|
: "CPU (ONNX Runtime, TensorRT OCR unavailable)";
|
||||||
char buf[192];
|
char buf[192];
|
||||||
snprintf(buf, sizeof(buf),
|
snprintf(buf, sizeof(buf),
|
||||||
"[ANSOCR] CreateANSOCRHandleEx: detected engineType=%d [%s], engineMode=%d\n",
|
"[ANSOCR] CreateANSOCRHandleEx: detected engineType=%d [%s], engineMode=%d\n",
|
||||||
@@ -230,10 +241,23 @@ extern "C" ANSOCR_API int CreateANSOCRHandleEx(ANSCENTER::ANSOCRBase** Handle,
|
|||||||
// select, including DirectML for AMD).
|
// select, including DirectML for AMD).
|
||||||
const bool isNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
|
const bool isNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
|
||||||
switch (engineMode) {
|
switch (engineMode) {
|
||||||
case 0:// Auto-detect, always use ONNX for better compatibility, especially on AMD GPUs and high-res images
|
case 0: // Auto-detect — prefer TensorRT on NVIDIA, ONNX elsewhere.
|
||||||
(*Handle) = new ANSCENTER::ANSONNXOCR();
|
// Previous policy was "always ONNX" for cross-platform safety,
|
||||||
|
// but on NVIDIA that defeated the point: each ANSONNXOCR handle
|
||||||
|
// allocates its own cls/dec/rec OrtSessions (no dedupe), which
|
||||||
|
// wasted ~300–600 MB VRAM per extra instance and ran ~2× slower
|
||||||
|
// than ANSRTOCR's shared-engine path via EnginePoolManager.
|
||||||
|
if (isNvidia) {
|
||||||
|
limitSideLen = 960;
|
||||||
|
(*Handle) = new ANSCENTER::ANSRTOCR();
|
||||||
|
} else {
|
||||||
|
// AMD / Intel / CPU — ANSRTOCR hard-requires CUDA and would
|
||||||
|
// crash. ANSONNXOCR auto-picks the correct ORT EP
|
||||||
|
// (DirectML on AMD, OpenVINO on Intel, CPU otherwise).
|
||||||
|
(*Handle) = new ANSCENTER::ANSONNXOCR();
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case 1:// GPU — use TensorRT engine ONLY on NVIDIA hardware.
|
case 1: // GPU — use TensorRT engine ONLY on NVIDIA hardware.
|
||||||
if (isNvidia) {
|
if (isNvidia) {
|
||||||
limitSideLen = 960;
|
limitSideLen = 960;
|
||||||
(*Handle) = new ANSCENTER::ANSRTOCR();
|
(*Handle) = new ANSCENTER::ANSRTOCR();
|
||||||
@@ -244,7 +268,7 @@ extern "C" ANSOCR_API int CreateANSOCRHandleEx(ANSCENTER::ANSOCRBase** Handle,
|
|||||||
(*Handle) = new ANSCENTER::ANSONNXOCR();
|
(*Handle) = new ANSCENTER::ANSONNXOCR();
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 2:// CPU
|
case 2: // CPU
|
||||||
(*Handle) = new ANSCENTER::ANSONNXOCR();
|
(*Handle) = new ANSCENTER::ANSONNXOCR();
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
|||||||
@@ -426,27 +426,37 @@ extern "C" ANSODENGINE_API std::string CreateANSODHandle(ANSCENTER::ANSODBase**
|
|||||||
ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
|
ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
|
||||||
if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU
|
if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU
|
||||||
|
|
||||||
//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
|
// Route detection / pose / segmentation / OBB / classification to the best
|
||||||
|
// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
|
||||||
if ((modelType == 4) || // TensorRT
|
// handler. Unlisted modelType values are left untouched for the switch below.
|
||||||
(modelType == 14)|| // TensorRT Yolov10
|
// See CreateANSODHandleEx for the full rationale — three correctness bugs
|
||||||
(modelType == 22)|| // TensorRT Pose
|
// were fixed in that dispatcher and must be kept in sync across copies.
|
||||||
(modelType == 24)) // TensorRT Segmentation
|
const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
|
||||||
{
|
switch (modelType) {
|
||||||
if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) modelType = 31; // RTYOLO
|
// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
|
||||||
else modelType=30;// ONNXYOLO
|
case 3: // YOLOV8 / YOLOV11
|
||||||
}
|
case 4: // generic TensorRT
|
||||||
else if ((modelType == 3) || // YoloV8/YoloV11 (Object Detection)
|
case 14: // YOLOv10RTOD (TRT end-to-end NMS)
|
||||||
(modelType == 17)|| // YOLO V12
|
case 17: // YOLOV12
|
||||||
(modelType == 20) || // ONNX Classification
|
modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
|
||||||
(modelType == 21) || // ONNX Pose
|
break;
|
||||||
(modelType == 23) || // ONNX Segmentation
|
// ── Pose ─────────────────────────────────────────────────────────────
|
||||||
(modelType == 25)) // OBB Segmentation
|
case 21: // ONNXPOSE
|
||||||
{
|
case 22: // RTPOSE
|
||||||
modelType = 30; // ONNXYOLO
|
modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
|
||||||
}
|
break;
|
||||||
else {
|
// ── Segmentation ─────────────────────────────────────────────────────
|
||||||
// do nothing, use the modelType specified by user
|
case 23: // ONNXSEG
|
||||||
|
case 24: // RTSEG
|
||||||
|
modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
|
||||||
|
break;
|
||||||
|
// ── OBB / Classification (ONNX-only today — leave as-is) ─────────────
|
||||||
|
case 20: // ONNXCL
|
||||||
|
case 25: // ONNXOBB
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// Any other modelType is handled directly by the switch below.
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (detectionType) {
|
switch (detectionType) {
|
||||||
@@ -764,27 +774,53 @@ extern "C" ANSODENGINE_API int CreateANSODHandleEx(ANSCENTER::ANSODBase** Handl
|
|||||||
ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
|
ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
|
||||||
if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU
|
if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU
|
||||||
|
|
||||||
//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
|
// Route detection / pose / segmentation / OBB / classification to the best
|
||||||
|
// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
|
||||||
if ((modelType == 4) || // TensorRT
|
// handler. Unlisted modelType values are left untouched for the switch below.
|
||||||
(modelType == 14)|| // TensorRT Yolov10
|
//
|
||||||
(modelType == 22)|| // TensorRT Pose
|
// Previous revisions of this block had two correctness bugs:
|
||||||
(modelType == 24)) // TensorRT Segmentation
|
// (1) modelType == 3 / 17 (YoloV8/V11/V12 detection) was hard-wired to
|
||||||
{
|
// ONNXYOLO even on NVIDIA — bypassing the TensorRT path entirely and
|
||||||
if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) modelType = 31; // RTYOLO
|
// duplicating VRAM when multiple handles loaded the same .onnx (ORT
|
||||||
else modelType=30;// ONNXYOLO
|
// has no EnginePoolManager dedupe).
|
||||||
}
|
// (2) modelType == 20 / 21 / 23 / 25 (ONNX CLS / POSE / SEG / OBB) was
|
||||||
else if ((modelType == 3) || // YoloV8/YoloV11 (Object Detection)
|
// rewritten to 30 (ONNXYOLO = detection), making the dedicated
|
||||||
(modelType == 17)|| // YOLO V12
|
// case 20 / 21 / 23 / 25 handlers unreachable dead code. A user
|
||||||
(modelType == 20) || // ONNX Classification
|
// passing modelType=20 for classification ended up with a YOLO head.
|
||||||
(modelType == 21) || // ONNX Pose
|
// (3) modelType == 22 / 24 (TRT pose / TRT seg) on a non-NVIDIA box fell
|
||||||
(modelType == 23) || // ONNX Segmentation
|
// back to ONNXYOLO instead of the correct ONNXPOSE / ONNXSEG handler.
|
||||||
(modelType == 25)) // OBB Segmentation
|
const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
|
||||||
{
|
switch (modelType) {
|
||||||
modelType = 30; // ONNXYOLO
|
// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
|
||||||
}
|
case 3: // YOLOV8 / YOLOV11
|
||||||
else {
|
case 4: // generic TensorRT
|
||||||
// do nothing, use the modelType specified by user
|
case 14: // YOLOv10RTOD (TRT end-to-end NMS)
|
||||||
|
case 17: // YOLOV12
|
||||||
|
modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
|
||||||
|
break;
|
||||||
|
// ── Pose ─────────────────────────────────────────────────────────────
|
||||||
|
case 21: // ONNXPOSE
|
||||||
|
case 22: // RTPOSE
|
||||||
|
modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
|
||||||
|
break;
|
||||||
|
// ── Segmentation ─────────────────────────────────────────────────────
|
||||||
|
case 23: // ONNXSEG
|
||||||
|
case 24: // RTSEG
|
||||||
|
modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
|
||||||
|
break;
|
||||||
|
// ── Oriented Bounding Box (ONNX-only today) ──────────────────────────
|
||||||
|
case 25: // ONNXOBB — no TRT variant; leave as-is
|
||||||
|
break;
|
||||||
|
// ── Classification (ONNX-only in this dispatcher) ────────────────────
|
||||||
|
case 20: // ONNXCL — no TRT variant; leave as-is
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// Any other modelType is handled directly by the switch below
|
||||||
|
// (TENSORFLOW, YOLOV4, YOLOV5, FACEDETECT, FACERECOGNIZE, ALPR,
|
||||||
|
// OCR, ANOMALIB, POSE, SAM, ODHUBMODEL, CUSTOMDETECTOR, CUSTOMPY,
|
||||||
|
// MOTIONDETECTOR, MOVIENET, ONNXSAM3, RTSAM3, ONNXYOLO=30,
|
||||||
|
// RTYOLO=31). Do nothing — keep user's value.
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
// returnModelType will be set after the switch to reflect the actual
|
// returnModelType will be set after the switch to reflect the actual
|
||||||
// model class that was instantiated (e.g. RTYOLO→ONNXYOLO on AMD).
|
// model class that was instantiated (e.g. RTYOLO→ONNXYOLO on AMD).
|
||||||
@@ -1151,26 +1187,39 @@ extern "C" __declspec(dllexport) int LoadModelFromFolder(ANSCENTER::ANSODBase**
|
|||||||
if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU
|
if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU
|
||||||
|
|
||||||
|
|
||||||
//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
|
// Route detection / pose / segmentation / OBB / classification to the best
|
||||||
if ((modelType == 4) || // TensorRT
|
// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
|
||||||
(modelType == 14) || // TensorRT Yolov10
|
// handler. Unlisted modelType values are left untouched for the switch below.
|
||||||
(modelType == 22) || // TensorRT Pose
|
// See CreateANSODHandleEx for the full rationale — three correctness bugs
|
||||||
(modelType == 24)) // TensorRT Segmentation
|
// were fixed in that dispatcher and must be kept in sync across copies.
|
||||||
{
|
{
|
||||||
if (engineType == ANSCENTER::EngineType::NVIDIA_GPU)modelType = 31; // RTYOLO
|
const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
|
||||||
else modelType = 30;// ONNXYOLO
|
switch (modelType) {
|
||||||
}
|
// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
|
||||||
else if ((modelType == 3) || // YoloV8/YoloV11 (Object Detection)
|
case 3: // YOLOV8 / YOLOV11
|
||||||
(modelType == 17) || // YOLO V12
|
case 4: // generic TensorRT
|
||||||
(modelType == 20) || // ONNX Classification
|
case 14: // YOLOv10RTOD (TRT end-to-end NMS)
|
||||||
(modelType == 21) || // ONNX Pose
|
case 17: // YOLOV12
|
||||||
(modelType == 23) || // ONNX Segmentation
|
modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
|
||||||
(modelType == 25)) // OBB Segmentation
|
break;
|
||||||
{
|
// ── Pose ─────────────────────────────────────────────────────────
|
||||||
modelType = 30; // ONNXYOLO
|
case 21: // ONNXPOSE
|
||||||
}
|
case 22: // RTPOSE
|
||||||
else {
|
modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
|
||||||
// do nothing, use the modelType specified by user
|
break;
|
||||||
|
// ── Segmentation ─────────────────────────────────────────────────
|
||||||
|
case 23: // ONNXSEG
|
||||||
|
case 24: // RTSEG
|
||||||
|
modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
|
||||||
|
break;
|
||||||
|
// ── OBB / Classification (ONNX-only today — leave as-is) ─────────
|
||||||
|
case 20: // ONNXCL
|
||||||
|
case 25: // ONNXOBB
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// Any other modelType is handled directly by the switch below.
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// NOTE: We intentionally do NOT destroy any existing *Handle here.
|
// NOTE: We intentionally do NOT destroy any existing *Handle here.
|
||||||
// LabVIEW reuses DLL parameter buffer addresses, so *Handle may point
|
// LabVIEW reuses DLL parameter buffer addresses, so *Handle may point
|
||||||
@@ -1461,26 +1510,39 @@ ANSODENGINE_API int OptimizeModelStr(const char* modelFilePath, const char* mode
|
|||||||
ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
|
ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
|
||||||
|
|
||||||
|
|
||||||
//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
|
// Route detection / pose / segmentation / OBB / classification to the best
|
||||||
if ((modelType == 4) || // TensorRT
|
// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
|
||||||
(modelType == 14) || // TensorRT Yolov10
|
// handler. Unlisted modelType values are left untouched for the switch below.
|
||||||
(modelType == 22) || // TensorRT Pose
|
// See CreateANSODHandleEx for the full rationale — three correctness bugs
|
||||||
(modelType == 24)) // TensorRT Segmentation
|
// were fixed in that dispatcher and must be kept in sync across copies.
|
||||||
{
|
{
|
||||||
if (engineType == ANSCENTER::EngineType::NVIDIA_GPU)modelType = 31; // RTYOLO
|
const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
|
||||||
else modelType = 30;// ONNXYOLO
|
switch (modelType) {
|
||||||
}
|
// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
|
||||||
else if ((modelType == 3) || // YoloV8/YoloV11 (Object Detection)
|
case 3: // YOLOV8 / YOLOV11
|
||||||
(modelType == 17) || // YOLO V12
|
case 4: // generic TensorRT
|
||||||
(modelType == 20) || // ONNX Classification
|
case 14: // YOLOv10RTOD (TRT end-to-end NMS)
|
||||||
(modelType == 21) || // ONNX Pose
|
case 17: // YOLOV12
|
||||||
(modelType == 23) || // ONNX Segmentation
|
modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
|
||||||
(modelType == 25)) // OBB Segmentation
|
break;
|
||||||
{
|
// ── Pose ─────────────────────────────────────────────────────────
|
||||||
modelType = 30; // ONNXYOLO
|
case 21: // ONNXPOSE
|
||||||
}
|
case 22: // RTPOSE
|
||||||
else {
|
modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
|
||||||
// do nothing, use the modelType specified by user
|
break;
|
||||||
|
// ── Segmentation ─────────────────────────────────────────────────
|
||||||
|
case 23: // ONNXSEG
|
||||||
|
case 24: // RTSEG
|
||||||
|
modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
|
||||||
|
break;
|
||||||
|
// ── OBB / Classification (ONNX-only today — leave as-is) ─────────
|
||||||
|
case 20: // ONNXCL
|
||||||
|
case 25: // ONNXOBB
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// Any other modelType is handled directly by the switch below.
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -720,8 +720,24 @@ void Engine<T>::lockGpuClocks(int deviceIndex, int requestedMHz) {
|
|||||||
if (rc == nvml_types::SUCCESS) {
|
if (rc == nvml_types::SUCCESS) {
|
||||||
m_clocksLocked = true;
|
m_clocksLocked = true;
|
||||||
m_nvmlDeviceIdx = static_cast<unsigned int>(deviceIndex);
|
m_nvmlDeviceIdx = static_cast<unsigned int>(deviceIndex);
|
||||||
|
// Always emit to DebugView so operators can confirm the lock took
|
||||||
|
// effect without needing to read engine-level verbose output.
|
||||||
|
ANS_DBG("TRT_Clock",
|
||||||
|
"GPU clocks LOCKED at %u MHz (device %d) — P-state will stay high, "
|
||||||
|
"no WDDM down-clock between inferences",
|
||||||
|
targetMHz, deviceIndex);
|
||||||
if (m_verbose) std::cout << "Info: GPU clocks locked at " << targetMHz << " MHz (device " << deviceIndex << ")" << std::endl;
|
if (m_verbose) std::cout << "Info: GPU clocks locked at " << targetMHz << " MHz (device " << deviceIndex << ")" << std::endl;
|
||||||
} else {
|
} else {
|
||||||
|
// Surface the failure reason + remediation in DebugView. Most common
|
||||||
|
// failure is access-denied (requires Administrator) or the driver
|
||||||
|
// refusing the requested frequency. Users see this in the log and
|
||||||
|
// know to elevate, set NVCP 'Prefer maximum performance', or run
|
||||||
|
// `nvidia-smi -lgc <MHz>,<MHz>` before launching.
|
||||||
|
ANS_DBG("TRT_Clock",
|
||||||
|
"GPU clock lock FAILED (nvml rc=%s) — expect 2-3x inference latency from "
|
||||||
|
"WDDM down-clocking. Fix: run as Admin, OR set NVCP 'Prefer maximum "
|
||||||
|
"performance' for this app, OR: nvidia-smi -lgc %u,%u",
|
||||||
|
errName(rc), targetMHz, targetMHz);
|
||||||
if (m_verbose) {
|
if (m_verbose) {
|
||||||
std::cout << "Warning: nvmlDeviceSetGpuLockedClocks failed: " << errName(rc) << std::endl;
|
std::cout << "Warning: nvmlDeviceSetGpuLockedClocks failed: " << errName(rc) << std::endl;
|
||||||
std::cout << " (Run as Administrator, or use: nvidia-smi -lgc " << targetMHz << "," << targetMHz << ")" << std::endl;
|
std::cout << " (Run as Administrator, or use: nvidia-smi -lgc " << targetMHz << "," << targetMHz << ")" << std::endl;
|
||||||
|
|||||||
Reference in New Issue
Block a user