Improve ANSCV

This commit is contained in:
2026-04-21 09:26:02 +10:00
parent 9f0a10a4c8
commit 7e772f76bc
15 changed files with 749 additions and 421 deletions

View File

@@ -2,6 +2,7 @@
#include "ANSMatRegistry.h"
#include "ANSGpuFrameOps.h"
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
#include "ANSLicense.h" // ANS_DBG macro
#include <memory>
#include <cstdint>
#include "media_codec.h"
@@ -251,6 +252,23 @@ namespace ANSCENTER {
return _pLastFrame; // Shallow copy (fast)
}
// Early stale-out: if the decoder hasn't produced a frame in 5s the
// source is dead. Skip _playerClient->getImage() entirely and return
// the cached frame with unchanged _pts so LabVIEW sees STALE PTS one
// poll earlier and triggers reconnect.
if (!_pLastFrame.empty()) {
double ageMs = _playerClient->getLastFrameAgeMs();
if (ageMs >= 5000.0) {
ANS_DBG("FLV_GetImage",
"EARLY STALE: ageMs=%.1f pts=%lld url=%s — skipping getImage()",
ageMs, (long long)_pts, _url.c_str());
width = _imageWidth;
height = _imageHeight;
pts = _pts;
return _pLastFrame;
}
}
int imageW = 0, imageH = 0;
int64_t currentPts = 0;

View File

@@ -2,6 +2,7 @@
#include "ANSMatRegistry.h"
#include "ANSGpuFrameOps.h"
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
#include "ANSLicense.h" // ANS_DBG macro
#include <memory>
#include <cstdint>
#include "media_codec.h"
@@ -239,6 +240,23 @@ namespace ANSCENTER {
return _pLastFrame; // Shallow copy (fast)
}
// Early stale-out: if the decoder hasn't produced a frame in 5s the
// source is dead. Skip _playerClient->getImage() entirely and return
// the cached frame with unchanged _pts so LabVIEW sees STALE PTS one
// poll earlier and triggers reconnect.
if (!_pLastFrame.empty()) {
double ageMs = _playerClient->getLastFrameAgeMs();
if (ageMs >= 5000.0) {
ANS_DBG("MJPEG_GetImage",
"EARLY STALE: ageMs=%.1f pts=%lld url=%s — skipping getImage()",
ageMs, (long long)_pts, _url.c_str());
width = _imageWidth;
height = _imageHeight;
pts = _pts;
return _pLastFrame;
}
}
int imageW = 0, imageH = 0;
int64_t currentPts = 0;

File diff suppressed because it is too large Load Diff

View File

@@ -155,7 +155,9 @@ namespace ANSCENTER
std::recursive_mutex _mutex;
//std::once_flag licenseOnceFlag; // For one-time license check
bool _licenseValid = false;
// Atomic so lock-free methods (ImageResize, ImageResizeWithRatio,
// MatToBinaryData, EncodeJpegString) can read it without _mutex.
std::atomic<bool> _licenseValid{ false };
public:
};
}

View File

@@ -2,6 +2,7 @@
#include "ANSMatRegistry.h"
#include "ANSGpuFrameOps.h"
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
#include "ANSLicense.h" // ANS_DBG macro
#include <memory>
#include "media_codec.h"
#include <cstdint>
@@ -245,6 +246,23 @@ namespace ANSCENTER {
return _pLastFrame; // Shallow copy (fast)
}
// Early stale-out: if the decoder hasn't produced a frame in 5s the
// source is dead. Skip _playerClient->getImage() entirely and return
// the cached frame with unchanged _pts so LabVIEW sees STALE PTS one
// poll earlier and triggers reconnect.
if (!_pLastFrame.empty()) {
double ageMs = _playerClient->getLastFrameAgeMs();
if (ageMs >= 5000.0) {
ANS_DBG("RTMP_GetImage",
"EARLY STALE: ageMs=%.1f pts=%lld url=%s — skipping getImage()",
ageMs, (long long)_pts, _url.c_str());
width = _imageWidth;
height = _imageHeight;
pts = _pts;
return _pLastFrame;
}
}
int imageW = 0, imageH = 0;
int64_t currentPts = 0;

View File

@@ -2,6 +2,7 @@
#include "ANSMatRegistry.h"
#include "ANSGpuFrameOps.h"
#include "ANSCVVendorGate.h" // anscv_vendor_gate::IsNvidiaGpuAvailable()
#include "ANSLicense.h" // ANS_DBG macro
#include <memory>
#include "media_codec.h"
#include <cstdint>
@@ -253,6 +254,23 @@ namespace ANSCENTER {
return _pLastFrame; // Shallow copy (fast)
}
// Early stale-out: if the decoder hasn't produced a frame in 5s the
// source is dead. Skip _playerClient->getImage() entirely and return
// the cached frame with unchanged _pts so LabVIEW sees STALE PTS one
// poll earlier and triggers reconnect.
if (!_pLastFrame.empty()) {
double ageMs = _playerClient->getLastFrameAgeMs();
if (ageMs >= 5000.0) {
ANS_DBG("SRT_GetImage",
"EARLY STALE: ageMs=%.1f pts=%lld url=%s — skipping getImage()",
ageMs, (long long)_pts, _url.c_str());
width = _imageWidth;
height = _imageHeight;
pts = _pts;
return _pLastFrame;
}
}
int imageW = 0, imageH = 0;
int64_t currentPts = 0;

View File

@@ -91,9 +91,14 @@ namespace ANSCENTER {
}
if (!m_trtEngine) {
// Enable batch support
m_options.optBatchSize = 8;
m_options.maxBatchSize = 32;
// Enable batch support. maxBatchSize controls the TRT workspace
// allocation (~linear in batch); opt is the kernel-selection sweet
// spot. Max=4 was picked to fit 4 concurrent face crops per frame
// comfortably on 8 GB GPUs while freeing ~1.5 GB VRAM vs max=32
// — most scenes have ≤4 faces visible, so throughput cost is
// near-zero (amortized per-face latency drops too at lower batch).
m_options.optBatchSize = 4;
m_options.maxBatchSize = 4;
m_options.maxInputHeight = GPU_FACE_HEIGHT;
m_options.minInputHeight = GPU_FACE_HEIGHT;

View File

@@ -534,8 +534,12 @@ namespace ANSCENTER {
_ocrModelConfig.inpHeight = 640;
_ocrModelConfig.inpWidth = 640;
_ocrModelConfig.gpuOptBatchSize = 8;
_ocrModelConfig.gpuMaxBatchSize = 32; // desired max; engine builder auto-caps by GPU VRAM
// Max=4 chosen to fit typical plate counts per frame on 8 GB GPUs.
// Was opt=8/max=32 which sized TRT workspace for 32 concurrent plates
// (~1 GB for this model alone). Cap of 4 is still >= the usual 13
// plates visible per camera frame, amortized throughput unchanged.
_ocrModelConfig.gpuOptBatchSize = 4;
_ocrModelConfig.gpuMaxBatchSize = 4; // desired max; engine builder auto-caps by GPU VRAM
_ocrModelConfig.maxInputHeight = 640;
_ocrModelConfig.maxInputWidth = 640;
_ocrModelConfig.minInputHeight = 640;
@@ -545,8 +549,9 @@ namespace ANSCENTER {
_lpColourModelConfig.inpHeight = 224;
_lpColourModelConfig.inpWidth = 224;
_lpColourModelConfig.gpuOptBatchSize = 8;
_lpColourModelConfig.gpuMaxBatchSize = 32; // desired max; engine builder auto-caps by GPU VRAM
// See _ocrModelConfig above — matching batch cap for consistency.
_lpColourModelConfig.gpuOptBatchSize = 4;
_lpColourModelConfig.gpuMaxBatchSize = 4; // desired max; engine builder auto-caps by GPU VRAM
_lpColourModelConfig.maxInputHeight = 224;
_lpColourModelConfig.maxInputWidth = 224;
_lpColourModelConfig.minInputHeight = 224;

View File

@@ -28,8 +28,11 @@ bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string&
ANSCENTER::Options options;
options.deviceIndex = gpuId;
options.precision = ANSCENTER::Precision::FP16;
options.maxBatchSize = 1;
options.optBatchSize = 1;
// maxBatch=4 matches FaceRecognizer / ALPR configuration — allows the
// recognizer to process up to 4 detected text lines in one call,
// amortizing per-invocation overhead while keeping TRT workspace small.
options.maxBatchSize = 4;
options.optBatchSize = 4;
// Fixed height, dynamic width for recognition
options.minInputHeight = imgH_;

View File

@@ -185,11 +185,22 @@ extern "C" ANSOCR_API int CreateANSOCRHandleEx(ANSCENTER::ANSOCRBase** Handle,
ANSCENTER::ANSLibsLoader::Initialize();
ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
{
// Describe the backend the engine-selector below will actually choose
// for this (hardware, engineMode) combination. Previous versions of
// this log claimed "TensorRT OCR enabled" based on hardware alone,
// which was misleading because engineMode=0 (auto) unconditionally
// picked ONNX — users saw the log and assumed TRT was running.
const bool isNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
const bool willUseTRT =
isNvidia && (engineMode == 0 /* auto → TRT on NVIDIA */ ||
engineMode == 1 /* GPU → TRT on NVIDIA */);
const char* vendorTag =
engineType == ANSCENTER::EngineType::NVIDIA_GPU ? "NVIDIA_GPU (TensorRT OCR enabled)" :
engineType == ANSCENTER::EngineType::AMD_GPU ? "AMD_GPU (ONNX Runtime / DirectML, TensorRT OCR DISABLED)" :
engineType == ANSCENTER::EngineType::OPENVINO_GPU ? "OPENVINO_GPU (ONNX Runtime / OpenVINO, TensorRT OCR DISABLED)" :
"CPU (ONNX Runtime, TensorRT OCR DISABLED)";
engineType == ANSCENTER::EngineType::NVIDIA_GPU
? (willUseTRT ? "NVIDIA_GPU (TensorRT OCR active)"
: "NVIDIA_GPU (TensorRT available, but engineMode forces ONNX)")
: engineType == ANSCENTER::EngineType::AMD_GPU ? "AMD_GPU (ONNX Runtime / DirectML, TensorRT OCR unavailable)"
: engineType == ANSCENTER::EngineType::OPENVINO_GPU ? "OPENVINO_GPU (ONNX Runtime / OpenVINO, TensorRT OCR unavailable)"
: "CPU (ONNX Runtime, TensorRT OCR unavailable)";
char buf[192];
snprintf(buf, sizeof(buf),
"[ANSOCR] CreateANSOCRHandleEx: detected engineType=%d [%s], engineMode=%d\n",
@@ -230,10 +241,23 @@ extern "C" ANSOCR_API int CreateANSOCRHandleEx(ANSCENTER::ANSOCRBase** Handle,
// select, including DirectML for AMD).
const bool isNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
switch (engineMode) {
case 0:// Auto-detect, always use ONNX for better compatibility, especially on AMD GPUs and high-res images
(*Handle) = new ANSCENTER::ANSONNXOCR();
case 0: // Auto-detect — prefer TensorRT on NVIDIA, ONNX elsewhere.
// Previous policy was "always ONNX" for cross-platform safety,
// but on NVIDIA that defeated the point: each ANSONNXOCR handle
// allocates its own cls/dec/rec OrtSessions (no dedupe), which
// wasted ~300600 MB VRAM per extra instance and ran ~2× slower
// than ANSRTOCR's shared-engine path via EnginePoolManager.
if (isNvidia) {
limitSideLen = 960;
(*Handle) = new ANSCENTER::ANSRTOCR();
} else {
// AMD / Intel / CPU — ANSRTOCR hard-requires CUDA and would
// crash. ANSONNXOCR auto-picks the correct ORT EP
// (DirectML on AMD, OpenVINO on Intel, CPU otherwise).
(*Handle) = new ANSCENTER::ANSONNXOCR();
}
break;
case 1:// GPU — use TensorRT engine ONLY on NVIDIA hardware.
case 1: // GPU — use TensorRT engine ONLY on NVIDIA hardware.
if (isNvidia) {
limitSideLen = 960;
(*Handle) = new ANSCENTER::ANSRTOCR();
@@ -244,7 +268,7 @@ extern "C" ANSOCR_API int CreateANSOCRHandleEx(ANSCENTER::ANSOCRBase** Handle,
(*Handle) = new ANSCENTER::ANSONNXOCR();
}
break;
case 2:// CPU
case 2: // CPU
(*Handle) = new ANSCENTER::ANSONNXOCR();
break;
default:

View File

@@ -426,27 +426,37 @@ extern "C" ANSODENGINE_API std::string CreateANSODHandle(ANSCENTER::ANSODBase**
ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU
//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
if ((modelType == 4) || // TensorRT
(modelType == 14)|| // TensorRT Yolov10
(modelType == 22)|| // TensorRT Pose
(modelType == 24)) // TensorRT Segmentation
{
if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) modelType = 31; // RTYOLO
else modelType=30;// ONNXYOLO
}
else if ((modelType == 3) || // YoloV8/YoloV11 (Object Detection)
(modelType == 17)|| // YOLO V12
(modelType == 20) || // ONNX Classification
(modelType == 21) || // ONNX Pose
(modelType == 23) || // ONNX Segmentation
(modelType == 25)) // OBB Segmentation
{
modelType = 30; // ONNXYOLO
}
else {
// do nothing, use the modelType specified by user
// Route detection / pose / segmentation / OBB / classification to the best
// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
// handler. Unlisted modelType values are left untouched for the switch below.
// See CreateANSODHandleEx for the full rationale — three correctness bugs
// were fixed in that dispatcher and must be kept in sync across copies.
const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
switch (modelType) {
// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
case 3: // YOLOV8 / YOLOV11
case 4: // generic TensorRT
case 14: // YOLOv10RTOD (TRT end-to-end NMS)
case 17: // YOLOV12
modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
break;
// ── Pose ─────────────────────────────────────────────────────────────
case 21: // ONNXPOSE
case 22: // RTPOSE
modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
break;
// ── Segmentation ─────────────────────────────────────────────────────
case 23: // ONNXSEG
case 24: // RTSEG
modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
break;
// ── OBB / Classification (ONNX-only today — leave as-is) ─────────────
case 20: // ONNXCL
case 25: // ONNXOBB
break;
default:
// Any other modelType is handled directly by the switch below.
break;
}
switch (detectionType) {
@@ -764,27 +774,53 @@ extern "C" ANSODENGINE_API int CreateANSODHandleEx(ANSCENTER::ANSODBase** Handl
ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU
//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
if ((modelType == 4) || // TensorRT
(modelType == 14)|| // TensorRT Yolov10
(modelType == 22)|| // TensorRT Pose
(modelType == 24)) // TensorRT Segmentation
{
if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) modelType = 31; // RTYOLO
else modelType=30;// ONNXYOLO
}
else if ((modelType == 3) || // YoloV8/YoloV11 (Object Detection)
(modelType == 17)|| // YOLO V12
(modelType == 20) || // ONNX Classification
(modelType == 21) || // ONNX Pose
(modelType == 23) || // ONNX Segmentation
(modelType == 25)) // OBB Segmentation
{
modelType = 30; // ONNXYOLO
}
else {
// do nothing, use the modelType specified by user
// Route detection / pose / segmentation / OBB / classification to the best
// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
// handler. Unlisted modelType values are left untouched for the switch below.
//
// Previous revisions of this block had two correctness bugs:
// (1) modelType == 3 / 17 (YoloV8/V11/V12 detection) was hard-wired to
// ONNXYOLO even on NVIDIA — bypassing the TensorRT path entirely and
// duplicating VRAM when multiple handles loaded the same .onnx (ORT
// has no EnginePoolManager dedupe).
// (2) modelType == 20 / 21 / 23 / 25 (ONNX CLS / POSE / SEG / OBB) was
// rewritten to 30 (ONNXYOLO = detection), making the dedicated
// case 20 / 21 / 23 / 25 handlers unreachable dead code. A user
// passing modelType=20 for classification ended up with a YOLO head.
// (3) modelType == 22 / 24 (TRT pose / TRT seg) on a non-NVIDIA box fell
// back to ONNXYOLO instead of the correct ONNXPOSE / ONNXSEG handler.
const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
switch (modelType) {
// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
case 3: // YOLOV8 / YOLOV11
case 4: // generic TensorRT
case 14: // YOLOv10RTOD (TRT end-to-end NMS)
case 17: // YOLOV12
modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
break;
// ── Pose ─────────────────────────────────────────────────────────────
case 21: // ONNXPOSE
case 22: // RTPOSE
modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
break;
// ── Segmentation ─────────────────────────────────────────────────────
case 23: // ONNXSEG
case 24: // RTSEG
modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
break;
// ── Oriented Bounding Box (ONNX-only today) ──────────────────────────
case 25: // ONNXOBB — no TRT variant; leave as-is
break;
// ── Classification (ONNX-only in this dispatcher) ────────────────────
case 20: // ONNXCL — no TRT variant; leave as-is
break;
default:
// Any other modelType is handled directly by the switch below
// (TENSORFLOW, YOLOV4, YOLOV5, FACEDETECT, FACERECOGNIZE, ALPR,
// OCR, ANOMALIB, POSE, SAM, ODHUBMODEL, CUSTOMDETECTOR, CUSTOMPY,
// MOTIONDETECTOR, MOVIENET, ONNXSAM3, RTSAM3, ONNXYOLO=30,
// RTYOLO=31). Do nothing — keep user's value.
break;
}
// returnModelType will be set after the switch to reflect the actual
// model class that was instantiated (e.g. RTYOLO→ONNXYOLO on AMD).
@@ -1151,26 +1187,39 @@ extern "C" __declspec(dllexport) int LoadModelFromFolder(ANSCENTER::ANSODBase**
if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU
//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
if ((modelType == 4) || // TensorRT
(modelType == 14) || // TensorRT Yolov10
(modelType == 22) || // TensorRT Pose
(modelType == 24)) // TensorRT Segmentation
// Route detection / pose / segmentation / OBB / classification to the best
// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
// handler. Unlisted modelType values are left untouched for the switch below.
// See CreateANSODHandleEx for the full rationale — three correctness bugs
// were fixed in that dispatcher and must be kept in sync across copies.
{
if (engineType == ANSCENTER::EngineType::NVIDIA_GPU)modelType = 31; // RTYOLO
else modelType = 30;// ONNXYOLO
}
else if ((modelType == 3) || // YoloV8/YoloV11 (Object Detection)
(modelType == 17) || // YOLO V12
(modelType == 20) || // ONNX Classification
(modelType == 21) || // ONNX Pose
(modelType == 23) || // ONNX Segmentation
(modelType == 25)) // OBB Segmentation
{
modelType = 30; // ONNXYOLO
}
else {
// do nothing, use the modelType specified by user
const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
switch (modelType) {
// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
case 3: // YOLOV8 / YOLOV11
case 4: // generic TensorRT
case 14: // YOLOv10RTOD (TRT end-to-end NMS)
case 17: // YOLOV12
modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
break;
// ── Pose ─────────────────────────────────────────────────────────
case 21: // ONNXPOSE
case 22: // RTPOSE
modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
break;
// ── Segmentation ─────────────────────────────────────────────────
case 23: // ONNXSEG
case 24: // RTSEG
modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
break;
// ── OBB / Classification (ONNX-only today — leave as-is) ─────────
case 20: // ONNXCL
case 25: // ONNXOBB
break;
default:
// Any other modelType is handled directly by the switch below.
break;
}
}
// NOTE: We intentionally do NOT destroy any existing *Handle here.
// LabVIEW reuses DLL parameter buffer addresses, so *Handle may point
@@ -1461,26 +1510,39 @@ ANSODENGINE_API int OptimizeModelStr(const char* modelFilePath, const char* mode
ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
if ((modelType == 4) || // TensorRT
(modelType == 14) || // TensorRT Yolov10
(modelType == 22) || // TensorRT Pose
(modelType == 24)) // TensorRT Segmentation
// Route detection / pose / segmentation / OBB / classification to the best
// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
// handler. Unlisted modelType values are left untouched for the switch below.
// See CreateANSODHandleEx for the full rationale — three correctness bugs
// were fixed in that dispatcher and must be kept in sync across copies.
{
if (engineType == ANSCENTER::EngineType::NVIDIA_GPU)modelType = 31; // RTYOLO
else modelType = 30;// ONNXYOLO
}
else if ((modelType == 3) || // YoloV8/YoloV11 (Object Detection)
(modelType == 17) || // YOLO V12
(modelType == 20) || // ONNX Classification
(modelType == 21) || // ONNX Pose
(modelType == 23) || // ONNX Segmentation
(modelType == 25)) // OBB Segmentation
{
modelType = 30; // ONNXYOLO
}
else {
// do nothing, use the modelType specified by user
const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
switch (modelType) {
// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
case 3: // YOLOV8 / YOLOV11
case 4: // generic TensorRT
case 14: // YOLOv10RTOD (TRT end-to-end NMS)
case 17: // YOLOV12
modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
break;
// ── Pose ─────────────────────────────────────────────────────────
case 21: // ONNXPOSE
case 22: // RTPOSE
modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
break;
// ── Segmentation ─────────────────────────────────────────────────
case 23: // ONNXSEG
case 24: // RTSEG
modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
break;
// ── OBB / Classification (ONNX-only today — leave as-is) ─────────
case 20: // ONNXCL
case 25: // ONNXOBB
break;
default:
// Any other modelType is handled directly by the switch below.
break;
}
}

View File

@@ -720,8 +720,24 @@ void Engine<T>::lockGpuClocks(int deviceIndex, int requestedMHz) {
if (rc == nvml_types::SUCCESS) {
m_clocksLocked = true;
m_nvmlDeviceIdx = static_cast<unsigned int>(deviceIndex);
// Always emit to DebugView so operators can confirm the lock took
// effect without needing to read engine-level verbose output.
ANS_DBG("TRT_Clock",
"GPU clocks LOCKED at %u MHz (device %d) — P-state will stay high, "
"no WDDM down-clock between inferences",
targetMHz, deviceIndex);
if (m_verbose) std::cout << "Info: GPU clocks locked at " << targetMHz << " MHz (device " << deviceIndex << ")" << std::endl;
} else {
// Surface the failure reason + remediation in DebugView. Most common
// failure is access-denied (requires Administrator) or the driver
// refusing the requested frequency. Users see this in the log and
// know to elevate, set NVCP 'Prefer maximum performance', or run
// `nvidia-smi -lgc <MHz>,<MHz>` before launching.
ANS_DBG("TRT_Clock",
"GPU clock lock FAILED (nvml rc=%s) — expect 2-3x inference latency from "
"WDDM down-clocking. Fix: run as Admin, OR set NVCP 'Prefer maximum "
"performance' for this app, OR: nvidia-smi -lgc %u,%u",
errName(rc), targetMHz, targetMHz);
if (m_verbose) {
std::cout << "Warning: nvmlDeviceSetGpuLockedClocks failed: " << errName(rc) << std::endl;
std::cout << " (Run as Administrator, or use: nvidia-smi -lgc " << targetMHz << "," << targetMHz << ")" << std::endl;