Improve ANSCV

2026-04-21 09:26:02 +10:00
parent 9f0a10a4c8
commit 7e772f76bc
15 changed files with 749 additions and 421 deletions
--- a/modules/ANSODEngine/dllmain.cpp
+++ b/modules/ANSODEngine/dllmain.cpp
@@ -426,27 +426,37 @@ extern "C" ANSODENGINE_API std::string  CreateANSODHandle(ANSCENTER::ANSODBase**
 	ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
 	if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU

-	//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
-
-	if ((modelType == 4) || // TensorRT  
-		(modelType == 14)|| // TensorRT Yolov10	 
-		(modelType == 22)|| // TensorRT Pose
-		(modelType == 24))  // TensorRT Segmentation
-	{
-		if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) modelType = 31; // RTYOLO
-		else modelType=30;// ONNXYOLO
-	}
-	else if ((modelType == 3) ||	// YoloV8/YoloV11 (Object Detection)
-		     (modelType == 17)||	// YOLO V12
-			 (modelType == 20) ||	// ONNX Classification	
-			 (modelType == 21) ||	// ONNX Pose
-			 (modelType == 23) ||	// ONNX Segmentation	
-			 (modelType == 25))	    // OBB Segmentation
-	{
-		modelType = 30; // ONNXYOLO
-	}
-	else {
-		// do nothing, use the modelType specified by user
+	// Route detection / pose / segmentation / OBB / classification to the best
+	// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
+	// handler. Unlisted modelType values are left untouched for the switch below.
+	// See CreateANSODHandleEx for the full rationale — three correctness bugs
+	// were fixed in that dispatcher and must be kept in sync across copies.
+	const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
+	switch (modelType) {
+		// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
+		case 3:   // YOLOV8 / YOLOV11
+		case 4:   // generic TensorRT
+		case 14:  // YOLOv10RTOD (TRT end-to-end NMS)
+		case 17:  // YOLOV12
+			modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
+			break;
+		// ── Pose ─────────────────────────────────────────────────────────────
+		case 21:  // ONNXPOSE
+		case 22:  // RTPOSE
+			modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
+			break;
+		// ── Segmentation ─────────────────────────────────────────────────────
+		case 23:  // ONNXSEG
+		case 24:  // RTSEG
+			modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
+			break;
+		// ── OBB / Classification (ONNX-only today — leave as-is) ─────────────
+		case 20:  // ONNXCL
+		case 25:  // ONNXOBB
+			break;
+		default:
+			// Any other modelType is handled directly by the switch below.
+			break;
 	}

 	switch (detectionType) {
@@ -764,27 +774,53 @@ extern "C" ANSODENGINE_API int  CreateANSODHandleEx(ANSCENTER::ANSODBase** Handl
 	ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();
 	if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU

-	//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
-
-	if ((modelType == 4) || // TensorRT
-		(modelType == 14)|| // TensorRT Yolov10
-		(modelType == 22)|| // TensorRT Pose
-		(modelType == 24))  // TensorRT Segmentation
-	{
-		if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) modelType = 31; // RTYOLO
-		else modelType=30;// ONNXYOLO
-	}
-	else if ((modelType == 3) ||	// YoloV8/YoloV11 (Object Detection)
-		     (modelType == 17)||	// YOLO V12
-			 (modelType == 20) ||	// ONNX Classification
-			 (modelType == 21) ||	// ONNX Pose
-			 (modelType == 23) ||	// ONNX Segmentation
-			 (modelType == 25))	    // OBB Segmentation
-	{
-		modelType = 30; // ONNXYOLO
-	}
-	else {
-		// do nothing, use the modelType specified by user
+	// Route detection / pose / segmentation / OBB / classification to the best
+	// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
+	// handler. Unlisted modelType values are left untouched for the switch below.
+	//
+	// Previous revisions of this block had two correctness bugs:
+	//   (1) modelType == 3 / 17 (YoloV8/V11/V12 detection) was hard-wired to
+	//       ONNXYOLO even on NVIDIA — bypassing the TensorRT path entirely and
+	//       duplicating VRAM when multiple handles loaded the same .onnx (ORT
+	//       has no EnginePoolManager dedupe).
+	//   (2) modelType == 20 / 21 / 23 / 25 (ONNX CLS / POSE / SEG / OBB) was
+	//       rewritten to 30 (ONNXYOLO = detection), making the dedicated
+	//       case 20 / 21 / 23 / 25 handlers unreachable dead code. A user
+	//       passing modelType=20 for classification ended up with a YOLO head.
+	//   (3) modelType == 22 / 24 (TRT pose / TRT seg) on a non-NVIDIA box fell
+	//       back to ONNXYOLO instead of the correct ONNXPOSE / ONNXSEG handler.
+	const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
+	switch (modelType) {
+		// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
+		case 3:   // YOLOV8 / YOLOV11
+		case 4:   // generic TensorRT
+		case 14:  // YOLOv10RTOD (TRT end-to-end NMS)
+		case 17:  // YOLOV12
+			modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
+			break;
+		// ── Pose ─────────────────────────────────────────────────────────────
+		case 21:  // ONNXPOSE
+		case 22:  // RTPOSE
+			modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
+			break;
+		// ── Segmentation ─────────────────────────────────────────────────────
+		case 23:  // ONNXSEG
+		case 24:  // RTSEG
+			modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
+			break;
+		// ── Oriented Bounding Box (ONNX-only today) ──────────────────────────
+		case 25:  // ONNXOBB — no TRT variant; leave as-is
+			break;
+		// ── Classification (ONNX-only in this dispatcher) ────────────────────
+		case 20:  // ONNXCL — no TRT variant; leave as-is
+			break;
+		default:
+			// Any other modelType is handled directly by the switch below
+			// (TENSORFLOW, YOLOV4, YOLOV5, FACEDETECT, FACERECOGNIZE, ALPR,
+			// OCR, ANOMALIB, POSE, SAM, ODHUBMODEL, CUSTOMDETECTOR, CUSTOMPY,
+			// MOTIONDETECTOR, MOVIENET, ONNXSAM3, RTSAM3, ONNXYOLO=30,
+			// RTYOLO=31). Do nothing — keep user's value.
+			break;
 	}
 	// returnModelType will be set after the switch to reflect the actual
 	// model class that was instantiated (e.g. RTYOLO→ONNXYOLO on AMD).
@@ -1151,26 +1187,39 @@ extern "C" __declspec(dllexport) int LoadModelFromFolder(ANSCENTER::ANSODBase**
 		if (autoDetectEngine==-1)engineType=ANSCENTER::EngineType::CPU;// We force to use CPU


-		//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
-		if ((modelType == 4) || // TensorRT  
-			(modelType == 14) || // TensorRT Yolov10	 
-			(modelType == 22) || // TensorRT Pose
-			(modelType == 24))  // TensorRT Segmentation
+		// Route detection / pose / segmentation / OBB / classification to the best
+		// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
+		// handler. Unlisted modelType values are left untouched for the switch below.
+		// See CreateANSODHandleEx for the full rationale — three correctness bugs
+		// were fixed in that dispatcher and must be kept in sync across copies.
 		{
-			if (engineType == ANSCENTER::EngineType::NVIDIA_GPU)modelType = 31; // RTYOLO
-			else modelType = 30;// ONNXYOLO
-		}
-		else if ((modelType == 3) ||	// YoloV8/YoloV11 (Object Detection)
-			(modelType == 17) ||	// YOLO V12
-			(modelType == 20) ||	// ONNX Classification	
-			(modelType == 21) ||	// ONNX Pose
-			(modelType == 23) ||	// ONNX Segmentation	
-			(modelType == 25))	    // OBB Segmentation
-		{
-			modelType = 30; // ONNXYOLO
-		}
-		else {
-			// do nothing, use the modelType specified by user
+			const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
+			switch (modelType) {
+				// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
+				case 3:   // YOLOV8 / YOLOV11
+				case 4:   // generic TensorRT
+				case 14:  // YOLOv10RTOD (TRT end-to-end NMS)
+				case 17:  // YOLOV12
+					modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
+					break;
+				// ── Pose ─────────────────────────────────────────────────────────
+				case 21:  // ONNXPOSE
+				case 22:  // RTPOSE
+					modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
+					break;
+				// ── Segmentation ─────────────────────────────────────────────────
+				case 23:  // ONNXSEG
+				case 24:  // RTSEG
+					modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
+					break;
+				// ── OBB / Classification (ONNX-only today — leave as-is) ─────────
+				case 20:  // ONNXCL
+				case 25:  // ONNXOBB
+					break;
+				default:
+					// Any other modelType is handled directly by the switch below.
+					break;
+			}
 		}
 		// NOTE: We intentionally do NOT destroy any existing *Handle here.
 		// LabVIEW reuses DLL parameter buffer addresses, so *Handle may point
@@ -1461,26 +1510,39 @@ ANSODENGINE_API int OptimizeModelStr(const char* modelFilePath, const char* mode
 		ANSCENTER::EngineType engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();


-		//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
-		if ((modelType == 4) || // TensorRT
-			(modelType == 14) || // TensorRT Yolov10
-			(modelType == 22) || // TensorRT Pose
-			(modelType == 24))  // TensorRT Segmentation
+		// Route detection / pose / segmentation / OBB / classification to the best
+		// available backend: prefer TensorRT on NVIDIA, otherwise the matching ONNX
+		// handler. Unlisted modelType values are left untouched for the switch below.
+		// See CreateANSODHandleEx for the full rationale — three correctness bugs
+		// were fixed in that dispatcher and must be kept in sync across copies.
 		{
-			if (engineType == ANSCENTER::EngineType::NVIDIA_GPU)modelType = 31; // RTYOLO
-			else modelType = 30;// ONNXYOLO
-		}
-		else if ((modelType == 3) ||	// YoloV8/YoloV11 (Object Detection)
-			(modelType == 17) ||	// YOLO V12
-			(modelType == 20) ||	// ONNX Classification
-			(modelType == 21) ||	// ONNX Pose
-			(modelType == 23) ||	// ONNX Segmentation
-			(modelType == 25))	    // OBB Segmentation
-		{
-			modelType = 30; // ONNXYOLO
-		}
-		else {
-			// do nothing, use the modelType specified by user
+			const bool onNvidia = (engineType == ANSCENTER::EngineType::NVIDIA_GPU);
+			switch (modelType) {
+				// ── Detection family: YOLOv8 / V11 / V12 / generic TRT / V10-RTOD ──
+				case 3:   // YOLOV8 / YOLOV11
+				case 4:   // generic TensorRT
+				case 14:  // YOLOv10RTOD (TRT end-to-end NMS)
+				case 17:  // YOLOV12
+					modelType = onNvidia ? 31 /* RTYOLO */ : 30 /* ONNXYOLO */;
+					break;
+				// ── Pose ─────────────────────────────────────────────────────────
+				case 21:  // ONNXPOSE
+				case 22:  // RTPOSE
+					modelType = onNvidia ? 22 /* RTPOSE */ : 21 /* ONNXPOSE */;
+					break;
+				// ── Segmentation ─────────────────────────────────────────────────
+				case 23:  // ONNXSEG
+				case 24:  // RTSEG
+					modelType = onNvidia ? 24 /* RTSEG */ : 23 /* ONNXSEG */;
+					break;
+				// ── OBB / Classification (ONNX-only today — leave as-is) ─────────
+				case 20:  // ONNXCL
+				case 25:  // ONNXOBB
+					break;
+				default:
+					// Any other modelType is handled directly by the switch below.
+					break;
+			}
 		}

 		
--- a/modules/ANSODEngine/engine.h
+++ b/modules/ANSODEngine/engine.h
@@ -720,8 +720,24 @@ void Engine<T>::lockGpuClocks(int deviceIndex, int requestedMHz) {
    if (rc == nvml_types::SUCCESS) {
        m_clocksLocked  = true;
        m_nvmlDeviceIdx = static_cast<unsigned int>(deviceIndex);
+        // Always emit to DebugView so operators can confirm the lock took
+        // effect without needing to read engine-level verbose output.
+        ANS_DBG("TRT_Clock",
+            "GPU clocks LOCKED at %u MHz (device %d) — P-state will stay high, "
+            "no WDDM down-clock between inferences",
+            targetMHz, deviceIndex);
        if (m_verbose) std::cout << "Info: GPU clocks locked at " << targetMHz << " MHz (device " << deviceIndex << ")" << std::endl;
    } else {
+        // Surface the failure reason + remediation in DebugView. Most common
+        // failure is access-denied (requires Administrator) or the driver
+        // refusing the requested frequency. Users see this in the log and
+        // know to elevate, set NVCP 'Prefer maximum performance', or run
+        // `nvidia-smi -lgc <MHz>,<MHz>` before launching.
+        ANS_DBG("TRT_Clock",
+            "GPU clock lock FAILED (nvml rc=%s) — expect 2-3x inference latency from "
+            "WDDM down-clocking. Fix: run as Admin, OR set NVCP 'Prefer maximum "
+            "performance' for this app, OR: nvidia-smi -lgc %u,%u",
+            errName(rc), targetMHz, targetMHz);
        if (m_verbose) {
            std::cout << "Warning: nvmlDeviceSetGpuLockedClocks failed: " << errName(rc) << std::endl;
            std::cout << "  (Run as Administrator, or use: nvidia-smi -lgc " << targetMHz << "," << targetMHz << ")" << std::endl;