Fix AMD and OpenVINO

2026-04-08 13:45:52 +10:00
parent a4a8caaa86
commit 69787b0ff0
15 changed files with 1209 additions and 132 deletions
--- a/modules/ANSODEngine/ANSONNXYOLO.cpp
+++ b/modules/ANSODEngine/ANSONNXYOLO.cpp
@@ -335,7 +335,7 @@ namespace ANSCENTER {
                // to distinguish OBB (angle values in [-pi, pi]) from detection
                bool likelyOBB = false;
                if (extra >= 2) {
-                    const float* rawOutput = outputTensors[0].GetTensorData<float>();
+                    const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
                    int numSamples = std::min(numBoxes, 100);
                    int angleCount = 0;
                    for (int s = 0; s < numSamples; ++s) {
@@ -371,13 +371,13 @@ namespace ANSCENTER {
    std::vector<Object> ONNXYOLO::postprocessEndToEnd(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
-        const std::vector<Ort::Value>& outputTensors,
+        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold)
    {
        if (outputTensors.empty()) return {};

-        const float* rawOutput = outputTensors[0].GetTensorData<float>();
+        const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
        const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        if (outputShape.size() < 3) return {};

@@ -427,13 +427,13 @@ namespace ANSCENTER {
    std::vector<Object> ONNXYOLO::postprocessLegacy(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
-        const std::vector<Ort::Value>& outputTensors,
+        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold, float iouThreshold, int maxDet)
    {
        if (outputTensors.empty()) return {};

-        const float* rawOutput = outputTensors[0].GetTensorData<float>();
+        const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
        const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        if (outputShape.size() < 3) return {};

@@ -656,12 +656,12 @@ namespace ANSCENTER {
    std::vector<Object> ONNXYOLO::postprocessOBBEndToEnd(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
-        const std::vector<Ort::Value>& outputTensors,
+        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold)
    {
        if (outputTensors.empty()) return {};
-        const float* raw = outputTensors[0].GetTensorData<float>();
+        const float* raw = outputTensors[0].GetTensorMutableData<float>();
        const auto shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        if (shape.size() < 3) return {};

@@ -721,12 +721,12 @@ namespace ANSCENTER {
    std::vector<Object> ONNXYOLO::postprocessOBBLegacy(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
-        const std::vector<Ort::Value>& outputTensors,
+        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold, float iouThreshold, int maxDet)
    {
        if (outputTensors.empty()) return {};
-        const float* rawOutput = outputTensors[0].GetTensorData<float>();
+        const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
        const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        if (outputShape.size() < 3) return {};

@@ -822,13 +822,13 @@ namespace ANSCENTER {
    std::vector<Object> ONNXYOLO::postprocessSegEndToEnd(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
-        const std::vector<Ort::Value>& outputTensors,
+        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold)
    {
        if (outputTensors.size() < 2) return {};

-        const float* raw = outputTensors[0].GetTensorData<float>();
+        const float* raw = outputTensors[0].GetTensorMutableData<float>();
        const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        const auto protoShape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape();
        if (shape0.size() < 3 || protoShape.size() < 4) return {};
@@ -884,7 +884,7 @@ namespace ANSCENTER {

        // Generate masks: coeffs @ protos → sigmoid → crop-in-proto → resize-to-box → threshold
        if (!objs.empty() && !maskCoeffs.empty()) {
-            const float* protoData = outputTensors[1].GetTensorData<float>();
+            const float* protoData = outputTensors[1].GetTensorMutableData<float>();
            cv::Mat protos(nm, protoH * protoW, CV_32F, const_cast<float*>(protoData));
            cv::Mat matmulRes = (maskCoeffs * protos).t();

@@ -951,13 +951,13 @@ namespace ANSCENTER {
    std::vector<Object> ONNXYOLO::postprocessSegLegacy(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
-        const std::vector<Ort::Value>& outputTensors,
+        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold, float iouThreshold, int maxDet)
    {
        if (outputTensors.size() < 2) return {};

-        const float* rawOutput = outputTensors[0].GetTensorData<float>();
+        const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
        const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        const auto protoShape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape();
        if (shape0.size() < 3 || protoShape.size() < 4) return {};
@@ -1035,7 +1035,7 @@ namespace ANSCENTER {

        // Generate masks
        if (!objs.empty() && !masks.empty()) {
-            const float* protoData = outputTensors[1].GetTensorData<float>();
+            const float* protoData = outputTensors[1].GetTensorMutableData<float>();
            cv::Mat protos(nm, protoH * protoW, CV_32F, const_cast<float*>(protoData));
            cv::Mat matmulRes = (masks * protos).t();

@@ -1106,12 +1106,12 @@ namespace ANSCENTER {
    std::vector<Object> ONNXYOLO::postprocessPoseEndToEnd(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
-        const std::vector<Ort::Value>& outputTensors,
+        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold, int numKPS)
    {
        if (outputTensors.empty()) return {};
-        const float* raw = outputTensors[0].GetTensorData<float>();
+        const float* raw = outputTensors[0].GetTensorMutableData<float>();
        const auto shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        if (shape.size() < 3) return {};

@@ -1172,12 +1172,12 @@ namespace ANSCENTER {
    std::vector<Object> ONNXYOLO::postprocessPoseLegacy(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
-        const std::vector<Ort::Value>& outputTensors,
+        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold, float iouThreshold, int numKPS, int maxDet)
    {
        if (outputTensors.empty()) return {};
-        const float* rawOutput = outputTensors[0].GetTensorData<float>();
+        const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
        const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        if (outputShape.size() < 3) return {};

@@ -1273,12 +1273,12 @@ namespace ANSCENTER {
    // ====================================================================

    std::vector<Object> ONNXYOLO::postprocessClassify(
-        const std::vector<Ort::Value>& outputTensors,
+        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        const cv::Size& imageSize)
    {
        if (outputTensors.empty()) return {};
-        const float* raw = outputTensors[0].GetTensorData<float>();
+        const float* raw = outputTensors[0].GetTensorMutableData<float>();
        const auto shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        if (shape.size() < 2) return {};

@@ -1339,7 +1339,7 @@ namespace ANSCENTER {
    // ====================================================================

    /*static*/ Ort::Value ONNXYOLO::sliceBatchOutput(
-        const Ort::Value& batchTensor,
+        Ort::Value& batchTensor,
        int64_t batchIndex,
        const std::vector<int64_t>& fullShape,
        Ort::MemoryInfo& memInfo)
@@ -1349,8 +1349,8 @@ namespace ANSCENTER {
        for (size_t d = 1; d < fullShape.size(); ++d)
            elemsPerImage *= fullShape[d];

-        const float* batchData = batchTensor.GetTensorData<float>();
-        float* imageData = const_cast<float*>(batchData + batchIndex * elemsPerImage);
+        float* batchData = batchTensor.GetTensorMutableData<float>();
+        float* imageData = batchData + batchIndex * elemsPerImage;

        // Shape for single image: [1, D1, D2, ...]
        std::vector<int64_t> singleShape = fullShape;
@@ -1504,7 +1504,7 @@ namespace ANSCENTER {
                    // Class count mismatch — probe last channel for OBB angles
                    bool likelyOBB = false;
                    if (extra >= 2) {
-                        const float* rawOutput = perImageOutputs[0].GetTensorData<float>();
+                        const float* rawOutput = perImageOutputs[0].GetTensorMutableData<float>();
                        int numSamp = std::min(numBoxes, 100);
                        int angleCount = 0;
                        for (int s = 0; s < numSamp; ++s) {
@@ -1571,6 +1571,22 @@ namespace ANSCENTER {
        }
    }

+    bool ANSONNXYOLO::InitOrtEngine(ANSCENTER::EngineType engineType) {
+        try {
+            if (!FileExist(_modelFilePath)) {
+                _logger.LogError("ANSONNXYOLO::InitOrtEngine",
+                    "Model file does not exist: " + _modelFilePath, __FILE__, __LINE__);
+                return false;
+            }
+            m_ortEngine = std::make_unique<ONNXYOLO>(_modelFilePath, engineType);
+            return true;
+        }
+        catch (const std::exception& e) {
+            _logger.LogFatal("ANSONNXYOLO::InitOrtEngine", e.what(), __FILE__, __LINE__);
+            return false;
+        }
+    }
+
    bool ANSONNXYOLO::Initialize(std::string licenseKey, ModelConfig modelConfig,
                                   const std::string& modelZipFilePath,
                                   const std::string& modelZipPassword,
@@ -1807,9 +1823,12 @@ namespace ANSCENTER {
                                                      const std::string& camera_id)
    {
        try {
+            ANS_DBG("ONNXYOLO", "DetectObjects: cam=%s acquiring mutex...", camera_id.c_str());
            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            ANS_DBG("ONNXYOLO", "DetectObjects: mutex acquired, cam=%s", camera_id.c_str());
            if (!m_ortEngine) {
                _logger.LogError("ANSONNXYOLO::DetectObjects", "ORT engine is null", __FILE__, __LINE__);
+                ANS_DBG("ONNXYOLO", "DetectObjects: ORT engine is null!");
                return {};
            }

@@ -1880,6 +1899,7 @@ namespace ANSCENTER {
            return results;
        }
        catch (const std::exception& e) {
+            ANS_DBG("ONNXYOLO", "DetectObjects EXCEPTION: %s cam=%s", e.what(), camera_id.c_str());
            _logger.LogFatal("ANSONNXYOLO::DetectObjects", e.what(), __FILE__, __LINE__);
            return {};
        }
--- a/modules/ANSODEngine/ANSONNXYOLO.h
+++ b/modules/ANSODEngine/ANSONNXYOLO.h
@@ -83,55 +83,55 @@ namespace ANSCENTER {
        // ── Detection postprocess ───────────────────────────────────────
        std::vector<Object> postprocessEndToEnd(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
-            const std::vector<Ort::Value>& outputTensors,
+            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames, float confThreshold);

        std::vector<Object> postprocessLegacy(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
-            const std::vector<Ort::Value>& outputTensors,
+            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames,
            float confThreshold, float iouThreshold, int maxDet = 300);

        // ── OBB postprocess ─────────────────────────────────────────────
        std::vector<Object> postprocessOBBEndToEnd(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
-            const std::vector<Ort::Value>& outputTensors,
+            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames, float confThreshold);

        std::vector<Object> postprocessOBBLegacy(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
-            const std::vector<Ort::Value>& outputTensors,
+            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames,
            float confThreshold, float iouThreshold, int maxDet = 300);

        // ── Segmentation postprocess ────────────────────────────────────
        std::vector<Object> postprocessSegEndToEnd(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
-            const std::vector<Ort::Value>& outputTensors,
+            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames, float confThreshold);

        std::vector<Object> postprocessSegLegacy(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
-            const std::vector<Ort::Value>& outputTensors,
+            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames,
            float confThreshold, float iouThreshold, int maxDet = 300);

        // ── Pose postprocess ────────────────────────────────────────────
        std::vector<Object> postprocessPoseEndToEnd(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
-            const std::vector<Ort::Value>& outputTensors,
+            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames,
            float confThreshold, int numKPS);

        std::vector<Object> postprocessPoseLegacy(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
-            const std::vector<Ort::Value>& outputTensors,
+            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames,
            float confThreshold, float iouThreshold, int numKPS, int maxDet = 300);

        // ── Classification postprocess ──────────────────────────────────
        std::vector<Object> postprocessClassify(
-            const std::vector<Ort::Value>& outputTensors,
+            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames,
            const cv::Size& imageSize);

@@ -154,7 +154,7 @@ namespace ANSCENTER {

        // ── Batch output slicing helper ────────────────────────────────
        static Ort::Value sliceBatchOutput(
-            const Ort::Value& batchTensor,
+            Ort::Value& batchTensor,
            int64_t batchIndex,
            const std::vector<int64_t>& fullShape,
            Ort::MemoryInfo& memInfo);
@@ -224,6 +224,9 @@ namespace ANSCENTER {

        // Initialise ORT engine from the resolved model path
        bool InitOrtEngine();
+    public:
+        // Initialise ORT engine with explicit engine type override (e.g. CPU fallback for AMD iGPUs)
+        bool InitOrtEngine(ANSCENTER::EngineType engineType);
    };
 }
 #endif
--- a/modules/ANSODEngine/ANSYOLOOD.cpp
+++ b/modules/ANSODEngine/ANSYOLOOD.cpp
@@ -218,6 +218,12 @@ namespace ANSCENTER
 				std::min(6, static_cast<int>(std::thread::hardware_concurrency())));
 			sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);

+			// DirectML REQUIRES these two settings per ORT documentation
+			if (ep.type == ANSCENTER::EngineType::AMD_GPU) {
+				sessionOptions.DisableMemPattern();
+				sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+			}
+
 			// ── Log available providers ─────────────────────────────────────────
 			std::vector<std::string> availableProviders = Ort::GetAvailableProviders();
 			std::cout << "Available Execution Providers:" << std::endl;
@@ -519,7 +525,7 @@ namespace ANSCENTER
 	{
 		try {
 			// Get raw output pointer (NO COPY!)
-			const float* rawOutput = outputTensors[0].GetTensorData<float>();
+			const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
 			std::vector<int64_t> outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();

 			const int numClasses = static_cast<int>(outputShape[2]) - 5;
@@ -647,11 +653,11 @@ namespace ANSCENTER
 		}
 		return result;
 	}
-	std::vector<Object> YOLOOD::postprocessv11(const cv::Size& originalImageSize,const cv::Size& resizedImageShape,const std::vector<Ort::Value>& outputTensors,float confThreshold,float iouThreshold)
+	std::vector<Object> YOLOOD::postprocessv11(const cv::Size& originalImageSize,const cv::Size& resizedImageShape,std::vector<Ort::Value>& outputTensors,float confThreshold,float iouThreshold)
 	{
 		try {
 			// Get raw output
-			const float* rawOutput = outputTensors[0].GetTensorData<float>();
+			const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
 			const std::vector<int64_t> outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();

 			const size_t numFeatures = outputShape[1];
@@ -1448,7 +1454,7 @@ namespace ANSCENTER
 			);

 			// Parse output
-			const float* rawOutput = outputTensors[0].GetTensorData<float>();
+			const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
 			const std::vector<int64_t> outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();

 			const int dimensions = static_cast<int>(outputShape[1]);  // 4 + num_classes
--- a/modules/ANSODEngine/ANSYOLOOD.h
+++ b/modules/ANSODEngine/ANSYOLOOD.h
@@ -44,7 +44,7 @@ namespace ANSCENTER {
        cv::Mat preprocessv11(const cv::Mat& image, std::vector<float>& blob, std::vector<int64_t>& inputTensorShape);
        std::vector<Object> postprocessing(const cv::Size& resizedImageShape,const cv::Size& originalImageShape,std::vector<Ort::Value>& outputTensors,
                                           const float& confThreshold, const float& iouThreshold);
-        std::vector<Object> postprocessv11(const cv::Size& originalImageSize,const cv::Size& resizedImageShape,const std::vector<Ort::Value>& outputTensors,float confThreshold,float iouThreshold);
+        std::vector<Object> postprocessv11(const cv::Size& originalImageSize,const cv::Size& resizedImageShape,std::vector<Ort::Value>& outputTensors,float confThreshold,float iouThreshold);
        BoundingBox scaleCoordsv11(const cv::Size& imageShape, BoundingBox coords,const cv::Size& imageOriginalShape, bool p_Clip);
        std::vector<const char*> inputNodeNames;
        std::vector<const char*> outputNodeNames;
--- a/modules/ANSODEngine/dllmain.cpp
+++ b/modules/ANSODEngine/dllmain.cpp
@@ -355,6 +355,7 @@ extern "C" ANSODENGINE_API std::string  CreateANSODHandle(ANSCENTER::ANSODBase**
 	//	TEXTSCENSE = 6

 	//Force modelType to ANSONNXYOLO and ANSRTYOLO if detectionType is detection and modelType is TENSORRT or ONNX
+
 	if ((modelType == 4) || // TensorRT  
 		(modelType == 14)|| // TensorRT Yolov10	 
 		(modelType == 22)|| // TensorRT Pose
@@ -376,7 +377,6 @@ extern "C" ANSODENGINE_API std::string  CreateANSODHandle(ANSCENTER::ANSODBase**
 	}


-
 	switch (detectionType) {
 		case 0:
 			modelConfig.detectionType = ANSCENTER::DetectionType::CLASSIFICATION;