Fix AMD by changing from GetTensorData<T>() to GetTensorMutableData<T>()

2026-04-28 13:25:02 +10:00
parent f4b74c837e
commit dcf974c35c
18 changed files with 359 additions and 48 deletions
--- a/modules/ANSODEngine/ANSONNXCL.cpp
+++ b/modules/ANSODEngine/ANSONNXCL.cpp
@@ -575,7 +575,7 @@ namespace ANSCENTER
            return false;
        }
    }
-    std::vector<Object> ANSONNXCL::postprocess(const std::vector<Ort::Value>& outputTensors, const std::string& camera_id) {
+    std::vector<Object> ANSONNXCL::postprocess(std::vector<Ort::Value>& outputTensors, const std::string& camera_id) {
        ANS_DBG("ANSONNXCL_pp", "ENTRY tensors=%zu cam=%s this=%p",
            outputTensors.size(), camera_id.c_str(), (void*)this);
        std::lock_guard<std::recursive_mutex> lock(_mutex);
@@ -589,8 +589,16 @@ namespace ANSCENTER
                return {};
            }

-            ANS_DBG("ANSONNXCL_pp", "GetTensorData<float>");
-            const float* rawOutput = outputTensors[0].GetTensorData<float>();
+            ANS_DBG("ANSONNXCL_pp", "GetTensorMutableData<float>");
+            // GetTensorMutableData (not GetTensorData) on DirectML.  The const
+            // GetTensorData triggers a per-call host-readable mapping that on
+            // AMD DML exhausts a small staging-buffer pool after ~8 calls and
+            // blocks indefinitely.  GetTensorMutableData returns the existing
+            // host-accessible pointer directly with no per-call mapping cost.
+            // Same pattern used by every output-tensor read in ANSONNXYOLO
+            // and engines/ONNXEngine.  Safe on all EPs (CUDA/OpenVINO/CPU);
+            // we read the data only, never mutate it.
+            const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
            if (!rawOutput) {
                ANS_DBG("ANSONNXCL_pp", "EARLY-RETURN rawOutput=null");
                this->_logger.LogError("ANSONNXCL::postprocess", "rawOutput pointer is null", __FILE__, __LINE__);
--- a/modules/ANSODEngine/ANSONNXCL.h
+++ b/modules/ANSODEngine/ANSONNXCL.h
@@ -28,7 +28,11 @@ namespace ANSCENTER {
        void warmupModel();
        bool Init(const std::string& modelPath, const cv::Size& targetInputShape, bool useGPU = true);
        bool preprocess(const cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
-        std::vector<Object> postprocess(const std::vector<Ort::Value>& outputTensors, const std::string& camera_id);
+        // outputTensors is non-const because GetTensorMutableData() (the
+        // ORT API that doesn't hang on AMD DirectML) requires a non-const
+        // Ort::Value receiver.  See comment at the GetTensorMutableData
+        // call site in postprocess() for the full rationale.
+        std::vector<Object> postprocess(std::vector<Ort::Value>& outputTensors, const std::string& camera_id);
        std::vector<Object> classify(const cv::Mat& image, const std::string& camera_id);

    private:
--- a/modules/ANSODEngine/ANSONNXOBB.cpp
+++ b/modules/ANSODEngine/ANSONNXOBB.cpp
@@ -1089,7 +1089,7 @@ namespace ANSCENTER {
 	std::vector<Object> ANSONNXOBB::postprocess(
 		const cv::Size& originalImageSize,
 		const cv::Size& resizedImageShape,
-		const std::vector<Ort::Value>& outputTensors,
+		std::vector<Ort::Value>& outputTensors,
 		int topk,
 		const std::string& camera_id)
 	{
@@ -1103,8 +1103,10 @@ namespace ANSCENTER {
 				return {};
 			}

-			// Extract output tensor data and shape [1, num_features, num_detections]
-			const float* rawOutput = outputTensors[0].GetTensorData<float>();
+			// Extract output tensor data and shape [1, num_features, num_detections].
+			// GetTensorMutableData (not GetTensorData) on DML — const variant
+			// hangs on AMD after ~8 calls.  Read-only despite the name.
+			const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
 			const std::vector<int64_t> outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();

 			if (outputShape.size() < 3) {
--- a/modules/ANSODEngine/ANSONNXOBB.h
+++ b/modules/ANSODEngine/ANSONNXOBB.h
@@ -74,10 +74,12 @@ namespace ANSCENTER {
        void warmupModel();
        bool Init(const std::string& modelPath, bool useGPU=true, int deviceId = 0);
        cv::Mat preprocess(const cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
+        // outputTensors is non-const because GetTensorMutableData() requires
+        // a non-const Ort::Value receiver — see ANSONNXCL.h for full note.
        std::vector<Object> postprocess(
            const cv::Size& originalImageSize,
            const cv::Size& resizedImageShape,
-            const std::vector<Ort::Value>& outputTensors, int topk,
+            std::vector<Ort::Value>& outputTensors, int topk,
            const std::string& camera_id);
        std::vector<Object> detect(const cv::Mat& image, const std::string& camera_id);
    private:
--- a/modules/ANSODEngine/ANSONNXPOSE.cpp
+++ b/modules/ANSODEngine/ANSONNXPOSE.cpp
@@ -759,7 +759,7 @@ namespace ANSCENTER {
 	std::vector<Object> ANSONNXPOSE::postprocess(
 		const cv::Size& originalImageSize,
 		const cv::Size& resizedImageShape,
-		const std::vector<Ort::Value>& outputTensors,
+		std::vector<Ort::Value>& outputTensors,
 		const std::string& camera_id)
 	{
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
@@ -773,7 +773,9 @@ namespace ANSCENTER {
 				return {};
 			}

-			const float* rawOutput = outputTensors[0].GetTensorData<float>();
+			// GetTensorMutableData (not GetTensorData) on DML — const variant
+			// hangs on AMD after ~8 calls.  Read-only despite the name.
+			const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
 			if (!rawOutput) {
 				this->_logger.LogError("ANSONNXPOSE::postprocess", "rawOutput pointer is null", __FILE__, __LINE__);
 				return {};
--- a/modules/ANSODEngine/ANSONNXPOSE.h
+++ b/modules/ANSODEngine/ANSONNXPOSE.h
@@ -41,8 +41,10 @@ namespace ANSCENTER {
        void warmupModel();
        bool Init(const std::string& modelPath, bool useGPU=true, int deviceId = 0);
        cv::Mat preprocess(const cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
+        // outputTensors is non-const because GetTensorMutableData() requires
+        // a non-const Ort::Value receiver — see ANSONNXCL.h for full note.
        std::vector<Object> postprocess(const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
-                                        const std::vector<Ort::Value>& outputTensors, const std::string& camera_id);
+                                        std::vector<Ort::Value>& outputTensors, const std::string& camera_id);
        std::vector<Object> detect(const cv::Mat& image, const std::string& camera_id);
    private:
        static std::atomic<int> instanceCounter_;  // Thread-safe counter
--- a/modules/ANSODEngine/ANSONNXSEG.cpp
+++ b/modules/ANSODEngine/ANSONNXSEG.cpp
@@ -726,7 +726,7 @@ namespace ANSCENTER {
 	std::vector<Object> ANSONNXSEG::postprocess(
 		const cv::Size& origSize,
 		const cv::Size& letterboxSize,
-		const std::vector<Ort::Value>& outputs,
+		std::vector<Ort::Value>& outputs,
 		const std::string& camera_id)
 	{
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
@@ -738,9 +738,11 @@ namespace ANSCENTER {
 					std::to_string(outputs.size()));
 			}

-			// Extract output tensors
-			const float* detections = outputs[0].GetTensorData<float>();
-			const float* prototypes = outputs[1].GetTensorData<float>();
+			// Extract output tensors.  GetTensorMutableData (not GetTensorData)
+			// on DML — const variant hangs on AMD after ~8 calls.  Read-only
+			// despite the name.
+			const float* detections = outputs[0].GetTensorMutableData<float>();
+			const float* prototypes = outputs[1].GetTensorMutableData<float>();

 			// Get tensor shapes
 			auto detectionShape = outputs[0].GetTensorTypeAndShapeInfo().GetShape(); // [1, 116, N]
--- a/modules/ANSODEngine/ANSONNXSEG.h
+++ b/modules/ANSODEngine/ANSONNXSEG.h
@@ -51,8 +51,10 @@ namespace ANSCENTER {
        void warmupModel();
        bool Init(const std::string& modelPath, bool useGPU=true, int deviceId = 0);
        cv::Mat preprocess(const cv::Mat& image,float*& blobPtr,std::vector<int64_t>& inputTensorShape);
+        // outputs is non-const because GetTensorMutableData() requires a
+        // non-const Ort::Value receiver — see ANSONNXCL.h for full note.
        std::vector<Object> postprocess(const cv::Size& origSize,const cv::Size& letterboxSize,
-                                       const std::vector<Ort::Value>& outputs, const std::string& camera_id);
+                                       std::vector<Ort::Value>& outputs, const std::string& camera_id);
        std::vector<Object> segment(const cv::Mat& image, const std::string& camera_id);
        std::vector<cv::Point2f> maskToPolygon(const cv::Mat& binaryMask,
            const cv::Rect& boundingBox,
--- a/modules/ANSODEngine/ANSYOLO12OD.cpp
+++ b/modules/ANSODEngine/ANSYOLO12OD.cpp
@@ -518,14 +518,16 @@ namespace ANSCENTER {
        }
    }
    std::vector<Object> YOLO12OD::postprocess(const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
-                                            const std::vector<Ort::Value>& outputTensors,
-                                            float confThreshold, float iouThreshold) 
+                                            std::vector<Ort::Value>& outputTensors,
+                                            float confThreshold, float iouThreshold)
    {
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {

            std::vector<Object> detections;
-            const float* rawOutput = outputTensors[0].GetTensorData<float>(); // Extract raw output data from the first output tensor
+            // GetTensorMutableData (not GetTensorData) on DML — const variant
+            // hangs on AMD after ~8 calls.  Read-only despite the name.
+            const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
            const std::vector<int64_t> outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();

            // Determine the number of features and detections
--- a/modules/ANSODEngine/ANSYOLO12OD.h
+++ b/modules/ANSODEngine/ANSYOLO12OD.h
@@ -49,8 +49,11 @@ namespace ANSCENTER {
            std::vector<Object> detect(const cv::Mat& image, float confThreshold = 0.4f, float iouThreshold = 0.45f);
            //cv::Mat preprocess(const cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
            cv::Mat preprocess(const cv::Mat& image, std::vector<float>& blob, std::vector<int64_t>& inputTensorShape);
+            // outputTensors is non-const because GetTensorMutableData()
+            // requires a non-const Ort::Value receiver — see ANSONNXCL.h
+            // for full note.
            std::vector<Object> postprocess(const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
-                                            const std::vector<Ort::Value>& outputTensors,
+                                            std::vector<Ort::Value>& outputTensors,
                                            float confThreshold, float iouThreshold);

        private: