Fix AMD by changing from GetTensorData<T>() to GetTensorMutableData<T>()

2026-04-28 13:25:02 +10:00
parent f4b74c837e
commit dcf974c35c
18 changed files with 359 additions and 48 deletions
--- a/modules/ANSODEngine/ANSONNXCL.cpp
+++ b/modules/ANSODEngine/ANSONNXCL.cpp
@@ -575,7 +575,7 @@ namespace ANSCENTER
            return false;
        }
    }
-    std::vector<Object> ANSONNXCL::postprocess(const std::vector<Ort::Value>& outputTensors, const std::string& camera_id) {
+    std::vector<Object> ANSONNXCL::postprocess(std::vector<Ort::Value>& outputTensors, const std::string& camera_id) {
        ANS_DBG("ANSONNXCL_pp", "ENTRY tensors=%zu cam=%s this=%p",
            outputTensors.size(), camera_id.c_str(), (void*)this);
        std::lock_guard<std::recursive_mutex> lock(_mutex);
@@ -589,8 +589,16 @@ namespace ANSCENTER
                return {};
            }

-            ANS_DBG("ANSONNXCL_pp", "GetTensorData<float>");
-            const float* rawOutput = outputTensors[0].GetTensorData<float>();
+            ANS_DBG("ANSONNXCL_pp", "GetTensorMutableData<float>");
+            // GetTensorMutableData (not GetTensorData) on DirectML.  The const
+            // GetTensorData triggers a per-call host-readable mapping that on
+            // AMD DML exhausts a small staging-buffer pool after ~8 calls and
+            // blocks indefinitely.  GetTensorMutableData returns the existing
+            // host-accessible pointer directly with no per-call mapping cost.
+            // Same pattern used by every output-tensor read in ANSONNXYOLO
+            // and engines/ONNXEngine.  Safe on all EPs (CUDA/OpenVINO/CPU);
+            // we read the data only, never mutate it.
+            const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
            if (!rawOutput) {
                ANS_DBG("ANSONNXCL_pp", "EARLY-RETURN rawOutput=null");
                this->_logger.LogError("ANSONNXCL::postprocess", "rawOutput pointer is null", __FILE__, __LINE__);