Fix AMD by changing from GetTensorData<T>() to GetTensorMutableData<T>()

This commit is contained in:
2026-04-28 13:25:02 +10:00
parent f4b74c837e
commit dcf974c35c
18 changed files with 359 additions and 48 deletions

View File

@@ -575,7 +575,7 @@ namespace ANSCENTER
return false;
}
}
std::vector<Object> ANSONNXCL::postprocess(const std::vector<Ort::Value>& outputTensors, const std::string& camera_id) {
std::vector<Object> ANSONNXCL::postprocess(std::vector<Ort::Value>& outputTensors, const std::string& camera_id) {
ANS_DBG("ANSONNXCL_pp", "ENTRY tensors=%zu cam=%s this=%p",
outputTensors.size(), camera_id.c_str(), (void*)this);
std::lock_guard<std::recursive_mutex> lock(_mutex);
@@ -589,8 +589,16 @@ namespace ANSCENTER
return {};
}
ANS_DBG("ANSONNXCL_pp", "GetTensorData<float>");
const float* rawOutput = outputTensors[0].GetTensorData<float>();
ANS_DBG("ANSONNXCL_pp", "GetTensorMutableData<float>");
// GetTensorMutableData (not GetTensorData) on DirectML. The const
// GetTensorData triggers a per-call host-readable mapping that on
// AMD DML exhausts a small staging-buffer pool after ~8 calls and
// blocks indefinitely. GetTensorMutableData returns the existing
// host-accessible pointer directly with no per-call mapping cost.
// Same pattern used by every output-tensor read in ANSONNXYOLO
// and engines/ONNXEngine. Safe on all EPs (CUDA/OpenVINO/CPU);
// we read the data only, never mutate it.
const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
if (!rawOutput) {
ANS_DBG("ANSONNXCL_pp", "EARLY-RETURN rawOutput=null");
this->_logger.LogError("ANSONNXCL::postprocess", "rawOutput pointer is null", __FILE__, __LINE__);

View File

@@ -28,7 +28,11 @@ namespace ANSCENTER {
void warmupModel();
bool Init(const std::string& modelPath, const cv::Size& targetInputShape, bool useGPU = true);
bool preprocess(const cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
std::vector<Object> postprocess(const std::vector<Ort::Value>& outputTensors, const std::string& camera_id);
// outputTensors is non-const because GetTensorMutableData() (the
// ORT API that doesn't hang on AMD DirectML) requires a non-const
// Ort::Value receiver. See comment at the GetTensorMutableData
// call site in postprocess() for the full rationale.
std::vector<Object> postprocess(std::vector<Ort::Value>& outputTensors, const std::string& camera_id);
std::vector<Object> classify(const cv::Mat& image, const std::string& camera_id);
private:

View File

@@ -1089,7 +1089,7 @@ namespace ANSCENTER {
std::vector<Object> ANSONNXOBB::postprocess(
const cv::Size& originalImageSize,
const cv::Size& resizedImageShape,
const std::vector<Ort::Value>& outputTensors,
std::vector<Ort::Value>& outputTensors,
int topk,
const std::string& camera_id)
{
@@ -1103,8 +1103,10 @@ namespace ANSCENTER {
return {};
}
// Extract output tensor data and shape [1, num_features, num_detections]
const float* rawOutput = outputTensors[0].GetTensorData<float>();
// Extract output tensor data and shape [1, num_features, num_detections].
// GetTensorMutableData (not GetTensorData) on DML — const variant
// hangs on AMD after ~8 calls. Read-only despite the name.
const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
const std::vector<int64_t> outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
if (outputShape.size() < 3) {

View File

@@ -74,10 +74,12 @@ namespace ANSCENTER {
void warmupModel();
bool Init(const std::string& modelPath, bool useGPU=true, int deviceId = 0);
cv::Mat preprocess(const cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
// outputTensors is non-const because GetTensorMutableData() requires
// a non-const Ort::Value receiver — see ANSONNXCL.h for full note.
std::vector<Object> postprocess(
const cv::Size& originalImageSize,
const cv::Size& resizedImageShape,
const std::vector<Ort::Value>& outputTensors, int topk,
std::vector<Ort::Value>& outputTensors, int topk,
const std::string& camera_id);
std::vector<Object> detect(const cv::Mat& image, const std::string& camera_id);
private:

View File

@@ -759,7 +759,7 @@ namespace ANSCENTER {
std::vector<Object> ANSONNXPOSE::postprocess(
const cv::Size& originalImageSize,
const cv::Size& resizedImageShape,
const std::vector<Ort::Value>& outputTensors,
std::vector<Ort::Value>& outputTensors,
const std::string& camera_id)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
@@ -773,7 +773,9 @@ namespace ANSCENTER {
return {};
}
const float* rawOutput = outputTensors[0].GetTensorData<float>();
// GetTensorMutableData (not GetTensorData) on DML — const variant
// hangs on AMD after ~8 calls. Read-only despite the name.
const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
if (!rawOutput) {
this->_logger.LogError("ANSONNXPOSE::postprocess", "rawOutput pointer is null", __FILE__, __LINE__);
return {};

View File

@@ -41,8 +41,10 @@ namespace ANSCENTER {
void warmupModel();
bool Init(const std::string& modelPath, bool useGPU=true, int deviceId = 0);
cv::Mat preprocess(const cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
// outputTensors is non-const because GetTensorMutableData() requires
// a non-const Ort::Value receiver — see ANSONNXCL.h for full note.
std::vector<Object> postprocess(const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
const std::vector<Ort::Value>& outputTensors, const std::string& camera_id);
std::vector<Ort::Value>& outputTensors, const std::string& camera_id);
std::vector<Object> detect(const cv::Mat& image, const std::string& camera_id);
private:
static std::atomic<int> instanceCounter_; // Thread-safe counter

View File

@@ -726,7 +726,7 @@ namespace ANSCENTER {
std::vector<Object> ANSONNXSEG::postprocess(
const cv::Size& origSize,
const cv::Size& letterboxSize,
const std::vector<Ort::Value>& outputs,
std::vector<Ort::Value>& outputs,
const std::string& camera_id)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
@@ -738,9 +738,11 @@ namespace ANSCENTER {
std::to_string(outputs.size()));
}
// Extract output tensors
const float* detections = outputs[0].GetTensorData<float>();
const float* prototypes = outputs[1].GetTensorData<float>();
// Extract output tensors. GetTensorMutableData (not GetTensorData)
// on DML — const variant hangs on AMD after ~8 calls. Read-only
// despite the name.
const float* detections = outputs[0].GetTensorMutableData<float>();
const float* prototypes = outputs[1].GetTensorMutableData<float>();
// Get tensor shapes
auto detectionShape = outputs[0].GetTensorTypeAndShapeInfo().GetShape(); // [1, 116, N]

View File

@@ -51,8 +51,10 @@ namespace ANSCENTER {
void warmupModel();
bool Init(const std::string& modelPath, bool useGPU=true, int deviceId = 0);
cv::Mat preprocess(const cv::Mat& image,float*& blobPtr,std::vector<int64_t>& inputTensorShape);
// outputs is non-const because GetTensorMutableData() requires a
// non-const Ort::Value receiver — see ANSONNXCL.h for full note.
std::vector<Object> postprocess(const cv::Size& origSize,const cv::Size& letterboxSize,
const std::vector<Ort::Value>& outputs, const std::string& camera_id);
std::vector<Ort::Value>& outputs, const std::string& camera_id);
std::vector<Object> segment(const cv::Mat& image, const std::string& camera_id);
std::vector<cv::Point2f> maskToPolygon(const cv::Mat& binaryMask,
const cv::Rect& boundingBox,

View File

@@ -518,14 +518,16 @@ namespace ANSCENTER {
}
}
std::vector<Object> YOLO12OD::postprocess(const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
const std::vector<Ort::Value>& outputTensors,
float confThreshold, float iouThreshold)
std::vector<Ort::Value>& outputTensors,
float confThreshold, float iouThreshold)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
std::vector<Object> detections;
const float* rawOutput = outputTensors[0].GetTensorData<float>(); // Extract raw output data from the first output tensor
// GetTensorMutableData (not GetTensorData) on DML — const variant
// hangs on AMD after ~8 calls. Read-only despite the name.
const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
const std::vector<int64_t> outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
// Determine the number of features and detections

View File

@@ -49,8 +49,11 @@ namespace ANSCENTER {
std::vector<Object> detect(const cv::Mat& image, float confThreshold = 0.4f, float iouThreshold = 0.45f);
//cv::Mat preprocess(const cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
cv::Mat preprocess(const cv::Mat& image, std::vector<float>& blob, std::vector<int64_t>& inputTensorShape);
// outputTensors is non-const because GetTensorMutableData()
// requires a non-const Ort::Value receiver — see ANSONNXCL.h
// for full note.
std::vector<Object> postprocess(const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
const std::vector<Ort::Value>& outputTensors,
std::vector<Ort::Value>& outputTensors,
float confThreshold, float iouThreshold);
private: