Fix AMD by changing from GetTensorData<T>() to GetTensorMutableData<T>()
This commit is contained in:
@@ -575,7 +575,7 @@ namespace ANSCENTER
|
||||
return false;
|
||||
}
|
||||
}
|
||||
std::vector<Object> ANSONNXCL::postprocess(const std::vector<Ort::Value>& outputTensors, const std::string& camera_id) {
|
||||
std::vector<Object> ANSONNXCL::postprocess(std::vector<Ort::Value>& outputTensors, const std::string& camera_id) {
|
||||
ANS_DBG("ANSONNXCL_pp", "ENTRY tensors=%zu cam=%s this=%p",
|
||||
outputTensors.size(), camera_id.c_str(), (void*)this);
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
@@ -589,8 +589,16 @@ namespace ANSCENTER
|
||||
return {};
|
||||
}
|
||||
|
||||
ANS_DBG("ANSONNXCL_pp", "GetTensorData<float>");
|
||||
const float* rawOutput = outputTensors[0].GetTensorData<float>();
|
||||
ANS_DBG("ANSONNXCL_pp", "GetTensorMutableData<float>");
|
||||
// GetTensorMutableData (not GetTensorData) on DirectML. The const
|
||||
// GetTensorData triggers a per-call host-readable mapping that on
|
||||
// AMD DML exhausts a small staging-buffer pool after ~8 calls and
|
||||
// blocks indefinitely. GetTensorMutableData returns the existing
|
||||
// host-accessible pointer directly with no per-call mapping cost.
|
||||
// Same pattern used by every output-tensor read in ANSONNXYOLO
|
||||
// and engines/ONNXEngine. Safe on all EPs (CUDA/OpenVINO/CPU);
|
||||
// we read the data only, never mutate it.
|
||||
const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
|
||||
if (!rawOutput) {
|
||||
ANS_DBG("ANSONNXCL_pp", "EARLY-RETURN rawOutput=null");
|
||||
this->_logger.LogError("ANSONNXCL::postprocess", "rawOutput pointer is null", __FILE__, __LINE__);
|
||||
|
||||
@@ -28,7 +28,11 @@ namespace ANSCENTER {
|
||||
void warmupModel();
|
||||
bool Init(const std::string& modelPath, const cv::Size& targetInputShape, bool useGPU = true);
|
||||
bool preprocess(const cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
|
||||
std::vector<Object> postprocess(const std::vector<Ort::Value>& outputTensors, const std::string& camera_id);
|
||||
// outputTensors is non-const because GetTensorMutableData() (the
|
||||
// ORT API that doesn't hang on AMD DirectML) requires a non-const
|
||||
// Ort::Value receiver. See comment at the GetTensorMutableData
|
||||
// call site in postprocess() for the full rationale.
|
||||
std::vector<Object> postprocess(std::vector<Ort::Value>& outputTensors, const std::string& camera_id);
|
||||
std::vector<Object> classify(const cv::Mat& image, const std::string& camera_id);
|
||||
|
||||
private:
|
||||
|
||||
@@ -1089,7 +1089,7 @@ namespace ANSCENTER {
|
||||
std::vector<Object> ANSONNXOBB::postprocess(
|
||||
const cv::Size& originalImageSize,
|
||||
const cv::Size& resizedImageShape,
|
||||
const std::vector<Ort::Value>& outputTensors,
|
||||
std::vector<Ort::Value>& outputTensors,
|
||||
int topk,
|
||||
const std::string& camera_id)
|
||||
{
|
||||
@@ -1103,8 +1103,10 @@ namespace ANSCENTER {
|
||||
return {};
|
||||
}
|
||||
|
||||
// Extract output tensor data and shape [1, num_features, num_detections]
|
||||
const float* rawOutput = outputTensors[0].GetTensorData<float>();
|
||||
// Extract output tensor data and shape [1, num_features, num_detections].
|
||||
// GetTensorMutableData (not GetTensorData) on DML — const variant
|
||||
// hangs on AMD after ~8 calls. Read-only despite the name.
|
||||
const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
|
||||
const std::vector<int64_t> outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
||||
|
||||
if (outputShape.size() < 3) {
|
||||
|
||||
@@ -74,10 +74,12 @@ namespace ANSCENTER {
|
||||
void warmupModel();
|
||||
bool Init(const std::string& modelPath, bool useGPU=true, int deviceId = 0);
|
||||
cv::Mat preprocess(const cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
|
||||
// outputTensors is non-const because GetTensorMutableData() requires
|
||||
// a non-const Ort::Value receiver — see ANSONNXCL.h for full note.
|
||||
std::vector<Object> postprocess(
|
||||
const cv::Size& originalImageSize,
|
||||
const cv::Size& resizedImageShape,
|
||||
const std::vector<Ort::Value>& outputTensors, int topk,
|
||||
std::vector<Ort::Value>& outputTensors, int topk,
|
||||
const std::string& camera_id);
|
||||
std::vector<Object> detect(const cv::Mat& image, const std::string& camera_id);
|
||||
private:
|
||||
|
||||
@@ -759,7 +759,7 @@ namespace ANSCENTER {
|
||||
std::vector<Object> ANSONNXPOSE::postprocess(
|
||||
const cv::Size& originalImageSize,
|
||||
const cv::Size& resizedImageShape,
|
||||
const std::vector<Ort::Value>& outputTensors,
|
||||
std::vector<Ort::Value>& outputTensors,
|
||||
const std::string& camera_id)
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
@@ -773,7 +773,9 @@ namespace ANSCENTER {
|
||||
return {};
|
||||
}
|
||||
|
||||
const float* rawOutput = outputTensors[0].GetTensorData<float>();
|
||||
// GetTensorMutableData (not GetTensorData) on DML — const variant
|
||||
// hangs on AMD after ~8 calls. Read-only despite the name.
|
||||
const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
|
||||
if (!rawOutput) {
|
||||
this->_logger.LogError("ANSONNXPOSE::postprocess", "rawOutput pointer is null", __FILE__, __LINE__);
|
||||
return {};
|
||||
|
||||
@@ -41,8 +41,10 @@ namespace ANSCENTER {
|
||||
void warmupModel();
|
||||
bool Init(const std::string& modelPath, bool useGPU=true, int deviceId = 0);
|
||||
cv::Mat preprocess(const cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
|
||||
// outputTensors is non-const because GetTensorMutableData() requires
|
||||
// a non-const Ort::Value receiver — see ANSONNXCL.h for full note.
|
||||
std::vector<Object> postprocess(const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
|
||||
const std::vector<Ort::Value>& outputTensors, const std::string& camera_id);
|
||||
std::vector<Ort::Value>& outputTensors, const std::string& camera_id);
|
||||
std::vector<Object> detect(const cv::Mat& image, const std::string& camera_id);
|
||||
private:
|
||||
static std::atomic<int> instanceCounter_; // Thread-safe counter
|
||||
|
||||
@@ -726,7 +726,7 @@ namespace ANSCENTER {
|
||||
std::vector<Object> ANSONNXSEG::postprocess(
|
||||
const cv::Size& origSize,
|
||||
const cv::Size& letterboxSize,
|
||||
const std::vector<Ort::Value>& outputs,
|
||||
std::vector<Ort::Value>& outputs,
|
||||
const std::string& camera_id)
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
@@ -738,9 +738,11 @@ namespace ANSCENTER {
|
||||
std::to_string(outputs.size()));
|
||||
}
|
||||
|
||||
// Extract output tensors
|
||||
const float* detections = outputs[0].GetTensorData<float>();
|
||||
const float* prototypes = outputs[1].GetTensorData<float>();
|
||||
// Extract output tensors. GetTensorMutableData (not GetTensorData)
|
||||
// on DML — const variant hangs on AMD after ~8 calls. Read-only
|
||||
// despite the name.
|
||||
const float* detections = outputs[0].GetTensorMutableData<float>();
|
||||
const float* prototypes = outputs[1].GetTensorMutableData<float>();
|
||||
|
||||
// Get tensor shapes
|
||||
auto detectionShape = outputs[0].GetTensorTypeAndShapeInfo().GetShape(); // [1, 116, N]
|
||||
|
||||
@@ -51,8 +51,10 @@ namespace ANSCENTER {
|
||||
void warmupModel();
|
||||
bool Init(const std::string& modelPath, bool useGPU=true, int deviceId = 0);
|
||||
cv::Mat preprocess(const cv::Mat& image,float*& blobPtr,std::vector<int64_t>& inputTensorShape);
|
||||
// outputs is non-const because GetTensorMutableData() requires a
|
||||
// non-const Ort::Value receiver — see ANSONNXCL.h for full note.
|
||||
std::vector<Object> postprocess(const cv::Size& origSize,const cv::Size& letterboxSize,
|
||||
const std::vector<Ort::Value>& outputs, const std::string& camera_id);
|
||||
std::vector<Ort::Value>& outputs, const std::string& camera_id);
|
||||
std::vector<Object> segment(const cv::Mat& image, const std::string& camera_id);
|
||||
std::vector<cv::Point2f> maskToPolygon(const cv::Mat& binaryMask,
|
||||
const cv::Rect& boundingBox,
|
||||
|
||||
@@ -518,14 +518,16 @@ namespace ANSCENTER {
|
||||
}
|
||||
}
|
||||
std::vector<Object> YOLO12OD::postprocess(const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
|
||||
const std::vector<Ort::Value>& outputTensors,
|
||||
float confThreshold, float iouThreshold)
|
||||
std::vector<Ort::Value>& outputTensors,
|
||||
float confThreshold, float iouThreshold)
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
try {
|
||||
|
||||
std::vector<Object> detections;
|
||||
const float* rawOutput = outputTensors[0].GetTensorData<float>(); // Extract raw output data from the first output tensor
|
||||
// GetTensorMutableData (not GetTensorData) on DML — const variant
|
||||
// hangs on AMD after ~8 calls. Read-only despite the name.
|
||||
const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
|
||||
const std::vector<int64_t> outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
||||
|
||||
// Determine the number of features and detections
|
||||
|
||||
@@ -49,8 +49,11 @@ namespace ANSCENTER {
|
||||
std::vector<Object> detect(const cv::Mat& image, float confThreshold = 0.4f, float iouThreshold = 0.45f);
|
||||
//cv::Mat preprocess(const cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
|
||||
cv::Mat preprocess(const cv::Mat& image, std::vector<float>& blob, std::vector<int64_t>& inputTensorShape);
|
||||
// outputTensors is non-const because GetTensorMutableData()
|
||||
// requires a non-const Ort::Value receiver — see ANSONNXCL.h
|
||||
// for full note.
|
||||
std::vector<Object> postprocess(const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
|
||||
const std::vector<Ort::Value>& outputTensors,
|
||||
std::vector<Ort::Value>& outputTensors,
|
||||
float confThreshold, float iouThreshold);
|
||||
|
||||
private:
|
||||
|
||||
Reference in New Issue
Block a user