2026-03-28 16:54:11 +11:00
|
|
|
#include "ANSONNXYOLO.h"
|
|
|
|
|
#include "Utility.h"
|
|
|
|
|
#include "ANSGpuFrameRegistry.h"
|
|
|
|
|
#include "NV12PreprocessHelper.h" // tl_currentGpuFrame()
|
|
|
|
|
#include <numeric> // std::iota
|
|
|
|
|
#include <cmath>
|
2026-04-10 17:13:47 +10:00
|
|
|
#include <chrono> // WarmUpEngine() timing
|
2026-03-28 16:54:11 +11:00
|
|
|
|
|
|
|
|
namespace ANSCENTER {
|
|
|
|
|
|
|
|
|
|
// ====================================================================
|
|
|
|
|
// ONNXYOLO — BasicOrtHandler subclass for Ultralytics YOLO
|
|
|
|
|
// ====================================================================
|
|
|
|
|
|
|
|
|
|
ONNXYOLO::ONNXYOLO(const std::string& _onnx_path, unsigned int _num_threads)
|
|
|
|
|
: BasicOrtHandler(_onnx_path, _num_threads)
|
|
|
|
|
{
|
|
|
|
|
if (input_node_dims.size() >= 4) {
|
|
|
|
|
int h = static_cast<int>(input_node_dims[2]);
|
|
|
|
|
int w = static_cast<int>(input_node_dims[3]);
|
|
|
|
|
isDynamicInputShape = (h == -1 || w == -1);
|
|
|
|
|
inputImageShape = isDynamicInputShape ? cv::Size(640, 640) : cv::Size(w, h);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
inputImageShape = cv::Size(640, 640);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ONNXYOLO::ONNXYOLO(const std::string& _onnx_path, EngineType engineType,
|
|
|
|
|
unsigned int _num_threads)
|
|
|
|
|
: BasicOrtHandler(_onnx_path, engineType, _num_threads)
|
|
|
|
|
{
|
|
|
|
|
if (input_node_dims.size() >= 4) {
|
|
|
|
|
int h = static_cast<int>(input_node_dims[2]);
|
|
|
|
|
int w = static_cast<int>(input_node_dims[3]);
|
|
|
|
|
isDynamicInputShape = (h == -1 || w == -1);
|
|
|
|
|
inputImageShape = isDynamicInputShape ? cv::Size(640, 640) : cv::Size(w, h);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
inputImageShape = cv::Size(640, 640);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ------------------------------------------------------------------
|
|
|
|
|
// letterBox — Ultralytics-compatible LetterBox transform
|
|
|
|
|
// ------------------------------------------------------------------
|
|
|
|
|
void ONNXYOLO::letterBox(const cv::Mat& image, cv::Mat& outImage,
|
|
|
|
|
const cv::Size& newShape,
|
|
|
|
|
const cv::Scalar& color,
|
|
|
|
|
bool scaleUp, int stride)
|
|
|
|
|
{
|
|
|
|
|
float r = std::min(static_cast<float>(newShape.height) / image.rows,
|
|
|
|
|
static_cast<float>(newShape.width) / image.cols);
|
|
|
|
|
if (!scaleUp)
|
|
|
|
|
r = std::min(r, 1.0f);
|
|
|
|
|
|
|
|
|
|
int newUnpadW = static_cast<int>(std::round(image.cols * r));
|
|
|
|
|
int newUnpadH = static_cast<int>(std::round(image.rows * r));
|
|
|
|
|
|
|
|
|
|
float dw = static_cast<float>(newShape.width - newUnpadW);
|
|
|
|
|
float dh = static_cast<float>(newShape.height - newUnpadH);
|
|
|
|
|
|
|
|
|
|
dw /= 2.0f;
|
|
|
|
|
dh /= 2.0f;
|
|
|
|
|
|
|
|
|
|
if (image.cols != newUnpadW || image.rows != newUnpadH) {
|
|
|
|
|
cv::resize(image, outImage, cv::Size(newUnpadW, newUnpadH),
|
|
|
|
|
0, 0, cv::INTER_LINEAR);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
outImage = image.clone();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Ultralytics -0.1/+0.1 trick for deterministic padding split
|
|
|
|
|
int top = static_cast<int>(std::round(dh - 0.1f));
|
|
|
|
|
int bottom = static_cast<int>(std::round(dh + 0.1f));
|
|
|
|
|
int left = static_cast<int>(std::round(dw - 0.1f));
|
|
|
|
|
int right = static_cast<int>(std::round(dw + 0.1f));
|
|
|
|
|
|
|
|
|
|
cv::copyMakeBorder(outImage, outImage, top, bottom, left, right,
|
|
|
|
|
cv::BORDER_CONSTANT, color);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ------------------------------------------------------------------
|
|
|
|
|
// transform — BGR → RGB, letterbox, /255, HWC→CHW
|
|
|
|
|
// ------------------------------------------------------------------
|
|
|
|
|
Ort::Value ONNXYOLO::transform(const cv::Mat& mat)
|
|
|
|
|
{
|
|
|
|
|
// Grayscale → BGR if needed
|
|
|
|
|
cv::Mat bgrMat;
|
|
|
|
|
if (mat.channels() == 1) {
|
|
|
|
|
cv::cvtColor(mat, bgrMat, cv::COLOR_GRAY2BGR);
|
|
|
|
|
} else {
|
|
|
|
|
bgrMat = mat;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Check if model is classification (first output has 2 dims: [B, nc])
|
|
|
|
|
const bool isClassification = !output_node_dims.empty()
|
|
|
|
|
&& output_node_dims[0].size() == 2;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cv::Mat canvas;
|
|
|
|
|
if (isClassification) {
|
|
|
|
|
// Classification: direct resize (no letterbox padding) — matches ANSONNXCL
|
|
|
|
|
cv::resize(bgrMat, canvas, cv::Size(inputImageShape.width, inputImageShape.height),
|
|
|
|
|
0, 0, cv::INTER_LINEAR);
|
|
|
|
|
} else {
|
|
|
|
|
// Detection/Seg/Pose/OBB: Ultralytics letterbox
|
|
|
|
|
letterBox(bgrMat, canvas, inputImageShape);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
|
|
|
|
|
canvas.convertTo(canvas, CV_32FC3, 1.0 / 255.0);
|
|
|
|
|
|
|
|
|
|
const int channels = canvas.channels();
|
|
|
|
|
const int height = canvas.rows;
|
|
|
|
|
const int width = canvas.cols;
|
|
|
|
|
const size_t imageSize = static_cast<size_t>(height) * width;
|
|
|
|
|
|
|
|
|
|
input_node_dims = { 1, 3, height, width };
|
|
|
|
|
input_tensor_size = 1 * 3 * imageSize;
|
|
|
|
|
input_values_handler.resize(input_tensor_size);
|
|
|
|
|
|
|
|
|
|
std::vector<cv::Mat> channelMats(channels);
|
|
|
|
|
for (int c = 0; c < channels; ++c) {
|
|
|
|
|
channelMats[c] = cv::Mat(height, width, CV_32FC1,
|
|
|
|
|
input_values_handler.data() + c * imageSize);
|
|
|
|
|
}
|
|
|
|
|
cv::split(canvas, channelMats);
|
|
|
|
|
|
|
|
|
|
return Ort::Value::CreateTensor<float>(
|
|
|
|
|
*memory_info_handler,
|
|
|
|
|
input_values_handler.data(),
|
|
|
|
|
input_tensor_size,
|
|
|
|
|
input_node_dims.data(),
|
|
|
|
|
input_node_dims.size());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ort::Value ONNXYOLO::transformBatch(const std::vector<cv::Mat>& images)
|
|
|
|
|
{
|
|
|
|
|
if (images.empty())
|
|
|
|
|
throw std::runtime_error("ONNXYOLO::transformBatch: empty input");
|
|
|
|
|
|
|
|
|
|
const size_t N = images.size();
|
|
|
|
|
|
|
|
|
|
// Preprocess all images: letterbox → BGR→RGB → float → /255
|
|
|
|
|
// Check if model is classification (first output has 2 dims: [B, nc])
|
|
|
|
|
const bool isClassification = !output_node_dims.empty()
|
|
|
|
|
&& output_node_dims[0].size() == 2;
|
|
|
|
|
|
|
|
|
|
std::vector<cv::Mat> batch;
|
|
|
|
|
batch.reserve(N);
|
|
|
|
|
for (const auto& img : images) {
|
|
|
|
|
if (img.empty())
|
|
|
|
|
throw std::runtime_error("ONNXYOLO::transformBatch: empty image in batch");
|
|
|
|
|
|
|
|
|
|
// Grayscale → BGR if needed
|
|
|
|
|
cv::Mat bgrImg;
|
|
|
|
|
if (img.channels() == 1) {
|
|
|
|
|
cv::cvtColor(img, bgrImg, cv::COLOR_GRAY2BGR);
|
|
|
|
|
} else {
|
|
|
|
|
bgrImg = img;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cv::Mat canvas;
|
|
|
|
|
if (isClassification) {
|
|
|
|
|
// Classification: direct resize (no letterbox)
|
|
|
|
|
cv::resize(bgrImg, canvas, cv::Size(inputImageShape.width, inputImageShape.height),
|
|
|
|
|
0, 0, cv::INTER_LINEAR);
|
|
|
|
|
} else {
|
|
|
|
|
letterBox(bgrImg, canvas, inputImageShape);
|
|
|
|
|
}
|
|
|
|
|
cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
|
|
|
|
|
canvas.convertTo(canvas, CV_32FC3, 1.0 / 255.0);
|
|
|
|
|
batch.push_back(canvas);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const int height = batch[0].rows;
|
|
|
|
|
const int width = batch[0].cols;
|
|
|
|
|
const size_t imageSize = static_cast<size_t>(height) * width;
|
|
|
|
|
|
|
|
|
|
input_node_dims = {
|
|
|
|
|
static_cast<int64_t>(N), 3,
|
|
|
|
|
static_cast<int64_t>(height),
|
|
|
|
|
static_cast<int64_t>(width)
|
|
|
|
|
};
|
|
|
|
|
input_tensor_size = N * 3 * imageSize;
|
|
|
|
|
input_values_handler.resize(input_tensor_size);
|
|
|
|
|
|
|
|
|
|
// Pack each image into CHW layout (same as transform() for single image)
|
|
|
|
|
for (size_t b = 0; b < N; ++b) {
|
|
|
|
|
const size_t batchOffset = b * 3 * imageSize;
|
|
|
|
|
std::vector<cv::Mat> channelMats(3);
|
|
|
|
|
for (int c = 0; c < 3; ++c) {
|
|
|
|
|
channelMats[c] = cv::Mat(height, width, CV_32FC1,
|
|
|
|
|
input_values_handler.data() + batchOffset + c * imageSize);
|
|
|
|
|
}
|
|
|
|
|
cv::split(batch[b], channelMats);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return Ort::Value::CreateTensor<float>(
|
|
|
|
|
*memory_info_handler,
|
|
|
|
|
input_values_handler.data(),
|
|
|
|
|
input_tensor_size,
|
|
|
|
|
input_node_dims.data(),
|
|
|
|
|
input_node_dims.size());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ------------------------------------------------------------------
|
|
|
|
|
// detect — full pipeline with auto task detection
|
|
|
|
|
//
|
|
|
|
|
// Decision logic:
|
|
|
|
|
// 2 outputs (second 4D) → segmentation
|
|
|
|
|
// 1 output, 2D → classification
|
|
|
|
|
// 1 output, 3D end2end dim2=6 → detection
|
|
|
|
|
// 1 output, 3D end2end dim2=7 → OBB
|
|
|
|
|
// 1 output, 3D end2end dim2>7 → pose (if (dim2-6)%3==0)
|
|
|
|
|
// 1 output, 3D legacy → detect/obb/pose by nc
|
|
|
|
|
// ------------------------------------------------------------------
|
|
|
|
|
std::vector<Object> ONNXYOLO::detect(const cv::Mat& image,
|
|
|
|
|
const std::vector<std::string>& classNames,
|
|
|
|
|
float confThreshold,
|
|
|
|
|
float iouThreshold,
|
|
|
|
|
int numKPS)
|
|
|
|
|
{
|
|
|
|
|
lastWasClassification = false;
|
|
|
|
|
|
|
|
|
|
if (image.empty())
|
|
|
|
|
return {};
|
|
|
|
|
|
|
|
|
|
Ort::Value inputTensor = transform(image);
|
|
|
|
|
|
|
|
|
|
auto outputTensors = ort_session->Run(
|
|
|
|
|
Ort::RunOptions{ nullptr },
|
|
|
|
|
input_node_names.data(),
|
|
|
|
|
&inputTensor, 1,
|
|
|
|
|
output_node_names.data(),
|
|
|
|
|
num_outputs);
|
|
|
|
|
|
2026-04-12 17:16:16 +10:00
|
|
|
// ── Output shape sanity check ───────────────────────────────────
|
|
|
|
|
// DirectML on some AMD configurations has been observed to return
|
|
|
|
|
// output tensors whose dim[1]/dim[2] values don't match what the
|
|
|
|
|
// ONNX graph actually produced, which propagates into
|
|
|
|
|
// postprocessLegacy / postprocessEndToEnd as huge numBoxes /
|
|
|
|
|
// numChannels values and causes multi-terabyte cv::Mat allocations
|
|
|
|
|
// inside the `cv::Mat(numChannels, numBoxes, CV_32F, ...).t()`
|
|
|
|
|
// call (observed as "Failed to allocate 3522082959360 bytes" on
|
|
|
|
|
// Ryzen APUs). Bail out early here instead of letting the
|
|
|
|
|
// postprocess layer try to materialise a 3.5 TB buffer.
|
|
|
|
|
//
|
|
|
|
|
// Sane upper bounds for Ultralytics YOLO outputs:
|
|
|
|
|
// • legacy [1, 84..300, 8400..25200] → max dim ≈ 30k
|
|
|
|
|
// • end2end [1, 300, 6..56] → max dim ≈ 300
|
|
|
|
|
// • segmentation proto mask [1, 32, 160, 160] → max dim ≈ 160
|
|
|
|
|
// • classification [1, 1000] → max dim ≈ 1k
|
|
|
|
|
// 1,000,000 is ~30x the largest real-world dim and catches the
|
|
|
|
|
// garbage values without clipping any legitimate model.
|
|
|
|
|
constexpr int64_t kMaxOutputDim = 1000000;
|
|
|
|
|
for (size_t t = 0; t < outputTensors.size(); ++t) {
|
|
|
|
|
const auto shape = outputTensors[t].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
for (size_t d = 0; d < shape.size(); ++d) {
|
|
|
|
|
if (shape[d] < 0 || shape[d] > kMaxOutputDim) {
|
|
|
|
|
std::cerr << "[ONNXYOLO] detect: output[" << t
|
|
|
|
|
<< "] dim[" << d << "]=" << shape[d]
|
|
|
|
|
<< " is out of range — refusing to postprocess."
|
|
|
|
|
<< std::endl;
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
const cv::Size resizedShape(
|
|
|
|
|
static_cast<int>(input_node_dims[3]),
|
|
|
|
|
static_cast<int>(input_node_dims[2]));
|
|
|
|
|
|
|
|
|
|
const size_t numOutputs = outputTensors.size();
|
|
|
|
|
|
|
|
|
|
// ── Segmentation: 2 outputs (detections + proto masks) ──────────
|
|
|
|
|
if (numOutputs >= 2) {
|
|
|
|
|
const auto protoShape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
if (protoShape.size() == 4) {
|
|
|
|
|
const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
// Legacy: shape [B, channels, num_boxes] e.g. [1, 116, 8400] -> shape[1] < shape[2]
|
|
|
|
|
// End2end: shape [B, max_det, features] e.g. [1, 300, 38] -> shape[1] > shape[2]
|
|
|
|
|
if (shape0.size() >= 3 && shape0[1] < shape0[2]) {
|
|
|
|
|
return postprocessSegLegacy(image.size(), resizedShape,
|
|
|
|
|
outputTensors, classNames,
|
|
|
|
|
confThreshold, iouThreshold);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
return postprocessSegEndToEnd(image.size(), resizedShape,
|
|
|
|
|
outputTensors, classNames, confThreshold);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
|
|
|
|
|
// ── Classification: 2D output [B, nc] ───────────────────────────
|
|
|
|
|
if (shape0.size() == 2) {
|
|
|
|
|
lastWasClassification = true;
|
|
|
|
|
return postprocessClassify(outputTensors, classNames, image.size());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (shape0.size() < 3)
|
|
|
|
|
return {};
|
|
|
|
|
|
|
|
|
|
// ── Determine end2end vs legacy ─────────────────────────────────
|
|
|
|
|
// End2end: shape [B, max_det, features] where max_det < features is false
|
|
|
|
|
// typically [1, 300, 6/7/...] so shape[1] > shape[2]
|
|
|
|
|
// Legacy: shape [B, channels, num_boxes] where channels < num_boxes
|
|
|
|
|
// typically [1, 84, 8400] so shape[1] < shape[2]
|
|
|
|
|
const bool isEndToEnd = (shape0[1] > shape0[2])
|
|
|
|
|
|| (shape0[2] <= 20); // very small dim2 = end2end
|
|
|
|
|
|
|
|
|
|
if (isEndToEnd) {
|
|
|
|
|
const int features = static_cast<int>(shape0[2]);
|
|
|
|
|
if (features == 6) {
|
|
|
|
|
return postprocessEndToEnd(image.size(), resizedShape,
|
|
|
|
|
outputTensors, classNames, confThreshold);
|
|
|
|
|
}
|
|
|
|
|
else if (features == 7) {
|
|
|
|
|
return postprocessOBBEndToEnd(image.size(), resizedShape,
|
|
|
|
|
outputTensors, classNames, confThreshold);
|
|
|
|
|
}
|
|
|
|
|
else if (features > 7 && (features - 6) % 3 == 0) {
|
|
|
|
|
int nk = (numKPS > 0) ? numKPS : (features - 6) / 3;
|
|
|
|
|
return postprocessPoseEndToEnd(image.size(), resizedShape,
|
|
|
|
|
outputTensors, classNames,
|
|
|
|
|
confThreshold, nk);
|
|
|
|
|
}
|
|
|
|
|
// Fallback to detection
|
|
|
|
|
return postprocessEndToEnd(image.size(), resizedShape,
|
|
|
|
|
outputTensors, classNames, confThreshold);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
// Legacy format: [B, channels, num_boxes]
|
|
|
|
|
// channels = 4(bbox) + nc(scores) + extra_features
|
|
|
|
|
const int nc = static_cast<int>(classNames.size());
|
|
|
|
|
const int numChannels = static_cast<int>(shape0[1]);
|
|
|
|
|
const int numBoxes = static_cast<int>(shape0[2]);
|
|
|
|
|
const int extra = numChannels - 4;
|
|
|
|
|
|
|
|
|
|
// Pose check: if numKPS is explicitly set, or we can detect keypoints
|
|
|
|
|
if (numKPS > 0 && numChannels >= 4 + 1 + numKPS * 3) {
|
|
|
|
|
return postprocessPoseLegacy(image.size(), resizedShape,
|
|
|
|
|
outputTensors, classNames,
|
|
|
|
|
confThreshold, iouThreshold, numKPS);
|
|
|
|
|
}
|
|
|
|
|
else if (nc > 0 && nc <= extra && extra > nc && (extra - nc) % 3 == 0 && (extra - nc) >= 3) {
|
|
|
|
|
int nk = (extra - nc) / 3;
|
|
|
|
|
return postprocessPoseLegacy(image.size(), resizedShape,
|
|
|
|
|
outputTensors, classNames,
|
|
|
|
|
confThreshold, iouThreshold, nk);
|
|
|
|
|
}
|
|
|
|
|
else if (nc > 0 && nc <= extra && extra == nc + 1) {
|
|
|
|
|
return postprocessOBBLegacy(image.size(), resizedShape,
|
|
|
|
|
outputTensors, classNames,
|
|
|
|
|
confThreshold, iouThreshold);
|
|
|
|
|
}
|
|
|
|
|
else if (nc > 0 && nc <= extra && extra == nc) {
|
|
|
|
|
return postprocessLegacy(image.size(), resizedShape,
|
|
|
|
|
outputTensors, classNames,
|
|
|
|
|
confThreshold, iouThreshold);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
// Class count doesn't match tensor — probe last channel
|
|
|
|
|
// to distinguish OBB (angle values in [-pi, pi]) from detection
|
|
|
|
|
bool likelyOBB = false;
|
|
|
|
|
if (extra >= 2) {
|
2026-04-08 13:45:52 +10:00
|
|
|
const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
|
2026-03-28 16:54:11 +11:00
|
|
|
int numSamples = std::min(numBoxes, 100);
|
|
|
|
|
int angleCount = 0;
|
|
|
|
|
for (int s = 0; s < numSamples; ++s) {
|
|
|
|
|
float v = rawOutput[(numChannels - 1) * numBoxes + s];
|
|
|
|
|
if (v >= -3.15f && v <= 3.15f) ++angleCount;
|
|
|
|
|
}
|
|
|
|
|
likelyOBB = (angleCount > numSamples * 8 / 10);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (likelyOBB) {
|
|
|
|
|
return postprocessOBBLegacy(image.size(), resizedShape,
|
|
|
|
|
outputTensors, classNames,
|
|
|
|
|
confThreshold, iouThreshold);
|
|
|
|
|
}
|
|
|
|
|
else if (numChannels == 56) {
|
|
|
|
|
return postprocessPoseLegacy(image.size(), resizedShape,
|
|
|
|
|
outputTensors, classNames,
|
|
|
|
|
confThreshold, iouThreshold, 17);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
return postprocessLegacy(image.size(), resizedShape,
|
|
|
|
|
outputTensors, classNames,
|
|
|
|
|
confThreshold, iouThreshold);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ====================================================================
|
|
|
|
|
// DETECTION — postprocess
|
|
|
|
|
// ====================================================================
|
|
|
|
|
|
|
|
|
|
std::vector<Object> ONNXYOLO::postprocessEndToEnd(
|
|
|
|
|
const cv::Size& originalImageSize,
|
|
|
|
|
const cv::Size& resizedImageShape,
|
2026-04-08 13:45:52 +10:00
|
|
|
std::vector<Ort::Value>& outputTensors,
|
2026-03-28 16:54:11 +11:00
|
|
|
const std::vector<std::string>& classNames,
|
|
|
|
|
float confThreshold)
|
|
|
|
|
{
|
|
|
|
|
if (outputTensors.empty()) return {};
|
|
|
|
|
|
2026-04-08 13:45:52 +10:00
|
|
|
const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
|
2026-03-28 16:54:11 +11:00
|
|
|
const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
if (outputShape.size() < 3) return {};
|
|
|
|
|
|
|
|
|
|
const int numDets = static_cast<int>(outputShape[1]);
|
|
|
|
|
const int numFeat = static_cast<int>(outputShape[2]);
|
|
|
|
|
|
|
|
|
|
const float origW = static_cast<float>(originalImageSize.width);
|
|
|
|
|
const float origH = static_cast<float>(originalImageSize.height);
|
|
|
|
|
const float modelW = static_cast<float>(resizedImageShape.width);
|
|
|
|
|
const float modelH = static_cast<float>(resizedImageShape.height);
|
|
|
|
|
const float gain = std::min(modelH / origH, modelW / origW);
|
|
|
|
|
const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float invGain = 1.0f / gain;
|
|
|
|
|
|
|
|
|
|
std::vector<Object> results;
|
|
|
|
|
results.reserve(numDets);
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < numDets; ++i) {
|
|
|
|
|
const float* det = rawOutput + i * numFeat;
|
|
|
|
|
const float conf = det[4];
|
|
|
|
|
if (conf <= confThreshold) continue;
|
|
|
|
|
|
|
|
|
|
float x1 = (det[0] - padX) * invGain;
|
|
|
|
|
float y1 = (det[1] - padY) * invGain;
|
|
|
|
|
float x2 = (det[2] - padX) * invGain;
|
|
|
|
|
float y2 = (det[3] - padY) * invGain;
|
|
|
|
|
int classId = static_cast<int>(det[5]);
|
|
|
|
|
|
|
|
|
|
x1 = clamp(x1, 0.f, origW); y1 = clamp(y1, 0.f, origH);
|
|
|
|
|
x2 = clamp(x2, 0.f, origW); y2 = clamp(y2, 0.f, origH);
|
|
|
|
|
float w = x2 - x1, h = y2 - y1;
|
|
|
|
|
if (w < 1.f || h < 1.f) continue;
|
|
|
|
|
|
|
|
|
|
Object obj;
|
|
|
|
|
obj.classId = classId;
|
|
|
|
|
obj.confidence = conf;
|
|
|
|
|
obj.box = cv::Rect(static_cast<int>(x1), static_cast<int>(y1),
|
|
|
|
|
static_cast<int>(w), static_cast<int>(h));
|
|
|
|
|
if (classId >= 0 && classId < static_cast<int>(classNames.size()))
|
|
|
|
|
obj.className = classNames[classId];
|
|
|
|
|
results.push_back(std::move(obj));
|
|
|
|
|
}
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<Object> ONNXYOLO::postprocessLegacy(
|
|
|
|
|
const cv::Size& originalImageSize,
|
|
|
|
|
const cv::Size& resizedImageShape,
|
2026-04-08 13:45:52 +10:00
|
|
|
std::vector<Ort::Value>& outputTensors,
|
2026-03-28 16:54:11 +11:00
|
|
|
const std::vector<std::string>& classNames,
|
|
|
|
|
float confThreshold, float iouThreshold, int maxDet)
|
|
|
|
|
{
|
|
|
|
|
if (outputTensors.empty()) return {};
|
|
|
|
|
|
2026-04-08 13:45:52 +10:00
|
|
|
const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
|
2026-03-28 16:54:11 +11:00
|
|
|
const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
if (outputShape.size() < 3) return {};
|
|
|
|
|
|
|
|
|
|
const int numChannels = static_cast<int>(outputShape[1]);
|
|
|
|
|
const int numBoxes = static_cast<int>(outputShape[2]);
|
|
|
|
|
const int numClasses = numChannels - 4;
|
|
|
|
|
if (numClasses <= 0) return {};
|
|
|
|
|
|
|
|
|
|
cv::Mat output = cv::Mat(numChannels, numBoxes, CV_32F,
|
|
|
|
|
const_cast<float*>(rawOutput)).t();
|
|
|
|
|
|
|
|
|
|
const float origW = static_cast<float>(originalImageSize.width);
|
|
|
|
|
const float origH = static_cast<float>(originalImageSize.height);
|
|
|
|
|
const float modelW = static_cast<float>(resizedImageShape.width);
|
|
|
|
|
const float modelH = static_cast<float>(resizedImageShape.height);
|
|
|
|
|
const float gain = std::min(modelH / origH, modelW / origW);
|
|
|
|
|
const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float invGain = 1.0f / gain;
|
|
|
|
|
|
|
|
|
|
struct Candidate { float x1, y1, x2, y2, conf; int classId; };
|
|
|
|
|
std::vector<Candidate> candidates;
|
|
|
|
|
candidates.reserve(numBoxes);
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < numBoxes; ++i) {
|
|
|
|
|
const float* row = output.ptr<float>(i);
|
|
|
|
|
const float* scoresPtr = row + 4;
|
|
|
|
|
float maxScore = -FLT_MAX;
|
|
|
|
|
int bestClass = -1;
|
|
|
|
|
for (int c = 0; c < numClasses; ++c) {
|
|
|
|
|
if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; }
|
|
|
|
|
}
|
|
|
|
|
if (maxScore <= confThreshold) continue;
|
|
|
|
|
|
|
|
|
|
float cx = row[0], cy = row[1], w = row[2], h = row[3];
|
|
|
|
|
candidates.push_back({ cx - w*0.5f, cy - h*0.5f, cx + w*0.5f, cy + h*0.5f,
|
|
|
|
|
maxScore, bestClass });
|
|
|
|
|
}
|
|
|
|
|
if (candidates.empty()) return {};
|
|
|
|
|
|
|
|
|
|
// Class-aware NMS
|
|
|
|
|
constexpr float MAX_WH = 7680.0f;
|
|
|
|
|
std::vector<int> sortedIdx(candidates.size());
|
|
|
|
|
std::iota(sortedIdx.begin(), sortedIdx.end(), 0);
|
|
|
|
|
std::sort(sortedIdx.begin(), sortedIdx.end(),
|
|
|
|
|
[&](int a, int b) { return candidates[a].conf > candidates[b].conf; });
|
|
|
|
|
if (static_cast<int>(sortedIdx.size()) > 30000) sortedIdx.resize(30000);
|
|
|
|
|
|
|
|
|
|
std::vector<bool> suppressed(sortedIdx.size(), false);
|
|
|
|
|
std::vector<int> keepIndices;
|
|
|
|
|
keepIndices.reserve(maxDet);
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < sortedIdx.size() && static_cast<int>(keepIndices.size()) < maxDet; ++i) {
|
|
|
|
|
if (suppressed[i]) continue;
|
|
|
|
|
keepIndices.push_back(sortedIdx[i]);
|
|
|
|
|
const auto& cur = candidates[sortedIdx[i]];
|
|
|
|
|
float cx1 = cur.x1 + cur.classId*MAX_WH, cy1 = cur.y1 + cur.classId*MAX_WH;
|
|
|
|
|
float cx2 = cur.x2 + cur.classId*MAX_WH, cy2 = cur.y2 + cur.classId*MAX_WH;
|
|
|
|
|
float curArea = (cx2-cx1)*(cy2-cy1);
|
|
|
|
|
|
|
|
|
|
for (size_t j = i+1; j < sortedIdx.size(); ++j) {
|
|
|
|
|
if (suppressed[j]) continue;
|
|
|
|
|
const auto& o = candidates[sortedIdx[j]];
|
|
|
|
|
float ox1 = o.x1+o.classId*MAX_WH, oy1 = o.y1+o.classId*MAX_WH;
|
|
|
|
|
float ox2 = o.x2+o.classId*MAX_WH, oy2 = o.y2+o.classId*MAX_WH;
|
|
|
|
|
float iw = std::min(cx2,ox2)-std::max(cx1,ox1);
|
|
|
|
|
float ih = std::min(cy2,oy2)-std::max(cy1,oy1);
|
|
|
|
|
if (iw <= 0.f || ih <= 0.f) continue;
|
|
|
|
|
float inter = iw*ih;
|
|
|
|
|
float ua = curArea + (ox2-ox1)*(oy2-oy1) - inter;
|
|
|
|
|
if (ua > 0.f && inter/ua > iouThreshold) suppressed[j] = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<Object> results;
|
|
|
|
|
results.reserve(keepIndices.size());
|
|
|
|
|
for (int idx : keepIndices) {
|
|
|
|
|
const auto& c = candidates[idx];
|
|
|
|
|
float x1 = clamp((c.x1-padX)*invGain, 0.f, origW);
|
|
|
|
|
float y1 = clamp((c.y1-padY)*invGain, 0.f, origH);
|
|
|
|
|
float x2 = clamp((c.x2-padX)*invGain, 0.f, origW);
|
|
|
|
|
float y2 = clamp((c.y2-padY)*invGain, 0.f, origH);
|
|
|
|
|
float w = x2-x1, h = y2-y1;
|
|
|
|
|
if (w < 1.f || h < 1.f) continue;
|
|
|
|
|
|
|
|
|
|
Object obj;
|
|
|
|
|
obj.classId = c.classId;
|
|
|
|
|
obj.confidence = c.conf;
|
|
|
|
|
obj.box = cv::Rect(static_cast<int>(x1), static_cast<int>(y1),
|
|
|
|
|
static_cast<int>(w), static_cast<int>(h));
|
|
|
|
|
if (c.classId >= 0 && c.classId < static_cast<int>(classNames.size()))
|
|
|
|
|
obj.className = classNames[c.classId];
|
|
|
|
|
results.push_back(std::move(obj));
|
|
|
|
|
}
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ====================================================================
|
|
|
|
|
// OBB — helpers (Prob-IoU based NMS)
|
|
|
|
|
// ====================================================================
|
|
|
|
|
|
|
|
|
|
void ONNXYOLO::getCovarianceComponents(const OrientedBox& box,
|
|
|
|
|
float& out1, float& out2, float& out3)
|
|
|
|
|
{
|
|
|
|
|
if (box.width <= 0.f || box.height <= 0.f) {
|
|
|
|
|
out1 = out2 = out3 = 0.f;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
const float vw = (box.width * box.width) / 12.0f;
|
|
|
|
|
const float vh = (box.height * box.height) / 12.0f;
|
|
|
|
|
const float cosT = std::cos(box.angle);
|
|
|
|
|
const float sinT = std::sin(box.angle);
|
|
|
|
|
const float cos2 = cosT * cosT;
|
|
|
|
|
const float sin2 = sinT * sinT;
|
|
|
|
|
const float sc = sinT * cosT;
|
|
|
|
|
out1 = vw * cos2 + vh * sin2;
|
|
|
|
|
out2 = vw * sin2 + vh * cos2;
|
|
|
|
|
out3 = (vw - vh) * sc;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<std::vector<float>> ONNXYOLO::batchProbiou(
|
|
|
|
|
const std::vector<OrientedBox>& obb1,
|
|
|
|
|
const std::vector<OrientedBox>& obb2, float eps)
|
|
|
|
|
{
|
|
|
|
|
if (obb1.empty() || obb2.empty()) return {};
|
|
|
|
|
const size_t n1 = obb1.size(), n2 = obb2.size();
|
|
|
|
|
std::vector<std::vector<float>> iouMat(n1, std::vector<float>(n2, 0.f));
|
|
|
|
|
|
|
|
|
|
// Pre-compute covariance for obb1
|
|
|
|
|
struct CovData { float x, y, a, b, c; };
|
|
|
|
|
std::vector<CovData> cov1(n1);
|
|
|
|
|
for (size_t i = 0; i < n1; ++i) {
|
|
|
|
|
float a, b, c;
|
|
|
|
|
getCovarianceComponents(obb1[i], a, b, c);
|
|
|
|
|
cov1[i] = { obb1[i].x, obb1[i].y, a, b, c };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < n1; ++i) {
|
|
|
|
|
for (size_t j = 0; j < n2; ++j) {
|
|
|
|
|
float a2, b2, c2;
|
|
|
|
|
getCovarianceComponents(obb2[j], a2, b2, c2);
|
|
|
|
|
float dx = cov1[i].x - obb2[j].x;
|
|
|
|
|
float dy = cov1[i].y - obb2[j].y;
|
|
|
|
|
float sA = cov1[i].a + a2, sB = cov1[i].b + b2, sC = cov1[i].c + c2;
|
|
|
|
|
float denom = sA * sB - sC * sC + eps;
|
|
|
|
|
if (denom <= eps) continue;
|
|
|
|
|
|
|
|
|
|
float t1 = ((sA*dy*dy + sB*dx*dx) * 0.25f) / denom;
|
|
|
|
|
float t2 = ((sC*dx*dy) * -0.5f) / denom;
|
|
|
|
|
float d1 = cov1[i].a*cov1[i].b - cov1[i].c*cov1[i].c;
|
|
|
|
|
float d2 = a2*b2 - c2*c2;
|
|
|
|
|
float sqrtDet = std::sqrt(std::max(d1, 0.f) * std::max(d2, 0.f) + eps);
|
|
|
|
|
float t3 = 0.5f * std::log((sA*sB - sC*sC) / (4.f*sqrtDet) + eps);
|
|
|
|
|
float bd = std::clamp(t1 + t2 + t3, eps, 100.f);
|
|
|
|
|
float hd = std::sqrt(1.f - std::exp(-bd) + eps);
|
|
|
|
|
iouMat[i][j] = 1.f - hd;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return iouMat;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<int> ONNXYOLO::nmsRotatedImpl(
|
|
|
|
|
const std::vector<OrientedBox>& sortedBoxes, float iouThreshold)
|
|
|
|
|
{
|
|
|
|
|
if (sortedBoxes.empty()) return {};
|
|
|
|
|
if (sortedBoxes.size() == 1) return { 0 };
|
|
|
|
|
|
|
|
|
|
auto iouMat = batchProbiou(sortedBoxes, sortedBoxes);
|
|
|
|
|
if (iouMat.empty()) return {};
|
|
|
|
|
const int n = static_cast<int>(sortedBoxes.size());
|
|
|
|
|
|
|
|
|
|
std::vector<int> keep;
|
|
|
|
|
keep.reserve(n / 2);
|
|
|
|
|
for (int j = 0; j < n; ++j) {
|
|
|
|
|
bool shouldKeep = true;
|
|
|
|
|
for (int i = 0; i < j; ++i) {
|
|
|
|
|
if (iouMat[i][j] >= iouThreshold) { shouldKeep = false; break; }
|
|
|
|
|
}
|
|
|
|
|
if (shouldKeep) keep.push_back(j);
|
|
|
|
|
}
|
|
|
|
|
return keep;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<int> ONNXYOLO::nmsRotated(
|
|
|
|
|
const std::vector<OrientedBox>& boxes,
|
|
|
|
|
const std::vector<float>& scores, float iouThreshold)
|
|
|
|
|
{
|
|
|
|
|
if (boxes.empty() || scores.empty() || boxes.size() != scores.size()) return {};
|
|
|
|
|
|
|
|
|
|
std::vector<int> sortedIdx(boxes.size());
|
|
|
|
|
std::iota(sortedIdx.begin(), sortedIdx.end(), 0);
|
|
|
|
|
std::sort(sortedIdx.begin(), sortedIdx.end(),
|
|
|
|
|
[&](int a, int b) { return scores[a] > scores[b]; });
|
|
|
|
|
|
|
|
|
|
std::vector<OrientedBox> sortedBoxes;
|
|
|
|
|
sortedBoxes.reserve(boxes.size());
|
|
|
|
|
for (int i : sortedIdx) sortedBoxes.push_back(boxes[i]);
|
|
|
|
|
|
|
|
|
|
auto keepSorted = nmsRotatedImpl(sortedBoxes, iouThreshold);
|
|
|
|
|
std::vector<int> keepOrig;
|
|
|
|
|
keepOrig.reserve(keepSorted.size());
|
|
|
|
|
for (int si : keepSorted) keepOrig.push_back(sortedIdx[si]);
|
|
|
|
|
return keepOrig;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<cv::Point2f> ONNXYOLO::OBBToPoints(const OrientedBox& obb)
|
|
|
|
|
{
|
|
|
|
|
float angleDeg = obb.angle * 180.0f / static_cast<float>(CV_PI);
|
|
|
|
|
cv::RotatedRect rr(cv::Point2f(obb.x, obb.y),
|
|
|
|
|
cv::Size2f(obb.width, obb.height), angleDeg);
|
|
|
|
|
std::vector<cv::Point2f> corners(4);
|
|
|
|
|
rr.points(corners.data());
|
|
|
|
|
return corners;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ====================================================================
|
|
|
|
|
// OBB — postprocess
|
|
|
|
|
// ====================================================================
|
|
|
|
|
|
|
|
|
|
std::vector<Object> ONNXYOLO::postprocessOBBEndToEnd(
|
|
|
|
|
const cv::Size& originalImageSize,
|
|
|
|
|
const cv::Size& resizedImageShape,
|
2026-04-08 13:45:52 +10:00
|
|
|
std::vector<Ort::Value>& outputTensors,
|
2026-03-28 16:54:11 +11:00
|
|
|
const std::vector<std::string>& classNames,
|
|
|
|
|
float confThreshold)
|
|
|
|
|
{
|
|
|
|
|
if (outputTensors.empty()) return {};
|
2026-04-08 13:45:52 +10:00
|
|
|
const float* raw = outputTensors[0].GetTensorMutableData<float>();
|
2026-03-28 16:54:11 +11:00
|
|
|
const auto shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
if (shape.size() < 3) return {};
|
|
|
|
|
|
|
|
|
|
const int numDets = static_cast<int>(shape[1]);
|
|
|
|
|
const int numFeat = static_cast<int>(shape[2]); // 7: cx,cy,w,h,angle,conf,classId
|
|
|
|
|
|
|
|
|
|
const float origW = static_cast<float>(originalImageSize.width);
|
|
|
|
|
const float origH = static_cast<float>(originalImageSize.height);
|
|
|
|
|
const float modelW = static_cast<float>(resizedImageShape.width);
|
|
|
|
|
const float modelH = static_cast<float>(resizedImageShape.height);
|
|
|
|
|
const float gain = std::min(modelH / origH, modelW / origW);
|
|
|
|
|
const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float invGain = 1.0f / gain;
|
|
|
|
|
|
|
|
|
|
std::vector<Object> results;
|
|
|
|
|
results.reserve(numDets);
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < numDets; ++i) {
|
|
|
|
|
const float* det = raw + i * numFeat;
|
|
|
|
|
float angle = det[4];
|
|
|
|
|
float conf = det[5];
|
|
|
|
|
if (conf <= confThreshold) continue;
|
|
|
|
|
|
|
|
|
|
float cx = (det[0] - padX) * invGain;
|
|
|
|
|
float cy = (det[1] - padY) * invGain;
|
|
|
|
|
float bw = det[2] * invGain;
|
|
|
|
|
float bh = det[3] * invGain;
|
|
|
|
|
int classId = static_cast<int>(det[6]);
|
|
|
|
|
|
|
|
|
|
cx = clamp(cx, 0.f, origW);
|
|
|
|
|
cy = clamp(cy, 0.f, origH);
|
|
|
|
|
|
|
|
|
|
OrientedBox obb{ cx, cy, bw, bh, angle };
|
|
|
|
|
|
|
|
|
|
Object obj;
|
|
|
|
|
obj.classId = classId;
|
|
|
|
|
obj.confidence = conf;
|
|
|
|
|
obj.kps = { cx, cy, bw, bh, angle };
|
|
|
|
|
auto absCorners = OBBToPoints(obb);
|
|
|
|
|
obj.box = cv::boundingRect(absCorners);
|
|
|
|
|
// Normalize OBB corners to [0,1] and close the polygon
|
|
|
|
|
obj.polygon.reserve(absCorners.size() + 1);
|
|
|
|
|
for (const auto& pt : absCorners) {
|
|
|
|
|
obj.polygon.emplace_back(
|
|
|
|
|
std::clamp(pt.x / origW, 0.f, 1.f),
|
|
|
|
|
std::clamp(pt.y / origH, 0.f, 1.f));
|
|
|
|
|
}
|
|
|
|
|
if (!obj.polygon.empty()) obj.polygon.push_back(obj.polygon.front());
|
|
|
|
|
if (classId >= 0 && classId < static_cast<int>(classNames.size()))
|
|
|
|
|
obj.className = classNames[classId];
|
|
|
|
|
results.push_back(std::move(obj));
|
|
|
|
|
}
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<Object> ONNXYOLO::postprocessOBBLegacy(
|
|
|
|
|
const cv::Size& originalImageSize,
|
|
|
|
|
const cv::Size& resizedImageShape,
|
2026-04-08 13:45:52 +10:00
|
|
|
std::vector<Ort::Value>& outputTensors,
|
2026-03-28 16:54:11 +11:00
|
|
|
const std::vector<std::string>& classNames,
|
|
|
|
|
float confThreshold, float iouThreshold, int maxDet)
|
|
|
|
|
{
|
|
|
|
|
if (outputTensors.empty()) return {};
|
2026-04-08 13:45:52 +10:00
|
|
|
const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
|
2026-03-28 16:54:11 +11:00
|
|
|
const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
if (outputShape.size() < 3) return {};
|
|
|
|
|
|
|
|
|
|
const int numChannels = static_cast<int>(outputShape[1]);
|
|
|
|
|
const int numBoxes = static_cast<int>(outputShape[2]);
|
|
|
|
|
const int numClasses = numChannels - 5; // 4 box + nc scores + 1 angle
|
|
|
|
|
|
|
|
|
|
if (numClasses <= 0) return {};
|
|
|
|
|
|
|
|
|
|
cv::Mat output = cv::Mat(numChannels, numBoxes, CV_32F,
|
|
|
|
|
const_cast<float*>(rawOutput)).t();
|
|
|
|
|
|
|
|
|
|
const float origW = static_cast<float>(originalImageSize.width);
|
|
|
|
|
const float origH = static_cast<float>(originalImageSize.height);
|
|
|
|
|
const float modelW = static_cast<float>(resizedImageShape.width);
|
|
|
|
|
const float modelH = static_cast<float>(resizedImageShape.height);
|
|
|
|
|
const float gain = std::min(modelH / origH, modelW / origW);
|
|
|
|
|
const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float invGain = 1.0f / gain;
|
|
|
|
|
|
|
|
|
|
struct OBBCandidate {
|
|
|
|
|
OrientedBox box;
|
|
|
|
|
float conf;
|
|
|
|
|
int classId;
|
|
|
|
|
};
|
|
|
|
|
std::vector<OBBCandidate> candidates;
|
|
|
|
|
candidates.reserve(numBoxes);
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < numBoxes; ++i) {
|
|
|
|
|
const float* row = output.ptr<float>(i);
|
|
|
|
|
const float* scoresPtr = row + 4;
|
|
|
|
|
float maxScore = -FLT_MAX;
|
|
|
|
|
int bestClass = -1;
|
|
|
|
|
for (int c = 0; c < numClasses; ++c) {
|
|
|
|
|
if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; }
|
|
|
|
|
}
|
|
|
|
|
if (maxScore <= confThreshold) continue;
|
|
|
|
|
|
|
|
|
|
float angle = row[4 + numClasses]; // angle after class scores
|
|
|
|
|
float cx = (row[0] - padX) * invGain;
|
|
|
|
|
float cy = (row[1] - padY) * invGain;
|
|
|
|
|
float bw = row[2] * invGain;
|
|
|
|
|
float bh = row[3] * invGain;
|
|
|
|
|
cx = clamp(cx, 0.f, origW);
|
|
|
|
|
cy = clamp(cy, 0.f, origH);
|
|
|
|
|
|
|
|
|
|
candidates.push_back({ { cx, cy, bw, bh, angle }, maxScore, bestClass });
|
|
|
|
|
}
|
|
|
|
|
if (candidates.empty()) return {};
|
|
|
|
|
|
|
|
|
|
// Prob-IoU NMS for oriented boxes
|
|
|
|
|
std::vector<OrientedBox> boxes;
|
|
|
|
|
std::vector<float> scores;
|
|
|
|
|
boxes.reserve(candidates.size());
|
|
|
|
|
scores.reserve(candidates.size());
|
|
|
|
|
for (const auto& c : candidates) { boxes.push_back(c.box); scores.push_back(c.conf); }
|
|
|
|
|
|
|
|
|
|
auto keepIdx = nmsRotated(boxes, scores, iouThreshold);
|
|
|
|
|
|
|
|
|
|
std::vector<Object> results;
|
|
|
|
|
results.reserve(std::min(static_cast<int>(keepIdx.size()), maxDet));
|
|
|
|
|
for (int idx : keepIdx) {
|
|
|
|
|
if (static_cast<int>(results.size()) >= maxDet) break;
|
|
|
|
|
const auto& c = candidates[idx];
|
|
|
|
|
Object obj;
|
|
|
|
|
obj.classId = c.classId;
|
|
|
|
|
obj.confidence = c.conf;
|
|
|
|
|
obj.kps = { c.box.x, c.box.y, c.box.width, c.box.height, c.box.angle };
|
|
|
|
|
auto absCorners = OBBToPoints(c.box);
|
|
|
|
|
obj.box = cv::boundingRect(absCorners);
|
|
|
|
|
// Normalize OBB corners to [0,1] and close the polygon
|
|
|
|
|
const float origW = static_cast<float>(originalImageSize.width);
|
|
|
|
|
const float origH = static_cast<float>(originalImageSize.height);
|
|
|
|
|
obj.polygon.reserve(absCorners.size() + 1);
|
|
|
|
|
for (const auto& pt : absCorners) {
|
|
|
|
|
obj.polygon.emplace_back(
|
|
|
|
|
std::clamp(pt.x / origW, 0.f, 1.f),
|
|
|
|
|
std::clamp(pt.y / origH, 0.f, 1.f));
|
|
|
|
|
}
|
|
|
|
|
if (!obj.polygon.empty()) obj.polygon.push_back(obj.polygon.front());
|
|
|
|
|
if (c.classId >= 0 && c.classId < static_cast<int>(classNames.size()))
|
|
|
|
|
obj.className = classNames[c.classId];
|
|
|
|
|
results.push_back(std::move(obj));
|
|
|
|
|
}
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ====================================================================
|
|
|
|
|
// SEGMENTATION — postprocess
|
|
|
|
|
// ====================================================================
|
|
|
|
|
|
|
|
|
|
std::vector<Object> ONNXYOLO::postprocessSegEndToEnd(
|
|
|
|
|
const cv::Size& originalImageSize,
|
|
|
|
|
const cv::Size& resizedImageShape,
|
2026-04-08 13:45:52 +10:00
|
|
|
std::vector<Ort::Value>& outputTensors,
|
2026-03-28 16:54:11 +11:00
|
|
|
const std::vector<std::string>& classNames,
|
|
|
|
|
float confThreshold)
|
|
|
|
|
{
|
|
|
|
|
if (outputTensors.size() < 2) return {};
|
|
|
|
|
|
2026-04-08 13:45:52 +10:00
|
|
|
const float* raw = outputTensors[0].GetTensorMutableData<float>();
|
2026-03-28 16:54:11 +11:00
|
|
|
const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
const auto protoShape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
if (shape0.size() < 3 || protoShape.size() < 4) return {};
|
|
|
|
|
|
|
|
|
|
const int numDets = static_cast<int>(shape0[1]);
|
|
|
|
|
const int numFeat = static_cast<int>(shape0[2]); // 6 + nm
|
|
|
|
|
const int nm = static_cast<int>(protoShape[1]);
|
|
|
|
|
const int protoH = static_cast<int>(protoShape[2]);
|
|
|
|
|
const int protoW = static_cast<int>(protoShape[3]);
|
|
|
|
|
|
|
|
|
|
if (numFeat < 6 + nm) return {};
|
|
|
|
|
|
|
|
|
|
const float origW = static_cast<float>(originalImageSize.width);
|
|
|
|
|
const float origH = static_cast<float>(originalImageSize.height);
|
|
|
|
|
const float modelW = static_cast<float>(resizedImageShape.width);
|
|
|
|
|
const float modelH = static_cast<float>(resizedImageShape.height);
|
|
|
|
|
const float gain = std::min(modelH / origH, modelW / origW);
|
|
|
|
|
const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float invGain = 1.0f / gain;
|
|
|
|
|
|
|
|
|
|
// Collect detections and mask coefficients
|
|
|
|
|
std::vector<Object> objs;
|
|
|
|
|
cv::Mat maskCoeffs; // [N, nm]
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < numDets; ++i) {
|
|
|
|
|
const float* det = raw + i * numFeat;
|
|
|
|
|
float conf = det[4];
|
|
|
|
|
if (conf <= confThreshold) continue;
|
|
|
|
|
|
|
|
|
|
int classId = static_cast<int>(det[5]);
|
|
|
|
|
float x1 = clamp((det[0] - padX) * invGain, 0.f, origW);
|
|
|
|
|
float y1 = clamp((det[1] - padY) * invGain, 0.f, origH);
|
|
|
|
|
float x2 = clamp((det[2] - padX) * invGain, 0.f, origW);
|
|
|
|
|
float y2 = clamp((det[3] - padY) * invGain, 0.f, origH);
|
|
|
|
|
float w = x2-x1, h = y2-y1;
|
|
|
|
|
if (w < 1.f || h < 1.f) continue;
|
|
|
|
|
|
|
|
|
|
Object obj;
|
|
|
|
|
obj.classId = classId;
|
|
|
|
|
obj.confidence = conf;
|
|
|
|
|
obj.box = cv::Rect(static_cast<int>(x1), static_cast<int>(y1),
|
|
|
|
|
static_cast<int>(w), static_cast<int>(h));
|
|
|
|
|
if (classId >= 0 && classId < static_cast<int>(classNames.size()))
|
|
|
|
|
obj.className = classNames[classId];
|
|
|
|
|
objs.push_back(std::move(obj));
|
|
|
|
|
|
|
|
|
|
// Extract mask coefficients (after the 6 detection values)
|
|
|
|
|
cv::Mat mc(1, nm, CV_32F);
|
|
|
|
|
std::memcpy(mc.ptr<float>(), det + 6, nm * sizeof(float));
|
|
|
|
|
maskCoeffs.push_back(mc);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Generate masks: coeffs @ protos → sigmoid → crop-in-proto → resize-to-box → threshold
|
|
|
|
|
if (!objs.empty() && !maskCoeffs.empty()) {
|
2026-04-08 13:45:52 +10:00
|
|
|
const float* protoData = outputTensors[1].GetTensorMutableData<float>();
|
2026-03-28 16:54:11 +11:00
|
|
|
cv::Mat protos(nm, protoH * protoW, CV_32F, const_cast<float*>(protoData));
|
|
|
|
|
cv::Mat matmulRes = (maskCoeffs * protos).t();
|
|
|
|
|
|
|
|
|
|
// Apply sigmoid while still a single-channel 2D matrix
|
|
|
|
|
cv::Mat negMat;
|
|
|
|
|
cv::exp(-matmulRes, negMat);
|
|
|
|
|
cv::Mat sigmoidFlat = 1.0 / (1.0 + negMat);
|
|
|
|
|
|
|
|
|
|
cv::Mat sigmoidMat = sigmoidFlat.reshape(static_cast<int>(objs.size()),
|
|
|
|
|
{ protoH, protoW });
|
|
|
|
|
std::vector<cv::Mat> maskChannels;
|
|
|
|
|
cv::split(sigmoidMat, maskChannels);
|
|
|
|
|
|
|
|
|
|
// ROI in proto space, accounting for letterbox padding
|
|
|
|
|
cv::Rect roi;
|
|
|
|
|
if (origH > origW) {
|
|
|
|
|
int roiW = std::min(static_cast<int>(std::round(
|
|
|
|
|
static_cast<float>(protoW) * origW / origH)), protoW);
|
|
|
|
|
roi = cv::Rect((protoW - roiW) / 2, 0, roiW, protoH);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
int roiH = std::min(static_cast<int>(std::round(
|
|
|
|
|
static_cast<float>(protoH) * origH / origW)), protoH);
|
|
|
|
|
roi = cv::Rect(0, (protoH - roiH) / 2, protoW, roiH);
|
|
|
|
|
}
|
|
|
|
|
roi &= cv::Rect(0, 0, protoW, protoH);
|
|
|
|
|
|
|
|
|
|
int imgW = static_cast<int>(origW);
|
|
|
|
|
int imgH = static_cast<int>(origH);
|
|
|
|
|
|
|
|
|
|
const float scaleX = static_cast<float>(imgW) / roi.width;
|
|
|
|
|
const float scaleY = static_cast<float>(imgH) / roi.height;
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < objs.size(); ++i) {
|
|
|
|
|
cv::Rect safebox = objs[i].box & cv::Rect(0, 0, imgW, imgH);
|
|
|
|
|
if (safebox.area() <= 0) continue;
|
|
|
|
|
|
|
|
|
|
int px0 = std::max(static_cast<int>(std::floor(safebox.x / scaleX)), 0);
|
|
|
|
|
int py0 = std::max(static_cast<int>(std::floor(safebox.y / scaleY)), 0);
|
|
|
|
|
int px1 = std::min(static_cast<int>(std::ceil((safebox.x + safebox.width) / scaleX)), roi.width);
|
|
|
|
|
int py1 = std::min(static_cast<int>(std::ceil((safebox.y + safebox.height) / scaleY)), roi.height);
|
|
|
|
|
if (px1 <= px0 || py1 <= py0) continue;
|
|
|
|
|
|
|
|
|
|
cv::Rect protoBox(roi.x + px0, roi.y + py0, px1 - px0, py1 - py0);
|
|
|
|
|
protoBox &= cv::Rect(0, 0, protoW, protoH);
|
|
|
|
|
if (protoBox.area() <= 0) continue;
|
|
|
|
|
|
|
|
|
|
cv::Mat cropped = maskChannels[i](protoBox);
|
|
|
|
|
cv::Mat resized;
|
|
|
|
|
cv::resize(cropped, resized, cv::Size(safebox.width, safebox.height),
|
|
|
|
|
0, 0, cv::INTER_LINEAR);
|
|
|
|
|
objs[i].mask = resized > 0.5f;
|
|
|
|
|
objs[i].polygon = ANSUtilityHelper::MaskToNormalizedPolygon(
|
|
|
|
|
objs[i].mask, safebox, origW, origH);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
for (auto& obj : objs) {
|
|
|
|
|
if (obj.polygon.empty())
|
|
|
|
|
obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, origW, origH);
|
|
|
|
|
}
|
|
|
|
|
return objs;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<Object> ONNXYOLO::postprocessSegLegacy(
|
|
|
|
|
const cv::Size& originalImageSize,
|
|
|
|
|
const cv::Size& resizedImageShape,
|
2026-04-08 13:45:52 +10:00
|
|
|
std::vector<Ort::Value>& outputTensors,
|
2026-03-28 16:54:11 +11:00
|
|
|
const std::vector<std::string>& classNames,
|
|
|
|
|
float confThreshold, float iouThreshold, int maxDet)
|
|
|
|
|
{
|
|
|
|
|
if (outputTensors.size() < 2) return {};
|
|
|
|
|
|
2026-04-08 13:45:52 +10:00
|
|
|
const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
|
2026-03-28 16:54:11 +11:00
|
|
|
const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
const auto protoShape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
if (shape0.size() < 3 || protoShape.size() < 4) return {};
|
|
|
|
|
|
|
|
|
|
const int numChannels = static_cast<int>(shape0[1]);
|
|
|
|
|
const int numBoxes = static_cast<int>(shape0[2]);
|
|
|
|
|
const int nm = static_cast<int>(protoShape[1]);
|
|
|
|
|
const int protoH = static_cast<int>(protoShape[2]);
|
|
|
|
|
const int protoW = static_cast<int>(protoShape[3]);
|
|
|
|
|
const int numClasses = numChannels - 4 - nm;
|
|
|
|
|
if (numClasses <= 0) return {};
|
|
|
|
|
|
|
|
|
|
cv::Mat output = cv::Mat(numChannels, numBoxes, CV_32F,
|
|
|
|
|
const_cast<float*>(rawOutput)).t();
|
|
|
|
|
|
|
|
|
|
const float origW = static_cast<float>(originalImageSize.width);
|
|
|
|
|
const float origH = static_cast<float>(originalImageSize.height);
|
|
|
|
|
const float modelW = static_cast<float>(resizedImageShape.width);
|
|
|
|
|
const float modelH = static_cast<float>(resizedImageShape.height);
|
|
|
|
|
const float gain = std::min(modelH / origH, modelW / origW);
|
|
|
|
|
const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float invGain = 1.0f / gain;
|
|
|
|
|
|
|
|
|
|
std::vector<cv::Rect> bboxes;
|
|
|
|
|
std::vector<float> scores;
|
|
|
|
|
std::vector<int> labels;
|
|
|
|
|
std::vector<cv::Mat> maskCoeffs;
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < numBoxes; ++i) {
|
|
|
|
|
const float* row = output.ptr<float>(i);
|
|
|
|
|
const float* scoresPtr = row + 4;
|
|
|
|
|
float maxScore = -FLT_MAX;
|
|
|
|
|
int bestClass = -1;
|
|
|
|
|
for (int c = 0; c < numClasses; ++c) {
|
|
|
|
|
if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; }
|
|
|
|
|
}
|
|
|
|
|
if (maxScore <= confThreshold) continue;
|
|
|
|
|
|
|
|
|
|
float cx = row[0], cy = row[1], w = row[2], h = row[3];
|
|
|
|
|
float x0 = clamp((cx - w*0.5f - padX) * invGain, 0.f, origW);
|
|
|
|
|
float y0 = clamp((cy - h*0.5f - padY) * invGain, 0.f, origH);
|
|
|
|
|
float x1 = clamp((cx + w*0.5f - padX) * invGain, 0.f, origW);
|
|
|
|
|
float y1 = clamp((cy + h*0.5f - padY) * invGain, 0.f, origH);
|
|
|
|
|
|
|
|
|
|
bboxes.push_back(cv::Rect(static_cast<int>(x0), static_cast<int>(y0),
|
|
|
|
|
static_cast<int>(x1-x0), static_cast<int>(y1-y0)));
|
|
|
|
|
scores.push_back(maxScore);
|
|
|
|
|
labels.push_back(bestClass);
|
|
|
|
|
|
|
|
|
|
cv::Mat mc(1, nm, CV_32F);
|
|
|
|
|
std::memcpy(mc.ptr<float>(), row + 4 + numClasses, nm * sizeof(float));
|
|
|
|
|
maskCoeffs.push_back(mc);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// NMS
|
|
|
|
|
std::vector<int> indices;
|
|
|
|
|
cv::dnn::NMSBoxesBatched(bboxes, scores, labels, confThreshold,
|
|
|
|
|
iouThreshold, indices);
|
|
|
|
|
|
|
|
|
|
// Collect surviving detections and their mask coefficients
|
|
|
|
|
std::vector<Object> objs;
|
|
|
|
|
cv::Mat masks;
|
|
|
|
|
for (int idx : indices) {
|
|
|
|
|
if (static_cast<int>(objs.size()) >= maxDet) break;
|
|
|
|
|
Object obj;
|
|
|
|
|
obj.classId = labels[idx];
|
|
|
|
|
obj.confidence = scores[idx];
|
|
|
|
|
obj.box = bboxes[idx];
|
|
|
|
|
if (obj.classId >= 0 && obj.classId < static_cast<int>(classNames.size()))
|
|
|
|
|
obj.className = classNames[obj.classId];
|
|
|
|
|
objs.push_back(std::move(obj));
|
|
|
|
|
masks.push_back(maskCoeffs[idx]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Generate masks
|
|
|
|
|
if (!objs.empty() && !masks.empty()) {
|
2026-04-08 13:45:52 +10:00
|
|
|
const float* protoData = outputTensors[1].GetTensorMutableData<float>();
|
2026-03-28 16:54:11 +11:00
|
|
|
cv::Mat protos(nm, protoH * protoW, CV_32F, const_cast<float*>(protoData));
|
|
|
|
|
cv::Mat matmulRes = (masks * protos).t();
|
|
|
|
|
|
|
|
|
|
// Apply sigmoid while still a single-channel 2D matrix
|
|
|
|
|
cv::Mat negMat;
|
|
|
|
|
cv::exp(-matmulRes, negMat);
|
|
|
|
|
cv::Mat sigmoidFlat = 1.0 / (1.0 + negMat);
|
|
|
|
|
|
|
|
|
|
cv::Mat sigmoidMat = sigmoidFlat.reshape(static_cast<int>(objs.size()),
|
|
|
|
|
{ protoH, protoW });
|
|
|
|
|
std::vector<cv::Mat> maskChannels;
|
|
|
|
|
cv::split(sigmoidMat, maskChannels);
|
|
|
|
|
|
|
|
|
|
// ROI in proto space, accounting for letterbox padding
|
|
|
|
|
cv::Rect roi;
|
|
|
|
|
if (origH > origW) {
|
|
|
|
|
int roiW = std::min(static_cast<int>(std::round(
|
|
|
|
|
static_cast<float>(protoW) * origW / origH)), protoW);
|
|
|
|
|
roi = cv::Rect((protoW - roiW) / 2, 0, roiW, protoH);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
int roiH = std::min(static_cast<int>(std::round(
|
|
|
|
|
static_cast<float>(protoH) * origH / origW)), protoH);
|
|
|
|
|
roi = cv::Rect(0, (protoH - roiH) / 2, protoW, roiH);
|
|
|
|
|
}
|
|
|
|
|
roi &= cv::Rect(0, 0, protoW, protoH);
|
|
|
|
|
|
|
|
|
|
int imgW = static_cast<int>(origW);
|
|
|
|
|
int imgH = static_cast<int>(origH);
|
|
|
|
|
|
|
|
|
|
const float scaleX = static_cast<float>(imgW) / roi.width;
|
|
|
|
|
const float scaleY = static_cast<float>(imgH) / roi.height;
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < objs.size(); ++i) {
|
|
|
|
|
cv::Rect safebox = objs[i].box & cv::Rect(0, 0, imgW, imgH);
|
|
|
|
|
if (safebox.area() <= 0) continue;
|
|
|
|
|
|
|
|
|
|
int px0 = std::max(static_cast<int>(std::floor(safebox.x / scaleX)), 0);
|
|
|
|
|
int py0 = std::max(static_cast<int>(std::floor(safebox.y / scaleY)), 0);
|
|
|
|
|
int px1 = std::min(static_cast<int>(std::ceil((safebox.x + safebox.width) / scaleX)), roi.width);
|
|
|
|
|
int py1 = std::min(static_cast<int>(std::ceil((safebox.y + safebox.height) / scaleY)), roi.height);
|
|
|
|
|
if (px1 <= px0 || py1 <= py0) continue;
|
|
|
|
|
|
|
|
|
|
cv::Rect protoBox(roi.x + px0, roi.y + py0, px1 - px0, py1 - py0);
|
|
|
|
|
protoBox &= cv::Rect(0, 0, protoW, protoH);
|
|
|
|
|
if (protoBox.area() <= 0) continue;
|
|
|
|
|
|
|
|
|
|
cv::Mat cropped = maskChannels[i](protoBox);
|
|
|
|
|
cv::Mat resized;
|
|
|
|
|
cv::resize(cropped, resized, cv::Size(safebox.width, safebox.height),
|
|
|
|
|
0, 0, cv::INTER_LINEAR);
|
|
|
|
|
objs[i].mask = resized > 0.5f;
|
|
|
|
|
objs[i].polygon = ANSUtilityHelper::MaskToNormalizedPolygon(
|
|
|
|
|
objs[i].mask, safebox, origW, origH);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
for (auto& obj : objs) {
|
|
|
|
|
if (obj.polygon.empty())
|
|
|
|
|
obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, origW, origH);
|
|
|
|
|
}
|
|
|
|
|
return objs;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ====================================================================
|
|
|
|
|
// POSE — postprocess
|
|
|
|
|
// ====================================================================
|
|
|
|
|
|
|
|
|
|
std::vector<Object> ONNXYOLO::postprocessPoseEndToEnd(
|
|
|
|
|
const cv::Size& originalImageSize,
|
|
|
|
|
const cv::Size& resizedImageShape,
|
2026-04-08 13:45:52 +10:00
|
|
|
std::vector<Ort::Value>& outputTensors,
|
2026-03-28 16:54:11 +11:00
|
|
|
const std::vector<std::string>& classNames,
|
|
|
|
|
float confThreshold, int numKPS)
|
|
|
|
|
{
|
|
|
|
|
if (outputTensors.empty()) return {};
|
2026-04-08 13:45:52 +10:00
|
|
|
const float* raw = outputTensors[0].GetTensorMutableData<float>();
|
2026-03-28 16:54:11 +11:00
|
|
|
const auto shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
if (shape.size() < 3) return {};
|
|
|
|
|
|
|
|
|
|
const int numDets = static_cast<int>(shape[1]);
|
|
|
|
|
const int numFeat = static_cast<int>(shape[2]); // 6 + nk*3
|
|
|
|
|
|
|
|
|
|
const float origW = static_cast<float>(originalImageSize.width);
|
|
|
|
|
const float origH = static_cast<float>(originalImageSize.height);
|
|
|
|
|
const float modelW = static_cast<float>(resizedImageShape.width);
|
|
|
|
|
const float modelH = static_cast<float>(resizedImageShape.height);
|
|
|
|
|
const float gain = std::min(modelH / origH, modelW / origW);
|
|
|
|
|
const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float invGain = 1.0f / gain;
|
|
|
|
|
|
|
|
|
|
std::vector<Object> results;
|
|
|
|
|
results.reserve(numDets);
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < numDets; ++i) {
|
|
|
|
|
const float* det = raw + i * numFeat;
|
|
|
|
|
float conf = det[4];
|
|
|
|
|
if (conf <= confThreshold) continue;
|
|
|
|
|
|
|
|
|
|
int classId = static_cast<int>(det[5]);
|
|
|
|
|
float x1 = clamp((det[0] - padX) * invGain, 0.f, origW);
|
|
|
|
|
float y1 = clamp((det[1] - padY) * invGain, 0.f, origH);
|
|
|
|
|
float x2 = clamp((det[2] - padX) * invGain, 0.f, origW);
|
|
|
|
|
float y2 = clamp((det[3] - padY) * invGain, 0.f, origH);
|
|
|
|
|
float w = x2-x1, h = y2-y1;
|
|
|
|
|
if (w < 1.f || h < 1.f) continue;
|
|
|
|
|
|
|
|
|
|
// Extract keypoints (after the 6 detection values)
|
|
|
|
|
std::vector<float> kps;
|
|
|
|
|
kps.reserve(numKPS * 3);
|
|
|
|
|
const float* kpsPtr = det + 6;
|
|
|
|
|
for (int k = 0; k < numKPS; ++k) {
|
|
|
|
|
float kx = clamp((kpsPtr[3*k] - padX) * invGain, 0.f, origW);
|
|
|
|
|
float ky = clamp((kpsPtr[3*k+1] - padY) * invGain, 0.f, origH);
|
|
|
|
|
float ks = kpsPtr[3*k+2];
|
|
|
|
|
kps.push_back(kx);
|
|
|
|
|
kps.push_back(ky);
|
|
|
|
|
kps.push_back(ks);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Object obj;
|
|
|
|
|
obj.classId = classId;
|
|
|
|
|
obj.confidence = conf;
|
|
|
|
|
obj.box = cv::Rect(static_cast<int>(x1), static_cast<int>(y1),
|
|
|
|
|
static_cast<int>(w), static_cast<int>(h));
|
|
|
|
|
obj.kps = std::move(kps);
|
|
|
|
|
if (classId >= 0 && classId < static_cast<int>(classNames.size()))
|
|
|
|
|
obj.className = classNames[classId];
|
|
|
|
|
results.push_back(std::move(obj));
|
|
|
|
|
}
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<Object> ONNXYOLO::postprocessPoseLegacy(
|
|
|
|
|
const cv::Size& originalImageSize,
|
|
|
|
|
const cv::Size& resizedImageShape,
|
2026-04-08 13:45:52 +10:00
|
|
|
std::vector<Ort::Value>& outputTensors,
|
2026-03-28 16:54:11 +11:00
|
|
|
const std::vector<std::string>& classNames,
|
|
|
|
|
float confThreshold, float iouThreshold, int numKPS, int maxDet)
|
|
|
|
|
{
|
|
|
|
|
if (outputTensors.empty()) return {};
|
2026-04-08 13:45:52 +10:00
|
|
|
const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
|
2026-03-28 16:54:11 +11:00
|
|
|
const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
if (outputShape.size() < 3) return {};
|
|
|
|
|
|
|
|
|
|
const int numChannels = static_cast<int>(outputShape[1]);
|
|
|
|
|
const int numBoxes = static_cast<int>(outputShape[2]);
|
|
|
|
|
// Pose layout: [cx,cy,w,h, scores(nc), kp0_x,kp0_y,kp0_s, ..., kpN_x,kpN_y,kpN_s]
|
|
|
|
|
// Derive actual nc from tensor shape: nc = numChannels - 4 - numKPS*3
|
|
|
|
|
// This avoids mismatch when classNames has more entries than the model's actual classes
|
|
|
|
|
const int nc = std::max(numChannels - 4 - numKPS * 3, 1);
|
|
|
|
|
const int kpsOffset = 4 + nc;
|
|
|
|
|
|
|
|
|
|
// Safety: verify we won't read past the row
|
|
|
|
|
if (kpsOffset + numKPS * 3 > numChannels) return {};
|
|
|
|
|
|
|
|
|
|
cv::Mat output = cv::Mat(numChannels, numBoxes, CV_32F,
|
|
|
|
|
const_cast<float*>(rawOutput)).t();
|
|
|
|
|
|
|
|
|
|
const float origW = static_cast<float>(originalImageSize.width);
|
|
|
|
|
const float origH = static_cast<float>(originalImageSize.height);
|
|
|
|
|
const float modelW = static_cast<float>(resizedImageShape.width);
|
|
|
|
|
const float modelH = static_cast<float>(resizedImageShape.height);
|
|
|
|
|
const float gain = std::min(modelH / origH, modelW / origW);
|
|
|
|
|
const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
|
|
|
|
|
const float invGain = 1.0f / gain;
|
|
|
|
|
|
|
|
|
|
std::vector<cv::Rect> bboxes;
|
|
|
|
|
std::vector<float> scores;
|
|
|
|
|
std::vector<int> labels;
|
|
|
|
|
std::vector<std::vector<float>> allKps;
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < numBoxes; ++i) {
|
|
|
|
|
const float* row = output.ptr<float>(i);
|
|
|
|
|
const float* scoresPtr = row + 4;
|
|
|
|
|
|
|
|
|
|
// Find best class
|
|
|
|
|
float maxScore = -FLT_MAX;
|
|
|
|
|
int bestClass = 0;
|
|
|
|
|
int numScores = std::max(nc, 1);
|
|
|
|
|
for (int c = 0; c < numScores; ++c) {
|
|
|
|
|
if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; }
|
|
|
|
|
}
|
|
|
|
|
if (maxScore <= confThreshold) continue;
|
|
|
|
|
|
|
|
|
|
float cx = row[0], cy = row[1], w = row[2], h = row[3];
|
|
|
|
|
float x0 = clamp((cx - w*0.5f - padX) * invGain, 0.f, origW);
|
|
|
|
|
float y0 = clamp((cy - h*0.5f - padY) * invGain, 0.f, origH);
|
|
|
|
|
float x1 = clamp((cx + w*0.5f - padX) * invGain, 0.f, origW);
|
|
|
|
|
float y1 = clamp((cy + h*0.5f - padY) * invGain, 0.f, origH);
|
|
|
|
|
|
|
|
|
|
// Extract keypoints
|
|
|
|
|
const float* kpsPtr = row + kpsOffset;
|
|
|
|
|
std::vector<float> kps;
|
|
|
|
|
kps.reserve(numKPS * 3);
|
|
|
|
|
for (int k = 0; k < numKPS; ++k) {
|
|
|
|
|
float kx = clamp((kpsPtr[3*k] - padX) * invGain, 0.f, origW);
|
|
|
|
|
float ky = clamp((kpsPtr[3*k+1] - padY) * invGain, 0.f, origH);
|
|
|
|
|
float ks = kpsPtr[3*k+2];
|
|
|
|
|
kps.push_back(kx);
|
|
|
|
|
kps.push_back(ky);
|
|
|
|
|
kps.push_back(ks);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bboxes.push_back(cv::Rect(static_cast<int>(x0), static_cast<int>(y0),
|
|
|
|
|
static_cast<int>(x1-x0), static_cast<int>(y1-y0)));
|
|
|
|
|
scores.push_back(maxScore);
|
|
|
|
|
labels.push_back(bestClass);
|
|
|
|
|
allKps.push_back(std::move(kps));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// NMS
|
|
|
|
|
std::vector<int> indices;
|
|
|
|
|
cv::dnn::NMSBoxesBatched(bboxes, scores, labels, confThreshold,
|
|
|
|
|
iouThreshold, indices);
|
|
|
|
|
|
|
|
|
|
std::vector<Object> results;
|
|
|
|
|
for (int idx : indices) {
|
|
|
|
|
if (static_cast<int>(results.size()) >= maxDet) break;
|
|
|
|
|
Object obj;
|
|
|
|
|
obj.classId = labels[idx];
|
|
|
|
|
obj.confidence = scores[idx];
|
|
|
|
|
obj.box = bboxes[idx];
|
|
|
|
|
obj.kps = allKps[idx];
|
|
|
|
|
if (obj.classId >= 0 && obj.classId < static_cast<int>(classNames.size()))
|
|
|
|
|
obj.className = classNames[obj.classId];
|
|
|
|
|
results.push_back(std::move(obj));
|
|
|
|
|
}
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ====================================================================
|
|
|
|
|
// CLASSIFICATION — postprocess
|
|
|
|
|
// ====================================================================
|
|
|
|
|
|
|
|
|
|
std::vector<Object> ONNXYOLO::postprocessClassify(
|
2026-04-08 13:45:52 +10:00
|
|
|
std::vector<Ort::Value>& outputTensors,
|
2026-03-28 16:54:11 +11:00
|
|
|
const std::vector<std::string>& classNames,
|
|
|
|
|
const cv::Size& imageSize)
|
|
|
|
|
{
|
|
|
|
|
if (outputTensors.empty()) return {};
|
2026-04-08 13:45:52 +10:00
|
|
|
const float* raw = outputTensors[0].GetTensorMutableData<float>();
|
2026-03-28 16:54:11 +11:00
|
|
|
const auto shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
if (shape.size() < 2) return {};
|
|
|
|
|
|
|
|
|
|
const int nc = static_cast<int>(shape[1]);
|
|
|
|
|
|
|
|
|
|
// Check if the output is already a probability distribution (sums to ~1.0).
|
|
|
|
|
// Some ONNX models (e.g. exported with opset 19) include a Softmax layer
|
|
|
|
|
// in the graph itself. Applying softmax again would flatten the distribution
|
|
|
|
|
// and produce near-uniform probabilities, causing wrong classifications.
|
|
|
|
|
float rawSum = 0.f;
|
|
|
|
|
for (int i = 0; i < nc; ++i) rawSum += raw[i];
|
|
|
|
|
const bool alreadyNormalized = (rawSum > 0.9f && rawSum < 1.1f
|
|
|
|
|
&& raw[0] >= 0.f); // probabilities are non-negative
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<float> probs(nc);
|
|
|
|
|
if (alreadyNormalized) {
|
|
|
|
|
// Output is already softmax — use as-is (skip double softmax)
|
|
|
|
|
for (int i = 0; i < nc; ++i) probs[i] = raw[i];
|
|
|
|
|
} else {
|
|
|
|
|
// Raw logits — apply softmax
|
|
|
|
|
float maxVal = -FLT_MAX;
|
|
|
|
|
for (int i = 0; i < nc; ++i) maxVal = std::max(maxVal, raw[i]);
|
|
|
|
|
float sumExp = 0.f;
|
|
|
|
|
for (int i = 0; i < nc; ++i) {
|
|
|
|
|
probs[i] = std::exp(raw[i] - maxVal);
|
|
|
|
|
sumExp += probs[i];
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i < nc; ++i) probs[i] /= sumExp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int bestClass = 0;
|
|
|
|
|
float bestProb = 0.f;
|
|
|
|
|
for (int i = 0; i < nc; ++i) {
|
|
|
|
|
if (probs[i] > bestProb) { bestProb = probs[i]; bestClass = i; }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const int imgW = imageSize.width;
|
|
|
|
|
const int imgH = imageSize.height;
|
|
|
|
|
|
|
|
|
|
Object obj;
|
|
|
|
|
if (imgW > 20 && imgH > 20) {
|
|
|
|
|
obj.box = cv::Rect(10, 10, imgW - 20, imgH - 20);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
obj.box = cv::Rect(0, 0, imgW, imgH);
|
|
|
|
|
}
|
|
|
|
|
//obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, imgW, imgH);
|
|
|
|
|
obj.classId = bestClass;
|
|
|
|
|
obj.confidence = bestProb;
|
|
|
|
|
if (bestClass >= 0 && bestClass < static_cast<int>(classNames.size()))
|
|
|
|
|
obj.className = classNames[bestClass];
|
|
|
|
|
return { std::move(obj) };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ====================================================================
|
|
|
|
|
// BATCH — sliceBatchOutput + detectBatch
|
|
|
|
|
// ====================================================================
|
|
|
|
|
|
|
|
|
|
/*static*/ Ort::Value ONNXYOLO::sliceBatchOutput(
|
2026-04-08 13:45:52 +10:00
|
|
|
Ort::Value& batchTensor,
|
2026-03-28 16:54:11 +11:00
|
|
|
int64_t batchIndex,
|
|
|
|
|
const std::vector<int64_t>& fullShape,
|
|
|
|
|
Ort::MemoryInfo& memInfo)
|
|
|
|
|
{
|
|
|
|
|
// Per-image element count = product of all dims except batch
|
|
|
|
|
int64_t elemsPerImage = 1;
|
|
|
|
|
for (size_t d = 1; d < fullShape.size(); ++d)
|
|
|
|
|
elemsPerImage *= fullShape[d];
|
|
|
|
|
|
2026-04-08 13:45:52 +10:00
|
|
|
float* batchData = batchTensor.GetTensorMutableData<float>();
|
|
|
|
|
float* imageData = batchData + batchIndex * elemsPerImage;
|
2026-03-28 16:54:11 +11:00
|
|
|
|
|
|
|
|
// Shape for single image: [1, D1, D2, ...]
|
|
|
|
|
std::vector<int64_t> singleShape = fullShape;
|
|
|
|
|
singleShape[0] = 1;
|
|
|
|
|
|
|
|
|
|
return Ort::Value::CreateTensor<float>(
|
|
|
|
|
memInfo, imageData, static_cast<size_t>(elemsPerImage),
|
|
|
|
|
singleShape.data(), singleShape.size());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<std::vector<Object>> ONNXYOLO::detectBatch(
|
|
|
|
|
const std::vector<cv::Mat>& images,
|
|
|
|
|
const std::vector<std::string>& classNames,
|
|
|
|
|
float confThreshold, float iouThreshold, int numKPS)
|
|
|
|
|
{
|
|
|
|
|
lastWasClassification = false;
|
|
|
|
|
lastBatchWasClassification = false;
|
|
|
|
|
if (images.empty()) return {};
|
|
|
|
|
|
|
|
|
|
const size_t N = images.size();
|
|
|
|
|
|
|
|
|
|
// Fallback to sequential if model has fixed batch=1
|
|
|
|
|
// (input_node_dims[0] == 1 and not dynamic (-1))
|
|
|
|
|
if (input_node_dims.size() >= 1 && input_node_dims[0] == 1) {
|
|
|
|
|
std::vector<std::vector<Object>> results(N);
|
|
|
|
|
for (size_t i = 0; i < N; ++i)
|
|
|
|
|
results[i] = detect(images[i], classNames, confThreshold, iouThreshold, numKPS);
|
|
|
|
|
lastBatchWasClassification = lastWasClassification;
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Store original sizes for per-image postprocessing
|
|
|
|
|
std::vector<cv::Size> originalSizes;
|
|
|
|
|
originalSizes.reserve(N);
|
|
|
|
|
for (const auto& img : images)
|
|
|
|
|
originalSizes.push_back(img.size());
|
|
|
|
|
|
|
|
|
|
// Batch preprocess + single inference call
|
|
|
|
|
Ort::Value inputTensor = transformBatch(images);
|
|
|
|
|
|
|
|
|
|
auto outputTensors = ort_session->Run(
|
|
|
|
|
Ort::RunOptions{ nullptr },
|
|
|
|
|
input_node_names.data(),
|
|
|
|
|
&inputTensor, 1,
|
|
|
|
|
output_node_names.data(),
|
|
|
|
|
num_outputs);
|
|
|
|
|
|
2026-04-12 17:16:16 +10:00
|
|
|
// Output shape sanity check — see detect() for rationale. Prevents
|
|
|
|
|
// DirectML-returned garbage dims from propagating into postprocess
|
|
|
|
|
// and triggering multi-terabyte cv::Mat allocations on AMD.
|
|
|
|
|
constexpr int64_t kMaxOutputDim = 1000000;
|
|
|
|
|
for (size_t t = 0; t < outputTensors.size(); ++t) {
|
|
|
|
|
const auto sh = outputTensors[t].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
for (size_t d = 0; d < sh.size(); ++d) {
|
|
|
|
|
if (sh[d] < 0 || sh[d] > kMaxOutputDim) {
|
|
|
|
|
std::cerr << "[ONNXYOLO] detectBatch: output[" << t
|
|
|
|
|
<< "] dim[" << d << "]=" << sh[d]
|
|
|
|
|
<< " is out of range — refusing to postprocess."
|
|
|
|
|
<< std::endl;
|
|
|
|
|
return std::vector<std::vector<Object>>(N);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
const cv::Size resizedShape(
|
|
|
|
|
static_cast<int>(input_node_dims[3]),
|
|
|
|
|
static_cast<int>(input_node_dims[2]));
|
|
|
|
|
|
|
|
|
|
// Determine task type from output shapes (same logic as detect())
|
|
|
|
|
const size_t numOutputs = outputTensors.size();
|
|
|
|
|
const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
|
|
|
|
|
// Per-image postprocessing
|
|
|
|
|
std::vector<std::vector<Object>> results(N);
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < N; ++i) {
|
|
|
|
|
// Build per-image sliced output tensors
|
|
|
|
|
std::vector<Ort::Value> perImageOutputs;
|
|
|
|
|
for (size_t t = 0; t < numOutputs; ++t) {
|
|
|
|
|
auto tShape = outputTensors[t].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
perImageOutputs.push_back(
|
|
|
|
|
sliceBatchOutput(outputTensors[t], static_cast<int64_t>(i),
|
|
|
|
|
tShape, *memory_info_handler));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Dispatch to correct postprocess method
|
|
|
|
|
if (numOutputs >= 2) {
|
|
|
|
|
const auto protoShape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
if (protoShape.size() == 4) {
|
|
|
|
|
if (shape0.size() >= 3 && shape0[1] < shape0[2]) {
|
|
|
|
|
results[i] = postprocessSegLegacy(originalSizes[i], resizedShape,
|
|
|
|
|
perImageOutputs, classNames,
|
|
|
|
|
confThreshold, iouThreshold);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
results[i] = postprocessSegEndToEnd(originalSizes[i], resizedShape,
|
|
|
|
|
perImageOutputs, classNames, confThreshold);
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (shape0.size() == 2) {
|
|
|
|
|
lastBatchWasClassification = true;
|
|
|
|
|
results[i] = postprocessClassify(perImageOutputs, classNames, originalSizes[i]);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (shape0.size() < 3) continue;
|
|
|
|
|
|
|
|
|
|
const bool isEndToEnd = (shape0[1] > shape0[2]) || (shape0[2] <= 20);
|
|
|
|
|
|
|
|
|
|
if (isEndToEnd) {
|
|
|
|
|
const int features = static_cast<int>(shape0[2]);
|
|
|
|
|
if (features == 6) {
|
|
|
|
|
results[i] = postprocessEndToEnd(originalSizes[i], resizedShape,
|
|
|
|
|
perImageOutputs, classNames, confThreshold);
|
|
|
|
|
}
|
|
|
|
|
else if (features == 7) {
|
|
|
|
|
results[i] = postprocessOBBEndToEnd(originalSizes[i], resizedShape,
|
|
|
|
|
perImageOutputs, classNames, confThreshold);
|
|
|
|
|
}
|
|
|
|
|
else if (features > 7 && (features - 6) % 3 == 0) {
|
|
|
|
|
int nk = (numKPS > 0) ? numKPS : (features - 6) / 3;
|
|
|
|
|
results[i] = postprocessPoseEndToEnd(originalSizes[i], resizedShape,
|
|
|
|
|
perImageOutputs, classNames,
|
|
|
|
|
confThreshold, nk);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
results[i] = postprocessEndToEnd(originalSizes[i], resizedShape,
|
|
|
|
|
perImageOutputs, classNames, confThreshold);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
const int nc = static_cast<int>(classNames.size());
|
|
|
|
|
const int numChannels = static_cast<int>(shape0[1]);
|
|
|
|
|
const int numBoxes = static_cast<int>(shape0[2]);
|
|
|
|
|
const int extra = numChannels - 4;
|
|
|
|
|
|
|
|
|
|
bool routed = false;
|
|
|
|
|
if (numKPS > 0 && numChannels >= 4 + 1 + numKPS * 3) {
|
|
|
|
|
results[i] = postprocessPoseLegacy(originalSizes[i], resizedShape,
|
|
|
|
|
perImageOutputs, classNames,
|
|
|
|
|
confThreshold, iouThreshold, numKPS);
|
|
|
|
|
routed = true;
|
|
|
|
|
}
|
|
|
|
|
else if (nc > 0 && nc <= extra && extra > nc && (extra - nc) % 3 == 0 && (extra - nc) >= 3) {
|
|
|
|
|
int nk = (extra - nc) / 3;
|
|
|
|
|
results[i] = postprocessPoseLegacy(originalSizes[i], resizedShape,
|
|
|
|
|
perImageOutputs, classNames,
|
|
|
|
|
confThreshold, iouThreshold, nk);
|
|
|
|
|
routed = true;
|
|
|
|
|
}
|
|
|
|
|
else if (nc > 0 && nc <= extra && extra == nc + 1) {
|
|
|
|
|
results[i] = postprocessOBBLegacy(originalSizes[i], resizedShape,
|
|
|
|
|
perImageOutputs, classNames,
|
|
|
|
|
confThreshold, iouThreshold);
|
|
|
|
|
routed = true;
|
|
|
|
|
}
|
|
|
|
|
else if (nc > 0 && nc <= extra && extra == nc) {
|
|
|
|
|
results[i] = postprocessLegacy(originalSizes[i], resizedShape,
|
|
|
|
|
perImageOutputs, classNames,
|
|
|
|
|
confThreshold, iouThreshold);
|
|
|
|
|
routed = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!routed) {
|
|
|
|
|
// Class count mismatch — probe last channel for OBB angles
|
|
|
|
|
bool likelyOBB = false;
|
|
|
|
|
if (extra >= 2) {
|
2026-04-08 13:45:52 +10:00
|
|
|
const float* rawOutput = perImageOutputs[0].GetTensorMutableData<float>();
|
2026-03-28 16:54:11 +11:00
|
|
|
int numSamp = std::min(numBoxes, 100);
|
|
|
|
|
int angleCount = 0;
|
|
|
|
|
for (int s = 0; s < numSamp; ++s) {
|
|
|
|
|
float v = rawOutput[(numChannels - 1) * numBoxes + s];
|
|
|
|
|
if (v >= -3.15f && v <= 3.15f) ++angleCount;
|
|
|
|
|
}
|
|
|
|
|
likelyOBB = (angleCount > numSamp * 8 / 10);
|
|
|
|
|
}
|
|
|
|
|
if (likelyOBB) {
|
|
|
|
|
results[i] = postprocessOBBLegacy(originalSizes[i], resizedShape,
|
|
|
|
|
perImageOutputs, classNames,
|
|
|
|
|
confThreshold, iouThreshold);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
results[i] = postprocessLegacy(originalSizes[i], resizedShape,
|
|
|
|
|
perImageOutputs, classNames,
|
|
|
|
|
confThreshold, iouThreshold);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ====================================================================
|
|
|
|
|
// ANSONNXYOLO — ANSODBase wrapper
|
|
|
|
|
// ====================================================================
|
|
|
|
|
|
|
|
|
|
ANSONNXYOLO::~ANSONNXYOLO() {
|
|
|
|
|
try { Destroy(); }
|
|
|
|
|
catch (const std::exception& e) {
|
|
|
|
|
_logger.LogError("ANSONNXYOLO::~ANSONNXYOLO()", e.what(), __FILE__, __LINE__);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool ANSONNXYOLO::Destroy() {
|
|
|
|
|
try { m_ortEngine.reset(); return true; }
|
|
|
|
|
catch (const std::exception& e) {
|
|
|
|
|
_logger.LogError("ANSONNXYOLO::Destroy", e.what(), __FILE__, __LINE__);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool ANSONNXYOLO::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
|
|
|
|
|
if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder)) return false;
|
|
|
|
|
optimizedModelFolder = _modelFolder;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool ANSONNXYOLO::InitOrtEngine() {
|
|
|
|
|
try {
|
|
|
|
|
if (!FileExist(_modelFilePath)) {
|
|
|
|
|
_logger.LogError("ANSONNXYOLO::InitOrtEngine",
|
|
|
|
|
"Model file does not exist: " + _modelFilePath, __FILE__, __LINE__);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
m_ortEngine = std::make_unique<ONNXYOLO>(_modelFilePath);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
catch (const std::exception& e) {
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::InitOrtEngine", e.what(), __FILE__, __LINE__);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-08 13:45:52 +10:00
|
|
|
bool ANSONNXYOLO::InitOrtEngine(ANSCENTER::EngineType engineType) {
|
|
|
|
|
try {
|
|
|
|
|
if (!FileExist(_modelFilePath)) {
|
|
|
|
|
_logger.LogError("ANSONNXYOLO::InitOrtEngine",
|
|
|
|
|
"Model file does not exist: " + _modelFilePath, __FILE__, __LINE__);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
m_ortEngine = std::make_unique<ONNXYOLO>(_modelFilePath, engineType);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
catch (const std::exception& e) {
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::InitOrtEngine", e.what(), __FILE__, __LINE__);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-10 17:13:47 +10:00
|
|
|
// ========================================================================
|
2026-04-12 17:16:16 +10:00
|
|
|
// WarmUpEngine — run a dummy inference after session creation.
|
2026-04-10 17:13:47 +10:00
|
|
|
//
|
2026-04-12 17:16:16 +10:00
|
|
|
// Scope: **NVIDIA (CUDA EP) only.** On first inference, the CUDA EP
|
|
|
|
|
// allocates its memory arena (capped at 2 GB via BasicOrtHandler config),
|
|
|
|
|
// resolves cuDNN convolution algorithms, and populates the kernel launch
|
|
|
|
|
// cache. Running one dummy inference at load time amortises this cost
|
|
|
|
|
// so the first real frame doesn't see a latency spike.
|
2026-04-10 17:13:47 +10:00
|
|
|
//
|
2026-04-12 17:16:16 +10:00
|
|
|
// Explicitly disabled on AMD, Intel and CPU:
|
|
|
|
|
// • AMD (DirectML) — calling detect() at load time has been observed
|
|
|
|
|
// to hit a multi-terabyte cv::Mat allocation inside postprocessLegacy
|
|
|
|
|
// on AMD RDNA iGPUs when DirectML returns garbage output tensor
|
|
|
|
|
// dims. ONNXYOLO::detect() now has an output-shape sanity guard
|
|
|
|
|
// that catches this at runtime, so the warm-up would add risk
|
|
|
|
|
// without benefit. Earlier builds enabled warm-up specifically for
|
|
|
|
|
// Radeon 680M TDR mitigation; that workaround is obsolete with
|
|
|
|
|
// current DirectML 1.15.x drivers.
|
|
|
|
|
// • Intel (OpenVINO) — running detect() at load time has been
|
|
|
|
|
// observed to expose latent heap-corruption bugs
|
|
|
|
|
// (ntdll +0x1176e5 / STATUS_HEAP_CORRUPTION 0xc0000374).
|
|
|
|
|
// • CPU EP — no shader compile or kernel cache to warm up; the first
|
|
|
|
|
// real frame has the same latency as any subsequent frame.
|
2026-04-10 17:13:47 +10:00
|
|
|
//
|
2026-04-12 17:16:16 +10:00
|
|
|
// Non-fatal on failure: if warm-up itself throws, regular inference
|
|
|
|
|
// still works — the engine is fully loaded before WarmUpEngine runs.
|
2026-04-10 17:13:47 +10:00
|
|
|
// ========================================================================
|
|
|
|
|
void ANSONNXYOLO::WarmUpEngine() {
|
|
|
|
|
if (!m_ortEngine) return;
|
|
|
|
|
|
2026-04-12 17:16:16 +10:00
|
|
|
// Gate strictly on NVIDIA_GPU. Every other EP is a no-op.
|
|
|
|
|
if (m_ortEngine->getEngineType() != EngineType::NVIDIA_GPU) {
|
|
|
|
|
ANS_DBG("ONNXYOLO", "Warm-up skipped (non-NVIDIA EP)");
|
2026-04-10 17:13:47 +10:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-12 17:16:16 +10:00
|
|
|
// ── Strict dimension validation ─────────────────────────────────
|
|
|
|
|
// Defensive: refuse to warm up with implausible model dimensions.
|
|
|
|
|
// _modelConfig values come from the caller's ModelConfig and are
|
|
|
|
|
// normally 224..640; anything outside [32, 4096] is almost certainly
|
|
|
|
|
// a bug in the caller and we skip warm-up rather than risk a huge
|
|
|
|
|
// cv::Mat allocation inside detect().
|
|
|
|
|
constexpr int kMinDim = 32;
|
|
|
|
|
constexpr int kMaxDim = 4096;
|
|
|
|
|
const int rawW = _modelConfig.inpWidth;
|
|
|
|
|
const int rawH = _modelConfig.inpHeight;
|
|
|
|
|
if (rawW <= 0 || rawH <= 0 || rawW > kMaxDim || rawH > kMaxDim) {
|
|
|
|
|
_logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
|
|
|
|
|
"Warm-up skipped — suspect input dims ("
|
|
|
|
|
+ std::to_string(rawW) + "x" + std::to_string(rawH) + ")",
|
|
|
|
|
__FILE__, __LINE__);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
const int w = std::clamp(rawW, kMinDim, kMaxDim);
|
|
|
|
|
const int h = std::clamp(rawH, kMinDim, kMaxDim);
|
2026-04-10 17:13:47 +10:00
|
|
|
|
2026-04-12 17:16:16 +10:00
|
|
|
try {
|
2026-04-10 17:13:47 +10:00
|
|
|
// Mid-gray BGR image matches the letterbox fill colour used in
|
|
|
|
|
// preprocessing (114,114,114 ~ 128) and avoids degenerate inputs.
|
|
|
|
|
cv::Mat dummy(h, w, CV_8UC3, cv::Scalar(128, 128, 128));
|
|
|
|
|
|
2026-04-12 17:16:16 +10:00
|
|
|
ANS_DBG("ONNXYOLO", "Warm-up: running 1 dummy CUDA inference (%dx%d)", w, h);
|
|
|
|
|
|
|
|
|
|
auto t0 = std::chrono::steady_clock::now();
|
|
|
|
|
(void)m_ortEngine->detect(dummy, _classes,
|
|
|
|
|
PROBABILITY_THRESHOLD,
|
|
|
|
|
NMS_THRESHOLD,
|
|
|
|
|
NUM_KPS);
|
|
|
|
|
auto t1 = std::chrono::steady_clock::now();
|
|
|
|
|
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
|
|
|
|
|
ANS_DBG("ONNXYOLO", "Warm-up done: %lld ms", (long long)ms);
|
|
|
|
|
}
|
|
|
|
|
catch (const cv::Exception& e) {
|
|
|
|
|
// Defensive — should not fire on NVIDIA CUDA EP, but if it does
|
|
|
|
|
// the engine itself is still loaded and real inference will work.
|
|
|
|
|
_logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
|
|
|
|
|
std::string("Warm-up skipped (cv::Exception, non-fatal): ") + e.what(),
|
|
|
|
|
__FILE__, __LINE__);
|
2026-04-10 17:13:47 +10:00
|
|
|
}
|
|
|
|
|
catch (const std::exception& e) {
|
2026-04-12 17:16:16 +10:00
|
|
|
_logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
|
|
|
|
|
std::string("Warm-up skipped (std::exception, non-fatal): ") + e.what(),
|
|
|
|
|
__FILE__, __LINE__);
|
|
|
|
|
}
|
|
|
|
|
catch (...) {
|
|
|
|
|
_logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
|
|
|
|
|
"Warm-up skipped (unknown exception, non-fatal)",
|
2026-04-10 17:13:47 +10:00
|
|
|
__FILE__, __LINE__);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
bool ANSONNXYOLO::Initialize(std::string licenseKey, ModelConfig modelConfig,
|
|
|
|
|
const std::string& modelZipFilePath,
|
|
|
|
|
const std::string& modelZipPassword,
|
|
|
|
|
std::string& labelMap)
|
|
|
|
|
{
|
|
|
|
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
2026-04-13 19:48:32 +10:00
|
|
|
ModelLoadingGuard mlg(_modelLoading);
|
2026-03-28 16:54:11 +11:00
|
|
|
try {
|
|
|
|
|
_modelLoadValid = false;
|
|
|
|
|
bool result = ANSODBase::Initialize(licenseKey, modelConfig,
|
|
|
|
|
modelZipFilePath, modelZipPassword, labelMap);
|
|
|
|
|
if (!result) return false;
|
|
|
|
|
|
|
|
|
|
_modelConfig = modelConfig;
|
|
|
|
|
if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
|
|
|
|
|
if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640;
|
|
|
|
|
if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.45f;
|
|
|
|
|
if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.25f;
|
|
|
|
|
|
|
|
|
|
PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
|
|
|
|
|
NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
|
|
|
|
|
TOP_K = 300;
|
|
|
|
|
NUM_KPS = _modelConfig.numKPS;
|
|
|
|
|
KPS_THRESHOLD = _modelConfig.kpsThreshold;
|
|
|
|
|
|
|
|
|
|
_modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
|
|
|
|
|
if (FileExist(_modelConfigFile)) {
|
|
|
|
|
ModelType modelType;
|
|
|
|
|
std::vector<int> inputShape;
|
|
|
|
|
_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
|
|
|
|
|
if (inputShape.size() == 2) {
|
|
|
|
|
if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
|
|
|
|
|
if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
_classFilePath = CreateFilePath(_modelFolder, "classes.names");
|
|
|
|
|
std::ifstream isValid(_classFilePath);
|
|
|
|
|
if (!isValid) LoadClassesFromString();
|
|
|
|
|
else LoadClassesFromFile();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
labelMap.clear();
|
|
|
|
|
if (!_classes.empty())
|
|
|
|
|
labelMap = VectorToCommaSeparatedString(_classes);
|
|
|
|
|
|
|
|
|
|
if (this->_loadEngineOnCreation) {
|
|
|
|
|
if (!InitOrtEngine()) {
|
|
|
|
|
_logger.LogError("ANSONNXYOLO::Initialize",
|
|
|
|
|
"Failed to create ONNX Runtime engine: " + _modelFilePath,
|
|
|
|
|
__FILE__, __LINE__);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Fix input resolution for dynamic-shape models.
|
|
|
|
|
// The constructor defaults to 640x640 when ONNX dims are dynamic,
|
|
|
|
|
// but the ModelConfig may specify the correct size (e.g. 224x224
|
|
|
|
|
// for classification models). Override here after config is loaded.
|
|
|
|
|
if (m_ortEngine && m_ortEngine->hasDynamicInputShape()) {
|
|
|
|
|
if (_modelConfig.inpHeight > 0 && _modelConfig.inpWidth > 0) {
|
|
|
|
|
m_ortEngine->setInputShape(_modelConfig.inpWidth, _modelConfig.inpHeight);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-10 17:13:47 +10:00
|
|
|
// Pre-compile DirectML shaders / kernel cache before first real
|
|
|
|
|
// frame (mitigates amdkmdag TDR on Radeon 680M). Non-fatal.
|
|
|
|
|
WarmUpEngine();
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
_modelLoadValid = true;
|
|
|
|
|
_isInitialized = true;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
catch (const std::exception& e) {
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::Initialize", e.what(), __FILE__, __LINE__);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool ANSONNXYOLO::LoadModel(const std::string& modelZipFilePath,
|
|
|
|
|
const std::string& modelZipPassword)
|
|
|
|
|
{
|
|
|
|
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
2026-04-13 19:48:32 +10:00
|
|
|
ModelLoadingGuard mlg(_modelLoading);
|
2026-03-28 16:54:11 +11:00
|
|
|
try {
|
|
|
|
|
bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword);
|
|
|
|
|
if (!result) return false;
|
|
|
|
|
|
|
|
|
|
if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
|
|
|
|
|
if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640;
|
|
|
|
|
if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.45f;
|
|
|
|
|
if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.25f;
|
|
|
|
|
|
|
|
|
|
PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
|
|
|
|
|
NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
|
|
|
|
|
TOP_K = 300;
|
|
|
|
|
NUM_KPS = _modelConfig.numKPS;
|
|
|
|
|
KPS_THRESHOLD = _modelConfig.kpsThreshold;
|
|
|
|
|
|
|
|
|
|
_modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
|
|
|
|
|
if (FileExist(_modelConfigFile)) {
|
|
|
|
|
ModelType modelType;
|
|
|
|
|
std::vector<int> inputShape;
|
|
|
|
|
_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
|
|
|
|
|
if (inputShape.size() == 2) {
|
|
|
|
|
if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
|
|
|
|
|
if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
_classFilePath = CreateFilePath(_modelFolder, "classes.names");
|
|
|
|
|
std::ifstream isValid(_classFilePath);
|
|
|
|
|
if (!isValid) LoadClassesFromString();
|
|
|
|
|
else LoadClassesFromFile();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this->_loadEngineOnCreation) {
|
|
|
|
|
if (!InitOrtEngine()) { _modelLoadValid = false; return false; }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Fix input resolution for dynamic-shape models (same as primary Initialize)
|
|
|
|
|
if (m_ortEngine && m_ortEngine->hasDynamicInputShape()) {
|
|
|
|
|
if (_modelConfig.inpHeight > 0 && _modelConfig.inpWidth > 0) {
|
|
|
|
|
m_ortEngine->setInputShape(_modelConfig.inpWidth, _modelConfig.inpHeight);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-10 17:13:47 +10:00
|
|
|
// Pre-compile DirectML shaders / kernel cache before first real
|
|
|
|
|
// frame (mitigates amdkmdag TDR on Radeon 680M). Non-fatal.
|
|
|
|
|
WarmUpEngine();
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
_modelLoadValid = true;
|
|
|
|
|
_isInitialized = true;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
catch (const std::exception& e) {
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::LoadModel", e.what(), __FILE__, __LINE__);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool ANSONNXYOLO::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig,
|
|
|
|
|
std::string modelName, std::string className,
|
|
|
|
|
const std::string& modelFolder,
|
|
|
|
|
std::string& labelMap)
|
|
|
|
|
{
|
|
|
|
|
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
2026-04-13 19:48:32 +10:00
|
|
|
ModelLoadingGuard mlg(_modelLoading);
|
2026-03-28 16:54:11 +11:00
|
|
|
try {
|
|
|
|
|
bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig,
|
|
|
|
|
modelName, className,
|
|
|
|
|
modelFolder, labelMap);
|
|
|
|
|
if (!result) return false;
|
|
|
|
|
|
|
|
|
|
_modelConfig = modelConfig;
|
|
|
|
|
if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
|
|
|
|
|
if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640;
|
|
|
|
|
if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.45f;
|
|
|
|
|
if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.25f;
|
|
|
|
|
|
|
|
|
|
PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
|
|
|
|
|
NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
|
|
|
|
|
TOP_K = 300;
|
|
|
|
|
NUM_KPS = _modelConfig.numKPS;
|
|
|
|
|
KPS_THRESHOLD = _modelConfig.kpsThreshold;
|
|
|
|
|
|
|
|
|
|
std::string _modelName = modelName;
|
|
|
|
|
if (_modelName.empty()) _modelName = "train_last";
|
|
|
|
|
std::string modelFullName = _modelName + ".onnx";
|
|
|
|
|
|
|
|
|
|
_modelFilePath = CreateFilePath(_modelFolder, modelFullName);
|
|
|
|
|
if (FileExist(_modelConfigFile)) {
|
|
|
|
|
ModelType modelType;
|
|
|
|
|
std::vector<int> inputShape;
|
|
|
|
|
_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
|
|
|
|
|
if (inputShape.size() == 2) {
|
|
|
|
|
if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
|
|
|
|
|
if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
_classFilePath = CreateFilePath(_modelFolder, className);
|
|
|
|
|
std::ifstream isValid(_classFilePath);
|
|
|
|
|
if (!isValid) LoadClassesFromString();
|
|
|
|
|
else LoadClassesFromFile();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
labelMap.clear();
|
|
|
|
|
if (!_classes.empty())
|
|
|
|
|
labelMap = VectorToCommaSeparatedString(_classes);
|
|
|
|
|
|
|
|
|
|
if (this->_loadEngineOnCreation) {
|
|
|
|
|
if (!InitOrtEngine()) { _modelLoadValid = false; return false; }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Fix input resolution for dynamic-shape models (same as primary Initialize)
|
|
|
|
|
if (m_ortEngine && m_ortEngine->hasDynamicInputShape()) {
|
|
|
|
|
if (_modelConfig.inpHeight > 0 && _modelConfig.inpWidth > 0) {
|
|
|
|
|
m_ortEngine->setInputShape(_modelConfig.inpWidth, _modelConfig.inpHeight);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-10 17:13:47 +10:00
|
|
|
// Pre-compile DirectML shaders / kernel cache before first real
|
|
|
|
|
// frame (mitigates amdkmdag TDR on Radeon 680M). Non-fatal.
|
|
|
|
|
WarmUpEngine();
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
_modelLoadValid = true;
|
|
|
|
|
_isInitialized = true;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
catch (const std::exception& e) {
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::LoadModelFromFolder", e.what(), __FILE__, __LINE__);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<Object> ANSONNXYOLO::RunInference(const cv::Mat& inputImgBGR) {
|
|
|
|
|
return RunInference(inputImgBGR, "");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<Object> ANSONNXYOLO::RunInference(const cv::Mat& inputImgBGR,
|
|
|
|
|
const std::string& camera_id)
|
|
|
|
|
{
|
2026-04-13 19:48:32 +10:00
|
|
|
if (_modelLoading.load()) return {};
|
2026-03-28 16:54:11 +11:00
|
|
|
{
|
2026-04-13 19:48:32 +10:00
|
|
|
auto lock = TryLockWithTimeout("ANSONNXYOLO::RunInference");
|
|
|
|
|
if (!lock.owns_lock()) return {};
|
2026-03-28 16:54:11 +11:00
|
|
|
if (!_modelLoadValid) {
|
|
|
|
|
_logger.LogError("ANSONNXYOLO::RunInference", "Model not loaded", __FILE__, __LINE__);
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
if (!_licenseValid) {
|
|
|
|
|
_logger.LogError("ANSONNXYOLO::RunInference", "Invalid license", __FILE__, __LINE__);
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
if (!_isInitialized) {
|
|
|
|
|
_logger.LogError("ANSONNXYOLO::RunInference", "Model not initialized", __FILE__, __LINE__);
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
if (inputImgBGR.empty() || inputImgBGR.cols < 10 || inputImgBGR.rows < 10)
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
try { return DetectObjects(inputImgBGR, camera_id); }
|
|
|
|
|
catch (const std::exception& e) {
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::RunInference", e.what(), __FILE__, __LINE__);
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<Object> ANSONNXYOLO::DetectObjects(const cv::Mat& inputImage,
|
|
|
|
|
const std::string& camera_id)
|
|
|
|
|
{
|
|
|
|
|
try {
|
2026-04-13 19:48:32 +10:00
|
|
|
// Fail-fast if a model load/init is in progress on another thread
|
|
|
|
|
if (_modelLoading.load()) {
|
|
|
|
|
ANS_DBG("ONNXYOLO", "DetectObjects: skipped — model loading in progress, cam=%s", camera_id.c_str());
|
2026-03-28 16:54:11 +11:00
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-13 19:48:32 +10:00
|
|
|
// ── Snapshot config under a brief lock ──────────────────
|
|
|
|
|
// Only hold _mutex long enough to validate state and copy
|
|
|
|
|
// the parameters needed for inference. The actual ORT
|
|
|
|
|
// detect() call runs outside the lock so that concurrent
|
|
|
|
|
// Initialize/LoadModel calls are not blocked for the full
|
|
|
|
|
// duration of inference.
|
|
|
|
|
float probThresh, nmsThresh;
|
|
|
|
|
int numKps;
|
|
|
|
|
std::vector<std::string> classes;
|
|
|
|
|
bool trackerEnabled, stabilizationEnabled;
|
|
|
|
|
{
|
|
|
|
|
ANS_DBG("ONNXYOLO", "DetectObjects: cam=%s acquiring mutex...", camera_id.c_str());
|
|
|
|
|
auto lk = TryLockWithTimeout("ANSONNXYOLO::DetectObjects");
|
|
|
|
|
if (!lk.owns_lock()) return {}; // timed out
|
|
|
|
|
ANS_DBG("ONNXYOLO", "DetectObjects: mutex acquired, cam=%s", camera_id.c_str());
|
|
|
|
|
|
|
|
|
|
if (!m_ortEngine) {
|
|
|
|
|
_logger.LogError("ANSONNXYOLO::DetectObjects", "ORT engine is null", __FILE__, __LINE__);
|
|
|
|
|
ANS_DBG("ONNXYOLO", "DetectObjects: ORT engine is null!");
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Snapshot parameters while locked
|
|
|
|
|
probThresh = PROBABILITY_THRESHOLD;
|
|
|
|
|
nmsThresh = NMS_THRESHOLD;
|
|
|
|
|
numKps = NUM_KPS;
|
|
|
|
|
classes = _classes;
|
|
|
|
|
trackerEnabled = _trackerEnabled;
|
|
|
|
|
stabilizationEnabled = _stabilizationEnabled;
|
|
|
|
|
}
|
|
|
|
|
// ── _mutex released — heavy work below runs lock-free ───
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
// --- NV12 fast path: try to get full-res BGR from GPU NV12 frame ---
|
|
|
|
|
cv::Mat inferenceImage = inputImage;
|
|
|
|
|
float bgrScaleX = 1.0f, bgrScaleY = 1.0f;
|
|
|
|
|
{
|
|
|
|
|
auto* gpuData = tl_currentGpuFrame();
|
|
|
|
|
if (gpuData && gpuData->width > 0 && gpuData->height > 0) {
|
|
|
|
|
if (gpuData->cpuYPlane && gpuData->cpuUvPlane &&
|
|
|
|
|
gpuData->cpuYLinesize >= gpuData->width &&
|
|
|
|
|
gpuData->cpuUvLinesize >= gpuData->width) {
|
|
|
|
|
const int fw = gpuData->width;
|
|
|
|
|
const int fh = gpuData->height;
|
|
|
|
|
if ((fw % 2) == 0 && (fh % 2) == 0) {
|
|
|
|
|
try {
|
|
|
|
|
cv::Mat yPlane(fh, fw, CV_8UC1,
|
|
|
|
|
gpuData->cpuYPlane, static_cast<size_t>(gpuData->cpuYLinesize));
|
|
|
|
|
cv::Mat uvPlane(fh / 2, fw / 2, CV_8UC2,
|
|
|
|
|
gpuData->cpuUvPlane, static_cast<size_t>(gpuData->cpuUvLinesize));
|
|
|
|
|
cv::Mat fullResBGR;
|
|
|
|
|
cv::cvtColorTwoPlane(yPlane, uvPlane, fullResBGR, cv::COLOR_YUV2BGR_NV12);
|
|
|
|
|
if (!fullResBGR.empty()) {
|
|
|
|
|
bgrScaleX = static_cast<float>(inputImage.cols) / fullResBGR.cols;
|
|
|
|
|
bgrScaleY = static_cast<float>(inputImage.rows) / fullResBGR.rows;
|
|
|
|
|
inferenceImage = fullResBGR;
|
|
|
|
|
}
|
|
|
|
|
} catch (...) { /* NV12 conversion failed — fall back to inputImage */ }
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-04-13 19:48:32 +10:00
|
|
|
|
|
|
|
|
// Run ORT inference — no mutex held, this is the expensive call
|
|
|
|
|
auto results = m_ortEngine->detect(inferenceImage, classes,
|
|
|
|
|
probThresh,
|
|
|
|
|
nmsThresh,
|
|
|
|
|
numKps);
|
2026-03-28 16:54:11 +11:00
|
|
|
|
|
|
|
|
// --- Rescale coordinates from full-res back to display-res ---
|
|
|
|
|
if (bgrScaleX != 1.0f || bgrScaleY != 1.0f) {
|
|
|
|
|
for (auto& obj : results) {
|
|
|
|
|
obj.box.x = static_cast<int>(obj.box.x * bgrScaleX);
|
|
|
|
|
obj.box.y = static_cast<int>(obj.box.y * bgrScaleY);
|
|
|
|
|
obj.box.width = static_cast<int>(obj.box.width * bgrScaleX);
|
|
|
|
|
obj.box.height = static_cast<int>(obj.box.height * bgrScaleY);
|
|
|
|
|
for (size_t k = 0; k < obj.kps.size(); k += 2) {
|
|
|
|
|
obj.kps[k] *= bgrScaleX; // x
|
|
|
|
|
if (k + 1 < obj.kps.size())
|
|
|
|
|
obj.kps[k + 1] *= bgrScaleY; // y
|
|
|
|
|
}
|
|
|
|
|
for (auto& pt : obj.polygon) {
|
|
|
|
|
pt.x *= bgrScaleX;
|
|
|
|
|
pt.y *= bgrScaleY;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (auto& obj : results)
|
|
|
|
|
obj.cameraId = camera_id;
|
|
|
|
|
|
2026-04-13 19:48:32 +10:00
|
|
|
// Tracking/stabilization (ApplyTracking has its own lock)
|
|
|
|
|
if (trackerEnabled && !m_ortEngine->lastWasClassification) {
|
2026-03-28 16:54:11 +11:00
|
|
|
results = ApplyTracking(results, camera_id);
|
2026-04-13 19:48:32 +10:00
|
|
|
if (stabilizationEnabled) results = StabilizeDetections(results, camera_id);
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
catch (const std::exception& e) {
|
2026-04-09 08:09:02 +10:00
|
|
|
const std::string msg = e.what();
|
|
|
|
|
|
|
|
|
|
if (msg.find("887A0005") != std::string::npos) {
|
|
|
|
|
if (!_dmlDeviceLost) {
|
|
|
|
|
_dmlDeviceLost = true;
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::DetectObjects",
|
|
|
|
|
"DirectML GPU device lost (887A0005) — attempting CPU fallback",
|
|
|
|
|
__FILE__, __LINE__);
|
|
|
|
|
ANS_DBG("ONNXYOLO", "DML device lost — recreating session on CPU");
|
|
|
|
|
try {
|
2026-04-13 19:48:32 +10:00
|
|
|
std::lock_guard<std::recursive_mutex> lk(_mutex);
|
2026-04-09 08:09:02 +10:00
|
|
|
m_ortEngine.reset();
|
|
|
|
|
if (InitOrtEngine(ANSCENTER::EngineType::CPU)) {
|
|
|
|
|
_logger.LogInfo("ANSONNXYOLO::DetectObjects",
|
|
|
|
|
"CPU fallback session created successfully",
|
|
|
|
|
__FILE__, __LINE__);
|
|
|
|
|
ANS_DBG("ONNXYOLO", "CPU fallback OK");
|
|
|
|
|
} else {
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::DetectObjects",
|
|
|
|
|
"CPU fallback session creation failed",
|
|
|
|
|
__FILE__, __LINE__);
|
|
|
|
|
}
|
|
|
|
|
} catch (const std::exception& re) {
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::DetectObjects",
|
|
|
|
|
std::string("CPU fallback exception: ") + re.what(),
|
|
|
|
|
__FILE__, __LINE__);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-08 13:45:52 +10:00
|
|
|
ANS_DBG("ONNXYOLO", "DetectObjects EXCEPTION: %s cam=%s", e.what(), camera_id.c_str());
|
2026-03-28 16:54:11 +11:00
|
|
|
_logger.LogFatal("ANSONNXYOLO::DetectObjects", e.what(), __FILE__, __LINE__);
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ====================================================================
|
|
|
|
|
// RunInferencesBatch / DetectObjectsBatch — true ONNX batch
|
|
|
|
|
// ====================================================================
|
|
|
|
|
|
|
|
|
|
std::vector<std::vector<Object>> ANSONNXYOLO::RunInferencesBatch(
|
|
|
|
|
const std::vector<cv::Mat>& inputs, const std::string& camera_id)
|
|
|
|
|
{
|
2026-04-13 19:48:32 +10:00
|
|
|
if (_modelLoading.load()) return {};
|
2026-03-28 16:54:11 +11:00
|
|
|
{
|
2026-04-13 19:48:32 +10:00
|
|
|
auto lock = TryLockWithTimeout("ANSONNXYOLO::RunInferencesBatch");
|
|
|
|
|
if (!lock.owns_lock()) return {};
|
2026-03-28 16:54:11 +11:00
|
|
|
if (!_modelLoadValid) {
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::RunInferencesBatch",
|
|
|
|
|
"Cannot load ONNX model", __FILE__, __LINE__);
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
if (!_licenseValid) {
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::RunInferencesBatch",
|
|
|
|
|
"Invalid license", __FILE__, __LINE__);
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
if (!_isInitialized) {
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::RunInferencesBatch",
|
|
|
|
|
"Model not initialized", __FILE__, __LINE__);
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
if (inputs.empty()) {
|
|
|
|
|
_logger.LogWarn("ANSONNXYOLO::RunInferencesBatch",
|
|
|
|
|
"Empty input batch", __FILE__, __LINE__);
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
try {
|
|
|
|
|
return DetectObjectsBatch(inputs, camera_id);
|
|
|
|
|
}
|
|
|
|
|
catch (const std::exception& e) {
|
2026-04-09 08:09:02 +10:00
|
|
|
const std::string msg = e.what();
|
|
|
|
|
if (msg.find("887A0005") != std::string::npos) {
|
|
|
|
|
if (!_dmlDeviceLost) {
|
|
|
|
|
_dmlDeviceLost = true;
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::RunInferencesBatch",
|
|
|
|
|
"DirectML GPU device lost (887A0005) — attempting CPU fallback",
|
|
|
|
|
__FILE__, __LINE__);
|
|
|
|
|
try {
|
|
|
|
|
m_ortEngine.reset();
|
|
|
|
|
if (!InitOrtEngine(ANSCENTER::EngineType::CPU))
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::RunInferencesBatch",
|
|
|
|
|
"CPU fallback session creation failed", __FILE__, __LINE__);
|
|
|
|
|
} catch (...) {}
|
|
|
|
|
}
|
|
|
|
|
return {};
|
|
|
|
|
}
|
2026-03-28 16:54:11 +11:00
|
|
|
_logger.LogFatal("ANSONNXYOLO::RunInferencesBatch",
|
|
|
|
|
e.what(), __FILE__, __LINE__);
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<std::vector<Object>> ANSONNXYOLO::DetectObjectsBatch(
|
|
|
|
|
const std::vector<cv::Mat>& inputImages, const std::string& camera_id)
|
|
|
|
|
{
|
|
|
|
|
try {
|
2026-04-13 19:48:32 +10:00
|
|
|
if (_modelLoading.load()) return {};
|
|
|
|
|
|
|
|
|
|
// Snapshot config under brief lock
|
|
|
|
|
float probThresh, nmsThresh;
|
|
|
|
|
int numKps;
|
|
|
|
|
std::vector<std::string> classes;
|
|
|
|
|
bool trackerEnabled, stabilizationEnabled;
|
|
|
|
|
{
|
|
|
|
|
auto lk = TryLockWithTimeout("ANSONNXYOLO::DetectObjectsBatch");
|
|
|
|
|
if (!lk.owns_lock()) return {};
|
|
|
|
|
|
|
|
|
|
if (!m_ortEngine) {
|
|
|
|
|
_logger.LogError("ANSONNXYOLO::DetectObjectsBatch",
|
|
|
|
|
"ORT engine is null", __FILE__, __LINE__);
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
probThresh = PROBABILITY_THRESHOLD;
|
|
|
|
|
nmsThresh = NMS_THRESHOLD;
|
|
|
|
|
numKps = NUM_KPS;
|
|
|
|
|
classes = _classes;
|
|
|
|
|
trackerEnabled = _trackerEnabled;
|
|
|
|
|
stabilizationEnabled = _stabilizationEnabled;
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
|
|
|
|
|
2026-04-13 19:48:32 +10:00
|
|
|
// Heavy work outside lock
|
2026-03-28 16:54:11 +11:00
|
|
|
auto batchResults = m_ortEngine->detectBatch(
|
2026-04-13 19:48:32 +10:00
|
|
|
inputImages, classes, probThresh, nmsThresh, numKps);
|
2026-03-28 16:54:11 +11:00
|
|
|
|
|
|
|
|
const bool isClassification = m_ortEngine->lastBatchWasClassification;
|
|
|
|
|
|
|
|
|
|
for (auto& results : batchResults) {
|
|
|
|
|
for (auto& obj : results)
|
|
|
|
|
obj.cameraId = camera_id;
|
|
|
|
|
|
2026-04-13 19:48:32 +10:00
|
|
|
if (trackerEnabled && !isClassification) {
|
2026-03-28 16:54:11 +11:00
|
|
|
results = ApplyTracking(results, camera_id);
|
2026-04-13 19:48:32 +10:00
|
|
|
if (stabilizationEnabled) results = StabilizeDetections(results, camera_id);
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return batchResults;
|
|
|
|
|
}
|
|
|
|
|
catch (const std::exception& e) {
|
2026-04-09 08:09:02 +10:00
|
|
|
const std::string msg = e.what();
|
|
|
|
|
if (msg.find("887A0005") != std::string::npos) {
|
|
|
|
|
if (!_dmlDeviceLost) {
|
|
|
|
|
_dmlDeviceLost = true;
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::DetectObjectsBatch",
|
|
|
|
|
"DirectML GPU device lost (887A0005) — attempting CPU fallback",
|
|
|
|
|
__FILE__, __LINE__);
|
|
|
|
|
try {
|
2026-04-13 19:48:32 +10:00
|
|
|
std::lock_guard<std::recursive_mutex> lk(_mutex);
|
2026-04-09 08:09:02 +10:00
|
|
|
m_ortEngine.reset();
|
|
|
|
|
if (!InitOrtEngine(ANSCENTER::EngineType::CPU))
|
|
|
|
|
_logger.LogFatal("ANSONNXYOLO::DetectObjectsBatch",
|
|
|
|
|
"CPU fallback session creation failed", __FILE__, __LINE__);
|
|
|
|
|
} catch (...) {}
|
|
|
|
|
}
|
|
|
|
|
return {};
|
|
|
|
|
}
|
2026-03-28 16:54:11 +11:00
|
|
|
_logger.LogFatal("ANSONNXYOLO::DetectObjectsBatch",
|
|
|
|
|
e.what(), __FILE__, __LINE__);
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace ANSCENTER
|