#include "ANSONNXYOLO.h" #include "Utility.h" #include "ANSGpuFrameRegistry.h" #include "NV12PreprocessHelper.h" // tl_currentGpuFrame() #include // std::iota #include namespace ANSCENTER { // ==================================================================== // ONNXYOLO — BasicOrtHandler subclass for Ultralytics YOLO // ==================================================================== ONNXYOLO::ONNXYOLO(const std::string& _onnx_path, unsigned int _num_threads) : BasicOrtHandler(_onnx_path, _num_threads) { if (input_node_dims.size() >= 4) { int h = static_cast(input_node_dims[2]); int w = static_cast(input_node_dims[3]); isDynamicInputShape = (h == -1 || w == -1); inputImageShape = isDynamicInputShape ? cv::Size(640, 640) : cv::Size(w, h); } else { inputImageShape = cv::Size(640, 640); } } ONNXYOLO::ONNXYOLO(const std::string& _onnx_path, EngineType engineType, unsigned int _num_threads) : BasicOrtHandler(_onnx_path, engineType, _num_threads) { if (input_node_dims.size() >= 4) { int h = static_cast(input_node_dims[2]); int w = static_cast(input_node_dims[3]); isDynamicInputShape = (h == -1 || w == -1); inputImageShape = isDynamicInputShape ? cv::Size(640, 640) : cv::Size(w, h); } else { inputImageShape = cv::Size(640, 640); } } // ------------------------------------------------------------------ // letterBox — Ultralytics-compatible LetterBox transform // ------------------------------------------------------------------ void ONNXYOLO::letterBox(const cv::Mat& image, cv::Mat& outImage, const cv::Size& newShape, const cv::Scalar& color, bool scaleUp, int stride) { float r = std::min(static_cast(newShape.height) / image.rows, static_cast(newShape.width) / image.cols); if (!scaleUp) r = std::min(r, 1.0f); int newUnpadW = static_cast(std::round(image.cols * r)); int newUnpadH = static_cast(std::round(image.rows * r)); float dw = static_cast(newShape.width - newUnpadW); float dh = static_cast(newShape.height - newUnpadH); dw /= 2.0f; dh /= 2.0f; if (image.cols != newUnpadW || image.rows != newUnpadH) { cv::resize(image, outImage, cv::Size(newUnpadW, newUnpadH), 0, 0, cv::INTER_LINEAR); } else { outImage = image.clone(); } // Ultralytics -0.1/+0.1 trick for deterministic padding split int top = static_cast(std::round(dh - 0.1f)); int bottom = static_cast(std::round(dh + 0.1f)); int left = static_cast(std::round(dw - 0.1f)); int right = static_cast(std::round(dw + 0.1f)); cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color); } // ------------------------------------------------------------------ // transform — BGR → RGB, letterbox, /255, HWC→CHW // ------------------------------------------------------------------ Ort::Value ONNXYOLO::transform(const cv::Mat& mat) { // Grayscale → BGR if needed cv::Mat bgrMat; if (mat.channels() == 1) { cv::cvtColor(mat, bgrMat, cv::COLOR_GRAY2BGR); } else { bgrMat = mat; } // Check if model is classification (first output has 2 dims: [B, nc]) const bool isClassification = !output_node_dims.empty() && output_node_dims[0].size() == 2; cv::Mat canvas; if (isClassification) { // Classification: direct resize (no letterbox padding) — matches ANSONNXCL cv::resize(bgrMat, canvas, cv::Size(inputImageShape.width, inputImageShape.height), 0, 0, cv::INTER_LINEAR); } else { // Detection/Seg/Pose/OBB: Ultralytics letterbox letterBox(bgrMat, canvas, inputImageShape); } cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB); canvas.convertTo(canvas, CV_32FC3, 1.0 / 255.0); const int channels = canvas.channels(); const int height = canvas.rows; const int width = canvas.cols; const size_t imageSize = static_cast(height) * width; input_node_dims = { 1, 3, height, width }; input_tensor_size = 1 * 3 * imageSize; input_values_handler.resize(input_tensor_size); std::vector channelMats(channels); for (int c = 0; c < channels; ++c) { channelMats[c] = cv::Mat(height, width, CV_32FC1, input_values_handler.data() + c * imageSize); } cv::split(canvas, channelMats); return Ort::Value::CreateTensor( *memory_info_handler, input_values_handler.data(), input_tensor_size, input_node_dims.data(), input_node_dims.size()); } Ort::Value ONNXYOLO::transformBatch(const std::vector& images) { if (images.empty()) throw std::runtime_error("ONNXYOLO::transformBatch: empty input"); const size_t N = images.size(); // Preprocess all images: letterbox → BGR→RGB → float → /255 // Check if model is classification (first output has 2 dims: [B, nc]) const bool isClassification = !output_node_dims.empty() && output_node_dims[0].size() == 2; std::vector batch; batch.reserve(N); for (const auto& img : images) { if (img.empty()) throw std::runtime_error("ONNXYOLO::transformBatch: empty image in batch"); // Grayscale → BGR if needed cv::Mat bgrImg; if (img.channels() == 1) { cv::cvtColor(img, bgrImg, cv::COLOR_GRAY2BGR); } else { bgrImg = img; } cv::Mat canvas; if (isClassification) { // Classification: direct resize (no letterbox) cv::resize(bgrImg, canvas, cv::Size(inputImageShape.width, inputImageShape.height), 0, 0, cv::INTER_LINEAR); } else { letterBox(bgrImg, canvas, inputImageShape); } cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB); canvas.convertTo(canvas, CV_32FC3, 1.0 / 255.0); batch.push_back(canvas); } const int height = batch[0].rows; const int width = batch[0].cols; const size_t imageSize = static_cast(height) * width; input_node_dims = { static_cast(N), 3, static_cast(height), static_cast(width) }; input_tensor_size = N * 3 * imageSize; input_values_handler.resize(input_tensor_size); // Pack each image into CHW layout (same as transform() for single image) for (size_t b = 0; b < N; ++b) { const size_t batchOffset = b * 3 * imageSize; std::vector channelMats(3); for (int c = 0; c < 3; ++c) { channelMats[c] = cv::Mat(height, width, CV_32FC1, input_values_handler.data() + batchOffset + c * imageSize); } cv::split(batch[b], channelMats); } return Ort::Value::CreateTensor( *memory_info_handler, input_values_handler.data(), input_tensor_size, input_node_dims.data(), input_node_dims.size()); } // ------------------------------------------------------------------ // detect — full pipeline with auto task detection // // Decision logic: // 2 outputs (second 4D) → segmentation // 1 output, 2D → classification // 1 output, 3D end2end dim2=6 → detection // 1 output, 3D end2end dim2=7 → OBB // 1 output, 3D end2end dim2>7 → pose (if (dim2-6)%3==0) // 1 output, 3D legacy → detect/obb/pose by nc // ------------------------------------------------------------------ std::vector ONNXYOLO::detect(const cv::Mat& image, const std::vector& classNames, float confThreshold, float iouThreshold, int numKPS) { lastWasClassification = false; if (image.empty()) return {}; Ort::Value inputTensor = transform(image); auto outputTensors = ort_session->Run( Ort::RunOptions{ nullptr }, input_node_names.data(), &inputTensor, 1, output_node_names.data(), num_outputs); const cv::Size resizedShape( static_cast(input_node_dims[3]), static_cast(input_node_dims[2])); const size_t numOutputs = outputTensors.size(); // ── Segmentation: 2 outputs (detections + proto masks) ────────── if (numOutputs >= 2) { const auto protoShape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape(); if (protoShape.size() == 4) { const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); // Legacy: shape [B, channels, num_boxes] e.g. [1, 116, 8400] -> shape[1] < shape[2] // End2end: shape [B, max_det, features] e.g. [1, 300, 38] -> shape[1] > shape[2] if (shape0.size() >= 3 && shape0[1] < shape0[2]) { return postprocessSegLegacy(image.size(), resizedShape, outputTensors, classNames, confThreshold, iouThreshold); } else { return postprocessSegEndToEnd(image.size(), resizedShape, outputTensors, classNames, confThreshold); } } } const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); // ── Classification: 2D output [B, nc] ─────────────────────────── if (shape0.size() == 2) { lastWasClassification = true; return postprocessClassify(outputTensors, classNames, image.size()); } if (shape0.size() < 3) return {}; // ── Determine end2end vs legacy ───────────────────────────────── // End2end: shape [B, max_det, features] where max_det < features is false // typically [1, 300, 6/7/...] so shape[1] > shape[2] // Legacy: shape [B, channels, num_boxes] where channels < num_boxes // typically [1, 84, 8400] so shape[1] < shape[2] const bool isEndToEnd = (shape0[1] > shape0[2]) || (shape0[2] <= 20); // very small dim2 = end2end if (isEndToEnd) { const int features = static_cast(shape0[2]); if (features == 6) { return postprocessEndToEnd(image.size(), resizedShape, outputTensors, classNames, confThreshold); } else if (features == 7) { return postprocessOBBEndToEnd(image.size(), resizedShape, outputTensors, classNames, confThreshold); } else if (features > 7 && (features - 6) % 3 == 0) { int nk = (numKPS > 0) ? numKPS : (features - 6) / 3; return postprocessPoseEndToEnd(image.size(), resizedShape, outputTensors, classNames, confThreshold, nk); } // Fallback to detection return postprocessEndToEnd(image.size(), resizedShape, outputTensors, classNames, confThreshold); } else { // Legacy format: [B, channels, num_boxes] // channels = 4(bbox) + nc(scores) + extra_features const int nc = static_cast(classNames.size()); const int numChannels = static_cast(shape0[1]); const int numBoxes = static_cast(shape0[2]); const int extra = numChannels - 4; // Pose check: if numKPS is explicitly set, or we can detect keypoints if (numKPS > 0 && numChannels >= 4 + 1 + numKPS * 3) { return postprocessPoseLegacy(image.size(), resizedShape, outputTensors, classNames, confThreshold, iouThreshold, numKPS); } else if (nc > 0 && nc <= extra && extra > nc && (extra - nc) % 3 == 0 && (extra - nc) >= 3) { int nk = (extra - nc) / 3; return postprocessPoseLegacy(image.size(), resizedShape, outputTensors, classNames, confThreshold, iouThreshold, nk); } else if (nc > 0 && nc <= extra && extra == nc + 1) { return postprocessOBBLegacy(image.size(), resizedShape, outputTensors, classNames, confThreshold, iouThreshold); } else if (nc > 0 && nc <= extra && extra == nc) { return postprocessLegacy(image.size(), resizedShape, outputTensors, classNames, confThreshold, iouThreshold); } else { // Class count doesn't match tensor — probe last channel // to distinguish OBB (angle values in [-pi, pi]) from detection bool likelyOBB = false; if (extra >= 2) { const float* rawOutput = outputTensors[0].GetTensorMutableData(); int numSamples = std::min(numBoxes, 100); int angleCount = 0; for (int s = 0; s < numSamples; ++s) { float v = rawOutput[(numChannels - 1) * numBoxes + s]; if (v >= -3.15f && v <= 3.15f) ++angleCount; } likelyOBB = (angleCount > numSamples * 8 / 10); } if (likelyOBB) { return postprocessOBBLegacy(image.size(), resizedShape, outputTensors, classNames, confThreshold, iouThreshold); } else if (numChannels == 56) { return postprocessPoseLegacy(image.size(), resizedShape, outputTensors, classNames, confThreshold, iouThreshold, 17); } else { return postprocessLegacy(image.size(), resizedShape, outputTensors, classNames, confThreshold, iouThreshold); } } } } // ==================================================================== // DETECTION — postprocess // ==================================================================== std::vector ONNXYOLO::postprocessEndToEnd( const cv::Size& originalImageSize, const cv::Size& resizedImageShape, std::vector& outputTensors, const std::vector& classNames, float confThreshold) { if (outputTensors.empty()) return {}; const float* rawOutput = outputTensors[0].GetTensorMutableData(); const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); if (outputShape.size() < 3) return {}; const int numDets = static_cast(outputShape[1]); const int numFeat = static_cast(outputShape[2]); const float origW = static_cast(originalImageSize.width); const float origH = static_cast(originalImageSize.height); const float modelW = static_cast(resizedImageShape.width); const float modelH = static_cast(resizedImageShape.height); const float gain = std::min(modelH / origH, modelW / origW); const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f); const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f); const float invGain = 1.0f / gain; std::vector results; results.reserve(numDets); for (int i = 0; i < numDets; ++i) { const float* det = rawOutput + i * numFeat; const float conf = det[4]; if (conf <= confThreshold) continue; float x1 = (det[0] - padX) * invGain; float y1 = (det[1] - padY) * invGain; float x2 = (det[2] - padX) * invGain; float y2 = (det[3] - padY) * invGain; int classId = static_cast(det[5]); x1 = clamp(x1, 0.f, origW); y1 = clamp(y1, 0.f, origH); x2 = clamp(x2, 0.f, origW); y2 = clamp(y2, 0.f, origH); float w = x2 - x1, h = y2 - y1; if (w < 1.f || h < 1.f) continue; Object obj; obj.classId = classId; obj.confidence = conf; obj.box = cv::Rect(static_cast(x1), static_cast(y1), static_cast(w), static_cast(h)); if (classId >= 0 && classId < static_cast(classNames.size())) obj.className = classNames[classId]; results.push_back(std::move(obj)); } return results; } std::vector ONNXYOLO::postprocessLegacy( const cv::Size& originalImageSize, const cv::Size& resizedImageShape, std::vector& outputTensors, const std::vector& classNames, float confThreshold, float iouThreshold, int maxDet) { if (outputTensors.empty()) return {}; const float* rawOutput = outputTensors[0].GetTensorMutableData(); const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); if (outputShape.size() < 3) return {}; const int numChannels = static_cast(outputShape[1]); const int numBoxes = static_cast(outputShape[2]); const int numClasses = numChannels - 4; if (numClasses <= 0) return {}; cv::Mat output = cv::Mat(numChannels, numBoxes, CV_32F, const_cast(rawOutput)).t(); const float origW = static_cast(originalImageSize.width); const float origH = static_cast(originalImageSize.height); const float modelW = static_cast(resizedImageShape.width); const float modelH = static_cast(resizedImageShape.height); const float gain = std::min(modelH / origH, modelW / origW); const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f); const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f); const float invGain = 1.0f / gain; struct Candidate { float x1, y1, x2, y2, conf; int classId; }; std::vector candidates; candidates.reserve(numBoxes); for (int i = 0; i < numBoxes; ++i) { const float* row = output.ptr(i); const float* scoresPtr = row + 4; float maxScore = -FLT_MAX; int bestClass = -1; for (int c = 0; c < numClasses; ++c) { if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; } } if (maxScore <= confThreshold) continue; float cx = row[0], cy = row[1], w = row[2], h = row[3]; candidates.push_back({ cx - w*0.5f, cy - h*0.5f, cx + w*0.5f, cy + h*0.5f, maxScore, bestClass }); } if (candidates.empty()) return {}; // Class-aware NMS constexpr float MAX_WH = 7680.0f; std::vector sortedIdx(candidates.size()); std::iota(sortedIdx.begin(), sortedIdx.end(), 0); std::sort(sortedIdx.begin(), sortedIdx.end(), [&](int a, int b) { return candidates[a].conf > candidates[b].conf; }); if (static_cast(sortedIdx.size()) > 30000) sortedIdx.resize(30000); std::vector suppressed(sortedIdx.size(), false); std::vector keepIndices; keepIndices.reserve(maxDet); for (size_t i = 0; i < sortedIdx.size() && static_cast(keepIndices.size()) < maxDet; ++i) { if (suppressed[i]) continue; keepIndices.push_back(sortedIdx[i]); const auto& cur = candidates[sortedIdx[i]]; float cx1 = cur.x1 + cur.classId*MAX_WH, cy1 = cur.y1 + cur.classId*MAX_WH; float cx2 = cur.x2 + cur.classId*MAX_WH, cy2 = cur.y2 + cur.classId*MAX_WH; float curArea = (cx2-cx1)*(cy2-cy1); for (size_t j = i+1; j < sortedIdx.size(); ++j) { if (suppressed[j]) continue; const auto& o = candidates[sortedIdx[j]]; float ox1 = o.x1+o.classId*MAX_WH, oy1 = o.y1+o.classId*MAX_WH; float ox2 = o.x2+o.classId*MAX_WH, oy2 = o.y2+o.classId*MAX_WH; float iw = std::min(cx2,ox2)-std::max(cx1,ox1); float ih = std::min(cy2,oy2)-std::max(cy1,oy1); if (iw <= 0.f || ih <= 0.f) continue; float inter = iw*ih; float ua = curArea + (ox2-ox1)*(oy2-oy1) - inter; if (ua > 0.f && inter/ua > iouThreshold) suppressed[j] = true; } } std::vector results; results.reserve(keepIndices.size()); for (int idx : keepIndices) { const auto& c = candidates[idx]; float x1 = clamp((c.x1-padX)*invGain, 0.f, origW); float y1 = clamp((c.y1-padY)*invGain, 0.f, origH); float x2 = clamp((c.x2-padX)*invGain, 0.f, origW); float y2 = clamp((c.y2-padY)*invGain, 0.f, origH); float w = x2-x1, h = y2-y1; if (w < 1.f || h < 1.f) continue; Object obj; obj.classId = c.classId; obj.confidence = c.conf; obj.box = cv::Rect(static_cast(x1), static_cast(y1), static_cast(w), static_cast(h)); if (c.classId >= 0 && c.classId < static_cast(classNames.size())) obj.className = classNames[c.classId]; results.push_back(std::move(obj)); } return results; } // ==================================================================== // OBB — helpers (Prob-IoU based NMS) // ==================================================================== void ONNXYOLO::getCovarianceComponents(const OrientedBox& box, float& out1, float& out2, float& out3) { if (box.width <= 0.f || box.height <= 0.f) { out1 = out2 = out3 = 0.f; return; } const float vw = (box.width * box.width) / 12.0f; const float vh = (box.height * box.height) / 12.0f; const float cosT = std::cos(box.angle); const float sinT = std::sin(box.angle); const float cos2 = cosT * cosT; const float sin2 = sinT * sinT; const float sc = sinT * cosT; out1 = vw * cos2 + vh * sin2; out2 = vw * sin2 + vh * cos2; out3 = (vw - vh) * sc; } std::vector> ONNXYOLO::batchProbiou( const std::vector& obb1, const std::vector& obb2, float eps) { if (obb1.empty() || obb2.empty()) return {}; const size_t n1 = obb1.size(), n2 = obb2.size(); std::vector> iouMat(n1, std::vector(n2, 0.f)); // Pre-compute covariance for obb1 struct CovData { float x, y, a, b, c; }; std::vector cov1(n1); for (size_t i = 0; i < n1; ++i) { float a, b, c; getCovarianceComponents(obb1[i], a, b, c); cov1[i] = { obb1[i].x, obb1[i].y, a, b, c }; } for (size_t i = 0; i < n1; ++i) { for (size_t j = 0; j < n2; ++j) { float a2, b2, c2; getCovarianceComponents(obb2[j], a2, b2, c2); float dx = cov1[i].x - obb2[j].x; float dy = cov1[i].y - obb2[j].y; float sA = cov1[i].a + a2, sB = cov1[i].b + b2, sC = cov1[i].c + c2; float denom = sA * sB - sC * sC + eps; if (denom <= eps) continue; float t1 = ((sA*dy*dy + sB*dx*dx) * 0.25f) / denom; float t2 = ((sC*dx*dy) * -0.5f) / denom; float d1 = cov1[i].a*cov1[i].b - cov1[i].c*cov1[i].c; float d2 = a2*b2 - c2*c2; float sqrtDet = std::sqrt(std::max(d1, 0.f) * std::max(d2, 0.f) + eps); float t3 = 0.5f * std::log((sA*sB - sC*sC) / (4.f*sqrtDet) + eps); float bd = std::clamp(t1 + t2 + t3, eps, 100.f); float hd = std::sqrt(1.f - std::exp(-bd) + eps); iouMat[i][j] = 1.f - hd; } } return iouMat; } std::vector ONNXYOLO::nmsRotatedImpl( const std::vector& sortedBoxes, float iouThreshold) { if (sortedBoxes.empty()) return {}; if (sortedBoxes.size() == 1) return { 0 }; auto iouMat = batchProbiou(sortedBoxes, sortedBoxes); if (iouMat.empty()) return {}; const int n = static_cast(sortedBoxes.size()); std::vector keep; keep.reserve(n / 2); for (int j = 0; j < n; ++j) { bool shouldKeep = true; for (int i = 0; i < j; ++i) { if (iouMat[i][j] >= iouThreshold) { shouldKeep = false; break; } } if (shouldKeep) keep.push_back(j); } return keep; } std::vector ONNXYOLO::nmsRotated( const std::vector& boxes, const std::vector& scores, float iouThreshold) { if (boxes.empty() || scores.empty() || boxes.size() != scores.size()) return {}; std::vector sortedIdx(boxes.size()); std::iota(sortedIdx.begin(), sortedIdx.end(), 0); std::sort(sortedIdx.begin(), sortedIdx.end(), [&](int a, int b) { return scores[a] > scores[b]; }); std::vector sortedBoxes; sortedBoxes.reserve(boxes.size()); for (int i : sortedIdx) sortedBoxes.push_back(boxes[i]); auto keepSorted = nmsRotatedImpl(sortedBoxes, iouThreshold); std::vector keepOrig; keepOrig.reserve(keepSorted.size()); for (int si : keepSorted) keepOrig.push_back(sortedIdx[si]); return keepOrig; } std::vector ONNXYOLO::OBBToPoints(const OrientedBox& obb) { float angleDeg = obb.angle * 180.0f / static_cast(CV_PI); cv::RotatedRect rr(cv::Point2f(obb.x, obb.y), cv::Size2f(obb.width, obb.height), angleDeg); std::vector corners(4); rr.points(corners.data()); return corners; } // ==================================================================== // OBB — postprocess // ==================================================================== std::vector ONNXYOLO::postprocessOBBEndToEnd( const cv::Size& originalImageSize, const cv::Size& resizedImageShape, std::vector& outputTensors, const std::vector& classNames, float confThreshold) { if (outputTensors.empty()) return {}; const float* raw = outputTensors[0].GetTensorMutableData(); const auto shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); if (shape.size() < 3) return {}; const int numDets = static_cast(shape[1]); const int numFeat = static_cast(shape[2]); // 7: cx,cy,w,h,angle,conf,classId const float origW = static_cast(originalImageSize.width); const float origH = static_cast(originalImageSize.height); const float modelW = static_cast(resizedImageShape.width); const float modelH = static_cast(resizedImageShape.height); const float gain = std::min(modelH / origH, modelW / origW); const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f); const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f); const float invGain = 1.0f / gain; std::vector results; results.reserve(numDets); for (int i = 0; i < numDets; ++i) { const float* det = raw + i * numFeat; float angle = det[4]; float conf = det[5]; if (conf <= confThreshold) continue; float cx = (det[0] - padX) * invGain; float cy = (det[1] - padY) * invGain; float bw = det[2] * invGain; float bh = det[3] * invGain; int classId = static_cast(det[6]); cx = clamp(cx, 0.f, origW); cy = clamp(cy, 0.f, origH); OrientedBox obb{ cx, cy, bw, bh, angle }; Object obj; obj.classId = classId; obj.confidence = conf; obj.kps = { cx, cy, bw, bh, angle }; auto absCorners = OBBToPoints(obb); obj.box = cv::boundingRect(absCorners); // Normalize OBB corners to [0,1] and close the polygon obj.polygon.reserve(absCorners.size() + 1); for (const auto& pt : absCorners) { obj.polygon.emplace_back( std::clamp(pt.x / origW, 0.f, 1.f), std::clamp(pt.y / origH, 0.f, 1.f)); } if (!obj.polygon.empty()) obj.polygon.push_back(obj.polygon.front()); if (classId >= 0 && classId < static_cast(classNames.size())) obj.className = classNames[classId]; results.push_back(std::move(obj)); } return results; } std::vector ONNXYOLO::postprocessOBBLegacy( const cv::Size& originalImageSize, const cv::Size& resizedImageShape, std::vector& outputTensors, const std::vector& classNames, float confThreshold, float iouThreshold, int maxDet) { if (outputTensors.empty()) return {}; const float* rawOutput = outputTensors[0].GetTensorMutableData(); const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); if (outputShape.size() < 3) return {}; const int numChannels = static_cast(outputShape[1]); const int numBoxes = static_cast(outputShape[2]); const int numClasses = numChannels - 5; // 4 box + nc scores + 1 angle if (numClasses <= 0) return {}; cv::Mat output = cv::Mat(numChannels, numBoxes, CV_32F, const_cast(rawOutput)).t(); const float origW = static_cast(originalImageSize.width); const float origH = static_cast(originalImageSize.height); const float modelW = static_cast(resizedImageShape.width); const float modelH = static_cast(resizedImageShape.height); const float gain = std::min(modelH / origH, modelW / origW); const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f); const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f); const float invGain = 1.0f / gain; struct OBBCandidate { OrientedBox box; float conf; int classId; }; std::vector candidates; candidates.reserve(numBoxes); for (int i = 0; i < numBoxes; ++i) { const float* row = output.ptr(i); const float* scoresPtr = row + 4; float maxScore = -FLT_MAX; int bestClass = -1; for (int c = 0; c < numClasses; ++c) { if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; } } if (maxScore <= confThreshold) continue; float angle = row[4 + numClasses]; // angle after class scores float cx = (row[0] - padX) * invGain; float cy = (row[1] - padY) * invGain; float bw = row[2] * invGain; float bh = row[3] * invGain; cx = clamp(cx, 0.f, origW); cy = clamp(cy, 0.f, origH); candidates.push_back({ { cx, cy, bw, bh, angle }, maxScore, bestClass }); } if (candidates.empty()) return {}; // Prob-IoU NMS for oriented boxes std::vector boxes; std::vector scores; boxes.reserve(candidates.size()); scores.reserve(candidates.size()); for (const auto& c : candidates) { boxes.push_back(c.box); scores.push_back(c.conf); } auto keepIdx = nmsRotated(boxes, scores, iouThreshold); std::vector results; results.reserve(std::min(static_cast(keepIdx.size()), maxDet)); for (int idx : keepIdx) { if (static_cast(results.size()) >= maxDet) break; const auto& c = candidates[idx]; Object obj; obj.classId = c.classId; obj.confidence = c.conf; obj.kps = { c.box.x, c.box.y, c.box.width, c.box.height, c.box.angle }; auto absCorners = OBBToPoints(c.box); obj.box = cv::boundingRect(absCorners); // Normalize OBB corners to [0,1] and close the polygon const float origW = static_cast(originalImageSize.width); const float origH = static_cast(originalImageSize.height); obj.polygon.reserve(absCorners.size() + 1); for (const auto& pt : absCorners) { obj.polygon.emplace_back( std::clamp(pt.x / origW, 0.f, 1.f), std::clamp(pt.y / origH, 0.f, 1.f)); } if (!obj.polygon.empty()) obj.polygon.push_back(obj.polygon.front()); if (c.classId >= 0 && c.classId < static_cast(classNames.size())) obj.className = classNames[c.classId]; results.push_back(std::move(obj)); } return results; } // ==================================================================== // SEGMENTATION — postprocess // ==================================================================== std::vector ONNXYOLO::postprocessSegEndToEnd( const cv::Size& originalImageSize, const cv::Size& resizedImageShape, std::vector& outputTensors, const std::vector& classNames, float confThreshold) { if (outputTensors.size() < 2) return {}; const float* raw = outputTensors[0].GetTensorMutableData(); const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); const auto protoShape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape(); if (shape0.size() < 3 || protoShape.size() < 4) return {}; const int numDets = static_cast(shape0[1]); const int numFeat = static_cast(shape0[2]); // 6 + nm const int nm = static_cast(protoShape[1]); const int protoH = static_cast(protoShape[2]); const int protoW = static_cast(protoShape[3]); if (numFeat < 6 + nm) return {}; const float origW = static_cast(originalImageSize.width); const float origH = static_cast(originalImageSize.height); const float modelW = static_cast(resizedImageShape.width); const float modelH = static_cast(resizedImageShape.height); const float gain = std::min(modelH / origH, modelW / origW); const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f); const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f); const float invGain = 1.0f / gain; // Collect detections and mask coefficients std::vector objs; cv::Mat maskCoeffs; // [N, nm] for (int i = 0; i < numDets; ++i) { const float* det = raw + i * numFeat; float conf = det[4]; if (conf <= confThreshold) continue; int classId = static_cast(det[5]); float x1 = clamp((det[0] - padX) * invGain, 0.f, origW); float y1 = clamp((det[1] - padY) * invGain, 0.f, origH); float x2 = clamp((det[2] - padX) * invGain, 0.f, origW); float y2 = clamp((det[3] - padY) * invGain, 0.f, origH); float w = x2-x1, h = y2-y1; if (w < 1.f || h < 1.f) continue; Object obj; obj.classId = classId; obj.confidence = conf; obj.box = cv::Rect(static_cast(x1), static_cast(y1), static_cast(w), static_cast(h)); if (classId >= 0 && classId < static_cast(classNames.size())) obj.className = classNames[classId]; objs.push_back(std::move(obj)); // Extract mask coefficients (after the 6 detection values) cv::Mat mc(1, nm, CV_32F); std::memcpy(mc.ptr(), det + 6, nm * sizeof(float)); maskCoeffs.push_back(mc); } // Generate masks: coeffs @ protos → sigmoid → crop-in-proto → resize-to-box → threshold if (!objs.empty() && !maskCoeffs.empty()) { const float* protoData = outputTensors[1].GetTensorMutableData(); cv::Mat protos(nm, protoH * protoW, CV_32F, const_cast(protoData)); cv::Mat matmulRes = (maskCoeffs * protos).t(); // Apply sigmoid while still a single-channel 2D matrix cv::Mat negMat; cv::exp(-matmulRes, negMat); cv::Mat sigmoidFlat = 1.0 / (1.0 + negMat); cv::Mat sigmoidMat = sigmoidFlat.reshape(static_cast(objs.size()), { protoH, protoW }); std::vector maskChannels; cv::split(sigmoidMat, maskChannels); // ROI in proto space, accounting for letterbox padding cv::Rect roi; if (origH > origW) { int roiW = std::min(static_cast(std::round( static_cast(protoW) * origW / origH)), protoW); roi = cv::Rect((protoW - roiW) / 2, 0, roiW, protoH); } else { int roiH = std::min(static_cast(std::round( static_cast(protoH) * origH / origW)), protoH); roi = cv::Rect(0, (protoH - roiH) / 2, protoW, roiH); } roi &= cv::Rect(0, 0, protoW, protoH); int imgW = static_cast(origW); int imgH = static_cast(origH); const float scaleX = static_cast(imgW) / roi.width; const float scaleY = static_cast(imgH) / roi.height; for (size_t i = 0; i < objs.size(); ++i) { cv::Rect safebox = objs[i].box & cv::Rect(0, 0, imgW, imgH); if (safebox.area() <= 0) continue; int px0 = std::max(static_cast(std::floor(safebox.x / scaleX)), 0); int py0 = std::max(static_cast(std::floor(safebox.y / scaleY)), 0); int px1 = std::min(static_cast(std::ceil((safebox.x + safebox.width) / scaleX)), roi.width); int py1 = std::min(static_cast(std::ceil((safebox.y + safebox.height) / scaleY)), roi.height); if (px1 <= px0 || py1 <= py0) continue; cv::Rect protoBox(roi.x + px0, roi.y + py0, px1 - px0, py1 - py0); protoBox &= cv::Rect(0, 0, protoW, protoH); if (protoBox.area() <= 0) continue; cv::Mat cropped = maskChannels[i](protoBox); cv::Mat resized; cv::resize(cropped, resized, cv::Size(safebox.width, safebox.height), 0, 0, cv::INTER_LINEAR); objs[i].mask = resized > 0.5f; objs[i].polygon = ANSUtilityHelper::MaskToNormalizedPolygon( objs[i].mask, safebox, origW, origH); } } for (auto& obj : objs) { if (obj.polygon.empty()) obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, origW, origH); } return objs; } std::vector ONNXYOLO::postprocessSegLegacy( const cv::Size& originalImageSize, const cv::Size& resizedImageShape, std::vector& outputTensors, const std::vector& classNames, float confThreshold, float iouThreshold, int maxDet) { if (outputTensors.size() < 2) return {}; const float* rawOutput = outputTensors[0].GetTensorMutableData(); const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); const auto protoShape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape(); if (shape0.size() < 3 || protoShape.size() < 4) return {}; const int numChannels = static_cast(shape0[1]); const int numBoxes = static_cast(shape0[2]); const int nm = static_cast(protoShape[1]); const int protoH = static_cast(protoShape[2]); const int protoW = static_cast(protoShape[3]); const int numClasses = numChannels - 4 - nm; if (numClasses <= 0) return {}; cv::Mat output = cv::Mat(numChannels, numBoxes, CV_32F, const_cast(rawOutput)).t(); const float origW = static_cast(originalImageSize.width); const float origH = static_cast(originalImageSize.height); const float modelW = static_cast(resizedImageShape.width); const float modelH = static_cast(resizedImageShape.height); const float gain = std::min(modelH / origH, modelW / origW); const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f); const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f); const float invGain = 1.0f / gain; std::vector bboxes; std::vector scores; std::vector labels; std::vector maskCoeffs; for (int i = 0; i < numBoxes; ++i) { const float* row = output.ptr(i); const float* scoresPtr = row + 4; float maxScore = -FLT_MAX; int bestClass = -1; for (int c = 0; c < numClasses; ++c) { if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; } } if (maxScore <= confThreshold) continue; float cx = row[0], cy = row[1], w = row[2], h = row[3]; float x0 = clamp((cx - w*0.5f - padX) * invGain, 0.f, origW); float y0 = clamp((cy - h*0.5f - padY) * invGain, 0.f, origH); float x1 = clamp((cx + w*0.5f - padX) * invGain, 0.f, origW); float y1 = clamp((cy + h*0.5f - padY) * invGain, 0.f, origH); bboxes.push_back(cv::Rect(static_cast(x0), static_cast(y0), static_cast(x1-x0), static_cast(y1-y0))); scores.push_back(maxScore); labels.push_back(bestClass); cv::Mat mc(1, nm, CV_32F); std::memcpy(mc.ptr(), row + 4 + numClasses, nm * sizeof(float)); maskCoeffs.push_back(mc); } // NMS std::vector indices; cv::dnn::NMSBoxesBatched(bboxes, scores, labels, confThreshold, iouThreshold, indices); // Collect surviving detections and their mask coefficients std::vector objs; cv::Mat masks; for (int idx : indices) { if (static_cast(objs.size()) >= maxDet) break; Object obj; obj.classId = labels[idx]; obj.confidence = scores[idx]; obj.box = bboxes[idx]; if (obj.classId >= 0 && obj.classId < static_cast(classNames.size())) obj.className = classNames[obj.classId]; objs.push_back(std::move(obj)); masks.push_back(maskCoeffs[idx]); } // Generate masks if (!objs.empty() && !masks.empty()) { const float* protoData = outputTensors[1].GetTensorMutableData(); cv::Mat protos(nm, protoH * protoW, CV_32F, const_cast(protoData)); cv::Mat matmulRes = (masks * protos).t(); // Apply sigmoid while still a single-channel 2D matrix cv::Mat negMat; cv::exp(-matmulRes, negMat); cv::Mat sigmoidFlat = 1.0 / (1.0 + negMat); cv::Mat sigmoidMat = sigmoidFlat.reshape(static_cast(objs.size()), { protoH, protoW }); std::vector maskChannels; cv::split(sigmoidMat, maskChannels); // ROI in proto space, accounting for letterbox padding cv::Rect roi; if (origH > origW) { int roiW = std::min(static_cast(std::round( static_cast(protoW) * origW / origH)), protoW); roi = cv::Rect((protoW - roiW) / 2, 0, roiW, protoH); } else { int roiH = std::min(static_cast(std::round( static_cast(protoH) * origH / origW)), protoH); roi = cv::Rect(0, (protoH - roiH) / 2, protoW, roiH); } roi &= cv::Rect(0, 0, protoW, protoH); int imgW = static_cast(origW); int imgH = static_cast(origH); const float scaleX = static_cast(imgW) / roi.width; const float scaleY = static_cast(imgH) / roi.height; for (size_t i = 0; i < objs.size(); ++i) { cv::Rect safebox = objs[i].box & cv::Rect(0, 0, imgW, imgH); if (safebox.area() <= 0) continue; int px0 = std::max(static_cast(std::floor(safebox.x / scaleX)), 0); int py0 = std::max(static_cast(std::floor(safebox.y / scaleY)), 0); int px1 = std::min(static_cast(std::ceil((safebox.x + safebox.width) / scaleX)), roi.width); int py1 = std::min(static_cast(std::ceil((safebox.y + safebox.height) / scaleY)), roi.height); if (px1 <= px0 || py1 <= py0) continue; cv::Rect protoBox(roi.x + px0, roi.y + py0, px1 - px0, py1 - py0); protoBox &= cv::Rect(0, 0, protoW, protoH); if (protoBox.area() <= 0) continue; cv::Mat cropped = maskChannels[i](protoBox); cv::Mat resized; cv::resize(cropped, resized, cv::Size(safebox.width, safebox.height), 0, 0, cv::INTER_LINEAR); objs[i].mask = resized > 0.5f; objs[i].polygon = ANSUtilityHelper::MaskToNormalizedPolygon( objs[i].mask, safebox, origW, origH); } } for (auto& obj : objs) { if (obj.polygon.empty()) obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, origW, origH); } return objs; } // ==================================================================== // POSE — postprocess // ==================================================================== std::vector ONNXYOLO::postprocessPoseEndToEnd( const cv::Size& originalImageSize, const cv::Size& resizedImageShape, std::vector& outputTensors, const std::vector& classNames, float confThreshold, int numKPS) { if (outputTensors.empty()) return {}; const float* raw = outputTensors[0].GetTensorMutableData(); const auto shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); if (shape.size() < 3) return {}; const int numDets = static_cast(shape[1]); const int numFeat = static_cast(shape[2]); // 6 + nk*3 const float origW = static_cast(originalImageSize.width); const float origH = static_cast(originalImageSize.height); const float modelW = static_cast(resizedImageShape.width); const float modelH = static_cast(resizedImageShape.height); const float gain = std::min(modelH / origH, modelW / origW); const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f); const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f); const float invGain = 1.0f / gain; std::vector results; results.reserve(numDets); for (int i = 0; i < numDets; ++i) { const float* det = raw + i * numFeat; float conf = det[4]; if (conf <= confThreshold) continue; int classId = static_cast(det[5]); float x1 = clamp((det[0] - padX) * invGain, 0.f, origW); float y1 = clamp((det[1] - padY) * invGain, 0.f, origH); float x2 = clamp((det[2] - padX) * invGain, 0.f, origW); float y2 = clamp((det[3] - padY) * invGain, 0.f, origH); float w = x2-x1, h = y2-y1; if (w < 1.f || h < 1.f) continue; // Extract keypoints (after the 6 detection values) std::vector kps; kps.reserve(numKPS * 3); const float* kpsPtr = det + 6; for (int k = 0; k < numKPS; ++k) { float kx = clamp((kpsPtr[3*k] - padX) * invGain, 0.f, origW); float ky = clamp((kpsPtr[3*k+1] - padY) * invGain, 0.f, origH); float ks = kpsPtr[3*k+2]; kps.push_back(kx); kps.push_back(ky); kps.push_back(ks); } Object obj; obj.classId = classId; obj.confidence = conf; obj.box = cv::Rect(static_cast(x1), static_cast(y1), static_cast(w), static_cast(h)); obj.kps = std::move(kps); if (classId >= 0 && classId < static_cast(classNames.size())) obj.className = classNames[classId]; results.push_back(std::move(obj)); } return results; } std::vector ONNXYOLO::postprocessPoseLegacy( const cv::Size& originalImageSize, const cv::Size& resizedImageShape, std::vector& outputTensors, const std::vector& classNames, float confThreshold, float iouThreshold, int numKPS, int maxDet) { if (outputTensors.empty()) return {}; const float* rawOutput = outputTensors[0].GetTensorMutableData(); const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); if (outputShape.size() < 3) return {}; const int numChannels = static_cast(outputShape[1]); const int numBoxes = static_cast(outputShape[2]); // Pose layout: [cx,cy,w,h, scores(nc), kp0_x,kp0_y,kp0_s, ..., kpN_x,kpN_y,kpN_s] // Derive actual nc from tensor shape: nc = numChannels - 4 - numKPS*3 // This avoids mismatch when classNames has more entries than the model's actual classes const int nc = std::max(numChannels - 4 - numKPS * 3, 1); const int kpsOffset = 4 + nc; // Safety: verify we won't read past the row if (kpsOffset + numKPS * 3 > numChannels) return {}; cv::Mat output = cv::Mat(numChannels, numBoxes, CV_32F, const_cast(rawOutput)).t(); const float origW = static_cast(originalImageSize.width); const float origH = static_cast(originalImageSize.height); const float modelW = static_cast(resizedImageShape.width); const float modelH = static_cast(resizedImageShape.height); const float gain = std::min(modelH / origH, modelW / origW); const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f); const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f); const float invGain = 1.0f / gain; std::vector bboxes; std::vector scores; std::vector labels; std::vector> allKps; for (int i = 0; i < numBoxes; ++i) { const float* row = output.ptr(i); const float* scoresPtr = row + 4; // Find best class float maxScore = -FLT_MAX; int bestClass = 0; int numScores = std::max(nc, 1); for (int c = 0; c < numScores; ++c) { if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; } } if (maxScore <= confThreshold) continue; float cx = row[0], cy = row[1], w = row[2], h = row[3]; float x0 = clamp((cx - w*0.5f - padX) * invGain, 0.f, origW); float y0 = clamp((cy - h*0.5f - padY) * invGain, 0.f, origH); float x1 = clamp((cx + w*0.5f - padX) * invGain, 0.f, origW); float y1 = clamp((cy + h*0.5f - padY) * invGain, 0.f, origH); // Extract keypoints const float* kpsPtr = row + kpsOffset; std::vector kps; kps.reserve(numKPS * 3); for (int k = 0; k < numKPS; ++k) { float kx = clamp((kpsPtr[3*k] - padX) * invGain, 0.f, origW); float ky = clamp((kpsPtr[3*k+1] - padY) * invGain, 0.f, origH); float ks = kpsPtr[3*k+2]; kps.push_back(kx); kps.push_back(ky); kps.push_back(ks); } bboxes.push_back(cv::Rect(static_cast(x0), static_cast(y0), static_cast(x1-x0), static_cast(y1-y0))); scores.push_back(maxScore); labels.push_back(bestClass); allKps.push_back(std::move(kps)); } // NMS std::vector indices; cv::dnn::NMSBoxesBatched(bboxes, scores, labels, confThreshold, iouThreshold, indices); std::vector results; for (int idx : indices) { if (static_cast(results.size()) >= maxDet) break; Object obj; obj.classId = labels[idx]; obj.confidence = scores[idx]; obj.box = bboxes[idx]; obj.kps = allKps[idx]; if (obj.classId >= 0 && obj.classId < static_cast(classNames.size())) obj.className = classNames[obj.classId]; results.push_back(std::move(obj)); } return results; } // ==================================================================== // CLASSIFICATION — postprocess // ==================================================================== std::vector ONNXYOLO::postprocessClassify( std::vector& outputTensors, const std::vector& classNames, const cv::Size& imageSize) { if (outputTensors.empty()) return {}; const float* raw = outputTensors[0].GetTensorMutableData(); const auto shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); if (shape.size() < 2) return {}; const int nc = static_cast(shape[1]); // Check if the output is already a probability distribution (sums to ~1.0). // Some ONNX models (e.g. exported with opset 19) include a Softmax layer // in the graph itself. Applying softmax again would flatten the distribution // and produce near-uniform probabilities, causing wrong classifications. float rawSum = 0.f; for (int i = 0; i < nc; ++i) rawSum += raw[i]; const bool alreadyNormalized = (rawSum > 0.9f && rawSum < 1.1f && raw[0] >= 0.f); // probabilities are non-negative std::vector probs(nc); if (alreadyNormalized) { // Output is already softmax — use as-is (skip double softmax) for (int i = 0; i < nc; ++i) probs[i] = raw[i]; } else { // Raw logits — apply softmax float maxVal = -FLT_MAX; for (int i = 0; i < nc; ++i) maxVal = std::max(maxVal, raw[i]); float sumExp = 0.f; for (int i = 0; i < nc; ++i) { probs[i] = std::exp(raw[i] - maxVal); sumExp += probs[i]; } for (int i = 0; i < nc; ++i) probs[i] /= sumExp; } int bestClass = 0; float bestProb = 0.f; for (int i = 0; i < nc; ++i) { if (probs[i] > bestProb) { bestProb = probs[i]; bestClass = i; } } const int imgW = imageSize.width; const int imgH = imageSize.height; Object obj; if (imgW > 20 && imgH > 20) { obj.box = cv::Rect(10, 10, imgW - 20, imgH - 20); } else { obj.box = cv::Rect(0, 0, imgW, imgH); } //obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, imgW, imgH); obj.classId = bestClass; obj.confidence = bestProb; if (bestClass >= 0 && bestClass < static_cast(classNames.size())) obj.className = classNames[bestClass]; return { std::move(obj) }; } // ==================================================================== // BATCH — sliceBatchOutput + detectBatch // ==================================================================== /*static*/ Ort::Value ONNXYOLO::sliceBatchOutput( Ort::Value& batchTensor, int64_t batchIndex, const std::vector& fullShape, Ort::MemoryInfo& memInfo) { // Per-image element count = product of all dims except batch int64_t elemsPerImage = 1; for (size_t d = 1; d < fullShape.size(); ++d) elemsPerImage *= fullShape[d]; float* batchData = batchTensor.GetTensorMutableData(); float* imageData = batchData + batchIndex * elemsPerImage; // Shape for single image: [1, D1, D2, ...] std::vector singleShape = fullShape; singleShape[0] = 1; return Ort::Value::CreateTensor( memInfo, imageData, static_cast(elemsPerImage), singleShape.data(), singleShape.size()); } std::vector> ONNXYOLO::detectBatch( const std::vector& images, const std::vector& classNames, float confThreshold, float iouThreshold, int numKPS) { lastWasClassification = false; lastBatchWasClassification = false; if (images.empty()) return {}; const size_t N = images.size(); // Fallback to sequential if model has fixed batch=1 // (input_node_dims[0] == 1 and not dynamic (-1)) if (input_node_dims.size() >= 1 && input_node_dims[0] == 1) { std::vector> results(N); for (size_t i = 0; i < N; ++i) results[i] = detect(images[i], classNames, confThreshold, iouThreshold, numKPS); lastBatchWasClassification = lastWasClassification; return results; } // Store original sizes for per-image postprocessing std::vector originalSizes; originalSizes.reserve(N); for (const auto& img : images) originalSizes.push_back(img.size()); // Batch preprocess + single inference call Ort::Value inputTensor = transformBatch(images); auto outputTensors = ort_session->Run( Ort::RunOptions{ nullptr }, input_node_names.data(), &inputTensor, 1, output_node_names.data(), num_outputs); const cv::Size resizedShape( static_cast(input_node_dims[3]), static_cast(input_node_dims[2])); // Determine task type from output shapes (same logic as detect()) const size_t numOutputs = outputTensors.size(); const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); // Per-image postprocessing std::vector> results(N); for (size_t i = 0; i < N; ++i) { // Build per-image sliced output tensors std::vector perImageOutputs; for (size_t t = 0; t < numOutputs; ++t) { auto tShape = outputTensors[t].GetTensorTypeAndShapeInfo().GetShape(); perImageOutputs.push_back( sliceBatchOutput(outputTensors[t], static_cast(i), tShape, *memory_info_handler)); } // Dispatch to correct postprocess method if (numOutputs >= 2) { const auto protoShape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape(); if (protoShape.size() == 4) { if (shape0.size() >= 3 && shape0[1] < shape0[2]) { results[i] = postprocessSegLegacy(originalSizes[i], resizedShape, perImageOutputs, classNames, confThreshold, iouThreshold); } else { results[i] = postprocessSegEndToEnd(originalSizes[i], resizedShape, perImageOutputs, classNames, confThreshold); } continue; } } if (shape0.size() == 2) { lastBatchWasClassification = true; results[i] = postprocessClassify(perImageOutputs, classNames, originalSizes[i]); continue; } if (shape0.size() < 3) continue; const bool isEndToEnd = (shape0[1] > shape0[2]) || (shape0[2] <= 20); if (isEndToEnd) { const int features = static_cast(shape0[2]); if (features == 6) { results[i] = postprocessEndToEnd(originalSizes[i], resizedShape, perImageOutputs, classNames, confThreshold); } else if (features == 7) { results[i] = postprocessOBBEndToEnd(originalSizes[i], resizedShape, perImageOutputs, classNames, confThreshold); } else if (features > 7 && (features - 6) % 3 == 0) { int nk = (numKPS > 0) ? numKPS : (features - 6) / 3; results[i] = postprocessPoseEndToEnd(originalSizes[i], resizedShape, perImageOutputs, classNames, confThreshold, nk); } else { results[i] = postprocessEndToEnd(originalSizes[i], resizedShape, perImageOutputs, classNames, confThreshold); } } else { const int nc = static_cast(classNames.size()); const int numChannels = static_cast(shape0[1]); const int numBoxes = static_cast(shape0[2]); const int extra = numChannels - 4; bool routed = false; if (numKPS > 0 && numChannels >= 4 + 1 + numKPS * 3) { results[i] = postprocessPoseLegacy(originalSizes[i], resizedShape, perImageOutputs, classNames, confThreshold, iouThreshold, numKPS); routed = true; } else if (nc > 0 && nc <= extra && extra > nc && (extra - nc) % 3 == 0 && (extra - nc) >= 3) { int nk = (extra - nc) / 3; results[i] = postprocessPoseLegacy(originalSizes[i], resizedShape, perImageOutputs, classNames, confThreshold, iouThreshold, nk); routed = true; } else if (nc > 0 && nc <= extra && extra == nc + 1) { results[i] = postprocessOBBLegacy(originalSizes[i], resizedShape, perImageOutputs, classNames, confThreshold, iouThreshold); routed = true; } else if (nc > 0 && nc <= extra && extra == nc) { results[i] = postprocessLegacy(originalSizes[i], resizedShape, perImageOutputs, classNames, confThreshold, iouThreshold); routed = true; } if (!routed) { // Class count mismatch — probe last channel for OBB angles bool likelyOBB = false; if (extra >= 2) { const float* rawOutput = perImageOutputs[0].GetTensorMutableData(); int numSamp = std::min(numBoxes, 100); int angleCount = 0; for (int s = 0; s < numSamp; ++s) { float v = rawOutput[(numChannels - 1) * numBoxes + s]; if (v >= -3.15f && v <= 3.15f) ++angleCount; } likelyOBB = (angleCount > numSamp * 8 / 10); } if (likelyOBB) { results[i] = postprocessOBBLegacy(originalSizes[i], resizedShape, perImageOutputs, classNames, confThreshold, iouThreshold); } else { results[i] = postprocessLegacy(originalSizes[i], resizedShape, perImageOutputs, classNames, confThreshold, iouThreshold); } } } } return results; } // ==================================================================== // ANSONNXYOLO — ANSODBase wrapper // ==================================================================== ANSONNXYOLO::~ANSONNXYOLO() { try { Destroy(); } catch (const std::exception& e) { _logger.LogError("ANSONNXYOLO::~ANSONNXYOLO()", e.what(), __FILE__, __LINE__); } } bool ANSONNXYOLO::Destroy() { try { m_ortEngine.reset(); return true; } catch (const std::exception& e) { _logger.LogError("ANSONNXYOLO::Destroy", e.what(), __FILE__, __LINE__); return false; } } bool ANSONNXYOLO::OptimizeModel(bool fp16, std::string& optimizedModelFolder) { if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder)) return false; optimizedModelFolder = _modelFolder; return true; } bool ANSONNXYOLO::InitOrtEngine() { try { if (!FileExist(_modelFilePath)) { _logger.LogError("ANSONNXYOLO::InitOrtEngine", "Model file does not exist: " + _modelFilePath, __FILE__, __LINE__); return false; } m_ortEngine = std::make_unique(_modelFilePath); return true; } catch (const std::exception& e) { _logger.LogFatal("ANSONNXYOLO::InitOrtEngine", e.what(), __FILE__, __LINE__); return false; } } bool ANSONNXYOLO::InitOrtEngine(ANSCENTER::EngineType engineType) { try { if (!FileExist(_modelFilePath)) { _logger.LogError("ANSONNXYOLO::InitOrtEngine", "Model file does not exist: " + _modelFilePath, __FILE__, __LINE__); return false; } m_ortEngine = std::make_unique(_modelFilePath, engineType); return true; } catch (const std::exception& e) { _logger.LogFatal("ANSONNXYOLO::InitOrtEngine", e.what(), __FILE__, __LINE__); return false; } } bool ANSONNXYOLO::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, std::string& labelMap) { std::lock_guard lock(_mutex); try { _modelLoadValid = false; bool result = ANSODBase::Initialize(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, labelMap); if (!result) return false; _modelConfig = modelConfig; if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640; if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640; if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.45f; if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.25f; PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold; NMS_THRESHOLD = _modelConfig.modelMNSThreshold; TOP_K = 300; NUM_KPS = _modelConfig.numKPS; KPS_THRESHOLD = _modelConfig.kpsThreshold; _modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx"); if (FileExist(_modelConfigFile)) { ModelType modelType; std::vector inputShape; _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape); if (inputShape.size() == 2) { if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0]; if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1]; } } else { _classFilePath = CreateFilePath(_modelFolder, "classes.names"); std::ifstream isValid(_classFilePath); if (!isValid) LoadClassesFromString(); else LoadClassesFromFile(); } labelMap.clear(); if (!_classes.empty()) labelMap = VectorToCommaSeparatedString(_classes); if (this->_loadEngineOnCreation) { if (!InitOrtEngine()) { _logger.LogError("ANSONNXYOLO::Initialize", "Failed to create ONNX Runtime engine: " + _modelFilePath, __FILE__, __LINE__); return false; } } // Fix input resolution for dynamic-shape models. // The constructor defaults to 640x640 when ONNX dims are dynamic, // but the ModelConfig may specify the correct size (e.g. 224x224 // for classification models). Override here after config is loaded. if (m_ortEngine && m_ortEngine->hasDynamicInputShape()) { if (_modelConfig.inpHeight > 0 && _modelConfig.inpWidth > 0) { m_ortEngine->setInputShape(_modelConfig.inpWidth, _modelConfig.inpHeight); } } _modelLoadValid = true; _isInitialized = true; return true; } catch (const std::exception& e) { _logger.LogFatal("ANSONNXYOLO::Initialize", e.what(), __FILE__, __LINE__); return false; } } bool ANSONNXYOLO::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) { std::lock_guard lock(_mutex); try { bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword); if (!result) return false; if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640; if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640; if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.45f; if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.25f; PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold; NMS_THRESHOLD = _modelConfig.modelMNSThreshold; TOP_K = 300; NUM_KPS = _modelConfig.numKPS; KPS_THRESHOLD = _modelConfig.kpsThreshold; _modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx"); if (FileExist(_modelConfigFile)) { ModelType modelType; std::vector inputShape; _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape); if (inputShape.size() == 2) { if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0]; if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1]; } } else { _classFilePath = CreateFilePath(_modelFolder, "classes.names"); std::ifstream isValid(_classFilePath); if (!isValid) LoadClassesFromString(); else LoadClassesFromFile(); } if (this->_loadEngineOnCreation) { if (!InitOrtEngine()) { _modelLoadValid = false; return false; } } // Fix input resolution for dynamic-shape models (same as primary Initialize) if (m_ortEngine && m_ortEngine->hasDynamicInputShape()) { if (_modelConfig.inpHeight > 0 && _modelConfig.inpWidth > 0) { m_ortEngine->setInputShape(_modelConfig.inpWidth, _modelConfig.inpHeight); } } _modelLoadValid = true; _isInitialized = true; return true; } catch (const std::exception& e) { _logger.LogFatal("ANSONNXYOLO::LoadModel", e.what(), __FILE__, __LINE__); return false; } } bool ANSONNXYOLO::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig, std::string modelName, std::string className, const std::string& modelFolder, std::string& labelMap) { std::lock_guard lock(_mutex); try { bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap); if (!result) return false; _modelConfig = modelConfig; if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640; if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640; if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.45f; if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.25f; PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold; NMS_THRESHOLD = _modelConfig.modelMNSThreshold; TOP_K = 300; NUM_KPS = _modelConfig.numKPS; KPS_THRESHOLD = _modelConfig.kpsThreshold; std::string _modelName = modelName; if (_modelName.empty()) _modelName = "train_last"; std::string modelFullName = _modelName + ".onnx"; _modelFilePath = CreateFilePath(_modelFolder, modelFullName); if (FileExist(_modelConfigFile)) { ModelType modelType; std::vector inputShape; _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape); if (inputShape.size() == 2) { if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0]; if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1]; } } else { _classFilePath = CreateFilePath(_modelFolder, className); std::ifstream isValid(_classFilePath); if (!isValid) LoadClassesFromString(); else LoadClassesFromFile(); } labelMap.clear(); if (!_classes.empty()) labelMap = VectorToCommaSeparatedString(_classes); if (this->_loadEngineOnCreation) { if (!InitOrtEngine()) { _modelLoadValid = false; return false; } } // Fix input resolution for dynamic-shape models (same as primary Initialize) if (m_ortEngine && m_ortEngine->hasDynamicInputShape()) { if (_modelConfig.inpHeight > 0 && _modelConfig.inpWidth > 0) { m_ortEngine->setInputShape(_modelConfig.inpWidth, _modelConfig.inpHeight); } } _modelLoadValid = true; _isInitialized = true; return true; } catch (const std::exception& e) { _logger.LogFatal("ANSONNXYOLO::LoadModelFromFolder", e.what(), __FILE__, __LINE__); return false; } } std::vector ANSONNXYOLO::RunInference(const cv::Mat& inputImgBGR) { return RunInference(inputImgBGR, ""); } std::vector ANSONNXYOLO::RunInference(const cv::Mat& inputImgBGR, const std::string& camera_id) { { std::lock_guard lock(_mutex); if (!_modelLoadValid) { _logger.LogError("ANSONNXYOLO::RunInference", "Model not loaded", __FILE__, __LINE__); return {}; } if (!_licenseValid) { _logger.LogError("ANSONNXYOLO::RunInference", "Invalid license", __FILE__, __LINE__); return {}; } if (!_isInitialized) { _logger.LogError("ANSONNXYOLO::RunInference", "Model not initialized", __FILE__, __LINE__); return {}; } if (inputImgBGR.empty() || inputImgBGR.cols < 10 || inputImgBGR.rows < 10) return {}; } try { return DetectObjects(inputImgBGR, camera_id); } catch (const std::exception& e) { _logger.LogFatal("ANSONNXYOLO::RunInference", e.what(), __FILE__, __LINE__); return {}; } } std::vector ANSONNXYOLO::DetectObjects(const cv::Mat& inputImage, const std::string& camera_id) { try { ANS_DBG("ONNXYOLO", "DetectObjects: cam=%s acquiring mutex...", camera_id.c_str()); std::lock_guard lock(_mutex); ANS_DBG("ONNXYOLO", "DetectObjects: mutex acquired, cam=%s", camera_id.c_str()); if (!m_ortEngine) { _logger.LogError("ANSONNXYOLO::DetectObjects", "ORT engine is null", __FILE__, __LINE__); ANS_DBG("ONNXYOLO", "DetectObjects: ORT engine is null!"); return {}; } // --- NV12 fast path: try to get full-res BGR from GPU NV12 frame --- cv::Mat inferenceImage = inputImage; float bgrScaleX = 1.0f, bgrScaleY = 1.0f; { auto* gpuData = tl_currentGpuFrame(); if (gpuData && gpuData->width > 0 && gpuData->height > 0) { // Full-res NV12 available — convert to BGR on CPU for ORT // (ORT preprocessing is CPU-based, so we need a cv::Mat) if (gpuData->cpuYPlane && gpuData->cpuUvPlane && gpuData->cpuYLinesize >= gpuData->width && gpuData->cpuUvLinesize >= gpuData->width) { const int fw = gpuData->width; const int fh = gpuData->height; // NV12 requires even dimensions if ((fw % 2) == 0 && (fh % 2) == 0) { try { cv::Mat yPlane(fh, fw, CV_8UC1, gpuData->cpuYPlane, static_cast(gpuData->cpuYLinesize)); cv::Mat uvPlane(fh / 2, fw / 2, CV_8UC2, gpuData->cpuUvPlane, static_cast(gpuData->cpuUvLinesize)); cv::Mat fullResBGR; cv::cvtColorTwoPlane(yPlane, uvPlane, fullResBGR, cv::COLOR_YUV2BGR_NV12); if (!fullResBGR.empty()) { bgrScaleX = static_cast(inputImage.cols) / fullResBGR.cols; bgrScaleY = static_cast(inputImage.rows) / fullResBGR.rows; inferenceImage = fullResBGR; } } catch (...) { /* NV12 conversion failed — fall back to inputImage */ } } } } } auto results = m_ortEngine->detect(inferenceImage, _classes, PROBABILITY_THRESHOLD, NMS_THRESHOLD, NUM_KPS); // --- Rescale coordinates from full-res back to display-res --- if (bgrScaleX != 1.0f || bgrScaleY != 1.0f) { for (auto& obj : results) { obj.box.x = static_cast(obj.box.x * bgrScaleX); obj.box.y = static_cast(obj.box.y * bgrScaleY); obj.box.width = static_cast(obj.box.width * bgrScaleX); obj.box.height = static_cast(obj.box.height * bgrScaleY); for (size_t k = 0; k < obj.kps.size(); k += 2) { obj.kps[k] *= bgrScaleX; // x if (k + 1 < obj.kps.size()) obj.kps[k + 1] *= bgrScaleY; // y } for (auto& pt : obj.polygon) { pt.x *= bgrScaleX; pt.y *= bgrScaleY; } } } for (auto& obj : results) obj.cameraId = camera_id; // Skip tracking for classification models if (_trackerEnabled && !m_ortEngine->lastWasClassification) { results = ApplyTracking(results, camera_id); if (_stabilizationEnabled) results = StabilizeDetections(results, camera_id); } return results; } catch (const std::exception& e) { const std::string msg = e.what(); // ── DML device-removal detection ────────────────────────── // HRESULT 887A0005 = DXGI_ERROR_DEVICE_REMOVED ("The GPU // device instance has been suspended"). Once the D3D12 // device is gone the ORT session is permanently broken. // Log once, attempt CPU fallback, suppress further flood. if (msg.find("887A0005") != std::string::npos) { if (!_dmlDeviceLost) { _dmlDeviceLost = true; _logger.LogFatal("ANSONNXYOLO::DetectObjects", "DirectML GPU device lost (887A0005) — attempting CPU fallback", __FILE__, __LINE__); ANS_DBG("ONNXYOLO", "DML device lost — recreating session on CPU"); try { m_ortEngine.reset(); if (InitOrtEngine(ANSCENTER::EngineType::CPU)) { _logger.LogInfo("ANSONNXYOLO::DetectObjects", "CPU fallback session created successfully", __FILE__, __LINE__); ANS_DBG("ONNXYOLO", "CPU fallback OK"); } else { _logger.LogFatal("ANSONNXYOLO::DetectObjects", "CPU fallback session creation failed", __FILE__, __LINE__); } } catch (const std::exception& re) { _logger.LogFatal("ANSONNXYOLO::DetectObjects", std::string("CPU fallback exception: ") + re.what(), __FILE__, __LINE__); } } // Suppress flood — already logged above return {}; } ANS_DBG("ONNXYOLO", "DetectObjects EXCEPTION: %s cam=%s", e.what(), camera_id.c_str()); _logger.LogFatal("ANSONNXYOLO::DetectObjects", e.what(), __FILE__, __LINE__); return {}; } } // ==================================================================== // RunInferencesBatch / DetectObjectsBatch — true ONNX batch // ==================================================================== std::vector> ANSONNXYOLO::RunInferencesBatch( const std::vector& inputs, const std::string& camera_id) { { std::lock_guard lock(_mutex); if (!_modelLoadValid) { _logger.LogFatal("ANSONNXYOLO::RunInferencesBatch", "Cannot load ONNX model", __FILE__, __LINE__); return {}; } if (!_licenseValid) { _logger.LogFatal("ANSONNXYOLO::RunInferencesBatch", "Invalid license", __FILE__, __LINE__); return {}; } if (!_isInitialized) { _logger.LogFatal("ANSONNXYOLO::RunInferencesBatch", "Model not initialized", __FILE__, __LINE__); return {}; } if (inputs.empty()) { _logger.LogWarn("ANSONNXYOLO::RunInferencesBatch", "Empty input batch", __FILE__, __LINE__); return {}; } } try { return DetectObjectsBatch(inputs, camera_id); } catch (const std::exception& e) { const std::string msg = e.what(); if (msg.find("887A0005") != std::string::npos) { if (!_dmlDeviceLost) { _dmlDeviceLost = true; _logger.LogFatal("ANSONNXYOLO::RunInferencesBatch", "DirectML GPU device lost (887A0005) — attempting CPU fallback", __FILE__, __LINE__); try { m_ortEngine.reset(); if (!InitOrtEngine(ANSCENTER::EngineType::CPU)) _logger.LogFatal("ANSONNXYOLO::RunInferencesBatch", "CPU fallback session creation failed", __FILE__, __LINE__); } catch (...) {} } return {}; } _logger.LogFatal("ANSONNXYOLO::RunInferencesBatch", e.what(), __FILE__, __LINE__); return {}; } } std::vector> ANSONNXYOLO::DetectObjectsBatch( const std::vector& inputImages, const std::string& camera_id) { try { std::lock_guard lock(_mutex); if (!m_ortEngine) { _logger.LogError("ANSONNXYOLO::DetectObjectsBatch", "ORT engine is null", __FILE__, __LINE__); return {}; } auto batchResults = m_ortEngine->detectBatch( inputImages, _classes, PROBABILITY_THRESHOLD, NMS_THRESHOLD, NUM_KPS); const bool isClassification = m_ortEngine->lastBatchWasClassification; for (auto& results : batchResults) { for (auto& obj : results) obj.cameraId = camera_id; // Skip tracking for classification models if (_trackerEnabled && !isClassification) { results = ApplyTracking(results, camera_id); if (_stabilizationEnabled) results = StabilizeDetections(results, camera_id); } } return batchResults; } catch (const std::exception& e) { const std::string msg = e.what(); if (msg.find("887A0005") != std::string::npos) { if (!_dmlDeviceLost) { _dmlDeviceLost = true; _logger.LogFatal("ANSONNXYOLO::DetectObjectsBatch", "DirectML GPU device lost (887A0005) — attempting CPU fallback", __FILE__, __LINE__); try { m_ortEngine.reset(); if (!InitOrtEngine(ANSCENTER::EngineType::CPU)) _logger.LogFatal("ANSONNXYOLO::DetectObjectsBatch", "CPU fallback session creation failed", __FILE__, __LINE__); } catch (...) {} } return {}; } _logger.LogFatal("ANSONNXYOLO::DetectObjectsBatch", e.what(), __FILE__, __LINE__); return {}; } } } // namespace ANSCENTER