modules/ANSODEngine/ANSONNXYOLO.cpp

#include "ANSONNXYOLO.h"
#include "Utility.h"
#include "ANSGpuFrameRegistry.h"
#include "NV12PreprocessHelper.h"   // tl_currentGpuFrame()
#include <numeric>   // std::iota
#include <cmath>
#include <chrono>    // WarmUpEngine() timing

namespace ANSCENTER {

    // ====================================================================
    // ONNXYOLO — BasicOrtHandler subclass for Ultralytics YOLO
    // ====================================================================

    ONNXYOLO::ONNXYOLO(const std::string& _onnx_path, unsigned int _num_threads)
        : BasicOrtHandler(_onnx_path, _num_threads)
    {
        if (input_node_dims.size() >= 4) {
            int h = static_cast<int>(input_node_dims[2]);
            int w = static_cast<int>(input_node_dims[3]);
            isDynamicInputShape = (h == -1 || w == -1);
            inputImageShape = isDynamicInputShape ? cv::Size(640, 640) : cv::Size(w, h);
        }
        else {
            inputImageShape = cv::Size(640, 640);
        }
    }

    ONNXYOLO::ONNXYOLO(const std::string& _onnx_path, EngineType engineType,
                           unsigned int _num_threads)
        : BasicOrtHandler(_onnx_path, engineType, _num_threads)
    {
        if (input_node_dims.size() >= 4) {
            int h = static_cast<int>(input_node_dims[2]);
            int w = static_cast<int>(input_node_dims[3]);
            isDynamicInputShape = (h == -1 || w == -1);
            inputImageShape = isDynamicInputShape ? cv::Size(640, 640) : cv::Size(w, h);
        }
        else {
            inputImageShape = cv::Size(640, 640);
        }
    }

    // ------------------------------------------------------------------
    // letterBox — Ultralytics-compatible LetterBox transform
    // ------------------------------------------------------------------
    void ONNXYOLO::letterBox(const cv::Mat& image, cv::Mat& outImage,
                               const cv::Size& newShape,
                               const cv::Scalar& color,
                               bool scaleUp, int stride)
    {
        float r = std::min(static_cast<float>(newShape.height) / image.rows,
                           static_cast<float>(newShape.width) / image.cols);
        if (!scaleUp)
            r = std::min(r, 1.0f);

        int newUnpadW = static_cast<int>(std::round(image.cols * r));
        int newUnpadH = static_cast<int>(std::round(image.rows * r));

        float dw = static_cast<float>(newShape.width - newUnpadW);
        float dh = static_cast<float>(newShape.height - newUnpadH);

        dw /= 2.0f;
        dh /= 2.0f;

        if (image.cols != newUnpadW || image.rows != newUnpadH) {
            cv::resize(image, outImage, cv::Size(newUnpadW, newUnpadH),
                       0, 0, cv::INTER_LINEAR);
        }
        else {
            outImage = image.clone();
        }

        // Ultralytics -0.1/+0.1 trick for deterministic padding split
        int top    = static_cast<int>(std::round(dh - 0.1f));
        int bottom = static_cast<int>(std::round(dh + 0.1f));
        int left   = static_cast<int>(std::round(dw - 0.1f));
        int right  = static_cast<int>(std::round(dw + 0.1f));

        cv::copyMakeBorder(outImage, outImage, top, bottom, left, right,
                           cv::BORDER_CONSTANT, color);
    }

    // ------------------------------------------------------------------
    // transform — BGR → RGB, letterbox, /255, HWC→CHW
    // ------------------------------------------------------------------
    Ort::Value ONNXYOLO::transform(const cv::Mat& mat)
    {
        // Grayscale → BGR if needed
        cv::Mat bgrMat;
        if (mat.channels() == 1) {
            cv::cvtColor(mat, bgrMat, cv::COLOR_GRAY2BGR);
        } else {
            bgrMat = mat;
        }

        // Check if model is classification (first output has 2 dims: [B, nc])
        const bool isClassification = !output_node_dims.empty()
                                      && output_node_dims[0].size() == 2;


        cv::Mat canvas;
        if (isClassification) {
            // Classification: direct resize (no letterbox padding) — matches ANSONNXCL
            cv::resize(bgrMat, canvas, cv::Size(inputImageShape.width, inputImageShape.height),
                       0, 0, cv::INTER_LINEAR);
        } else {
            // Detection/Seg/Pose/OBB: Ultralytics letterbox
            letterBox(bgrMat, canvas, inputImageShape);
        }

        cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
        canvas.convertTo(canvas, CV_32FC3, 1.0 / 255.0);

        const int channels = canvas.channels();
        const int height   = canvas.rows;
        const int width    = canvas.cols;
        const size_t imageSize = static_cast<size_t>(height) * width;

        input_node_dims = { 1, 3, height, width };
        input_tensor_size = 1 * 3 * imageSize;
        input_values_handler.resize(input_tensor_size);

        std::vector<cv::Mat> channelMats(channels);
        for (int c = 0; c < channels; ++c) {
            channelMats[c] = cv::Mat(height, width, CV_32FC1,
                                     input_values_handler.data() + c * imageSize);
        }
        cv::split(canvas, channelMats);

        return Ort::Value::CreateTensor<float>(
            *memory_info_handler,
            input_values_handler.data(),
            input_tensor_size,
            input_node_dims.data(),
            input_node_dims.size());
    }

    Ort::Value ONNXYOLO::transformBatch(const std::vector<cv::Mat>& images)
    {
        if (images.empty())
            throw std::runtime_error("ONNXYOLO::transformBatch: empty input");

        const size_t N = images.size();

        // Preprocess all images: letterbox → BGR→RGB → float → /255
        // Check if model is classification (first output has 2 dims: [B, nc])
        const bool isClassification = !output_node_dims.empty()
                                      && output_node_dims[0].size() == 2;

        std::vector<cv::Mat> batch;
        batch.reserve(N);
        for (const auto& img : images) {
            if (img.empty())
                throw std::runtime_error("ONNXYOLO::transformBatch: empty image in batch");

            // Grayscale → BGR if needed
            cv::Mat bgrImg;
            if (img.channels() == 1) {
                cv::cvtColor(img, bgrImg, cv::COLOR_GRAY2BGR);
            } else {
                bgrImg = img;
            }

            cv::Mat canvas;
            if (isClassification) {
                // Classification: direct resize (no letterbox)
                cv::resize(bgrImg, canvas, cv::Size(inputImageShape.width, inputImageShape.height),
                           0, 0, cv::INTER_LINEAR);
            } else {
                letterBox(bgrImg, canvas, inputImageShape);
            }
            cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
            canvas.convertTo(canvas, CV_32FC3, 1.0 / 255.0);
            batch.push_back(canvas);
        }

        const int height = batch[0].rows;
        const int width  = batch[0].cols;
        const size_t imageSize = static_cast<size_t>(height) * width;

        input_node_dims = {
            static_cast<int64_t>(N), 3,
            static_cast<int64_t>(height),
            static_cast<int64_t>(width)
        };
        input_tensor_size = N * 3 * imageSize;
        input_values_handler.resize(input_tensor_size);

        // Pack each image into CHW layout (same as transform() for single image)
        for (size_t b = 0; b < N; ++b) {
            const size_t batchOffset = b * 3 * imageSize;
            std::vector<cv::Mat> channelMats(3);
            for (int c = 0; c < 3; ++c) {
                channelMats[c] = cv::Mat(height, width, CV_32FC1,
                    input_values_handler.data() + batchOffset + c * imageSize);
            }
            cv::split(batch[b], channelMats);
        }

        return Ort::Value::CreateTensor<float>(
            *memory_info_handler,
            input_values_handler.data(),
            input_tensor_size,
            input_node_dims.data(),
            input_node_dims.size());
    }

    // ------------------------------------------------------------------
    // detect — full pipeline with auto task detection
    //
    // Decision logic:
    //   2 outputs (second 4D)        → segmentation
    //   1 output, 2D                 → classification
    //   1 output, 3D end2end dim2=6  → detection
    //   1 output, 3D end2end dim2=7  → OBB
    //   1 output, 3D end2end dim2>7  → pose (if (dim2-6)%3==0)
    //   1 output, 3D legacy          → detect/obb/pose by nc
    // ------------------------------------------------------------------
    std::vector<Object> ONNXYOLO::detect(const cv::Mat& image,
                                           const std::vector<std::string>& classNames,
                                           float confThreshold,
                                           float iouThreshold,
                                           int numKPS)
    {
        lastWasClassification = false;

        if (image.empty())
            return {};

        Ort::Value inputTensor = transform(image);

        auto outputTensors = ort_session->Run(
            Ort::RunOptions{ nullptr },
            input_node_names.data(),
            &inputTensor, 1,
            output_node_names.data(),
            num_outputs);

        // ── Output shape sanity check ───────────────────────────────────
        // DirectML on some AMD configurations has been observed to return
        // output tensors whose dim[1]/dim[2] values don't match what the
        // ONNX graph actually produced, which propagates into
        // postprocessLegacy / postprocessEndToEnd as huge numBoxes /
        // numChannels values and causes multi-terabyte cv::Mat allocations
        // inside the `cv::Mat(numChannels, numBoxes, CV_32F, ...).t()`
        // call (observed as "Failed to allocate 3522082959360 bytes" on
        // Ryzen APUs).  Bail out early here instead of letting the
        // postprocess layer try to materialise a 3.5 TB buffer.
        //
        // Sane upper bounds for Ultralytics YOLO outputs:
        //   • legacy [1, 84..300, 8400..25200]          → max dim ≈ 30k
        //   • end2end [1, 300, 6..56]                   → max dim ≈ 300
        //   • segmentation proto mask [1, 32, 160, 160] → max dim ≈ 160
        //   • classification [1, 1000]                  → max dim ≈ 1k
        // 1,000,000 is ~30x the largest real-world dim and catches the
        // garbage values without clipping any legitimate model.
        constexpr int64_t kMaxOutputDim = 1000000;
        for (size_t t = 0; t < outputTensors.size(); ++t) {
            const auto shape = outputTensors[t].GetTensorTypeAndShapeInfo().GetShape();
            for (size_t d = 0; d < shape.size(); ++d) {
                if (shape[d] < 0 || shape[d] > kMaxOutputDim) {
                    std::cerr << "[ONNXYOLO] detect: output[" << t
                              << "] dim[" << d << "]=" << shape[d]
                              << " is out of range — refusing to postprocess."
                              << std::endl;
                    return {};
                }
            }
        }

        const cv::Size resizedShape(
            static_cast<int>(input_node_dims[3]),
            static_cast<int>(input_node_dims[2]));

        const size_t numOutputs = outputTensors.size();

        // ── Segmentation: 2 outputs (detections + proto masks) ──────────
        if (numOutputs >= 2) {
            const auto protoShape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape();
            if (protoShape.size() == 4) {
                const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
                // Legacy:  shape [B, channels, num_boxes] e.g. [1, 116, 8400] -> shape[1] < shape[2]
                // End2end: shape [B, max_det, features]   e.g. [1, 300, 38]   -> shape[1] > shape[2]
                if (shape0.size() >= 3 && shape0[1] < shape0[2]) {
                    return postprocessSegLegacy(image.size(), resizedShape,
                                                outputTensors, classNames,
                                                confThreshold, iouThreshold);
                }
                else {
                    return postprocessSegEndToEnd(image.size(), resizedShape,
                                                  outputTensors, classNames, confThreshold);
                }
            }
        }

        const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();

        // ── Classification: 2D output [B, nc] ───────────────────────────
        if (shape0.size() == 2) {
            lastWasClassification = true;
            return postprocessClassify(outputTensors, classNames, image.size());
        }

        if (shape0.size() < 3)
            return {};

        // ── Determine end2end vs legacy ─────────────────────────────────
        // End2end: shape [B, max_det, features] where max_det < features is false
        //          typically [1, 300, 6/7/...] so shape[1] > shape[2]
        // Legacy:  shape [B, channels, num_boxes] where channels < num_boxes
        //          typically [1, 84, 8400] so shape[1] < shape[2]
        const bool isEndToEnd = (shape0[1] > shape0[2])
                             || (shape0[2] <= 20);  // very small dim2 = end2end

        if (isEndToEnd) {
            const int features = static_cast<int>(shape0[2]);
            if (features == 6) {
                return postprocessEndToEnd(image.size(), resizedShape,
                                           outputTensors, classNames, confThreshold);
            }
            else if (features == 7) {
                return postprocessOBBEndToEnd(image.size(), resizedShape,
                                              outputTensors, classNames, confThreshold);
            }
            else if (features > 7 && (features - 6) % 3 == 0) {
                int nk = (numKPS > 0) ? numKPS : (features - 6) / 3;
                return postprocessPoseEndToEnd(image.size(), resizedShape,
                                               outputTensors, classNames,
                                               confThreshold, nk);
            }
            // Fallback to detection
            return postprocessEndToEnd(image.size(), resizedShape,
                                       outputTensors, classNames, confThreshold);
        }
        else {
            // Legacy format: [B, channels, num_boxes]
            // channels = 4(bbox) + nc(scores) + extra_features
            const int nc = static_cast<int>(classNames.size());
            const int numChannels = static_cast<int>(shape0[1]);
            const int numBoxes    = static_cast<int>(shape0[2]);
            const int extra = numChannels - 4;

            // Pose check: if numKPS is explicitly set, or we can detect keypoints
            if (numKPS > 0 && numChannels >= 4 + 1 + numKPS * 3) {
                return postprocessPoseLegacy(image.size(), resizedShape,
                                             outputTensors, classNames,
                                             confThreshold, iouThreshold, numKPS);
            }
            else if (nc > 0 && nc <= extra && extra > nc && (extra - nc) % 3 == 0 && (extra - nc) >= 3) {
                int nk = (extra - nc) / 3;
                return postprocessPoseLegacy(image.size(), resizedShape,
                                             outputTensors, classNames,
                                             confThreshold, iouThreshold, nk);
            }
            else if (nc > 0 && nc <= extra && extra == nc + 1) {
                return postprocessOBBLegacy(image.size(), resizedShape,
                                            outputTensors, classNames,
                                            confThreshold, iouThreshold);
            }
            else if (nc > 0 && nc <= extra && extra == nc) {
                return postprocessLegacy(image.size(), resizedShape,
                                         outputTensors, classNames,
                                         confThreshold, iouThreshold);
            }
            else {
                // Class count doesn't match tensor — probe last channel
                // to distinguish OBB (angle values in [-pi, pi]) from detection
                bool likelyOBB = false;
                if (extra >= 2) {
                    const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
                    int numSamples = std::min(numBoxes, 100);
                    int angleCount = 0;
                    for (int s = 0; s < numSamples; ++s) {
                        float v = rawOutput[(numChannels - 1) * numBoxes + s];
                        if (v >= -3.15f && v <= 3.15f) ++angleCount;
                    }
                    likelyOBB = (angleCount > numSamples * 8 / 10);
                }

                if (likelyOBB) {
                    return postprocessOBBLegacy(image.size(), resizedShape,
                                                outputTensors, classNames,
                                                confThreshold, iouThreshold);
                }
                else if (numChannels == 56) {
                    return postprocessPoseLegacy(image.size(), resizedShape,
                                                 outputTensors, classNames,
                                                 confThreshold, iouThreshold, 17);
                }
                else {
                    return postprocessLegacy(image.size(), resizedShape,
                                             outputTensors, classNames,
                                             confThreshold, iouThreshold);
                }
            }
        }
    }

    // ====================================================================
    //  DETECTION — postprocess
    // ====================================================================

    std::vector<Object> ONNXYOLO::postprocessEndToEnd(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold)
    {
        if (outputTensors.empty()) return {};

        const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
        const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        if (outputShape.size() < 3) return {};

        const int numDets = static_cast<int>(outputShape[1]);
        const int numFeat = static_cast<int>(outputShape[2]);

        const float origW = static_cast<float>(originalImageSize.width);
        const float origH = static_cast<float>(originalImageSize.height);
        const float modelW = static_cast<float>(resizedImageShape.width);
        const float modelH = static_cast<float>(resizedImageShape.height);
        const float gain = std::min(modelH / origH, modelW / origW);
        const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
        const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
        const float invGain = 1.0f / gain;

        std::vector<Object> results;
        results.reserve(numDets);

        for (int i = 0; i < numDets; ++i) {
            const float* det = rawOutput + i * numFeat;
            const float conf = det[4];
            if (conf <= confThreshold) continue;

            float x1 = (det[0] - padX) * invGain;
            float y1 = (det[1] - padY) * invGain;
            float x2 = (det[2] - padX) * invGain;
            float y2 = (det[3] - padY) * invGain;
            int classId = static_cast<int>(det[5]);

            x1 = clamp(x1, 0.f, origW); y1 = clamp(y1, 0.f, origH);
            x2 = clamp(x2, 0.f, origW); y2 = clamp(y2, 0.f, origH);
            float w = x2 - x1, h = y2 - y1;
            if (w < 1.f || h < 1.f) continue;

            Object obj;
            obj.classId = classId;
            obj.confidence = conf;
            obj.box = cv::Rect(static_cast<int>(x1), static_cast<int>(y1),
                               static_cast<int>(w), static_cast<int>(h));
            if (classId >= 0 && classId < static_cast<int>(classNames.size()))
                obj.className = classNames[classId];
            results.push_back(std::move(obj));
        }
        return results;
    }

    std::vector<Object> ONNXYOLO::postprocessLegacy(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold, float iouThreshold, int maxDet)
    {
        if (outputTensors.empty()) return {};

        const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
        const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        if (outputShape.size() < 3) return {};

        const int numChannels = static_cast<int>(outputShape[1]);
        const int numBoxes    = static_cast<int>(outputShape[2]);
        const int numClasses  = numChannels - 4;
        if (numClasses <= 0) return {};

        cv::Mat output = cv::Mat(numChannels, numBoxes, CV_32F,
                                 const_cast<float*>(rawOutput)).t();

        const float origW = static_cast<float>(originalImageSize.width);
        const float origH = static_cast<float>(originalImageSize.height);
        const float modelW = static_cast<float>(resizedImageShape.width);
        const float modelH = static_cast<float>(resizedImageShape.height);
        const float gain = std::min(modelH / origH, modelW / origW);
        const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
        const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
        const float invGain = 1.0f / gain;

        struct Candidate { float x1, y1, x2, y2, conf; int classId; };
        std::vector<Candidate> candidates;
        candidates.reserve(numBoxes);

        for (int i = 0; i < numBoxes; ++i) {
            const float* row = output.ptr<float>(i);
            const float* scoresPtr = row + 4;
            float maxScore = -FLT_MAX;
            int bestClass = -1;
            for (int c = 0; c < numClasses; ++c) {
                if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; }
            }
            if (maxScore <= confThreshold) continue;

            float cx = row[0], cy = row[1], w = row[2], h = row[3];
            candidates.push_back({ cx - w*0.5f, cy - h*0.5f, cx + w*0.5f, cy + h*0.5f,
                                   maxScore, bestClass });
        }
        if (candidates.empty()) return {};

        // Class-aware NMS
        constexpr float MAX_WH = 7680.0f;
        std::vector<int> sortedIdx(candidates.size());
        std::iota(sortedIdx.begin(), sortedIdx.end(), 0);
        std::sort(sortedIdx.begin(), sortedIdx.end(),
            [&](int a, int b) { return candidates[a].conf > candidates[b].conf; });
        if (static_cast<int>(sortedIdx.size()) > 30000) sortedIdx.resize(30000);

        std::vector<bool> suppressed(sortedIdx.size(), false);
        std::vector<int> keepIndices;
        keepIndices.reserve(maxDet);

        for (size_t i = 0; i < sortedIdx.size() && static_cast<int>(keepIndices.size()) < maxDet; ++i) {
            if (suppressed[i]) continue;
            keepIndices.push_back(sortedIdx[i]);
            const auto& cur = candidates[sortedIdx[i]];
            float cx1 = cur.x1 + cur.classId*MAX_WH, cy1 = cur.y1 + cur.classId*MAX_WH;
            float cx2 = cur.x2 + cur.classId*MAX_WH, cy2 = cur.y2 + cur.classId*MAX_WH;
            float curArea = (cx2-cx1)*(cy2-cy1);

            for (size_t j = i+1; j < sortedIdx.size(); ++j) {
                if (suppressed[j]) continue;
                const auto& o = candidates[sortedIdx[j]];
                float ox1 = o.x1+o.classId*MAX_WH, oy1 = o.y1+o.classId*MAX_WH;
                float ox2 = o.x2+o.classId*MAX_WH, oy2 = o.y2+o.classId*MAX_WH;
                float iw = std::min(cx2,ox2)-std::max(cx1,ox1);
                float ih = std::min(cy2,oy2)-std::max(cy1,oy1);
                if (iw <= 0.f || ih <= 0.f) continue;
                float inter = iw*ih;
                float ua = curArea + (ox2-ox1)*(oy2-oy1) - inter;
                if (ua > 0.f && inter/ua > iouThreshold) suppressed[j] = true;
            }
        }

        std::vector<Object> results;
        results.reserve(keepIndices.size());
        for (int idx : keepIndices) {
            const auto& c = candidates[idx];
            float x1 = clamp((c.x1-padX)*invGain, 0.f, origW);
            float y1 = clamp((c.y1-padY)*invGain, 0.f, origH);
            float x2 = clamp((c.x2-padX)*invGain, 0.f, origW);
            float y2 = clamp((c.y2-padY)*invGain, 0.f, origH);
            float w = x2-x1, h = y2-y1;
            if (w < 1.f || h < 1.f) continue;

            Object obj;
            obj.classId = c.classId;
            obj.confidence = c.conf;
            obj.box = cv::Rect(static_cast<int>(x1), static_cast<int>(y1),
                               static_cast<int>(w), static_cast<int>(h));
            if (c.classId >= 0 && c.classId < static_cast<int>(classNames.size()))
                obj.className = classNames[c.classId];
            results.push_back(std::move(obj));
        }
        return results;
    }

    // ====================================================================
    //  OBB — helpers (Prob-IoU based NMS)
    // ====================================================================

    void ONNXYOLO::getCovarianceComponents(const OrientedBox& box,
                                             float& out1, float& out2, float& out3)
    {
        if (box.width <= 0.f || box.height <= 0.f) {
            out1 = out2 = out3 = 0.f;
            return;
        }
        const float vw = (box.width * box.width) / 12.0f;
        const float vh = (box.height * box.height) / 12.0f;
        const float cosT = std::cos(box.angle);
        const float sinT = std::sin(box.angle);
        const float cos2 = cosT * cosT;
        const float sin2 = sinT * sinT;
        const float sc   = sinT * cosT;
        out1 = vw * cos2 + vh * sin2;
        out2 = vw * sin2 + vh * cos2;
        out3 = (vw - vh) * sc;
    }

    std::vector<std::vector<float>> ONNXYOLO::batchProbiou(
        const std::vector<OrientedBox>& obb1,
        const std::vector<OrientedBox>& obb2, float eps)
    {
        if (obb1.empty() || obb2.empty()) return {};
        const size_t n1 = obb1.size(), n2 = obb2.size();
        std::vector<std::vector<float>> iouMat(n1, std::vector<float>(n2, 0.f));

        // Pre-compute covariance for obb1
        struct CovData { float x, y, a, b, c; };
        std::vector<CovData> cov1(n1);
        for (size_t i = 0; i < n1; ++i) {
            float a, b, c;
            getCovarianceComponents(obb1[i], a, b, c);
            cov1[i] = { obb1[i].x, obb1[i].y, a, b, c };
        }

        for (size_t i = 0; i < n1; ++i) {
            for (size_t j = 0; j < n2; ++j) {
                float a2, b2, c2;
                getCovarianceComponents(obb2[j], a2, b2, c2);
                float dx = cov1[i].x - obb2[j].x;
                float dy = cov1[i].y - obb2[j].y;
                float sA = cov1[i].a + a2, sB = cov1[i].b + b2, sC = cov1[i].c + c2;
                float denom = sA * sB - sC * sC + eps;
                if (denom <= eps) continue;

                float t1 = ((sA*dy*dy + sB*dx*dx) * 0.25f) / denom;
                float t2 = ((sC*dx*dy) * -0.5f) / denom;
                float d1 = cov1[i].a*cov1[i].b - cov1[i].c*cov1[i].c;
                float d2 = a2*b2 - c2*c2;
                float sqrtDet = std::sqrt(std::max(d1, 0.f) * std::max(d2, 0.f) + eps);
                float t3 = 0.5f * std::log((sA*sB - sC*sC) / (4.f*sqrtDet) + eps);
                float bd = std::clamp(t1 + t2 + t3, eps, 100.f);
                float hd = std::sqrt(1.f - std::exp(-bd) + eps);
                iouMat[i][j] = 1.f - hd;
            }
        }
        return iouMat;
    }

    std::vector<int> ONNXYOLO::nmsRotatedImpl(
        const std::vector<OrientedBox>& sortedBoxes, float iouThreshold)
    {
        if (sortedBoxes.empty()) return {};
        if (sortedBoxes.size() == 1) return { 0 };

        auto iouMat = batchProbiou(sortedBoxes, sortedBoxes);
        if (iouMat.empty()) return {};
        const int n = static_cast<int>(sortedBoxes.size());

        std::vector<int> keep;
        keep.reserve(n / 2);
        for (int j = 0; j < n; ++j) {
            bool shouldKeep = true;
            for (int i = 0; i < j; ++i) {
                if (iouMat[i][j] >= iouThreshold) { shouldKeep = false; break; }
            }
            if (shouldKeep) keep.push_back(j);
        }
        return keep;
    }

    std::vector<int> ONNXYOLO::nmsRotated(
        const std::vector<OrientedBox>& boxes,
        const std::vector<float>& scores, float iouThreshold)
    {
        if (boxes.empty() || scores.empty() || boxes.size() != scores.size()) return {};

        std::vector<int> sortedIdx(boxes.size());
        std::iota(sortedIdx.begin(), sortedIdx.end(), 0);
        std::sort(sortedIdx.begin(), sortedIdx.end(),
            [&](int a, int b) { return scores[a] > scores[b]; });

        std::vector<OrientedBox> sortedBoxes;
        sortedBoxes.reserve(boxes.size());
        for (int i : sortedIdx) sortedBoxes.push_back(boxes[i]);

        auto keepSorted = nmsRotatedImpl(sortedBoxes, iouThreshold);
        std::vector<int> keepOrig;
        keepOrig.reserve(keepSorted.size());
        for (int si : keepSorted) keepOrig.push_back(sortedIdx[si]);
        return keepOrig;
    }

    std::vector<cv::Point2f> ONNXYOLO::OBBToPoints(const OrientedBox& obb)
    {
        float angleDeg = obb.angle * 180.0f / static_cast<float>(CV_PI);
        cv::RotatedRect rr(cv::Point2f(obb.x, obb.y),
                           cv::Size2f(obb.width, obb.height), angleDeg);
        std::vector<cv::Point2f> corners(4);
        rr.points(corners.data());
        return corners;
    }

    // ====================================================================
    //  OBB — postprocess
    // ====================================================================

    std::vector<Object> ONNXYOLO::postprocessOBBEndToEnd(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold)
    {
        if (outputTensors.empty()) return {};
        const float* raw = outputTensors[0].GetTensorMutableData<float>();
        const auto shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        if (shape.size() < 3) return {};

        const int numDets = static_cast<int>(shape[1]);
        const int numFeat = static_cast<int>(shape[2]); // 7: cx,cy,w,h,angle,conf,classId

        const float origW = static_cast<float>(originalImageSize.width);
        const float origH = static_cast<float>(originalImageSize.height);
        const float modelW = static_cast<float>(resizedImageShape.width);
        const float modelH = static_cast<float>(resizedImageShape.height);
        const float gain = std::min(modelH / origH, modelW / origW);
        const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
        const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
        const float invGain = 1.0f / gain;

        std::vector<Object> results;
        results.reserve(numDets);

        for (int i = 0; i < numDets; ++i) {
            const float* det = raw + i * numFeat;
            float angle = det[4];
            float conf  = det[5];
            if (conf <= confThreshold) continue;

            float cx    = (det[0] - padX) * invGain;
            float cy    = (det[1] - padY) * invGain;
            float bw    = det[2] * invGain;
            float bh    = det[3] * invGain;
            int classId = static_cast<int>(det[6]);

            cx = clamp(cx, 0.f, origW);
            cy = clamp(cy, 0.f, origH);

            OrientedBox obb{ cx, cy, bw, bh, angle };

            Object obj;
            obj.classId    = classId;
            obj.confidence = conf;
            obj.kps        = { cx, cy, bw, bh, angle };
            auto absCorners = OBBToPoints(obb);
            obj.box        = cv::boundingRect(absCorners);
            // Normalize OBB corners to [0,1] and close the polygon
            obj.polygon.reserve(absCorners.size() + 1);
            for (const auto& pt : absCorners) {
                obj.polygon.emplace_back(
                    std::clamp(pt.x / origW, 0.f, 1.f),
                    std::clamp(pt.y / origH, 0.f, 1.f));
            }
            if (!obj.polygon.empty()) obj.polygon.push_back(obj.polygon.front());
            if (classId >= 0 && classId < static_cast<int>(classNames.size()))
                obj.className = classNames[classId];
            results.push_back(std::move(obj));
        }
        return results;
    }

    std::vector<Object> ONNXYOLO::postprocessOBBLegacy(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold, float iouThreshold, int maxDet)
    {
        if (outputTensors.empty()) return {};
        const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
        const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        if (outputShape.size() < 3) return {};

        const int numChannels = static_cast<int>(outputShape[1]);
        const int numBoxes    = static_cast<int>(outputShape[2]);
        const int numClasses  = numChannels - 5; // 4 box + nc scores + 1 angle

        if (numClasses <= 0) return {};

        cv::Mat output = cv::Mat(numChannels, numBoxes, CV_32F,
                                 const_cast<float*>(rawOutput)).t();

        const float origW = static_cast<float>(originalImageSize.width);
        const float origH = static_cast<float>(originalImageSize.height);
        const float modelW = static_cast<float>(resizedImageShape.width);
        const float modelH = static_cast<float>(resizedImageShape.height);
        const float gain = std::min(modelH / origH, modelW / origW);
        const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
        const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
        const float invGain = 1.0f / gain;

        struct OBBCandidate {
            OrientedBox box;
            float conf;
            int classId;
        };
        std::vector<OBBCandidate> candidates;
        candidates.reserve(numBoxes);

        for (int i = 0; i < numBoxes; ++i) {
            const float* row = output.ptr<float>(i);
            const float* scoresPtr = row + 4;
            float maxScore = -FLT_MAX;
            int bestClass = -1;
            for (int c = 0; c < numClasses; ++c) {
                if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; }
            }
            if (maxScore <= confThreshold) continue;

            float angle = row[4 + numClasses]; // angle after class scores
            float cx = (row[0] - padX) * invGain;
            float cy = (row[1] - padY) * invGain;
            float bw = row[2] * invGain;
            float bh = row[3] * invGain;
            cx = clamp(cx, 0.f, origW);
            cy = clamp(cy, 0.f, origH);

            candidates.push_back({ { cx, cy, bw, bh, angle }, maxScore, bestClass });
        }
        if (candidates.empty()) return {};

        // Prob-IoU NMS for oriented boxes
        std::vector<OrientedBox> boxes;
        std::vector<float> scores;
        boxes.reserve(candidates.size());
        scores.reserve(candidates.size());
        for (const auto& c : candidates) { boxes.push_back(c.box); scores.push_back(c.conf); }

        auto keepIdx = nmsRotated(boxes, scores, iouThreshold);

        std::vector<Object> results;
        results.reserve(std::min(static_cast<int>(keepIdx.size()), maxDet));
        for (int idx : keepIdx) {
            if (static_cast<int>(results.size()) >= maxDet) break;
            const auto& c = candidates[idx];
            Object obj;
            obj.classId    = c.classId;
            obj.confidence = c.conf;
            obj.kps        = { c.box.x, c.box.y, c.box.width, c.box.height, c.box.angle };
            auto absCorners = OBBToPoints(c.box);
            obj.box        = cv::boundingRect(absCorners);
            // Normalize OBB corners to [0,1] and close the polygon
            const float origW = static_cast<float>(originalImageSize.width);
            const float origH = static_cast<float>(originalImageSize.height);
            obj.polygon.reserve(absCorners.size() + 1);
            for (const auto& pt : absCorners) {
                obj.polygon.emplace_back(
                    std::clamp(pt.x / origW, 0.f, 1.f),
                    std::clamp(pt.y / origH, 0.f, 1.f));
            }
            if (!obj.polygon.empty()) obj.polygon.push_back(obj.polygon.front());
            if (c.classId >= 0 && c.classId < static_cast<int>(classNames.size()))
                obj.className = classNames[c.classId];
            results.push_back(std::move(obj));
        }
        return results;
    }

    // ====================================================================
    //  SEGMENTATION — postprocess
    // ====================================================================

    std::vector<Object> ONNXYOLO::postprocessSegEndToEnd(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold)
    {
        if (outputTensors.size() < 2) return {};

        const float* raw = outputTensors[0].GetTensorMutableData<float>();
        const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        const auto protoShape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape();
        if (shape0.size() < 3 || protoShape.size() < 4) return {};

        const int numDets    = static_cast<int>(shape0[1]);
        const int numFeat    = static_cast<int>(shape0[2]); // 6 + nm
        const int nm         = static_cast<int>(protoShape[1]);
        const int protoH     = static_cast<int>(protoShape[2]);
        const int protoW     = static_cast<int>(protoShape[3]);

        if (numFeat < 6 + nm) return {};

        const float origW = static_cast<float>(originalImageSize.width);
        const float origH = static_cast<float>(originalImageSize.height);
        const float modelW = static_cast<float>(resizedImageShape.width);
        const float modelH = static_cast<float>(resizedImageShape.height);
        const float gain = std::min(modelH / origH, modelW / origW);
        const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
        const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
        const float invGain = 1.0f / gain;

        // Collect detections and mask coefficients
        std::vector<Object> objs;
        cv::Mat maskCoeffs;  // [N, nm]

        for (int i = 0; i < numDets; ++i) {
            const float* det = raw + i * numFeat;
            float conf = det[4];
            if (conf <= confThreshold) continue;

            int classId = static_cast<int>(det[5]);
            float x1 = clamp((det[0] - padX) * invGain, 0.f, origW);
            float y1 = clamp((det[1] - padY) * invGain, 0.f, origH);
            float x2 = clamp((det[2] - padX) * invGain, 0.f, origW);
            float y2 = clamp((det[3] - padY) * invGain, 0.f, origH);
            float w = x2-x1, h = y2-y1;
            if (w < 1.f || h < 1.f) continue;

            Object obj;
            obj.classId    = classId;
            obj.confidence = conf;
            obj.box        = cv::Rect(static_cast<int>(x1), static_cast<int>(y1),
                                      static_cast<int>(w), static_cast<int>(h));
            if (classId >= 0 && classId < static_cast<int>(classNames.size()))
                obj.className = classNames[classId];
            objs.push_back(std::move(obj));

            // Extract mask coefficients (after the 6 detection values)
            cv::Mat mc(1, nm, CV_32F);
            std::memcpy(mc.ptr<float>(), det + 6, nm * sizeof(float));
            maskCoeffs.push_back(mc);
        }

        // Generate masks: coeffs @ protos → sigmoid → crop-in-proto → resize-to-box → threshold
        if (!objs.empty() && !maskCoeffs.empty()) {
            const float* protoData = outputTensors[1].GetTensorMutableData<float>();
            cv::Mat protos(nm, protoH * protoW, CV_32F, const_cast<float*>(protoData));
            cv::Mat matmulRes = (maskCoeffs * protos).t();

            // Apply sigmoid while still a single-channel 2D matrix
            cv::Mat negMat;
            cv::exp(-matmulRes, negMat);
            cv::Mat sigmoidFlat = 1.0 / (1.0 + negMat);

            cv::Mat sigmoidMat = sigmoidFlat.reshape(static_cast<int>(objs.size()),
                                                { protoH, protoW });
            std::vector<cv::Mat> maskChannels;
            cv::split(sigmoidMat, maskChannels);

            // ROI in proto space, accounting for letterbox padding
            cv::Rect roi;
            if (origH > origW) {
                int roiW = std::min(static_cast<int>(std::round(
                    static_cast<float>(protoW) * origW / origH)), protoW);
                roi = cv::Rect((protoW - roiW) / 2, 0, roiW, protoH);
            }
            else {
                int roiH = std::min(static_cast<int>(std::round(
                    static_cast<float>(protoH) * origH / origW)), protoH);
                roi = cv::Rect(0, (protoH - roiH) / 2, protoW, roiH);
            }
            roi &= cv::Rect(0, 0, protoW, protoH);

            int imgW = static_cast<int>(origW);
            int imgH = static_cast<int>(origH);

            const float scaleX = static_cast<float>(imgW) / roi.width;
            const float scaleY = static_cast<float>(imgH) / roi.height;

            for (size_t i = 0; i < objs.size(); ++i) {
                cv::Rect safebox = objs[i].box & cv::Rect(0, 0, imgW, imgH);
                if (safebox.area() <= 0) continue;

                int px0 = std::max(static_cast<int>(std::floor(safebox.x / scaleX)), 0);
                int py0 = std::max(static_cast<int>(std::floor(safebox.y / scaleY)), 0);
                int px1 = std::min(static_cast<int>(std::ceil((safebox.x + safebox.width) / scaleX)), roi.width);
                int py1 = std::min(static_cast<int>(std::ceil((safebox.y + safebox.height) / scaleY)), roi.height);
                if (px1 <= px0 || py1 <= py0) continue;

                cv::Rect protoBox(roi.x + px0, roi.y + py0, px1 - px0, py1 - py0);
                protoBox &= cv::Rect(0, 0, protoW, protoH);
                if (protoBox.area() <= 0) continue;

                cv::Mat cropped = maskChannels[i](protoBox);
                cv::Mat resized;
                cv::resize(cropped, resized, cv::Size(safebox.width, safebox.height),
                           0, 0, cv::INTER_LINEAR);
                objs[i].mask = resized > 0.5f;
                objs[i].polygon = ANSUtilityHelper::MaskToNormalizedPolygon(
                    objs[i].mask, safebox, origW, origH);
            }
        }
        for (auto& obj : objs) {
            if (obj.polygon.empty())
                obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, origW, origH);
        }
        return objs;
    }

    std::vector<Object> ONNXYOLO::postprocessSegLegacy(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold, float iouThreshold, int maxDet)
    {
        if (outputTensors.size() < 2) return {};

        const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
        const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        const auto protoShape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape();
        if (shape0.size() < 3 || protoShape.size() < 4) return {};

        const int numChannels = static_cast<int>(shape0[1]);
        const int numBoxes    = static_cast<int>(shape0[2]);
        const int nm          = static_cast<int>(protoShape[1]);
        const int protoH      = static_cast<int>(protoShape[2]);
        const int protoW      = static_cast<int>(protoShape[3]);
        const int numClasses  = numChannels - 4 - nm;
        if (numClasses <= 0) return {};

        cv::Mat output = cv::Mat(numChannels, numBoxes, CV_32F,
                                 const_cast<float*>(rawOutput)).t();

        const float origW = static_cast<float>(originalImageSize.width);
        const float origH = static_cast<float>(originalImageSize.height);
        const float modelW = static_cast<float>(resizedImageShape.width);
        const float modelH = static_cast<float>(resizedImageShape.height);
        const float gain = std::min(modelH / origH, modelW / origW);
        const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
        const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
        const float invGain = 1.0f / gain;

        std::vector<cv::Rect> bboxes;
        std::vector<float> scores;
        std::vector<int> labels;
        std::vector<cv::Mat> maskCoeffs;

        for (int i = 0; i < numBoxes; ++i) {
            const float* row = output.ptr<float>(i);
            const float* scoresPtr = row + 4;
            float maxScore = -FLT_MAX;
            int bestClass = -1;
            for (int c = 0; c < numClasses; ++c) {
                if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; }
            }
            if (maxScore <= confThreshold) continue;

            float cx = row[0], cy = row[1], w = row[2], h = row[3];
            float x0 = clamp((cx - w*0.5f - padX) * invGain, 0.f, origW);
            float y0 = clamp((cy - h*0.5f - padY) * invGain, 0.f, origH);
            float x1 = clamp((cx + w*0.5f - padX) * invGain, 0.f, origW);
            float y1 = clamp((cy + h*0.5f - padY) * invGain, 0.f, origH);

            bboxes.push_back(cv::Rect(static_cast<int>(x0), static_cast<int>(y0),
                                       static_cast<int>(x1-x0), static_cast<int>(y1-y0)));
            scores.push_back(maxScore);
            labels.push_back(bestClass);

            cv::Mat mc(1, nm, CV_32F);
            std::memcpy(mc.ptr<float>(), row + 4 + numClasses, nm * sizeof(float));
            maskCoeffs.push_back(mc);
        }

        // NMS
        std::vector<int> indices;
        cv::dnn::NMSBoxesBatched(bboxes, scores, labels, confThreshold,
                                  iouThreshold, indices);

        // Collect surviving detections and their mask coefficients
        std::vector<Object> objs;
        cv::Mat masks;
        for (int idx : indices) {
            if (static_cast<int>(objs.size()) >= maxDet) break;
            Object obj;
            obj.classId    = labels[idx];
            obj.confidence = scores[idx];
            obj.box        = bboxes[idx];
            if (obj.classId >= 0 && obj.classId < static_cast<int>(classNames.size()))
                obj.className = classNames[obj.classId];
            objs.push_back(std::move(obj));
            masks.push_back(maskCoeffs[idx]);
        }

        // Generate masks
        if (!objs.empty() && !masks.empty()) {
            const float* protoData = outputTensors[1].GetTensorMutableData<float>();
            cv::Mat protos(nm, protoH * protoW, CV_32F, const_cast<float*>(protoData));
            cv::Mat matmulRes = (masks * protos).t();

            // Apply sigmoid while still a single-channel 2D matrix
            cv::Mat negMat;
            cv::exp(-matmulRes, negMat);
            cv::Mat sigmoidFlat = 1.0 / (1.0 + negMat);

            cv::Mat sigmoidMat = sigmoidFlat.reshape(static_cast<int>(objs.size()),
                                                { protoH, protoW });
            std::vector<cv::Mat> maskChannels;
            cv::split(sigmoidMat, maskChannels);

            // ROI in proto space, accounting for letterbox padding
            cv::Rect roi;
            if (origH > origW) {
                int roiW = std::min(static_cast<int>(std::round(
                    static_cast<float>(protoW) * origW / origH)), protoW);
                roi = cv::Rect((protoW - roiW) / 2, 0, roiW, protoH);
            }
            else {
                int roiH = std::min(static_cast<int>(std::round(
                    static_cast<float>(protoH) * origH / origW)), protoH);
                roi = cv::Rect(0, (protoH - roiH) / 2, protoW, roiH);
            }
            roi &= cv::Rect(0, 0, protoW, protoH);

            int imgW = static_cast<int>(origW);
            int imgH = static_cast<int>(origH);

            const float scaleX = static_cast<float>(imgW) / roi.width;
            const float scaleY = static_cast<float>(imgH) / roi.height;

            for (size_t i = 0; i < objs.size(); ++i) {
                cv::Rect safebox = objs[i].box & cv::Rect(0, 0, imgW, imgH);
                if (safebox.area() <= 0) continue;

                int px0 = std::max(static_cast<int>(std::floor(safebox.x / scaleX)), 0);
                int py0 = std::max(static_cast<int>(std::floor(safebox.y / scaleY)), 0);
                int px1 = std::min(static_cast<int>(std::ceil((safebox.x + safebox.width) / scaleX)), roi.width);
                int py1 = std::min(static_cast<int>(std::ceil((safebox.y + safebox.height) / scaleY)), roi.height);
                if (px1 <= px0 || py1 <= py0) continue;

                cv::Rect protoBox(roi.x + px0, roi.y + py0, px1 - px0, py1 - py0);
                protoBox &= cv::Rect(0, 0, protoW, protoH);
                if (protoBox.area() <= 0) continue;

                cv::Mat cropped = maskChannels[i](protoBox);
                cv::Mat resized;
                cv::resize(cropped, resized, cv::Size(safebox.width, safebox.height),
                           0, 0, cv::INTER_LINEAR);
                objs[i].mask = resized > 0.5f;
                objs[i].polygon = ANSUtilityHelper::MaskToNormalizedPolygon(
                    objs[i].mask, safebox, origW, origH);
            }
        }
        for (auto& obj : objs) {
            if (obj.polygon.empty())
                obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, origW, origH);
        }
        return objs;
    }

    // ====================================================================
    //  POSE — postprocess
    // ====================================================================

    std::vector<Object> ONNXYOLO::postprocessPoseEndToEnd(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold, int numKPS)
    {
        if (outputTensors.empty()) return {};
        const float* raw = outputTensors[0].GetTensorMutableData<float>();
        const auto shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        if (shape.size() < 3) return {};

        const int numDets = static_cast<int>(shape[1]);
        const int numFeat = static_cast<int>(shape[2]); // 6 + nk*3

        const float origW = static_cast<float>(originalImageSize.width);
        const float origH = static_cast<float>(originalImageSize.height);
        const float modelW = static_cast<float>(resizedImageShape.width);
        const float modelH = static_cast<float>(resizedImageShape.height);
        const float gain = std::min(modelH / origH, modelW / origW);
        const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
        const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
        const float invGain = 1.0f / gain;

        std::vector<Object> results;
        results.reserve(numDets);

        for (int i = 0; i < numDets; ++i) {
            const float* det = raw + i * numFeat;
            float conf = det[4];
            if (conf <= confThreshold) continue;

            int classId = static_cast<int>(det[5]);
            float x1 = clamp((det[0] - padX) * invGain, 0.f, origW);
            float y1 = clamp((det[1] - padY) * invGain, 0.f, origH);
            float x2 = clamp((det[2] - padX) * invGain, 0.f, origW);
            float y2 = clamp((det[3] - padY) * invGain, 0.f, origH);
            float w = x2-x1, h = y2-y1;
            if (w < 1.f || h < 1.f) continue;

            // Extract keypoints (after the 6 detection values)
            std::vector<float> kps;
            kps.reserve(numKPS * 3);
            const float* kpsPtr = det + 6;
            for (int k = 0; k < numKPS; ++k) {
                float kx = clamp((kpsPtr[3*k]   - padX) * invGain, 0.f, origW);
                float ky = clamp((kpsPtr[3*k+1] - padY) * invGain, 0.f, origH);
                float ks = kpsPtr[3*k+2];
                kps.push_back(kx);
                kps.push_back(ky);
                kps.push_back(ks);
            }

            Object obj;
            obj.classId    = classId;
            obj.confidence = conf;
            obj.box        = cv::Rect(static_cast<int>(x1), static_cast<int>(y1),
                                      static_cast<int>(w), static_cast<int>(h));
            obj.kps        = std::move(kps);
            if (classId >= 0 && classId < static_cast<int>(classNames.size()))
                obj.className = classNames[classId];
            results.push_back(std::move(obj));
        }
        return results;
    }

    std::vector<Object> ONNXYOLO::postprocessPoseLegacy(
        const cv::Size& originalImageSize,
        const cv::Size& resizedImageShape,
        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        float confThreshold, float iouThreshold, int numKPS, int maxDet)
    {
        if (outputTensors.empty()) return {};
        const float* rawOutput = outputTensors[0].GetTensorMutableData<float>();
        const auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        if (outputShape.size() < 3) return {};

        const int numChannels = static_cast<int>(outputShape[1]);
        const int numBoxes    = static_cast<int>(outputShape[2]);
        // Pose layout: [cx,cy,w,h, scores(nc), kp0_x,kp0_y,kp0_s, ..., kpN_x,kpN_y,kpN_s]
        // Derive actual nc from tensor shape: nc = numChannels - 4 - numKPS*3
        // This avoids mismatch when classNames has more entries than the model's actual classes
        const int nc = std::max(numChannels - 4 - numKPS * 3, 1);
        const int kpsOffset = 4 + nc;

        // Safety: verify we won't read past the row
        if (kpsOffset + numKPS * 3 > numChannels) return {};

        cv::Mat output = cv::Mat(numChannels, numBoxes, CV_32F,
                                 const_cast<float*>(rawOutput)).t();

        const float origW = static_cast<float>(originalImageSize.width);
        const float origH = static_cast<float>(originalImageSize.height);
        const float modelW = static_cast<float>(resizedImageShape.width);
        const float modelH = static_cast<float>(resizedImageShape.height);
        const float gain = std::min(modelH / origH, modelW / origW);
        const float padX = std::round((modelW - origW * gain) / 2.0f - 0.1f);
        const float padY = std::round((modelH - origH * gain) / 2.0f - 0.1f);
        const float invGain = 1.0f / gain;

        std::vector<cv::Rect> bboxes;
        std::vector<float> scores;
        std::vector<int> labels;
        std::vector<std::vector<float>> allKps;

        for (int i = 0; i < numBoxes; ++i) {
            const float* row = output.ptr<float>(i);
            const float* scoresPtr = row + 4;

            // Find best class
            float maxScore = -FLT_MAX;
            int bestClass = 0;
            int numScores = std::max(nc, 1);
            for (int c = 0; c < numScores; ++c) {
                if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; }
            }
            if (maxScore <= confThreshold) continue;

            float cx = row[0], cy = row[1], w = row[2], h = row[3];
            float x0 = clamp((cx - w*0.5f - padX) * invGain, 0.f, origW);
            float y0 = clamp((cy - h*0.5f - padY) * invGain, 0.f, origH);
            float x1 = clamp((cx + w*0.5f - padX) * invGain, 0.f, origW);
            float y1 = clamp((cy + h*0.5f - padY) * invGain, 0.f, origH);

            // Extract keypoints
            const float* kpsPtr = row + kpsOffset;
            std::vector<float> kps;
            kps.reserve(numKPS * 3);
            for (int k = 0; k < numKPS; ++k) {
                float kx = clamp((kpsPtr[3*k]   - padX) * invGain, 0.f, origW);
                float ky = clamp((kpsPtr[3*k+1] - padY) * invGain, 0.f, origH);
                float ks = kpsPtr[3*k+2];
                kps.push_back(kx);
                kps.push_back(ky);
                kps.push_back(ks);
            }

            bboxes.push_back(cv::Rect(static_cast<int>(x0), static_cast<int>(y0),
                                       static_cast<int>(x1-x0), static_cast<int>(y1-y0)));
            scores.push_back(maxScore);
            labels.push_back(bestClass);
            allKps.push_back(std::move(kps));
        }

        // NMS
        std::vector<int> indices;
        cv::dnn::NMSBoxesBatched(bboxes, scores, labels, confThreshold,
                                  iouThreshold, indices);

        std::vector<Object> results;
        for (int idx : indices) {
            if (static_cast<int>(results.size()) >= maxDet) break;
            Object obj;
            obj.classId    = labels[idx];
            obj.confidence = scores[idx];
            obj.box        = bboxes[idx];
            obj.kps        = allKps[idx];
            if (obj.classId >= 0 && obj.classId < static_cast<int>(classNames.size()))
                obj.className = classNames[obj.classId];
            results.push_back(std::move(obj));
        }
        return results;
    }

    // ====================================================================
    //  CLASSIFICATION — postprocess
    // ====================================================================

    std::vector<Object> ONNXYOLO::postprocessClassify(
        std::vector<Ort::Value>& outputTensors,
        const std::vector<std::string>& classNames,
        const cv::Size& imageSize)
    {
        if (outputTensors.empty()) return {};
        const float* raw = outputTensors[0].GetTensorMutableData<float>();
        const auto shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        if (shape.size() < 2) return {};

        const int nc = static_cast<int>(shape[1]);

        // Check if the output is already a probability distribution (sums to ~1.0).
        // Some ONNX models (e.g. exported with opset 19) include a Softmax layer
        // in the graph itself.  Applying softmax again would flatten the distribution
        // and produce near-uniform probabilities, causing wrong classifications.
        float rawSum = 0.f;
        for (int i = 0; i < nc; ++i) rawSum += raw[i];
        const bool alreadyNormalized = (rawSum > 0.9f && rawSum < 1.1f
                                        && raw[0] >= 0.f);  // probabilities are non-negative


        std::vector<float> probs(nc);
        if (alreadyNormalized) {
            // Output is already softmax — use as-is (skip double softmax)
            for (int i = 0; i < nc; ++i) probs[i] = raw[i];
        } else {
            // Raw logits — apply softmax
            float maxVal = -FLT_MAX;
            for (int i = 0; i < nc; ++i) maxVal = std::max(maxVal, raw[i]);
            float sumExp = 0.f;
            for (int i = 0; i < nc; ++i) {
                probs[i] = std::exp(raw[i] - maxVal);
                sumExp += probs[i];
            }
            for (int i = 0; i < nc; ++i) probs[i] /= sumExp;
        }

        int bestClass = 0;
        float bestProb = 0.f;
        for (int i = 0; i < nc; ++i) {
            if (probs[i] > bestProb) { bestProb = probs[i]; bestClass = i; }
        }

        const int imgW = imageSize.width;
        const int imgH = imageSize.height;

        Object obj;
        if (imgW > 20 && imgH > 20) {
            obj.box = cv::Rect(10, 10, imgW - 20, imgH - 20);
        }
        else {
            obj.box = cv::Rect(0, 0, imgW, imgH);
        }
        //obj.polygon    = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, imgW, imgH);
        obj.classId    = bestClass;
        obj.confidence = bestProb;
        if (bestClass >= 0 && bestClass < static_cast<int>(classNames.size()))
            obj.className = classNames[bestClass];
        return { std::move(obj) };
    }

    // ====================================================================
    //  BATCH — sliceBatchOutput + detectBatch
    // ====================================================================

    /*static*/ Ort::Value ONNXYOLO::sliceBatchOutput(
        Ort::Value& batchTensor,
        int64_t batchIndex,
        const std::vector<int64_t>& fullShape,
        Ort::MemoryInfo& memInfo)
    {
        // Per-image element count = product of all dims except batch
        int64_t elemsPerImage = 1;
        for (size_t d = 1; d < fullShape.size(); ++d)
            elemsPerImage *= fullShape[d];

        float* batchData = batchTensor.GetTensorMutableData<float>();
        float* imageData = batchData + batchIndex * elemsPerImage;

        // Shape for single image: [1, D1, D2, ...]
        std::vector<int64_t> singleShape = fullShape;
        singleShape[0] = 1;

        return Ort::Value::CreateTensor<float>(
            memInfo, imageData, static_cast<size_t>(elemsPerImage),
            singleShape.data(), singleShape.size());
    }

    std::vector<std::vector<Object>> ONNXYOLO::detectBatch(
        const std::vector<cv::Mat>& images,
        const std::vector<std::string>& classNames,
        float confThreshold, float iouThreshold, int numKPS)
    {
        lastWasClassification = false;
        lastBatchWasClassification = false;
        if (images.empty()) return {};

        const size_t N = images.size();

        // Fallback to sequential if model has fixed batch=1
        // (input_node_dims[0] == 1 and not dynamic (-1))
        if (input_node_dims.size() >= 1 && input_node_dims[0] == 1) {
            std::vector<std::vector<Object>> results(N);
            for (size_t i = 0; i < N; ++i)
                results[i] = detect(images[i], classNames, confThreshold, iouThreshold, numKPS);
            lastBatchWasClassification = lastWasClassification;
            return results;
        }

        // Store original sizes for per-image postprocessing
        std::vector<cv::Size> originalSizes;
        originalSizes.reserve(N);
        for (const auto& img : images)
            originalSizes.push_back(img.size());

        // Batch preprocess + single inference call
        Ort::Value inputTensor = transformBatch(images);

        auto outputTensors = ort_session->Run(
            Ort::RunOptions{ nullptr },
            input_node_names.data(),
            &inputTensor, 1,
            output_node_names.data(),
            num_outputs);

        // Output shape sanity check — see detect() for rationale.  Prevents
        // DirectML-returned garbage dims from propagating into postprocess
        // and triggering multi-terabyte cv::Mat allocations on AMD.
        constexpr int64_t kMaxOutputDim = 1000000;
        for (size_t t = 0; t < outputTensors.size(); ++t) {
            const auto sh = outputTensors[t].GetTensorTypeAndShapeInfo().GetShape();
            for (size_t d = 0; d < sh.size(); ++d) {
                if (sh[d] < 0 || sh[d] > kMaxOutputDim) {
                    std::cerr << "[ONNXYOLO] detectBatch: output[" << t
                              << "] dim[" << d << "]=" << sh[d]
                              << " is out of range — refusing to postprocess."
                              << std::endl;
                    return std::vector<std::vector<Object>>(N);
                }
            }
        }

        const cv::Size resizedShape(
            static_cast<int>(input_node_dims[3]),
            static_cast<int>(input_node_dims[2]));

        // Determine task type from output shapes (same logic as detect())
        const size_t numOutputs = outputTensors.size();
        const auto shape0 = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();

        // Per-image postprocessing
        std::vector<std::vector<Object>> results(N);

        for (size_t i = 0; i < N; ++i) {
            // Build per-image sliced output tensors
            std::vector<Ort::Value> perImageOutputs;
            for (size_t t = 0; t < numOutputs; ++t) {
                auto tShape = outputTensors[t].GetTensorTypeAndShapeInfo().GetShape();
                perImageOutputs.push_back(
                    sliceBatchOutput(outputTensors[t], static_cast<int64_t>(i),
                                     tShape, *memory_info_handler));
            }

            // Dispatch to correct postprocess method
            if (numOutputs >= 2) {
                const auto protoShape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape();
                if (protoShape.size() == 4) {
                    if (shape0.size() >= 3 && shape0[1] < shape0[2]) {
                        results[i] = postprocessSegLegacy(originalSizes[i], resizedShape,
                                                           perImageOutputs, classNames,
                                                           confThreshold, iouThreshold);
                    }
                    else {
                        results[i] = postprocessSegEndToEnd(originalSizes[i], resizedShape,
                                                             perImageOutputs, classNames, confThreshold);
                    }
                    continue;
                }
            }

            if (shape0.size() == 2) {
                lastBatchWasClassification = true;
                results[i] = postprocessClassify(perImageOutputs, classNames, originalSizes[i]);
                continue;
            }

            if (shape0.size() < 3) continue;

            const bool isEndToEnd = (shape0[1] > shape0[2]) || (shape0[2] <= 20);

            if (isEndToEnd) {
                const int features = static_cast<int>(shape0[2]);
                if (features == 6) {
                    results[i] = postprocessEndToEnd(originalSizes[i], resizedShape,
                                                      perImageOutputs, classNames, confThreshold);
                }
                else if (features == 7) {
                    results[i] = postprocessOBBEndToEnd(originalSizes[i], resizedShape,
                                                         perImageOutputs, classNames, confThreshold);
                }
                else if (features > 7 && (features - 6) % 3 == 0) {
                    int nk = (numKPS > 0) ? numKPS : (features - 6) / 3;
                    results[i] = postprocessPoseEndToEnd(originalSizes[i], resizedShape,
                                                          perImageOutputs, classNames,
                                                          confThreshold, nk);
                }
                else {
                    results[i] = postprocessEndToEnd(originalSizes[i], resizedShape,
                                                      perImageOutputs, classNames, confThreshold);
                }
            }
            else {
                const int nc = static_cast<int>(classNames.size());
                const int numChannels = static_cast<int>(shape0[1]);
                const int numBoxes    = static_cast<int>(shape0[2]);
                const int extra = numChannels - 4;

                bool routed = false;
                if (numKPS > 0 && numChannels >= 4 + 1 + numKPS * 3) {
                    results[i] = postprocessPoseLegacy(originalSizes[i], resizedShape,
                                                        perImageOutputs, classNames,
                                                        confThreshold, iouThreshold, numKPS);
                    routed = true;
                }
                else if (nc > 0 && nc <= extra && extra > nc && (extra - nc) % 3 == 0 && (extra - nc) >= 3) {
                    int nk = (extra - nc) / 3;
                    results[i] = postprocessPoseLegacy(originalSizes[i], resizedShape,
                                                        perImageOutputs, classNames,
                                                        confThreshold, iouThreshold, nk);
                    routed = true;
                }
                else if (nc > 0 && nc <= extra && extra == nc + 1) {
                    results[i] = postprocessOBBLegacy(originalSizes[i], resizedShape,
                                                       perImageOutputs, classNames,
                                                       confThreshold, iouThreshold);
                    routed = true;
                }
                else if (nc > 0 && nc <= extra && extra == nc) {
                    results[i] = postprocessLegacy(originalSizes[i], resizedShape,
                                                    perImageOutputs, classNames,
                                                    confThreshold, iouThreshold);
                    routed = true;
                }

                if (!routed) {
                    // Class count mismatch — probe last channel for OBB angles
                    bool likelyOBB = false;
                    if (extra >= 2) {
                        const float* rawOutput = perImageOutputs[0].GetTensorMutableData<float>();
                        int numSamp = std::min(numBoxes, 100);
                        int angleCount = 0;
                        for (int s = 0; s < numSamp; ++s) {
                            float v = rawOutput[(numChannels - 1) * numBoxes + s];
                            if (v >= -3.15f && v <= 3.15f) ++angleCount;
                        }
                        likelyOBB = (angleCount > numSamp * 8 / 10);
                    }
                    if (likelyOBB) {
                        results[i] = postprocessOBBLegacy(originalSizes[i], resizedShape,
                                                           perImageOutputs, classNames,
                                                           confThreshold, iouThreshold);
                    }
                    else {
                        results[i] = postprocessLegacy(originalSizes[i], resizedShape,
                                                        perImageOutputs, classNames,
                                                        confThreshold, iouThreshold);
                    }
                }
            }
        }

        return results;
    }

    // ====================================================================
    // ANSONNXYOLO — ANSODBase wrapper
    // ====================================================================

    ANSONNXYOLO::~ANSONNXYOLO() {
        try { Destroy(); }
        catch (const std::exception& e) {
            _logger.LogError("ANSONNXYOLO::~ANSONNXYOLO()", e.what(), __FILE__, __LINE__);
        }
    }

    bool ANSONNXYOLO::Destroy() {
        try { m_ortEngine.reset(); return true; }
        catch (const std::exception& e) {
            _logger.LogError("ANSONNXYOLO::Destroy", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    bool ANSONNXYOLO::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
        if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder)) return false;
        optimizedModelFolder = _modelFolder;
        return true;
    }

    bool ANSONNXYOLO::InitOrtEngine() {
        try {
            if (!FileExist(_modelFilePath)) {
                _logger.LogError("ANSONNXYOLO::InitOrtEngine",
                    "Model file does not exist: " + _modelFilePath, __FILE__, __LINE__);
                return false;
            }
            m_ortEngine = std::make_unique<ONNXYOLO>(_modelFilePath);
            return true;
        }
        catch (const std::exception& e) {
            _logger.LogFatal("ANSONNXYOLO::InitOrtEngine", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    bool ANSONNXYOLO::InitOrtEngine(ANSCENTER::EngineType engineType) {
        try {
            if (!FileExist(_modelFilePath)) {
                _logger.LogError("ANSONNXYOLO::InitOrtEngine",
                    "Model file does not exist: " + _modelFilePath, __FILE__, __LINE__);
                return false;
            }
            m_ortEngine = std::make_unique<ONNXYOLO>(_modelFilePath, engineType);
            return true;
        }
        catch (const std::exception& e) {
            _logger.LogFatal("ANSONNXYOLO::InitOrtEngine", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    // ========================================================================
    // WarmUpEngine — run a dummy inference after session creation.
    //
    // Scope: **NVIDIA (CUDA EP) only.**  On first inference, the CUDA EP
    // allocates its memory arena (capped at 2 GB via BasicOrtHandler config),
    // resolves cuDNN convolution algorithms, and populates the kernel launch
    // cache.  Running one dummy inference at load time amortises this cost
    // so the first real frame doesn't see a latency spike.
    //
    // Explicitly disabled on AMD, Intel and CPU:
    //   • AMD (DirectML) — calling detect() at load time has been observed
    //     to hit a multi-terabyte cv::Mat allocation inside postprocessLegacy
    //     on AMD RDNA iGPUs when DirectML returns garbage output tensor
    //     dims.  ONNXYOLO::detect() now has an output-shape sanity guard
    //     that catches this at runtime, so the warm-up would add risk
    //     without benefit.  Earlier builds enabled warm-up specifically for
    //     Radeon 680M TDR mitigation; that workaround is obsolete with
    //     current DirectML 1.15.x drivers.
    //   • Intel (OpenVINO) — running detect() at load time has been
    //     observed to expose latent heap-corruption bugs
    //     (ntdll +0x1176e5 / STATUS_HEAP_CORRUPTION 0xc0000374).
    //   • CPU EP — no shader compile or kernel cache to warm up; the first
    //     real frame has the same latency as any subsequent frame.
    //
    // Non-fatal on failure: if warm-up itself throws, regular inference
    // still works — the engine is fully loaded before WarmUpEngine runs.
    // ========================================================================
    void ANSONNXYOLO::WarmUpEngine() {
        if (!m_ortEngine) return;

        // Gate strictly on NVIDIA_GPU.  Every other EP is a no-op.
        if (m_ortEngine->getEngineType() != EngineType::NVIDIA_GPU) {
            ANS_DBG("ONNXYOLO", "Warm-up skipped (non-NVIDIA EP)");
            return;
        }

        // ── Strict dimension validation ─────────────────────────────────
        // Defensive: refuse to warm up with implausible model dimensions.
        // _modelConfig values come from the caller's ModelConfig and are
        // normally 224..640; anything outside [32, 4096] is almost certainly
        // a bug in the caller and we skip warm-up rather than risk a huge
        // cv::Mat allocation inside detect().
        constexpr int kMinDim = 32;
        constexpr int kMaxDim = 4096;
        const int rawW = _modelConfig.inpWidth;
        const int rawH = _modelConfig.inpHeight;
        if (rawW <= 0 || rawH <= 0 || rawW > kMaxDim || rawH > kMaxDim) {
            _logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
                "Warm-up skipped — suspect input dims ("
                + std::to_string(rawW) + "x" + std::to_string(rawH) + ")",
                __FILE__, __LINE__);
            return;
        }
        const int w = std::clamp(rawW, kMinDim, kMaxDim);
        const int h = std::clamp(rawH, kMinDim, kMaxDim);

        try {
            // Mid-gray BGR image matches the letterbox fill colour used in
            // preprocessing (114,114,114 ~ 128) and avoids degenerate inputs.
            cv::Mat dummy(h, w, CV_8UC3, cv::Scalar(128, 128, 128));

            ANS_DBG("ONNXYOLO", "Warm-up: running 1 dummy CUDA inference (%dx%d)", w, h);

            auto t0 = std::chrono::steady_clock::now();
            (void)m_ortEngine->detect(dummy, _classes,
                                      PROBABILITY_THRESHOLD,
                                      NMS_THRESHOLD,
                                      NUM_KPS);
            auto t1 = std::chrono::steady_clock::now();
            auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
            ANS_DBG("ONNXYOLO", "Warm-up done: %lld ms", (long long)ms);
        }
        catch (const cv::Exception& e) {
            // Defensive — should not fire on NVIDIA CUDA EP, but if it does
            // the engine itself is still loaded and real inference will work.
            _logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
                std::string("Warm-up skipped (cv::Exception, non-fatal): ") + e.what(),
                __FILE__, __LINE__);
        }
        catch (const std::exception& e) {
            _logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
                std::string("Warm-up skipped (std::exception, non-fatal): ") + e.what(),
                __FILE__, __LINE__);
        }
        catch (...) {
            _logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
                "Warm-up skipped (unknown exception, non-fatal)",
                __FILE__, __LINE__);
        }
    }

    bool ANSONNXYOLO::Initialize(std::string licenseKey, ModelConfig modelConfig,
                                   const std::string& modelZipFilePath,
                                   const std::string& modelZipPassword,
                                   std::string& labelMap)
    {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        ModelLoadingGuard mlg(_modelLoading);
        try {
            _modelLoadValid = false;
            bool result = ANSODBase::Initialize(licenseKey, modelConfig,
                                                modelZipFilePath, modelZipPassword, labelMap);
            if (!result) return false;

            _modelConfig = modelConfig;
            if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
            if (_modelConfig.inpWidth  <= 0) _modelConfig.inpWidth  = 640;
            if (_modelConfig.modelMNSThreshold  < 0.2f) _modelConfig.modelMNSThreshold  = 0.45f;
            if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.25f;

            PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
            NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
            TOP_K = 300;
            NUM_KPS = _modelConfig.numKPS;
            KPS_THRESHOLD = _modelConfig.kpsThreshold;

            _modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
            if (FileExist(_modelConfigFile)) {
                ModelType modelType;
                std::vector<int> inputShape;
                _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
                if (inputShape.size() == 2) {
                    if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
                    if (inputShape[1] > 0) _modelConfig.inpWidth  = inputShape[1];
                }
            }
            else {
                _classFilePath = CreateFilePath(_modelFolder, "classes.names");
                std::ifstream isValid(_classFilePath);
                if (!isValid) LoadClassesFromString();
                else          LoadClassesFromFile();
            }

            labelMap.clear();
            if (!_classes.empty())
                labelMap = VectorToCommaSeparatedString(_classes);

            if (this->_loadEngineOnCreation) {
                if (!InitOrtEngine()) {
                    _logger.LogError("ANSONNXYOLO::Initialize",
                        "Failed to create ONNX Runtime engine: " + _modelFilePath,
                        __FILE__, __LINE__);
                    return false;
                }
            }

            // Fix input resolution for dynamic-shape models.
            // The constructor defaults to 640x640 when ONNX dims are dynamic,
            // but the ModelConfig may specify the correct size (e.g. 224x224
            // for classification models).  Override here after config is loaded.
            if (m_ortEngine && m_ortEngine->hasDynamicInputShape()) {
                if (_modelConfig.inpHeight > 0 && _modelConfig.inpWidth > 0) {
                    m_ortEngine->setInputShape(_modelConfig.inpWidth, _modelConfig.inpHeight);
                }
            }

            // Pre-compile DirectML shaders / kernel cache before first real
            // frame (mitigates amdkmdag TDR on Radeon 680M).  Non-fatal.
            WarmUpEngine();

            _modelLoadValid = true;
            _isInitialized  = true;
            return true;
        }
        catch (const std::exception& e) {
            _logger.LogFatal("ANSONNXYOLO::Initialize", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    bool ANSONNXYOLO::LoadModel(const std::string& modelZipFilePath,
                                  const std::string& modelZipPassword)
    {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        ModelLoadingGuard mlg(_modelLoading);
        try {
            bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword);
            if (!result) return false;

            if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
            if (_modelConfig.inpWidth  <= 0) _modelConfig.inpWidth  = 640;
            if (_modelConfig.modelMNSThreshold  < 0.2f) _modelConfig.modelMNSThreshold  = 0.45f;
            if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.25f;

            PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
            NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
            TOP_K = 300;
            NUM_KPS = _modelConfig.numKPS;
            KPS_THRESHOLD = _modelConfig.kpsThreshold;

            _modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
            if (FileExist(_modelConfigFile)) {
                ModelType modelType;
                std::vector<int> inputShape;
                _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
                if (inputShape.size() == 2) {
                    if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
                    if (inputShape[1] > 0) _modelConfig.inpWidth  = inputShape[1];
                }
            }
            else {
                _classFilePath = CreateFilePath(_modelFolder, "classes.names");
                std::ifstream isValid(_classFilePath);
                if (!isValid) LoadClassesFromString();
                else          LoadClassesFromFile();
            }

            if (this->_loadEngineOnCreation) {
                if (!InitOrtEngine()) { _modelLoadValid = false; return false; }
            }

            // Fix input resolution for dynamic-shape models (same as primary Initialize)
            if (m_ortEngine && m_ortEngine->hasDynamicInputShape()) {
                if (_modelConfig.inpHeight > 0 && _modelConfig.inpWidth > 0) {
                    m_ortEngine->setInputShape(_modelConfig.inpWidth, _modelConfig.inpHeight);
                }
            }

            // Pre-compile DirectML shaders / kernel cache before first real
            // frame (mitigates amdkmdag TDR on Radeon 680M).  Non-fatal.
            WarmUpEngine();

            _modelLoadValid = true;
            _isInitialized  = true;
            return true;
        }
        catch (const std::exception& e) {
            _logger.LogFatal("ANSONNXYOLO::LoadModel", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    bool ANSONNXYOLO::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig,
                                            std::string modelName, std::string className,
                                            const std::string& modelFolder,
                                            std::string& labelMap)
    {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        ModelLoadingGuard mlg(_modelLoading);
        try {
            bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig,
                                                         modelName, className,
                                                         modelFolder, labelMap);
            if (!result) return false;

            _modelConfig = modelConfig;
            if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
            if (_modelConfig.inpWidth  <= 0) _modelConfig.inpWidth  = 640;
            if (_modelConfig.modelMNSThreshold  < 0.2f) _modelConfig.modelMNSThreshold  = 0.45f;
            if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.25f;

            PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
            NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
            TOP_K = 300;
            NUM_KPS = _modelConfig.numKPS;
            KPS_THRESHOLD = _modelConfig.kpsThreshold;

            std::string _modelName = modelName;
            if (_modelName.empty()) _modelName = "train_last";
            std::string modelFullName = _modelName + ".onnx";

            _modelFilePath = CreateFilePath(_modelFolder, modelFullName);
            if (FileExist(_modelConfigFile)) {
                ModelType modelType;
                std::vector<int> inputShape;
                _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
                if (inputShape.size() == 2) {
                    if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
                    if (inputShape[1] > 0) _modelConfig.inpWidth  = inputShape[1];
                }
            }
            else {
                _classFilePath = CreateFilePath(_modelFolder, className);
                std::ifstream isValid(_classFilePath);
                if (!isValid) LoadClassesFromString();
                else          LoadClassesFromFile();
            }

            labelMap.clear();
            if (!_classes.empty())
                labelMap = VectorToCommaSeparatedString(_classes);

            if (this->_loadEngineOnCreation) {
                if (!InitOrtEngine()) { _modelLoadValid = false; return false; }
            }

            // Fix input resolution for dynamic-shape models (same as primary Initialize)
            if (m_ortEngine && m_ortEngine->hasDynamicInputShape()) {
                if (_modelConfig.inpHeight > 0 && _modelConfig.inpWidth > 0) {
                    m_ortEngine->setInputShape(_modelConfig.inpWidth, _modelConfig.inpHeight);
                }
            }

            // Pre-compile DirectML shaders / kernel cache before first real
            // frame (mitigates amdkmdag TDR on Radeon 680M).  Non-fatal.
            WarmUpEngine();

            _modelLoadValid = true;
            _isInitialized  = true;
            return true;
        }
        catch (const std::exception& e) {
            _logger.LogFatal("ANSONNXYOLO::LoadModelFromFolder", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    std::vector<Object> ANSONNXYOLO::RunInference(const cv::Mat& inputImgBGR) {
        return RunInference(inputImgBGR, "");
    }

    std::vector<Object> ANSONNXYOLO::RunInference(const cv::Mat& inputImgBGR,
                                                     const std::string& camera_id)
    {
        if (_modelLoading.load()) return {};
        {
            auto lock = TryLockWithTimeout("ANSONNXYOLO::RunInference");
            if (!lock.owns_lock()) return {};
            if (!_modelLoadValid) {
                _logger.LogError("ANSONNXYOLO::RunInference", "Model not loaded", __FILE__, __LINE__);
                return {};
            }
            if (!_licenseValid) {
                _logger.LogError("ANSONNXYOLO::RunInference", "Invalid license", __FILE__, __LINE__);
                return {};
            }
            if (!_isInitialized) {
                _logger.LogError("ANSONNXYOLO::RunInference", "Model not initialized", __FILE__, __LINE__);
                return {};
            }
            if (inputImgBGR.empty() || inputImgBGR.cols < 10 || inputImgBGR.rows < 10)
                return {};
        }
        try { return DetectObjects(inputImgBGR, camera_id); }
        catch (const std::exception& e) {
            _logger.LogFatal("ANSONNXYOLO::RunInference", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    std::vector<Object> ANSONNXYOLO::DetectObjects(const cv::Mat& inputImage,
                                                      const std::string& camera_id)
    {
        try {
            // Fail-fast if a model load/init is in progress on another thread
            if (_modelLoading.load()) {
                ANS_DBG("ONNXYOLO", "DetectObjects: skipped — model loading in progress, cam=%s", camera_id.c_str());
                return {};
            }

            // ── Snapshot config under a brief lock ──────────────────
            // Only hold _mutex long enough to validate state and copy
            // the parameters needed for inference.  The actual ORT
            // detect() call runs outside the lock so that concurrent
            // Initialize/LoadModel calls are not blocked for the full
            // duration of inference.
            float probThresh, nmsThresh;
            int   numKps;
            std::vector<std::string> classes;
            bool  trackerEnabled, stabilizationEnabled;
            {
                ANS_DBG("ONNXYOLO", "DetectObjects: cam=%s acquiring mutex...", camera_id.c_str());
                auto lk = TryLockWithTimeout("ANSONNXYOLO::DetectObjects");
                if (!lk.owns_lock()) return {};  // timed out
                ANS_DBG("ONNXYOLO", "DetectObjects: mutex acquired, cam=%s", camera_id.c_str());

                if (!m_ortEngine) {
                    _logger.LogError("ANSONNXYOLO::DetectObjects", "ORT engine is null", __FILE__, __LINE__);
                    ANS_DBG("ONNXYOLO", "DetectObjects: ORT engine is null!");
                    return {};
                }

                // Snapshot parameters while locked
                probThresh          = PROBABILITY_THRESHOLD;
                nmsThresh           = NMS_THRESHOLD;
                numKps              = NUM_KPS;
                classes             = _classes;
                trackerEnabled      = _trackerEnabled;
                stabilizationEnabled = _stabilizationEnabled;
            }
            // ── _mutex released — heavy work below runs lock-free ───

            // --- NV12 fast path: try to get full-res BGR from GPU NV12 frame ---
            cv::Mat inferenceImage = inputImage;
            float bgrScaleX = 1.0f, bgrScaleY = 1.0f;
            {
                auto* gpuData = tl_currentGpuFrame();
                if (gpuData && gpuData->width > 0 && gpuData->height > 0) {
                    if (gpuData->cpuYPlane && gpuData->cpuUvPlane &&
                            gpuData->cpuYLinesize >= gpuData->width &&
                            gpuData->cpuUvLinesize >= gpuData->width) {
                        const int fw = gpuData->width;
                        const int fh = gpuData->height;
                        if ((fw % 2) == 0 && (fh % 2) == 0) {
                            try {
                                cv::Mat yPlane(fh, fw, CV_8UC1,
                                               gpuData->cpuYPlane, static_cast<size_t>(gpuData->cpuYLinesize));
                                cv::Mat uvPlane(fh / 2, fw / 2, CV_8UC2,
                                                gpuData->cpuUvPlane, static_cast<size_t>(gpuData->cpuUvLinesize));
                                cv::Mat fullResBGR;
                                cv::cvtColorTwoPlane(yPlane, uvPlane, fullResBGR, cv::COLOR_YUV2BGR_NV12);
                                if (!fullResBGR.empty()) {
                                    bgrScaleX = static_cast<float>(inputImage.cols) / fullResBGR.cols;
                                    bgrScaleY = static_cast<float>(inputImage.rows) / fullResBGR.rows;
                                    inferenceImage = fullResBGR;
                                }
                            } catch (...) { /* NV12 conversion failed — fall back to inputImage */ }
                        }
                    }
                }
            }

            // Run ORT inference — no mutex held, this is the expensive call
            auto results = m_ortEngine->detect(inferenceImage, classes,
                                               probThresh,
                                               nmsThresh,
                                               numKps);

            // --- Rescale coordinates from full-res back to display-res ---
            if (bgrScaleX != 1.0f || bgrScaleY != 1.0f) {
                for (auto& obj : results) {
                    obj.box.x      = static_cast<int>(obj.box.x      * bgrScaleX);
                    obj.box.y      = static_cast<int>(obj.box.y      * bgrScaleY);
                    obj.box.width  = static_cast<int>(obj.box.width  * bgrScaleX);
                    obj.box.height = static_cast<int>(obj.box.height * bgrScaleY);
                    for (size_t k = 0; k < obj.kps.size(); k += 2) {
                        obj.kps[k]     *= bgrScaleX;  // x
                        if (k + 1 < obj.kps.size())
                            obj.kps[k + 1] *= bgrScaleY;  // y
                    }
                    for (auto& pt : obj.polygon) {
                        pt.x *= bgrScaleX;
                        pt.y *= bgrScaleY;
                    }
                }
            }

            for (auto& obj : results)
                obj.cameraId = camera_id;

            // Tracking/stabilization (ApplyTracking has its own lock)
            if (trackerEnabled && !m_ortEngine->lastWasClassification) {
                results = ApplyTracking(results, camera_id);
                if (stabilizationEnabled) results = StabilizeDetections(results, camera_id);
            }
            return results;
        }
        catch (const std::exception& e) {
            const std::string msg = e.what();

            if (msg.find("887A0005") != std::string::npos) {
                if (!_dmlDeviceLost) {
                    _dmlDeviceLost = true;
                    _logger.LogFatal("ANSONNXYOLO::DetectObjects",
                        "DirectML GPU device lost (887A0005) — attempting CPU fallback",
                        __FILE__, __LINE__);
                    ANS_DBG("ONNXYOLO", "DML device lost — recreating session on CPU");
                    try {
                        std::lock_guard<std::recursive_mutex> lk(_mutex);
                        m_ortEngine.reset();
                        if (InitOrtEngine(ANSCENTER::EngineType::CPU)) {
                            _logger.LogInfo("ANSONNXYOLO::DetectObjects",
                                "CPU fallback session created successfully",
                                __FILE__, __LINE__);
                            ANS_DBG("ONNXYOLO", "CPU fallback OK");
                        } else {
                            _logger.LogFatal("ANSONNXYOLO::DetectObjects",
                                "CPU fallback session creation failed",
                                __FILE__, __LINE__);
                        }
                    } catch (const std::exception& re) {
                        _logger.LogFatal("ANSONNXYOLO::DetectObjects",
                            std::string("CPU fallback exception: ") + re.what(),
                            __FILE__, __LINE__);
                    }
                }
                return {};
            }

            ANS_DBG("ONNXYOLO", "DetectObjects EXCEPTION: %s cam=%s", e.what(), camera_id.c_str());
            _logger.LogFatal("ANSONNXYOLO::DetectObjects", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    // ====================================================================
    //  RunInferencesBatch / DetectObjectsBatch — true ONNX batch
    // ====================================================================

    std::vector<std::vector<Object>> ANSONNXYOLO::RunInferencesBatch(
        const std::vector<cv::Mat>& inputs, const std::string& camera_id)
    {
        if (_modelLoading.load()) return {};
        {
            auto lock = TryLockWithTimeout("ANSONNXYOLO::RunInferencesBatch");
            if (!lock.owns_lock()) return {};
            if (!_modelLoadValid) {
                _logger.LogFatal("ANSONNXYOLO::RunInferencesBatch",
                    "Cannot load ONNX model", __FILE__, __LINE__);
                return {};
            }
            if (!_licenseValid) {
                _logger.LogFatal("ANSONNXYOLO::RunInferencesBatch",
                    "Invalid license", __FILE__, __LINE__);
                return {};
            }
            if (!_isInitialized) {
                _logger.LogFatal("ANSONNXYOLO::RunInferencesBatch",
                    "Model not initialized", __FILE__, __LINE__);
                return {};
            }
            if (inputs.empty()) {
                _logger.LogWarn("ANSONNXYOLO::RunInferencesBatch",
                    "Empty input batch", __FILE__, __LINE__);
                return {};
            }
        }
        try {
            return DetectObjectsBatch(inputs, camera_id);
        }
        catch (const std::exception& e) {
            const std::string msg = e.what();
            if (msg.find("887A0005") != std::string::npos) {
                if (!_dmlDeviceLost) {
                    _dmlDeviceLost = true;
                    _logger.LogFatal("ANSONNXYOLO::RunInferencesBatch",
                        "DirectML GPU device lost (887A0005) — attempting CPU fallback",
                        __FILE__, __LINE__);
                    try {
                        m_ortEngine.reset();
                        if (!InitOrtEngine(ANSCENTER::EngineType::CPU))
                            _logger.LogFatal("ANSONNXYOLO::RunInferencesBatch",
                                "CPU fallback session creation failed", __FILE__, __LINE__);
                    } catch (...) {}
                }
                return {};
            }
            _logger.LogFatal("ANSONNXYOLO::RunInferencesBatch",
                e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    std::vector<std::vector<Object>> ANSONNXYOLO::DetectObjectsBatch(
        const std::vector<cv::Mat>& inputImages, const std::string& camera_id)
    {
        try {
            if (_modelLoading.load()) return {};

            // Snapshot config under brief lock
            float probThresh, nmsThresh;
            int   numKps;
            std::vector<std::string> classes;
            bool  trackerEnabled, stabilizationEnabled;
            {
                auto lk = TryLockWithTimeout("ANSONNXYOLO::DetectObjectsBatch");
                if (!lk.owns_lock()) return {};

                if (!m_ortEngine) {
                    _logger.LogError("ANSONNXYOLO::DetectObjectsBatch",
                        "ORT engine is null", __FILE__, __LINE__);
                    return {};
                }
                probThresh           = PROBABILITY_THRESHOLD;
                nmsThresh            = NMS_THRESHOLD;
                numKps               = NUM_KPS;
                classes              = _classes;
                trackerEnabled       = _trackerEnabled;
                stabilizationEnabled = _stabilizationEnabled;
            }

            // Heavy work outside lock
            auto batchResults = m_ortEngine->detectBatch(
                inputImages, classes, probThresh, nmsThresh, numKps);

            const bool isClassification = m_ortEngine->lastBatchWasClassification;

            for (auto& results : batchResults) {
                for (auto& obj : results)
                    obj.cameraId = camera_id;

                if (trackerEnabled && !isClassification) {
                    results = ApplyTracking(results, camera_id);
                    if (stabilizationEnabled) results = StabilizeDetections(results, camera_id);
                }
            }

            return batchResults;
        }
        catch (const std::exception& e) {
            const std::string msg = e.what();
            if (msg.find("887A0005") != std::string::npos) {
                if (!_dmlDeviceLost) {
                    _dmlDeviceLost = true;
                    _logger.LogFatal("ANSONNXYOLO::DetectObjectsBatch",
                        "DirectML GPU device lost (887A0005) — attempting CPU fallback",
                        __FILE__, __LINE__);
                    try {
                        std::lock_guard<std::recursive_mutex> lk(_mutex);
                        m_ortEngine.reset();
                        if (!InitOrtEngine(ANSCENTER::EngineType::CPU))
                            _logger.LogFatal("ANSONNXYOLO::DetectObjectsBatch",
                                "CPU fallback session creation failed", __FILE__, __LINE__);
                    } catch (...) {}
                }
                return {};
            }
            _logger.LogFatal("ANSONNXYOLO::DetectObjectsBatch",
                e.what(), __FILE__, __LINE__);
            return {};
        }
    }

} // namespace ANSCENTER