ANSCORE/modules/ANSODEngine/ANSONNXYOLO.h

#ifndef ANSONNXYOLO_H
#define ANSONNXYOLO_H
#pragma once
#include "ANSEngineCommon.h"
#include "ONNXEngine.h"

namespace ANSCENTER {

    // ====================================================================
    // ONNXYOLO — Ultralytics YOLO inference via ONNX Runtime
    //
    // Compatible with ALL Ultralytics YOLO tasks and versions
    // (v8, v9, v10, v11, v26+) by auto-detecting the task type
    // from output tensor shapes at inference time.
    //
    // Supported tasks:
    //   - Detection   : [B,300,6] end2end  or [B, nc+4, N] legacy
    //   - OBB         : [B,300,7] end2end  or [B, nc+5, N] legacy
    //   - Segmentation: [B,300,38]+protos   or [B, nc+36, N]+protos
    //   - Pose        : [B,300,6+nk*3]     or [B, nc+4+nk*3, N] legacy
    //   - Classification: [B, nc]
    //
    // Preprocessing follows the exact Ultralytics LetterBox transform:
    //   - center=True, stride=32, pad_value=114
    //   - Ultralytics-compatible -0.1/+0.1 rounding for deterministic padding
    //   - BGR→RGB, /255.0 normalisation, HWC→CHW
    // ====================================================================
    class ONNXENGINE_API ONNXYOLO : public BasicOrtHandler
    {
    public:
        explicit ONNXYOLO(const std::string& _onnx_path,
                            unsigned int _num_threads = 1);
        explicit ONNXYOLO(const std::string& _onnx_path,
                            EngineType engineType,
                            unsigned int _num_threads = 1);
        ~ONNXYOLO() override = default;

        /// Run inference on a single image.
        /// Auto-detects the task type (detect/segment/obb/pose/classify)
        /// from the ONNX model's output tensor shapes.
        std::vector<Object> detect(const cv::Mat& image,
                                   const std::vector<std::string>& classNames,
                                   float confThreshold = 0.25f,
                                   float iouThreshold  = 0.45f,
                                   int numKPS = 0);

        /// True after detect() if the last inference was classification.
        bool lastWasClassification = false;

        /// Run batched inference on multiple images in a single ONNX session call.
        /// Falls back to sequential detect() if the model has fixed batch=1.
        std::vector<std::vector<Object>> detectBatch(
            const std::vector<cv::Mat>& images,
            const std::vector<std::string>& classNames,
            float confThreshold = 0.25f,
            float iouThreshold  = 0.45f,
            int numKPS = 0);

        /// True after detectBatch() if the batch was classification.
        bool lastBatchWasClassification = false;

        /// Override the input image shape for dynamic-input models.
        /// Call after construction when the model config specifies a
        /// different resolution than the default 640x640.
        void setInputShape(int width, int height) {
            inputImageShape = cv::Size(width, height);
        }

        /// True if the ONNX model has dynamic spatial dimensions.
        bool hasDynamicInputShape() const { return isDynamicInputShape; }

    private:
        Ort::Value transform(const cv::Mat& mat) override;
        Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;

        // ── Ultralytics-compatible letterbox ────────────────────────────
        void letterBox(const cv::Mat& image, cv::Mat& outImage,
                       const cv::Size& newShape,
                       const cv::Scalar& color = cv::Scalar(114, 114, 114),
                       bool scaleUp = true,
                       int stride = 32);

        // ── Detection postprocess ───────────────────────────────────────
        std::vector<Object> postprocessEndToEnd(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames, float confThreshold);

        std::vector<Object> postprocessLegacy(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames,
            float confThreshold, float iouThreshold, int maxDet = 300);

        // ── OBB postprocess ─────────────────────────────────────────────
        std::vector<Object> postprocessOBBEndToEnd(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames, float confThreshold);

        std::vector<Object> postprocessOBBLegacy(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames,
            float confThreshold, float iouThreshold, int maxDet = 300);

        // ── Segmentation postprocess ────────────────────────────────────
        std::vector<Object> postprocessSegEndToEnd(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames, float confThreshold);

        std::vector<Object> postprocessSegLegacy(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames,
            float confThreshold, float iouThreshold, int maxDet = 300);

        // ── Pose postprocess ────────────────────────────────────────────
        std::vector<Object> postprocessPoseEndToEnd(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames,
            float confThreshold, int numKPS);

        std::vector<Object> postprocessPoseLegacy(
            const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames,
            float confThreshold, float iouThreshold, int numKPS, int maxDet = 300);

        // ── Classification postprocess ──────────────────────────────────
        std::vector<Object> postprocessClassify(
            std::vector<Ort::Value>& outputTensors,
            const std::vector<std::string>& classNames,
            const cv::Size& imageSize);

        // ── OBB NMS helpers (Prob-IoU based) ────────────────────────────
        struct OrientedBox {
            float x, y, width, height, angle;
        };

        static void getCovarianceComponents(const OrientedBox& box,
                                            float& out1, float& out2, float& out3);
        static std::vector<std::vector<float>> batchProbiou(
            const std::vector<OrientedBox>& obb1,
            const std::vector<OrientedBox>& obb2, float eps = 1e-7f);
        static std::vector<int> nmsRotatedImpl(
            const std::vector<OrientedBox>& sortedBoxes, float iouThreshold);
        static std::vector<int> nmsRotated(
            const std::vector<OrientedBox>& boxes,
            const std::vector<float>& scores, float iouThreshold);
        static std::vector<cv::Point2f> OBBToPoints(const OrientedBox& obb);

        // ── Batch output slicing helper ────────────────────────────────
        static Ort::Value sliceBatchOutput(
            Ort::Value& batchTensor,
            int64_t batchIndex,
            const std::vector<int64_t>& fullShape,
            Ort::MemoryInfo& memInfo);

        // Cached model input shape
        cv::Size inputImageShape;
        bool isDynamicInputShape{ false };
    };

    // ====================================================================
    // ANSONNXYOLO — ANSODBase wrapper for Ultralytics YOLO ONNX
    //
    // Compatible with all Ultralytics YOLO tasks and versions.
    // Implements all required ANSODBase interfaces.
    // ====================================================================
    class ANSENGINE_API ANSONNXYOLO : public ANSODBase {
    public:
        bool Initialize(std::string licenseKey, ModelConfig modelConfig,
                        const std::string& modelZipFilePath,
                        const std::string& modelZipPassword,
                        std::string& labelMap) override;

        bool LoadModel(const std::string& modelZipFilePath,
                       const std::string& modelZipPassword) override;

        bool LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig,
                                 std::string modelName, std::string className,
                                 const std::string& modelFolder,
                                 std::string& labelMap) override;

        bool OptimizeModel(bool fp16, std::string& optimizedModelFolder) override;

        std::vector<Object> RunInference(const cv::Mat& input);
        std::vector<Object> RunInference(const cv::Mat& input, const std::string& camera_id);

        std::vector<std::vector<Object>> RunInferencesBatch(
            const std::vector<cv::Mat>& inputs,
            const std::string& camera_id) override;

        bool Destroy();
        ~ANSONNXYOLO();

    private:
        std::string  _modelFilePath;
        bool         _modelLoadValid{ false };

        // Filter thresholds
        float PROBABILITY_THRESHOLD{ 0.25f };
        float NMS_THRESHOLD{ 0.45f };
        int   TOP_K{ 300 };

        // Pose estimation
        int   NUM_KPS{ 0 };
        float KPS_THRESHOLD{ 0.5f };

        // ONNX Runtime inference engine
        std::unique_ptr<ONNXYOLO> m_ortEngine;

        // DML device-lost recovery: when DirectML's GPU device is removed
        // (HRESULT 887A0005), the session is permanently broken.  We detect
        // this once, attempt a CPU-fallback recreation, and suppress further
        // error-log flooding.
        bool _dmlDeviceLost{ false };

        // Internal detection pipeline
        std::vector<Object> DetectObjects(const cv::Mat& inputImage,
                                          const std::string& camera_id);

        // Internal batch detection pipeline
        std::vector<std::vector<Object>> DetectObjectsBatch(
            const std::vector<cv::Mat>& inputImages,
            const std::string& camera_id);

        // Initialise ORT engine from the resolved model path
        bool InitOrtEngine();
    public:
        // Initialise ORT engine with explicit engine type override (e.g. CPU fallback for AMD iGPUs)
        bool InitOrtEngine(ANSCENTER::EngineType engineType);
    };
}
#endif