engines/ONNXEngine/ONNXEngine.h

#pragma once
#ifndef ONNXEngine_H
#define ONNXEngine_H

#include <string>
#include <vector>
#include <iostream>
#include <typeinfo>
#include <deque>
#include <unordered_map>

#include "onnxruntime_cxx_api.h"
#include "opencv2/opencv.hpp"
#include "EPLoader.h"           // brings in EngineType via ANSLicenseHelper

#define LITEORT_CHAR wchar_t

#ifdef ENGINE_EXPORTS
#define ONNXENGINE_API __declspec(dllexport)
#else
#define ONNXENGINE_API __declspec(dllimport)
#endif

namespace ANSCENTER {

    // ====================================================================
    // types
    // ====================================================================
    namespace types {

        template<typename _T1 = float, typename _T2 = float>
        static inline void __assert_type()
        {
            static_assert(
                std::is_standard_layout_v<_T1> && std::is_trivially_copyable_v<_T1>
                && std::is_standard_layout_v<_T2> && std::is_trivially_copyable_v<_T2>
                && std::is_floating_point<_T2>::value
                && (std::is_integral<_T1>::value || std::is_floating_point<_T1>::value),
                "not support type.");
        }

        template<typename T1 = float, typename T2 = float>
        struct BoundingBoxType
        {
            typedef T1 value_type;
            typedef T2 score_type;

            value_type   x1, y1, x2, y2;
            score_type   score;
            const char* label_text;
            unsigned int label;
            bool         flag;

            template<typename O1, typename O2 = score_type>
            BoundingBoxType<O1, O2> convert_type() const;

            template<typename O1, typename O2 = score_type>
            value_type iou_of(const BoundingBoxType<O1, O2>& other) const;

            value_type   width()  const;
            value_type   height() const;
            value_type   area()   const;
            ::cv::Rect   rect()   const;
            ::cv::Point2i tl()   const;
            ::cv::Point2i rb()   const;

            BoundingBoxType() :
                x1(0), y1(0), x2(0), y2(0),
                score(0), label_text(nullptr), label(0), flag(false)
            {
                types::__assert_type<value_type, score_type>();
            }
        };

        template class BoundingBoxType<int, float>;
        template class BoundingBoxType<float, float>;
        template class BoundingBoxType<double, double>;

        typedef BoundingBoxType<int, float>  Boxi;
        typedef BoundingBoxType<float, float>  Boxf;
        typedef BoundingBoxType<double, double> Boxd;

        typedef struct LandmarksType {
            std::vector<cv::Point2f> points;
            bool flag;
            LandmarksType() : flag(false) {}
        } Landmarks;

        typedef Landmarks Landmarks2D;

        typedef struct Landmarks3DType {
            std::vector<cv::Point3f> points;
            bool flag;
            Landmarks3DType() : flag(false) {}
        } Landmarks3D;

        typedef struct BoxfWithLandmarksType {
            Boxf      box;
            Landmarks landmarks;
            bool      flag;
            BoxfWithLandmarksType() : flag(false) {}
        } BoxfWithLandmarks;

        typedef struct EulerAnglesType {
            float yaw, pitch, roll;
            bool  flag;
            EulerAnglesType() : flag(false) {}
        } EulerAngles;

        typedef struct EmotionsType {
            float        score;
            unsigned int label;
            const char* text;
            bool         flag;
            EmotionsType() : flag(false) {}
        } Emotions;

        typedef struct AgeType {
            float        age;
            unsigned int age_interval[2];
            float        interval_prob;
            bool         flag;
            AgeType() : flag(false) {}
        } Age;

        typedef struct GenderType {
            float        score;
            unsigned int label;
            const char* text;
            bool         flag;
            GenderType() : flag(false) {}
        } Gender;

        typedef struct FaceContentType {
            std::vector<float> embedding;
            unsigned int       dim;
            bool               flag;
            FaceContentType() : flag(false) {}
        } FaceContent;

        typedef struct SegmentContentType {
            cv::Mat class_mat;
            cv::Mat color_mat;
            std::unordered_map<int, std::string> names_map;
            bool flag;
            SegmentContentType() : flag(false) {}
        } SegmentContent;

        typedef struct MattingContentType {
            cv::Mat fgr_mat;
            cv::Mat pha_mat;
            cv::Mat merge_mat;
            bool flag;
            MattingContentType() : flag(false) {}
        } MattingContent;

        typedef struct SegmentationMaskContentType {
            cv::Mat mask;
            bool flag;
            SegmentationMaskContentType() : flag(false) {}
        } SegmentationMaskContent;

        typedef struct ImageNetContentType {
            std::vector<float>        scores;
            std::vector<const char*>  texts;
            std::vector<unsigned int> labels;
            bool flag;
            ImageNetContentType() : flag(false) {}
        } ImageNetContent;

        typedef ImageNetContent ClassificationContent;

        typedef struct StyleContentType {
            cv::Mat mat;
            bool flag;
            StyleContentType() : flag(false) {}
        } StyleContent;

        typedef struct SuperResolutionContentType {
            cv::Mat mat;
            bool flag;
            SuperResolutionContentType() : flag(false) {}
        } SuperResolutionContent;

        typedef struct FaceParsingContentType {
            cv::Mat label;
            cv::Mat merge;
            bool flag;
            FaceParsingContentType() : flag(false) {}
        } FaceParsingContent;

        typedef SegmentationMaskContent HairSegContent;
        typedef SegmentationMaskContent HeadSegContent;
        typedef SegmentationMaskContent FaceHairSegContent;
        typedef SegmentationMaskContent PortraitSegContent;

    } // namespace types

    // ====================================================================
    // utils
    // ====================================================================
    namespace utils {
        namespace transform {

            enum { CHW = 0, HWC = 1 };

            Ort::Value create_tensor(
                const cv::Mat& mat,
                const std::vector<int64_t>& tensor_dims,
                const Ort::MemoryInfo& memory_info_handler,
                std::vector<float>& tensor_value_handler,
                unsigned int data_format = CHW);

            Ort::Value create_tensor_batch(
                const std::vector<cv::Mat>& batch_mats,
                const std::vector<int64_t>& tensor_dims,
                const Ort::MemoryInfo& memory_info_handler,
                std::vector<float>& tensor_value_handler,
                unsigned int data_format = CHW);

            Ort::Value create_video_tensor_5d(
                const std::deque<cv::Mat>& frames,
                const std::vector<int64_t>& tensor_dims,
                const Ort::MemoryInfo& memory_info_handler,
                std::vector<float>& tensor_value_handler);

            cv::Mat normalize(const cv::Mat& mat, float mean, float scale);
            cv::Mat normalize(const cv::Mat& mat, const float mean[3], const float scale[3]);
            void    normalize(const cv::Mat& inmat, cv::Mat& outmat, float mean, float scale);
            void    normalize_inplace(cv::Mat& mat_inplace, float mean, float scale);
            void    normalize_inplace(cv::Mat& mat_inplace, const float mean[3], const float scale[3]);

        } // namespace transform
    } // namespace utils

    // ====================================================================
    // Helpers
    // ====================================================================
    inline static std::string OrtCompatiableGetInputName(
        size_t index, OrtAllocator* allocator, Ort::Session* ort_session)
    {
        return std::string(ort_session->GetInputNameAllocated(index, allocator).get());
    }

    inline static std::string OrtCompatiableGetOutputName(
        size_t index, OrtAllocator* allocator, Ort::Session* ort_session)
    {
        return std::string(ort_session->GetOutputNameAllocated(index, allocator).get());
    }

    // ====================================================================
    // High-perf options for OCR sub-models that need TRT EP and full
    // cuDNN workspace.  Default-constructed = identical to the legacy
    // behavior (CUDA EP only, minimal cuDNN workspace).
    // ====================================================================
    struct OrtHandlerOptions {
        // Try to attach TensorRT EP before CUDA EP (NVIDIA only).
        // Falls back to CUDA EP automatically if TRT EP creation or session
        // creation fails.  Engines are cached on disk for fast reload.
        bool preferTensorRT = false;

        // Use the largest cuDNN conv workspace.  cuDNN can then pick fast
        // algorithms (Winograd, implicit-precomp-GEMM with big workspaces).
        // Defaults off because some deployments share VRAM with TRT engines
        // and need the minimal-workspace mode to avoid OOM.
        bool useMaxCudnnWorkspace = false;

        // Where to cache built TRT engines.  Empty → default
        // %TEMP%/ANSCENTER/TRTEngineCache.  Only used when preferTensorRT.
        std::string trtEngineCacheDir;

        // FP16 builds for TRT EP.  Recommended for inference; ignored if
        // preferTensorRT is false.
        bool trtFP16 = true;

        // Dynamic-shape profile for TRT EP. When set, TRT builds ONE
        // engine that handles every input shape in the [min..max] range
        // instead of rebuilding per unique shape. Critical for models
        // that see many (batch_size, spatial) combinations at runtime.
        //
        // Format: "input_name:d0xd1xd2xd3[,input2:...]"
        //   e.g. "x:1x3x48x320"  for batch=1, C=3, H=48, W=320
        //
        // All three fields must be set together. An empty min implies
        // no profile (fall back to static-shape-per-unique-input mode).
        std::string trtProfileMinShapes;
        std::string trtProfileOptShapes;
        std::string trtProfileMaxShapes;
    };

    // ====================================================================
    // BasicOrtHandler
    // ====================================================================
    class ONNXENGINE_API BasicOrtHandler
    {
    protected:

        const char* input_name = nullptr;
        std::vector<const char*> input_node_names;
        std::vector<std::string> input_node_names_;
        std::vector<int64_t>     input_node_dims;
        std::size_t              input_tensor_size = 1;
        std::vector<float>       input_values_handler;

        std::vector<const char*>              output_node_names;
        std::vector<std::string>              output_node_names_;
        std::vector<std::vector<int64_t>>     output_node_dims;
        int                 num_outputs = 1;

        Ort::Env* ort_env = nullptr;  // ← pointer, no in-class init
        Ort::Session* ort_session = nullptr;
        Ort::MemoryInfo* memory_info_handler = nullptr;

        std::wstring        onnx_path_w;          // ← owns the wstring storage
        const LITEORT_CHAR* onnx_path = nullptr;  // ← points into onnx_path_w
        const char* log_id = nullptr;


    protected:
        const unsigned int num_threads;
        EngineType m_engineType;

        // Per-session high-perf options. Default = legacy behavior.
        OrtHandlerOptions m_handlerOptions;

    protected:
        // Default: hardware auto-detection via ANSLicenseHelper through EPLoader
        explicit BasicOrtHandler(const std::string& _onnx_path,
            unsigned int _num_threads = 1);

        // Explicit engine override per-session
        explicit BasicOrtHandler(const std::string& _onnx_path,
            EngineType engineType,
            unsigned int _num_threads = 1);

        // Engine override + per-session high-perf options (TRT EP, max
        // cuDNN workspace, etc.).  Used by OCR sub-models that need
        // shape-stable, high-throughput inference.
        explicit BasicOrtHandler(const std::string& _onnx_path,
            EngineType engineType,
            const OrtHandlerOptions& options,
            unsigned int _num_threads = 1);

        // Auto-detect engine via EPLoader, but with high-perf options.
        explicit BasicOrtHandler(const std::string& _onnx_path,
            const OrtHandlerOptions& options,
            unsigned int _num_threads = 1);

        virtual ~BasicOrtHandler();

        BasicOrtHandler(const BasicOrtHandler&) = delete;
        BasicOrtHandler& operator=(const BasicOrtHandler&) = delete;
    public:
        // Resolved EP type (after EPLoader fallback). Subclasses use this
        // to branch on actual EP at inference time.
        EngineType getEngineType() const { return m_engineType; }

        // Spin up a tiny CPU-only ORT session just long enough to read
        // the name of the model's first input, then tear it down. Used
        // by callers that need to build TRT profile-shape strings
        // (which require the input name) BEFORE the real session is
        // created. Returns an empty string on failure.
        static std::string QueryModelInputName(const std::string& onnxPath);
    private:
        void initialize_handler();
    protected:
        virtual Ort::Value transform(const cv::Mat& mat) = 0;
        virtual Ort::Value transformBatch(const std::vector<cv::Mat>& images) = 0;

        // EP-specific session option builders
        bool TryAppendCUDA(Ort::SessionOptions& opts);
        bool TryAppendTensorRT(Ort::SessionOptions& opts);
        bool TryAppendDirectML(Ort::SessionOptions& opts);
        bool TryAppendOpenVINO(Ort::SessionOptions& opts);
    };

    // ====================================================================
    // SCRFD — face detection
    // ====================================================================
    class SCRFD : public BasicOrtHandler
    {
    public:
        explicit SCRFD(const std::string& _onnx_path,unsigned int _num_threads = 1);
        explicit SCRFD(const std::string& _onnx_path,EngineType engineType,unsigned int _num_threads = 1);
        ~SCRFD() override = default;

        void detect(const cv::Mat& mat,
            std::vector<types::BoxfWithLandmarks>& detected_boxes_kps,
            float score_threshold = 0.3f,
            float iou_threshold = 0.45f,
            unsigned int topk = 400);

    private:
        typedef struct { float cx, cy, stride; } SCRFDPoint;
        typedef struct { float ratio; int dw, dh; bool flag; } SCRFDScaleParams;

        const float mean_vals[3] = { 127.5f, 127.5f, 127.5f };
        const float scale_vals[3] = { 1.f / 128.f, 1.f / 128.f, 1.f / 128.f };

        unsigned int fmc = 3;
        bool         use_kps = false;
        unsigned int num_anchors = 2;
        std::vector<int> feat_stride_fpn = { 8, 16, 32 };
        std::unordered_map<int, std::vector<SCRFDPoint>> center_points;
        bool center_points_is_update = false;

        static constexpr unsigned int nms_pre = 1000;
        static constexpr unsigned int max_nms = 30000;

        Ort::Value transform(const cv::Mat& mat_rs) override;
        Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;

        void initial_context();
        void resize_unscale(const cv::Mat& mat, cv::Mat& mat_rs,
            int target_height, int target_width,
            SCRFDScaleParams& scale_params);
        void generate_points(int target_height, int target_width);

        void generate_bboxes_kps(const SCRFDScaleParams& scale_params,
            std::vector<types::BoxfWithLandmarks>& bbox_kps_collection,
            std::vector<Ort::Value>& output_tensors,
            float score_threshold,
            float img_height, float img_width);

        void generate_bboxes_single_stride(
            const SCRFDScaleParams& scale_params,
            Ort::Value& score_pred, Ort::Value& bbox_pred,
            unsigned int stride, float score_threshold,
            float img_height, float img_width,
            std::vector<types::BoxfWithLandmarks>& bbox_kps_collection);

        void generate_bboxes_kps_single_stride(
            const SCRFDScaleParams& scale_params,
            Ort::Value& score_pred, Ort::Value& bbox_pred, Ort::Value& kps_pred,
            unsigned int stride, float score_threshold,
            float img_height, float img_width,
            std::vector<types::BoxfWithLandmarks>& bbox_kps_collection);

        void nms_bboxes_kps(std::vector<types::BoxfWithLandmarks>& input,
            std::vector<types::BoxfWithLandmarks>& output,
            float iou_threshold, unsigned int topk);
    };

    // ====================================================================
    // GlintArcFace — face recognition
    // ====================================================================
    class GlintArcFace : public BasicOrtHandler
    {
    public:
        explicit GlintArcFace(const std::string& _onnx_path,
            unsigned int _num_threads = 1)
            : BasicOrtHandler(_onnx_path, _num_threads) 
        {
        }

        explicit GlintArcFace(const std::string& _onnx_path,
            EngineType engineType,
            unsigned int _num_threads = 1)
            : BasicOrtHandler(_onnx_path, engineType, _num_threads) {
        }

        ~GlintArcFace() override = default;

        void detect(const cv::Mat& mat, types::FaceContent& face_content);
        void detectBatch(const std::vector<cv::Mat>& images,
            std::vector<types::FaceContent>& face_contents);

    private:
        static constexpr float mean_val = 127.5f;
        static constexpr float scale_val = 1.f / 127.5f;

        Ort::Value transform(const cv::Mat& mat) override;
        Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
    };

    // ====================================================================
    // GlintCosFace — face recognition
    // ====================================================================
    class GlintCosFace : public BasicOrtHandler
    {
    public:
        explicit GlintCosFace(const std::string& _onnx_path,
            unsigned int _num_threads = 1)
            : BasicOrtHandler(_onnx_path, _num_threads) 
        {
        }

        explicit GlintCosFace(const std::string& _onnx_path,
            EngineType engineType,
            unsigned int _num_threads = 1)
            : BasicOrtHandler(_onnx_path, engineType, _num_threads) 
        {
        }

        ~GlintCosFace() override = default;

        void detect(const cv::Mat& mat, types::FaceContent& face_content);
        void detectBatch(const std::vector<cv::Mat>& images,
            std::vector<types::FaceContent>& face_contents);

    private:
        static constexpr float mean_val = 127.5f;
        static constexpr float scale_val = 1.f / 127.5f;

        Ort::Value transform(const cv::Mat& mat) override;
        Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
    };

    // ====================================================================
    // MOVINET — action recognition
    // ====================================================================
    class MOVINET : public BasicOrtHandler
    {
    public:
        explicit MOVINET(const std::string& _onnx_path,
            unsigned int _num_threads = 1);

        explicit MOVINET(const std::string& _onnx_path,
            int _temporal, int _width, int _height, int _channels = 3,
            unsigned int _num_threads = 1);

        explicit MOVINET(const std::string& _onnx_path,
            EngineType engineType,
            unsigned int _num_threads = 1);

        explicit MOVINET(const std::string& _onnx_path,
            EngineType engineType,
            int _temporal, int _width, int _height, int _channels = 3,
            unsigned int _num_threads = 1);

        ~MOVINET() override = default;

        void inference(const std::deque<cv::Mat>& frames,
            std::pair<int, float>& out_result);

    private:
        struct InputConfig {
            int temporal = 16;
            int width = 172;
            int height = 172;
            int channels = 3;
        } input_params;

        struct OutputConfig {
            int num_classes = 2;
        } output_params;

        std::string          _MoviNetInputName;
        std::string          _MoviNetOutputName;
        std::vector<float>   input_tensor_values;

        void init_io_names();

        Ort::Value transform(const std::deque<cv::Mat>& frames);
        std::pair<int, float> post_processing(const float* pOutput);

        // Required by BasicOrtHandler pure virtuals
        Ort::Value transform(const cv::Mat& mat) override;
        Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
    };

    // ====================================================================
    // BoundingBoxType template implementations
    // ====================================================================
    template<typename T1, typename T2>
    template<typename O1, typename O2>
    inline ANSCENTER::types::BoundingBoxType<O1, O2>
        ANSCENTER::types::BoundingBoxType<T1, T2>::convert_type() const
    {
        types::__assert_type<O1, O2>();
        types::__assert_type<value_type, score_type>();
        BoundingBoxType<O1, O2> other;
        other.x1 = static_cast<O1>(x1);
        other.y1 = static_cast<O1>(y1);
        other.x2 = static_cast<O1>(x2);
        other.y2 = static_cast<O1>(y2);
        other.score = static_cast<O2>(score);
        other.label_text = label_text;
        other.label = label;
        other.flag = flag;
        return other;
    }

} // namespace ANSCENTER

#endif // ONNXEngine_H