#ifndef ANSRTYOLO_H
#define ANSRTYOLO_H
#pragma once
#include "ANSEngineCommon.h"
#include "engine.h"
#include "ANSGpuFrameRegistry.h"
#include "NV12PreprocessHelper.h"
#include "engine/EnginePoolManager.h"

namespace ANSCENTER {

    // ====================================================================
    // ANSRTYOLO — TensorRT-based Ultralytics YOLO inference
    //
    // Compatible with ALL Ultralytics YOLO tasks and versions
    // (v8, v9, v10, v11, v26+) by auto-detecting the task type
    // from output tensor shapes at inference time.
    //
    // Supported tasks:
    //   - Detection   : legacy [B, nc+4, N]  or  end2end [B, 300, 6]
    //   - OBB         : legacy [B, nc+5, N]  or  end2end [B, 300, 7]
    //   - Segmentation: legacy [B, nc+36, N]+protos  or  end2end [B, 300, 38]+protos
    //   - Pose        : legacy [B, nc+4+nk*3, N]  or  end2end [B, 300, 6+nk*3]
    //   - Classification: [B, nc]
    //
    // Uses Engine<float> with GPU preprocessing (cv::cuda::GpuMat),
    // supports multi-GPU pool via SetMaxSlotsPerGpu.
    // ====================================================================
    class ANSENGINE_API ANSRTYOLO : public ANSODBase {
    public:
        bool Initialize(std::string licenseKey, ModelConfig modelConfig,
                        const std::string& modelZipFilePath,
                        const std::string& modelZipPassword,
                        std::string& labelMap) override;

        bool LoadModel(const std::string& modelZipFilePath,
                       const std::string& modelZipPassword) override;

        bool LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig,
                                 std::string modelName, std::string className,
                                 const std::string& modelFolder,
                                 std::string& labelMap) override;

        bool OptimizeModel(bool fp16, std::string& optimizedModelFolder) override;

        std::vector<Object> RunInference(const cv::Mat& input);
        std::vector<Object> RunInference(const cv::Mat& input, const std::string& camera_id);
        std::vector<std::vector<Object>> RunInferencesBatch(
            const std::vector<cv::Mat>& inputs, const std::string& camera_id) override;

        bool Destroy();
        ~ANSRTYOLO();

    private:
        std::string _modelFilePath;
        bool        _modelLoadValid{ false };
        bool        _fp16{ false };
        bool        _isFixedBatch{ false };
        int         m_maxSlotsPerGpu{ 1 };   // 1 = one slot per GPU, multi-GPU round-robin (no elastic)
        void SetMaxSlotsPerGpu(int n) override { m_maxSlotsPerGpu = n; }

        // NV12 fast-path helper (shared with ANSYOLOV12RTOD, ANSYOLOV10RTOD)
        NV12PreprocessHelper m_nv12Helper;

        // Per-call image geometry for concurrent inference
        struct ImageMetadata {
            float ratio     = 1.f;
            float imgWidth  = 0.f;
            float imgHeight = 0.f;
        };

        struct BatchMetadata {
            std::vector<int>   imgHeights;
            std::vector<int>   imgWidths;
            std::vector<float> ratios;
        };

        // ── GPU Preprocessing ────────────────────────────────────────────
        std::vector<std::vector<cv::cuda::GpuMat>> Preprocess(
            const cv::Mat& inputImage, ImageMetadata& outMeta);
        std::vector<std::vector<cv::cuda::GpuMat>> PreprocessBatch(
            const std::vector<cv::Mat>& inputImages, BatchMetadata& outMetadata);

        // ── Detection pipeline ───────────────────────────────────────────
        std::vector<Object> DetectObjects(const cv::Mat& inputImage,
                                          const std::string& camera_id);
        std::vector<std::vector<Object>> DetectObjectsBatch(
            const std::vector<cv::Mat>& inputImages, const std::string& camera_id);

        // ── Task-specific postprocessors (legacy format) ─────────────────
        std::vector<Object> PostprocessDetection(
            std::vector<float>& featureVector,
            const std::string& camera_id, const ImageMetadata& meta);
        std::vector<Object> PostprocessOBB(
            std::vector<float>& featureVector,
            const std::string& camera_id, const ImageMetadata& meta);
        std::vector<Object> PostprocessSegmentation(
            std::vector<std::vector<float>>& featureVectors,
            const std::string& camera_id, const ImageMetadata& meta);
        std::vector<Object> PostprocessPose(
            std::vector<float>& featureVector,
            const std::string& camera_id, const ImageMetadata& meta);
        std::vector<Object> PostprocessClassify(
            std::vector<float>& featureVector,
            const std::string& camera_id, const ImageMetadata& meta);

        // ── End2end postprocessors ───────────────────────────────────────
        std::vector<Object> PostprocessDetectionE2E(
            std::vector<float>& featureVector,
            const std::string& camera_id, const ImageMetadata& meta);
        std::vector<Object> PostprocessOBBE2E(
            std::vector<float>& featureVector,
            const std::string& camera_id, const ImageMetadata& meta);
        std::vector<Object> PostprocessSegE2E(
            std::vector<std::vector<float>>& featureVectors,
            const std::string& camera_id, const ImageMetadata& meta);
        std::vector<Object> PostprocessPoseE2E(
            std::vector<float>& featureVector,
            const std::string& camera_id, const ImageMetadata& meta);

        // ── OBB NMS helpers (Prob-IoU based) ─────────────────────────────
        struct OrientedBox {
            float x, y, width, height, angle;
        };
        static void getCovarianceComponents(const OrientedBox& box,
                                            float& out1, float& out2, float& out3);
        static std::vector<std::vector<float>> batchProbiou(
            const std::vector<OrientedBox>& obb1,
            const std::vector<OrientedBox>& obb2, float eps = 1e-7f);
        static std::vector<int> nmsRotatedImpl(
            const std::vector<OrientedBox>& sortedBoxes, float iouThreshold);
        static std::vector<int> nmsRotated(
            const std::vector<OrientedBox>& boxes,
            const std::vector<float>& scores, float iouThreshold);
        static std::vector<cv::Point2f> OBBToPoints(const OrientedBox& obb);

        // ── TensorRT engine (shared across tasks using same model) ──────
        std::shared_ptr<Engine<float>> m_trtEngine = nullptr;
        EnginePoolManager<float>::PoolKey m_poolKey;  // key for release
        bool m_usingSharedPool = false;

        // Preprocessing constants: YOLO expects [0,1] normalized input
        const std::array<float, 3> SUB_VALS{ 0.f, 0.f, 0.f };
        const std::array<float, 3> DIV_VALS{ 1.f, 1.f, 1.f };
        const bool NORMALIZE = true;

        ANSCENTER::Options m_options;

        // Filter thresholds
        float PROBABILITY_THRESHOLD{ 0.25f };
        float NMS_THRESHOLD{ 0.45f };
        int   TOP_K{ 300 };

        // Segmentation constants
        int   SEG_CHANNELS{ 32 };
        int   SEG_H{ 160 };
        int   SEG_W{ 160 };
        float SEGMENTATION_THRESHOLD{ 0.5f };

        // Pose estimation constants
        int   NUM_KPS{ 0 };
        float KPS_THRESHOLD{ 0.5f };
    };

} // namespace ANSCENTER
#endif