modules/ANSODEngine/ANSYOLO12OD.cpp

#include "ANSYOLO12OD.h"
#include "EPLoader.h"
#ifdef USEONNXOV
#endif


namespace ANSCENTER {
    bool YOLO12OD::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder)) {
            return false;
        }
        if (FileExist(_modelFilePath)) {
            optimizedModelFolder = GetParentFolder(_modelFilePath);
            this->_logger.LogDebug("YOLO12OD::OptimizeModel", "This model is optimized. No need other optimization.", __FILE__, __LINE__);
            return true;
        }
        else {
            optimizedModelFolder = "";
            this->_logger.LogFatal("YOLO12OD::OptimizeModel", "This model is not exist. Please check the model path again.", __FILE__, __LINE__);
            return false;
        }
    }
    bool YOLO12OD::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {
            bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword);
            if (!result) return false;
            _modelConfig.detectionType = ANSCENTER::DetectionType::DETECTION;
            _modelConfig.inpHeight = 640;
            _modelConfig.inpWidth = 640;
            if (_modelConfig.modelMNSThreshold < 0.2)
                _modelConfig.modelMNSThreshold = 0.5;
            if (_modelConfig.modelConfThreshold < 0.2)
                _modelConfig.modelConfThreshold = 0.5;
            // 0. Check if the configuration file exist
            if (FileExist(_modelConfigFile)) {
                ModelType modelType;
                std::vector<int> inputShape;
                _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
                if (inputShape.size() == 2) {
                    if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0];
                    if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1];
                }
                _modelConfig.modelType = modelType;
                _modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
                this->_logger.LogDebug("YOLO12OD::Initialize.  Loading YoloV12 weight", _modelFilePath, __FILE__, __LINE__);
                if (!FileExist(_modelFilePath)) {
                    this->_logger.LogError("YOLO12OD::Initialize.  Model file is not exist", _modelFilePath, __FILE__, __LINE__);
                    return false;
                }
            }
            else {// This is old version of model zip file
                std::string onnxfile = CreateFilePath(_modelFolder, "train_last.onnx");
                if (std::filesystem::exists(onnxfile)) {
                    _modelFilePath = onnxfile;
                    _classFilePath = CreateFilePath(_modelFolder, "classes.names");
                    this->_logger.LogDebug("YOLO12OD::Initialize.  Loading YoloV12 weight", _modelFilePath, __FILE__, __LINE__);
                }
                else {
                    this->_logger.LogError("YOLO12OD::Initialize.  Model file is not exist", _modelFilePath, __FILE__, __LINE__);
                    return false;
                }
                if (FileExist(_classFilePath))
                {
                    this->_logger.LogDebug("YOLO12OD::Initialize.  Load classes from file", _classFilePath, __FILE__, __LINE__);
                    LoadClassesFromFile();
                }
                else {
                    this->_logger.LogDebug("YOLO12OD::Initialize.  Load classes from string", _classFilePath, __FILE__, __LINE__);
                    LoadClassesFromString();
                }
            }  
            _isInitialized = loadModel(_modelFilePath, true);//Assume that GPU is available;
            return _isInitialized;
        }
        catch (std::exception& e) {
            this->_logger.LogFatal("YOLO12OD::LoadModel", e.what(), __FILE__, __LINE__);
            return false;
        }
    }
    bool YOLO12OD::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig, std::string modelName,std::string className, const std::string& modelFolder, std::string& labelMap) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {
            bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig, modelName, className,modelFolder, labelMap);
            if (!result) return false;
            std::string _modelName = modelName;
            if (_modelName.empty()) {
                _modelName = "train_last";
            }
            std::string weightFileName = _modelName + ".weights";
            _modelConfig = modelConfig;
            _modelConfig.detectionType = ANSCENTER::DetectionType::DETECTION;
            _modelConfig.inpHeight = 640;
            _modelConfig.inpWidth = 640;
            if (_modelConfig.modelMNSThreshold < 0.2)
                _modelConfig.modelMNSThreshold = 0.5;
            if (_modelConfig.modelConfThreshold < 0.2)
                _modelConfig.modelConfThreshold = 0.5;
            // 0. Check if the configuration file exist
            if (FileExist(_modelConfigFile)) {
                ModelType modelType;
                std::vector<int> inputShape;
                _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
                if (inputShape.size() == 2) {
                    if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0];
                    if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1];
                }
                _modelConfig.modelType = modelType;
                weightFileName = _modelName + ".onnx";
                _modelFilePath = CreateFilePath(_modelFolder, weightFileName);
                this->_logger.LogDebug("YOLO12OD::Initialize.  Loading YoloV12 weight", _modelFilePath, __FILE__, __LINE__);
                if (!FileExist(_modelFilePath)) {
                    this->_logger.LogError("YOLO12OD::Initialize.  Model file is not exist", _modelFilePath, __FILE__, __LINE__);
                    return false;
                }
            }
            else {// This is old version of model zip file
                weightFileName = _modelName + ".onnx";
                std::string onnxfile = CreateFilePath(_modelFolder, weightFileName);
                if (std::filesystem::exists(onnxfile)) {
                    _modelFilePath = onnxfile;
                    _classFilePath = CreateFilePath(_modelFolder, className);
                    this->_logger.LogDebug("YOLO12OD::Initialize.  Loading YoloV8 weight", _modelFilePath, __FILE__, __LINE__);
                }
                else {
                    this->_logger.LogError("YOLO12OD::Initialize.  Model file is not exist", _modelFilePath, __FILE__, __LINE__);
                    return false;
                }
                std::ifstream isValidFileName(_classFilePath);
                if (!isValidFileName)
                {
                    this->_logger.LogDebug("YOLO12OD::Initialize.  Load classes from string", _classFilePath, __FILE__, __LINE__);
                    LoadClassesFromString();
                }
                else {
                    this->_logger.LogDebug("YOLO12OD::Initialize.  Load classes from file", _classFilePath, __FILE__, __LINE__);
                    LoadClassesFromFile();
                }
            }

            // 1. Load labelMap and engine
            labelMap.clear();
            if (!_classes.empty())
                labelMap = VectorToCommaSeparatedString(_classes);
            _isInitialized = loadModel(_modelFilePath, true);//Assume that GPU is available;
            return _isInitialized;
        }
        catch (std::exception& e) {
            this->_logger.LogFatal("YOLO12OD::LoadModel", e.what(), __FILE__, __LINE__);
            return false;
        }
    }
    bool YOLO12OD::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, std::string& labelMap) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {
            bool result = ANSODBase::Initialize(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, labelMap);
            if (!result) return false;
            // Parsing for YOLO only here
            _modelConfig = modelConfig;
            _modelConfig.detectionType = ANSCENTER::DetectionType::DETECTION;
            _modelConfig.inpHeight = 640;
            _modelConfig.inpWidth = 640;
            if (_modelConfig.modelMNSThreshold < 0.2)
                _modelConfig.modelMNSThreshold = 0.5;
            if (_modelConfig.modelConfThreshold < 0.2)
                _modelConfig.modelConfThreshold = 0.5;
            // 0. Check if the configuration file exist
            if (FileExist(_modelConfigFile)) {
                ModelType modelType;
                std::vector<int> inputShape;
                _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
                if (inputShape.size() == 2) {
                    if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0];
                    if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1];
                }
                _modelConfig.modelType = modelType;
                _modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
                this->_logger.LogDebug("YOLO12OD::Initialize.  Loading YoloV8 weight", _modelFilePath, __FILE__, __LINE__);
                if (!FileExist(_modelFilePath)) {
                    this->_logger.LogError("YOLO12OD::Initialize.  Model file is not exist", _modelFilePath, __FILE__, __LINE__);
                    return false;
                }
            }
            else {// This is old version of model zip file
                std::string onnxfile = CreateFilePath(_modelFolder, "train_last.onnx");
                if (std::filesystem::exists(onnxfile)) {
                    // This is the yovoV5 or yolov8 format
                    _modelFilePath = onnxfile;
                    _classFilePath = CreateFilePath(_modelFolder, "classes.names");
                    this->_logger.LogDebug("YOLO12OD::Initialize.  Loading YoloV8/Yolov5 weight", _modelFilePath, __FILE__, __LINE__);
                }
                else {
                    this->_logger.LogError("YOLO12OD::Initialize.  Model file is not exist", _modelFilePath, __FILE__, __LINE__);
                    return false;
                }
                if (FileExist(_classFilePath))
                {
                    this->_logger.LogDebug("YOLO12OD::Initialize.  Load classes from file", _classFilePath, __FILE__, __LINE__);
                    LoadClassesFromFile();
                }
                else {
                    this->_logger.LogDebug("YOLO12OD::Initialize.  Load classes from string", _classFilePath, __FILE__, __LINE__);
                    LoadClassesFromString();
                }
            }

            // 1. Load labelMap and engine
            labelMap.clear();
            if (!_classes.empty())
                labelMap = VectorToCommaSeparatedString(_classes);
            _isInitialized = loadModel(_modelFilePath, true);//Assume that GPU is available;
            return _isInitialized;

        }
        catch (std::exception& e) {
            this->_logger.LogFatal("YOLO12OD::Initialize", e.what(), __FILE__, __LINE__);
            return false;
        }
    }
    std::vector<Object> YOLO12OD::RunInference(const cv::Mat& input) {
        return RunInference(input, "CustomCam");
    }  
    std::vector<Object> YOLO12OD::RunInference(const cv::Mat& input,const std::string& camera_id)
    {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {
            // Validation
            if (!_licenseValid) {
                _logger.LogError("YOLO12OD::RunInference", "Invalid License",
                    __FILE__, __LINE__);
                return {};
            }

            if (!_isInitialized) {
                _logger.LogError("YOLO12OD::RunInference", "Model is not initialized",
                    __FILE__, __LINE__);
                return {};
            }

            if (input.empty() || input.cols < 10 || input.rows < 10) {
                return {};
            }

            auto ret = detect(input, _modelConfig.detectionScoreThreshold,
                _modelConfig.modelMNSThreshold);
            if (_trackerEnabled) {
                ret = ApplyTracking(ret, camera_id);
                if (_stabilizationEnabled) ret = StabilizeDetections(ret, camera_id);
            }
            return ret;

        }
        catch (const std::exception& e) {
            _logger.LogFatal("YOLO12OD::RunInference", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    YOLO12OD::~YOLO12OD() {
        try {
            this->_logger.LogDebug("YOLO12OD::~YOLO12OD()", "Release YOLO12OD ", __FILE__, __LINE__);
        }
        catch (std::exception& e) {
            this->_logger.LogFatal("YOLO12OD::~YOLO12OD()", e.what(), __FILE__, __LINE__);
        }
    }
    bool YOLO12OD::Destroy() {
        try {
            return true;
        }
        catch (std::exception& e) {
            this->_logger.LogFatal("YOLO12OD::Destroy()", e.what(), __FILE__, __LINE__);
            return false;
        }
    }
    bool YOLO12OD::loadModel(const std::string& modelPath, bool useGPU)
    {
        try {
            const auto& ep = ANSCENTER::EPLoader::Current();
            if (Ort::Global<void>::api_ == nullptr)
                Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
            std::cout << "[YOLO12OD] EP ready: "
                << ANSCENTER::EPLoader::EngineTypeName(ep.type) << std::endl;

            env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "ONNX_DETECTION");
            sessionOptions = Ort::SessionOptions();
            sessionOptions.SetIntraOpNumThreads(
                std::min(6, static_cast<int>(std::thread::hardware_concurrency())));
            sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);

            // ── Log available providers ─────────────────────────────────────────
            std::vector<std::string> availableProviders = Ort::GetAvailableProviders();
            std::cout << "Available Execution Providers:" << std::endl;
            for (const auto& p : availableProviders)
                std::cout << " - " << p << std::endl;

            // ── Attach EP based on runtime-detected hardware ────────────────────
            if (useGPU) {
                bool attached = false;

                switch (ep.type) {

                case ANSCENTER::EngineType::NVIDIA_GPU: {
                    auto it = std::find(availableProviders.begin(),
                        availableProviders.end(), "CUDAExecutionProvider");
                    if (it == availableProviders.end()) {
                        this->_logger.LogError("YOLO12OD::loadModel", "CUDAExecutionProvider not in DLL — "
                            "check ep/cuda/ has the CUDA ORT build.", __FILE__, __LINE__);
                        break;
                    }
                    try {
                        OrtCUDAProviderOptionsV2* cuda_options = nullptr;
                        Ort::GetApi().CreateCUDAProviderOptions(&cuda_options);

                        const char* keys[] = { "device_id" };
                        const char* values[] = { "0" };
                        Ort::GetApi().UpdateCUDAProviderOptions(cuda_options, keys, values, 1);

                        sessionOptions.AppendExecutionProvider_CUDA_V2(*cuda_options);
                        Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options);

                        std::cout << "[YOLO12OD] CUDA EP attached." << std::endl;
                        attached = true;
                    }
                    catch (const Ort::Exception& e) {
                        this->_logger.LogError("YOLO12OD::loadModel", e.what(), __FILE__, __LINE__);
                    }
                    break;
                }

                case ANSCENTER::EngineType::AMD_GPU: {
                    auto it = std::find(availableProviders.begin(),
                        availableProviders.end(), "DmlExecutionProvider");
                    if (it == availableProviders.end()) {
                        this->_logger.LogError("YOLO12OD::loadModel", "DmlExecutionProvider not in DLL — "
                            "check ep/directml/ has the DirectML ORT build.", __FILE__, __LINE__);
                        break;
                    }
                    try {
                        std::unordered_map<std::string, std::string> opts = { { "device_id", "0" } };
                        sessionOptions.AppendExecutionProvider("DML", opts);
                        std::cout << "[YOLO12OD] DirectML EP attached." << std::endl;
                        attached = true;
                    }
                    catch (const Ort::Exception& e) {
                        this->_logger.LogError("YOLO12OD::loadModel", e.what(), __FILE__, __LINE__);
                    }
                    break;
                }

                case ANSCENTER::EngineType::OPENVINO_GPU: {
                    auto it = std::find(availableProviders.begin(),
                        availableProviders.end(), "OpenVINOExecutionProvider");
                    if (it == availableProviders.end()) {
                        this->_logger.LogError("YOLO12OD::loadModel", "OpenVINOExecutionProvider not in DLL — "
                            "check ep/openvino/ has the OpenVINO ORT build.", __FILE__, __LINE__);
                        break;
                    }

                    const std::string precision = "FP16";
                    const std::string numberOfThreads = "8";
                    const std::string numberOfStreams = "8";

                    std::vector<std::unordered_map<std::string, std::string>> try_configs = {
                        { {"device_type","AUTO:NPU,GPU"}, {"precision",precision},
                          {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} },
                        { {"device_type","GPU.0"}, {"precision",precision},
                          {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} },
                        { {"device_type","GPU.1"}, {"precision",precision},
                          {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} },
                        { {"device_type","AUTO:GPU,CPU"}, {"precision",precision},
                          {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams},
                          {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }
                    };

                    for (const auto& config : try_configs) {
                        try {
                            sessionOptions.AppendExecutionProvider_OpenVINO_V2(config);
                            std::cout << "[YOLO12OD] OpenVINO EP attached ("
                                << config.at("device_type") << ")." << std::endl;
                            attached = true;
                            break;
                        }
                        catch (const Ort::Exception& e) {
                            this->_logger.LogError("YOLO12OD::loadModel", e.what(), __FILE__, __LINE__);
                        }
                    }

                    if (!attached)
                        std::cerr << "[YOLO12OD] OpenVINO EP: all device configs failed." << std::endl;
                    break;
                }

                default:
                    break;
                }

                if (!attached) {
                    std::cerr << "[YOLO12OD] No GPU EP attached — running on CPU." << std::endl;
                    this->_logger.LogFatal("YOLO12OD::loadModel", "GPU EP not attached. Running on CPU.", __FILE__, __LINE__);
                }
            }
            else {
                std::cout << "[YOLO12OD] Inference device: CPU (useGPU=false)" << std::endl;
            }

            // ── Load model ──────────────────────────────────────────────────────
#ifdef _WIN32
            std::wstring w_modelPath(modelPath.begin(), modelPath.end());
            session = Ort::Session(env, w_modelPath.c_str(), sessionOptions);
#else
            session = Ort::Session(env, modelPath.c_str(), sessionOptions);
#endif

            Ort::AllocatorWithDefaultOptions allocator;

            // ── Input shape ─────────────────────────────────────────────────────
            Ort::TypeInfo inputTypeInfo = session.GetInputTypeInfo(0);
            std::vector<int64_t> inputTensorShapeVec =
                inputTypeInfo.GetTensorTypeAndShapeInfo().GetShape();
            isDynamicInputShape = (inputTensorShapeVec.size() >= 4) &&
                (inputTensorShapeVec[2] == -1 && inputTensorShapeVec[3] == -1);

            if (isDynamicInputShape)
                std::cout << "Dynamic input shape detected." << std::endl;
            for (auto shape : inputTensorShapeVec)
                std::cout << "Input shape: " << shape << std::endl;

            // ── Node names ──────────────────────────────────────────────────────
            auto input_name = session.GetInputNameAllocated(0, allocator);
            inputNodeNameAllocatedStrings.push_back(std::move(input_name));
            inputNames.push_back(inputNodeNameAllocatedStrings.back().get());

            auto output_name = session.GetOutputNameAllocated(0, allocator);
            outputNodeNameAllocatedStrings.push_back(std::move(output_name));
            outputNames.push_back(outputNodeNameAllocatedStrings.back().get());

            // ── Input image size ────────────────────────────────────────────────
            if (inputTensorShapeVec.size() >= 4) {
                inputImageShape = cv::Size(
                    static_cast<int>(inputTensorShapeVec[3]),
                    static_cast<int>(inputTensorShapeVec[2]));
            }
            else {
                this->_logger.LogFatal("YOLO12OD::loadModel", "Invalid input tensor shape.", __FILE__, __LINE__);
                return false;
            }

            numInputNodes = session.GetInputCount();
            numOutputNodes = session.GetOutputCount();

            std::cout << "Model loaded successfully with "
                << numInputNodes << " input nodes and "
                << numOutputNodes << " output nodes." << std::endl;
            return true;
        }
        catch (const std::exception& e) {
            this->_logger.LogFatal("YOLO12OD::loadModel", e.what(), __FILE__, __LINE__);
            return false;
        }
    }


    cv::Mat YOLO12OD::preprocess(const cv::Mat& image, std::vector<float>& blob, std::vector<int64_t>& inputTensorShape) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        m_imgWidth = image.cols;
        m_imgHeight = image.rows;
        try {
            cv::Mat processedImage;

            // Handle grayscale images by converting them to 3-channel BGR
            if (image.channels() == 1) {
                cv::cvtColor(image, processedImage, cv::COLOR_GRAY2BGR);
            }
            else {
                processedImage = image.clone();
            }

            cv::Mat resizedImage;
            // Resize and pad the image using letterBox utility
            letterBox(processedImage, resizedImage, inputImageShape, cv::Scalar(114, 114, 114), isDynamicInputShape, false, true, 32);

            // Update input tensor shape based on resized image dimensions
            inputTensorShape[2] = resizedImage.rows;
            inputTensorShape[3] = resizedImage.cols;

            // Convert image to float and normalize to [0, 1]
            resizedImage.convertTo(resizedImage, CV_32FC3, 1 / 255.0f);

            // Allocate memory for the image blob in CHW format
            size_t totalSize = resizedImage.cols * resizedImage.rows * resizedImage.channels();
            blob.resize(totalSize);

            // Split the image into separate channels and store in the blob
            std::vector<cv::Mat> chw(resizedImage.channels());
            for (int i = 0; i < resizedImage.channels(); ++i) {
                chw[i] = cv::Mat(resizedImage.rows, resizedImage.cols, CV_32FC1, blob.data() + i * resizedImage.cols * resizedImage.rows);
            }
            cv::split(resizedImage, chw); // Split channels into the blob

            return resizedImage;
        }
        catch (const std::exception& e) {
            this->_logger.LogFatal("YOLO12OD::preprocess", e.what(), __FILE__, __LINE__);
            return cv::Mat();
        }
    }
    std::vector<Object> YOLO12OD::postprocess(const cv::Size& originalImageSize, const cv::Size& resizedImageShape,
                                            const std::vector<Ort::Value>& outputTensors,
                                            float confThreshold, float iouThreshold) 
    {
		std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {

            std::vector<Object> detections;
            const float* rawOutput = outputTensors[0].GetTensorData<float>(); // Extract raw output data from the first output tensor
            const std::vector<int64_t> outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();

            // Determine the number of features and detections
            const size_t num_features = outputShape[1];
            const size_t num_detections = outputShape[2];

            // Early exit if no detections
            if (num_detections == 0) {
                return detections;
            }

            // Calculate number of classes based on output shape
            const int numClasses = static_cast<int>(num_features) - 4;
            if (numClasses <= 0) {
                // Invalid number of classes
                return detections;
            }

            // Reserve memory for efficient appending
            std::vector<BoundingBox> boxes;
            boxes.reserve(num_detections);
            std::vector<float> confs;
            confs.reserve(num_detections);
            std::vector<int> classIds;
            classIds.reserve(num_detections);
            std::vector<BoundingBox> nms_boxes;
            nms_boxes.reserve(num_detections);

            // Constants for indexing
            const float* ptr = rawOutput;

            for (size_t d = 0; d < num_detections; ++d) {
                // Extract bounding box coordinates (center x, center y, width, height)
                float centerX = ptr[0 * num_detections + d];
                float centerY = ptr[1 * num_detections + d];
                float width = ptr[2 * num_detections + d];
                float height = ptr[3 * num_detections + d];

                // Find class with the highest confidence score
                int classId = -1;
                float maxScore = -FLT_MAX;
                for (int c = 0; c < numClasses; ++c) {
                    const float score = ptr[d + (4 + c) * num_detections];
                    if (score > maxScore) {
                        maxScore = score;
                        classId = c;
                    }
                }

                // Proceed only if confidence exceeds threshold
                if (maxScore > confThreshold) {
                    // Convert center coordinates to top-left (x1, y1)
                    float left = centerX - width / 2.0f;
                    float top = centerY - height / 2.0f;

                    // Scale to original image size
                    BoundingBox scaledBox = scaleCoords(
                        resizedImageShape,
                        BoundingBox(left, top, width, height),
                        originalImageSize,
                        true
                    );

                    // Round coordinates for integer pixel positions
                    BoundingBox roundedBox;
                    roundedBox.x = std::round(scaledBox.x);
                    roundedBox.y = std::round(scaledBox.y);
                    roundedBox.width = std::round(scaledBox.width);
                    roundedBox.height = std::round(scaledBox.height);

                    // Adjust NMS box coordinates to prevent overlap between classes
                    BoundingBox nmsBox = roundedBox;
                    nmsBox.x += classId * 7680; // Arbitrary offset to differentiate classes
                    nmsBox.y += classId * 7680;

                    // Add to respective containers
                    nms_boxes.emplace_back(nmsBox);
                    boxes.emplace_back(roundedBox);
                    confs.emplace_back(maxScore);
                    classIds.emplace_back(classId);
                }
            }

            // Apply Non-Maximum Suppression (NMS) to eliminate redundant detections
            std::vector<int> indices;
            NMSBoxes(nms_boxes, confs, confThreshold, iouThreshold, indices);

            // Collect filtered detections into the result vector
			int classNameSize = _classes.size();
            detections.reserve(indices.size());
            for (const int idx : indices) {
				float conf = confs[idx];
                if (conf >= confThreshold) {
                    Object detection;
                    detection.confidence = confs[idx];
                    detection.box.x = boxes[idx].x;
                    detection.box.y = boxes[idx].y;
                    detection.box.width = boxes[idx].width;
                    detection.box.height = boxes[idx].height;
                    detection.classId = classIds[idx];
                    if (!_classes.empty()) {
                        if (detection.classId < classNameSize) {
                            detection.className = _classes[detection.classId];
                        }
                        else {
                            detection.className = _classes[classNameSize - 1]; // Use last valid class name if out of range
                        }
                    }
                    else {
                        detection.className = "Unknown"; // Fallback if _classes is empty
                    }

         
					detection.polygon = ANSUtilityHelper::RectToNormalizedPolygon(detection.box, m_imgWidth, m_imgHeight);
                    detections.push_back(detection);
                }
            }
            return detections;
        }
        catch (const std::exception& e) {
            this->_logger.LogFatal("YOLO12OD::postprocess", e.what(), __FILE__, __LINE__);
			return std::vector<Object>();
        }
    }

    std::vector<Object> YOLO12OD::detect(const cv::Mat& image, float confThreshold, float iouThreshold) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {
            // Define the shape of the input tensor (batch size, channels, height, width)
            std::vector<int64_t> inputTensorShape = { 1, 3, inputImageShape.height, inputImageShape.width };

            // Preprocess the image and obtain the blob as a vector<float>
            std::vector<float> blob;
            cv::Mat preprocessedImage = preprocess(image, blob, inputTensorShape);

            // Compute the total number of elements in the input tensor
            size_t inputTensorSize = vectorProduct(inputTensorShape);

            if (blob.size() != inputTensorSize) {
                this->_logger.LogFatal("YOLO12OD::detect", "Mismatch between blob size and expected tensor size", __FILE__, __LINE__);
                return {};
            }

            // Create an Ort memory info object (cached for efficiency)
            static const Ort::MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);

            // Create input tensor object using the preprocessed data
            Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
                memoryInfo,
                blob.data(),  // Use the vector's data directly
                inputTensorSize,
                inputTensorShape.data(),
                inputTensorShape.size()
            );

            // Run the inference session with the input tensor and retrieve output tensors
            std::vector<Ort::Value> outputTensors = session.Run(
                Ort::RunOptions{ nullptr },
                inputNames.data(),
                &inputTensor,
                numInputNodes,
                outputNames.data(),
                numOutputNodes
            );

            // Determine the resized image shape based on input tensor shape
            cv::Size resizedImageShape(static_cast<int>(inputTensorShape[3]), static_cast<int>(inputTensorShape[2]));

            // Postprocess the output tensors to obtain detections
            std::vector<Object> detections = postprocess(image.size(), resizedImageShape, outputTensors, confThreshold, iouThreshold);

            return detections; // Return the vector of detections
        }
        catch (const std::exception& e) {
            this->_logger.LogFatal("YOLO12OD::detect", e.what(), __FILE__, __LINE__);
            return {};
        }
    }
   
	// Utility function to clamp a value within a specified range
    size_t YOLO12OD::vectorProduct(const std::vector<int64_t>& vector) {
        return std::accumulate(vector.begin(), vector.end(), 1ull, std::multiplies<size_t>());
    }

    void YOLO12OD::letterBox(const cv::Mat& image, cv::Mat& outImage,
                            const cv::Size& newShape,
                            const cv::Scalar& color,
                            bool auto_,
                            bool scaleFill,
                            bool scaleUp,
                            int stride) 
    {
        try {
            // Calculate the scaling ratio to fit the image within the new shape
            float ratio = std::min(static_cast<float>(newShape.height) / image.rows,
                static_cast<float>(newShape.width) / image.cols);

            // Prevent scaling up if not allowed
            if (!scaleUp) {
                ratio = std::min(ratio, 1.0f);
            }

            // Calculate new dimensions after scaling
            int newUnpadW = static_cast<int>(std::round(image.cols * ratio));
            int newUnpadH = static_cast<int>(std::round(image.rows * ratio));

            // Calculate padding needed to reach the desired shape
            int dw = newShape.width - newUnpadW;
            int dh = newShape.height - newUnpadH;

            if (auto_) {
                // Ensure padding is a multiple of stride for model compatibility
                dw = (dw % stride) / 2;
                dh = (dh % stride) / 2;
            }
            else if (scaleFill) {
                // Scale to fill without maintaining aspect ratio
                newUnpadW = newShape.width;
                newUnpadH = newShape.height;
                ratio = std::min(static_cast<float>(newShape.width) / image.cols,
                    static_cast<float>(newShape.height) / image.rows);
                dw = 0;
                dh = 0;
            }
            else {
                // Evenly distribute padding on both sides
                // Calculate separate padding for left/right and top/bottom to handle odd padding
                int padLeft = dw / 2;
                int padRight = dw - padLeft;
                int padTop = dh / 2;
                int padBottom = dh - padTop;

                // Resize the image if the new dimensions differ
                if (image.cols != newUnpadW || image.rows != newUnpadH) {
                    cv::resize(image, outImage, cv::Size(newUnpadW, newUnpadH), 0, 0, cv::INTER_LINEAR);
                }
                else {
                    // Avoid unnecessary copying if dimensions are the same
                    outImage = image;
                }

                // Apply padding to reach the desired shape
                cv::copyMakeBorder(outImage, outImage, padTop, padBottom, padLeft, padRight, cv::BORDER_CONSTANT, color);
                return; // Exit early since padding is already applied
            }

            // Resize the image if the new dimensions differ
            if (image.cols != newUnpadW || image.rows != newUnpadH) {
                cv::resize(image, outImage, cv::Size(newUnpadW, newUnpadH), 0, 0, cv::INTER_LINEAR);
            }
            else {
                // Avoid unnecessary copying if dimensions are the same
                outImage = image;
            }

            // Calculate separate padding for left/right and top/bottom to handle odd padding
            int padLeft = dw / 2;
            int padRight = dw - padLeft;
            int padTop = dh / 2;
            int padBottom = dh - padTop;

            // Apply padding to reach the desired shape
            cv::copyMakeBorder(outImage, outImage, padTop, padBottom, padLeft, padRight, cv::BORDER_CONSTANT, color);
        }
		catch (const std::exception& e) {
			std::cerr << "Error in letterBox: " << e.what() << std::endl;
		}
    }

    BoundingBox YOLO12OD::scaleCoords(const cv::Size& imageShape, BoundingBox coords,
                                      const cv::Size& imageOriginalShape, bool p_Clip) 
    {
        BoundingBox result;
        try {
            float gain = std::min(static_cast<float>(imageShape.height) / static_cast<float>(imageOriginalShape.height),
                static_cast<float>(imageShape.width) / static_cast<float>(imageOriginalShape.width));

            int padX = static_cast<int>(std::round((imageShape.width - imageOriginalShape.width * gain) / 2.0f));
            int padY = static_cast<int>(std::round((imageShape.height - imageOriginalShape.height * gain) / 2.0f));

            result.x = static_cast<int>(std::round((coords.x - padX) / gain));
            result.y = static_cast<int>(std::round((coords.y - padY) / gain));
            result.width = static_cast<int>(std::round(coords.width / gain));
            result.height = static_cast<int>(std::round(coords.height / gain));

            if (p_Clip) {
                result.x = clamp(result.x, 0, imageOriginalShape.width);
                result.y = clamp(result.y, 0, imageOriginalShape.height);
                result.width = clamp(result.width, 0, imageOriginalShape.width - result.x);
                result.height = clamp(result.height, 0, imageOriginalShape.height - result.y);
            }
            return result;
        }
        catch (const std::exception& e) {
            std::cerr << "Error in scaleCoords: " << e.what() << std::endl;
            return result;
        }
    }

    // Optimized Non-Maximum Suppression Function
    void YOLO12OD::NMSBoxes(const std::vector<BoundingBox>& boundingBoxes,
                            const std::vector<float>& scores,
                            float scoreThreshold,
                            float nmsThreshold,
                            std::vector<int>& indices)
    {
        indices.clear();

        const size_t numBoxes = boundingBoxes.size();
        if (numBoxes == 0) {
            return;
        }

        // Step 1: Filter out boxes with scores below the threshold
        // and create a list of indices sorted by descending scores
        std::vector<int> sortedIndices;
        sortedIndices.reserve(numBoxes);
        for (size_t i = 0; i < numBoxes; ++i) {
            if (scores[i] >= scoreThreshold) {
                sortedIndices.push_back(static_cast<int>(i));
            }
        }

        // If no boxes remain after thresholding
        if (sortedIndices.empty()) {
            return;
        }

        // Sort the indices based on scores in descending order
        std::sort(sortedIndices.begin(), sortedIndices.end(),
            [&scores](int idx1, int idx2) {
                return scores[idx1] > scores[idx2];
            });

        // Step 2: Precompute the areas of all boxes
        std::vector<float> areas(numBoxes, 0.0f);
        for (size_t i = 0; i < numBoxes; ++i) {
            areas[i] = boundingBoxes[i].width * boundingBoxes[i].height;
        }

        // Step 3: Suppression mask to mark boxes that are suppressed
        std::vector<bool> suppressed(numBoxes, false);

        // Step 4: Iterate through the sorted list and suppress boxes with high IoU
        for (size_t i = 0; i < sortedIndices.size(); ++i) {
            int currentIdx = sortedIndices[i];
            if (suppressed[currentIdx]) {
                continue;
            }

            // Select the current box as a valid detection
            indices.push_back(currentIdx);

            const BoundingBox& currentBox = boundingBoxes[currentIdx];
            const float x1_max = currentBox.x;
            const float y1_max = currentBox.y;
            const float x2_max = currentBox.x + currentBox.width;
            const float y2_max = currentBox.y + currentBox.height;
            const float area_current = areas[currentIdx];

            // Compare IoU of the current box with the rest
            for (size_t j = i + 1; j < sortedIndices.size(); ++j) {
                int compareIdx = sortedIndices[j];
                if (suppressed[compareIdx]) {
                    continue;
                }

                const BoundingBox& compareBox = boundingBoxes[compareIdx];
                const float x1 = std::max(x1_max, static_cast<float>(compareBox.x));
                const float y1 = std::max(y1_max, static_cast<float>(compareBox.y));
                const float x2 = std::min(x2_max, static_cast<float>(compareBox.x + compareBox.width));
                const float y2 = std::min(y2_max, static_cast<float>(compareBox.y + compareBox.height));

                const float interWidth = x2 - x1;
                const float interHeight = y2 - y1;

                if (interWidth <= 0 || interHeight <= 0) {
                    continue;
                }

                const float intersection = interWidth * interHeight;
                const float unionArea = area_current + areas[compareIdx] - intersection;
                const float iou = (unionArea > 0.0f) ? (intersection / unionArea) : 0.0f;

                if (iou > nmsThreshold) {
                    suppressed[compareIdx] = true;
                }
            }
        }
    }

}