modules/ANSODEngine/ANSRTYOLO.cpp

#include "ANSRTYOLO.h"
#include "Utility.h"
#include "ANSLicense.h"   // ANS_DBG macro for DebugView
#include <future>
#include <numeric>
#include <cmath>
#include <cstring>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>

namespace ANSCENTER {

    // ====================================================================
    //  ANSODBase interface — OptimizeModel
    // ====================================================================
    bool ANSRTYOLO::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder)) return false;
        if (!FileExist(_modelFilePath)) {
            _logger.LogFatal("ANSRTYOLO::OptimizeModel", "Raw model file path does not exist", __FILE__, __LINE__);
            return false;
        }
        try {
            _fp16 = fp16;
            optimizedModelFolder = GetParentFolder(_modelFilePath);
            if (!m_trtEngine) {
                m_options.optBatchSize   = _modelConfig.gpuOptBatchSize;
                m_options.maxBatchSize   = _modelConfig.gpuMaxBatchSize;
                m_options.deviceIndex    = _modelConfig.gpuDeviceIndex;
                m_options.maxInputHeight = _modelConfig.maxInputHeight;
                m_options.minInputHeight = _modelConfig.minInputHeight;
                m_options.optInputHeight = _modelConfig.optInputHeight;
                m_options.maxInputWidth  = _modelConfig.maxInputWidth;
                m_options.minInputWidth  = _modelConfig.minInputWidth;
                m_options.optInputWidth  = _modelConfig.optInputWidth;
                m_options.engineFileDir  = optimizedModelFolder;
                m_options.precision      = (_fp16 ? Precision::FP16 : Precision::FP32);
                m_trtEngine = std::make_shared<Engine<float>>(m_options);
            }
            auto succ = m_trtEngine->buildWithRetry(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE);
            if (!succ) {
                _logger.LogError("ANSRTYOLO::OptimizeModel",
                    "Error: Unable to build TensorRT engine. " + _modelFilePath, __FILE__, __LINE__);
                return false;
            }
            return true;
        }
        catch (std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::OptimizeModel", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    // ====================================================================
    //  ANSODBase interface — LoadModel
    // ====================================================================
    bool ANSRTYOLO::LoadModel(const std::string& modelZipFilePath,
                              const std::string& modelZipPassword) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {
            _isFixedBatch = false;
            bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword);
            if (!result) return false;

            _modelConfig.modelType = ModelType::TENSORRT;
            if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
            if (_modelConfig.inpWidth  <= 0) _modelConfig.inpWidth  = 640;
            if (_modelConfig.modelMNSThreshold  < 0.2f) _modelConfig.modelMNSThreshold  = 0.5f;
            if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.5f;
            if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)
                _modelConfig.numKPS = 17;
            if (_modelConfig.kpsThreshold <= 0) _modelConfig.kpsThreshold = 0.5f;
            _fp16 = true;

            TOP_K                   = 300;
            SEG_CHANNELS            = 32;
            PROBABILITY_THRESHOLD   = _modelConfig.detectionScoreThreshold;
            NMS_THRESHOLD           = _modelConfig.modelMNSThreshold;
            SEGMENTATION_THRESHOLD  = 0.5f;
            SEG_H                   = 160;
            SEG_W                   = 160;
            NUM_KPS                 = _modelConfig.numKPS;
            KPS_THRESHOLD           = _modelConfig.kpsThreshold;

            m_options.optBatchSize   = _modelConfig.gpuOptBatchSize;
            m_options.maxBatchSize   = _modelConfig.gpuMaxBatchSize;
            m_options.deviceIndex    = _modelConfig.gpuDeviceIndex;
            m_options.maxInputHeight = _modelConfig.maxInputHeight;
            m_options.minInputHeight = _modelConfig.minInputHeight;
            m_options.optInputHeight = _modelConfig.optInputHeight;
            m_options.maxInputWidth  = _modelConfig.maxInputWidth;
            m_options.minInputWidth  = _modelConfig.minInputWidth;
            m_options.optInputWidth  = _modelConfig.optInputWidth;
            m_options.engineFileDir  = _modelFolder;
            m_options.precision      = (_fp16 ? Precision::FP16 : Precision::FP32);

            _modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
            if (FileExist(_modelConfigFile)) {
                ModelType modelType;
                std::vector<int> inputShape;
                _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
                if (inputShape.size() == 2) {
                    if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
                    if (inputShape[1] > 0) _modelConfig.inpWidth  = inputShape[1];
                }
            }
            else {
                _classFilePath = CreateFilePath(_modelFolder, "classes.names");
                std::ifstream isValid(_classFilePath);
                if (!isValid) LoadClassesFromString();
                else          LoadClassesFromFile();
            }

            if (this->_loadEngineOnCreation) {
                if (!m_trtEngine) {
                    m_poolKey = { _modelFilePath,
                                  static_cast<int>(m_options.precision), m_options.maxBatchSize };
                    m_trtEngine = EnginePoolManager<float>::instance().acquire(
                        m_poolKey, m_options, _modelFilePath,
                        SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
                    m_usingSharedPool = (m_trtEngine != nullptr);
                }
                if (!m_trtEngine) {
                    _logger.LogError("ANSRTYOLO::LoadModel",
                        "Error: Unable to load TensorRT engine. " + _modelFilePath, __FILE__, __LINE__);
                    _modelLoadValid = false;
                    return false;
                }
                m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize;
                m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize;
                m_trtEngine->warmUp();
            }
            _modelLoadValid = true;
            _isInitialized  = true;
            return true;
        }
        catch (std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::LoadModel", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    // ====================================================================
    //  ANSODBase interface — LoadModelFromFolder
    // ====================================================================
    bool ANSRTYOLO::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig,
                                        std::string modelName, std::string className,
                                        const std::string& modelFolder, std::string& labelMap) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {
            _isFixedBatch = false;
            bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig,
                                                         modelName, className, modelFolder, labelMap);
            if (!result) return false;

            _modelConfig = modelConfig;
            _modelConfig.modelType = ModelType::TENSORRT;
            if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
            if (_modelConfig.inpWidth  <= 0) _modelConfig.inpWidth  = 640;
            _modelConfig.precisionType = PrecisionType::FP32;
            if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)
                _modelConfig.numKPS = 17;
            if (_modelConfig.modelMNSThreshold  < 0.2f) _modelConfig.modelMNSThreshold  = 0.5f;
            if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.5f;
            if (_modelConfig.kpsThreshold <= 0) _modelConfig.kpsThreshold = 0.5f;
            _fp16 = true;

            TOP_K                   = 300;
            SEG_CHANNELS            = 32;
            PROBABILITY_THRESHOLD   = _modelConfig.detectionScoreThreshold;
            NMS_THRESHOLD           = _modelConfig.modelMNSThreshold;
            SEGMENTATION_THRESHOLD  = 0.5f;
            SEG_H                   = 160;
            SEG_W                   = 160;
            NUM_KPS                 = _modelConfig.numKPS;
            KPS_THRESHOLD           = _modelConfig.kpsThreshold;

            std::string _modelName = modelName;
            if (_modelName.empty()) _modelName = "train_last";
            std::string modelFullName = _modelName + ".onnx";

            m_options.optBatchSize   = _modelConfig.gpuOptBatchSize;
            m_options.maxBatchSize   = _modelConfig.gpuMaxBatchSize;
            m_options.deviceIndex    = _modelConfig.gpuDeviceIndex;
            m_options.maxInputHeight = _modelConfig.maxInputHeight;
            m_options.minInputHeight = _modelConfig.minInputHeight;
            m_options.optInputHeight = _modelConfig.optInputHeight;
            m_options.maxInputWidth  = _modelConfig.maxInputWidth;
            m_options.minInputWidth  = _modelConfig.minInputWidth;
            m_options.optInputWidth  = _modelConfig.optInputWidth;
            m_options.engineFileDir  = _modelFolder;
            m_options.precision      = (_fp16 ? Precision::FP16 : Precision::FP32);

            _modelFilePath = CreateFilePath(_modelFolder, modelFullName);
            if (FileExist(_modelConfigFile)) {
                ModelType modelType;
                std::vector<int> inputShape;
                _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
                if (inputShape.size() == 2) {
                    if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
                    if (inputShape[1] > 0) _modelConfig.inpWidth  = inputShape[1];
                }
            }
            else {
                _classFilePath = CreateFilePath(_modelFolder, className);
                std::ifstream isValid(_classFilePath);
                if (!isValid) LoadClassesFromString();
                else          LoadClassesFromFile();
            }

            labelMap.clear();
            if (!_classes.empty())
                labelMap = VectorToCommaSeparatedString(_classes);

            if (this->_loadEngineOnCreation) {
                if (!m_trtEngine) {
                    m_poolKey = { _modelFilePath,
                                  static_cast<int>(m_options.precision), m_options.maxBatchSize };
                    m_trtEngine = EnginePoolManager<float>::instance().acquire(
                        m_poolKey, m_options, _modelFilePath,
                        SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
                    m_usingSharedPool = (m_trtEngine != nullptr);
                }
                if (!m_trtEngine) {
                    _logger.LogError("ANSRTYOLO::LoadModelFromFolder",
                        "Error: Unable to load TensorRT engine. " + _modelFilePath, __FILE__, __LINE__);
                    _modelLoadValid = false;
                    return false;
                }
                m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize;
                m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize;
                m_trtEngine->warmUp();
            }
            _modelLoadValid = true;
            _isInitialized  = true;
            return true;
        }
        catch (std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::LoadModelFromFolder", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    // ====================================================================
    //  ANSODBase interface — Initialize
    // ====================================================================
    bool ANSRTYOLO::Initialize(std::string licenseKey, ModelConfig modelConfig,
                               const std::string& modelZipFilePath,
                               const std::string& modelZipPassword,
                               std::string& labelMap) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {
            const bool engineAlreadyLoaded = _modelLoadValid && _isInitialized && m_trtEngine != nullptr;
            _modelLoadValid = false;
            _isFixedBatch = false;
            bool result = ANSODBase::Initialize(licenseKey, modelConfig,
                                                modelZipFilePath, modelZipPassword, labelMap);
            if (!result) return false;

            _modelConfig = modelConfig;
            _modelConfig.modelType = ModelType::TENSORRT;
            if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
            if (_modelConfig.inpWidth  <= 0) _modelConfig.inpWidth  = 640;
            _modelConfig.precisionType = PrecisionType::FP32;
            if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)
                _modelConfig.numKPS = 17;
            if (_modelConfig.modelMNSThreshold  < 0.2f) _modelConfig.modelMNSThreshold  = 0.5f;
            if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.5f;
            if (_modelConfig.kpsThreshold <= 0) _modelConfig.kpsThreshold = 0.5f;
            _fp16 = true;

            TOP_K                   = 300;
            SEG_CHANNELS            = 32;
            PROBABILITY_THRESHOLD   = _modelConfig.detectionScoreThreshold;
            NMS_THRESHOLD           = _modelConfig.modelMNSThreshold;
            SEGMENTATION_THRESHOLD  = 0.5f;
            SEG_H                   = 160;
            SEG_W                   = 160;
            NUM_KPS                 = _modelConfig.numKPS;
            KPS_THRESHOLD           = _modelConfig.kpsThreshold;

            m_options.optBatchSize   = _modelConfig.gpuOptBatchSize;
            m_options.maxBatchSize   = _modelConfig.gpuMaxBatchSize;
            m_options.deviceIndex    = _modelConfig.gpuDeviceIndex;
            m_options.maxInputHeight = _modelConfig.maxInputHeight;
            m_options.minInputHeight = _modelConfig.minInputHeight;
            m_options.optInputHeight = _modelConfig.optInputHeight;
            m_options.maxInputWidth  = _modelConfig.maxInputWidth;
            m_options.minInputWidth  = _modelConfig.minInputWidth;
            m_options.optInputWidth  = _modelConfig.optInputWidth;
            m_options.engineFileDir  = _modelFolder;
            m_options.precision      = (_fp16 ? Precision::FP16 : Precision::FP32);

            _modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
            if (FileExist(_modelConfigFile)) {
                ModelType modelType;
                std::vector<int> inputShape;
                _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
                if (inputShape.size() == 2) {
                    if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
                    if (inputShape[1] > 0) _modelConfig.inpWidth  = inputShape[1];
                }
            }
            else {
                _classFilePath = CreateFilePath(_modelFolder, "classes.names");
                std::ifstream isValid(_classFilePath);
                if (!isValid) LoadClassesFromString();
                else          LoadClassesFromFile();
            }

            labelMap.clear();
            if (!_classes.empty())
                labelMap = VectorToCommaSeparatedString(_classes);

            if (this->_loadEngineOnCreation && !engineAlreadyLoaded) {
                if (!m_trtEngine) {
                    m_poolKey = { _modelFilePath,
                                  static_cast<int>(m_options.precision), m_options.maxBatchSize };
                    m_trtEngine = EnginePoolManager<float>::instance().acquire(
                        m_poolKey, m_options, _modelFilePath,
                        SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
                    m_usingSharedPool = (m_trtEngine != nullptr);
                }
                if (!m_trtEngine) {
                    _logger.LogError("ANSRTYOLO::Initialize",
                        "Error: Unable to load TensorRT engine. " + _modelFilePath, __FILE__, __LINE__);
                    _modelLoadValid = false;
                    return false;
                }
                m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize;
                m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize;
                m_trtEngine->warmUp();
            }
            _modelLoadValid = true;
            _isInitialized  = true;
            return true;
        }
        catch (std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::Initialize", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    // ====================================================================
    //  RunInference / RunInferencesBatch / Destroy / Destructor
    // ====================================================================
    std::vector<Object> ANSRTYOLO::RunInference(const cv::Mat& inputImgBGR) {
        return RunInference(inputImgBGR, "");
    }

    std::vector<Object> ANSRTYOLO::RunInference(const cv::Mat& inputImgBGR,
                                                 const std::string& camera_id) {
        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
            if (!_modelLoadValid) {
                _logger.LogError("ANSRTYOLO::RunInference", "Cannot load TensorRT model", __FILE__, __LINE__);
                return {};
            }
            if (!_licenseValid) {
                _logger.LogError("ANSRTYOLO::RunInference", "Invalid license", __FILE__, __LINE__);
                return {};
            }
            if (!_isInitialized) {
                _logger.LogError("ANSRTYOLO::RunInference", "Model not initialized", __FILE__, __LINE__);
                return {};
            }
            if (inputImgBGR.empty() || inputImgBGR.cols < 10 || inputImgBGR.rows < 10)
                return {};
        }
        try { return DetectObjects(inputImgBGR, camera_id); }
        catch (const std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::RunInference", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    std::vector<std::vector<Object>> ANSRTYOLO::RunInferencesBatch(
            const std::vector<cv::Mat>& inputs, const std::string& camera_id) {
        {
            std::lock_guard<std::recursive_mutex> lock(_mutex);
            if (!_modelLoadValid) {
                _logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Cannot load the TensorRT model", __FILE__, __LINE__);
                return {};
            }
            if (!_licenseValid) {
                _logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Runtime license is not valid or expired", __FILE__, __LINE__);
                return {};
            }
            if (!_isInitialized) {
                _logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Initialisation is not valid", __FILE__, __LINE__);
                return {};
            }
            if (inputs.empty()) {
                _logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Input images vector is empty", __FILE__, __LINE__);
                return {};
            }
        }
        try {
            if (_isFixedBatch) return ANSODBase::RunInferencesBatch(inputs, camera_id);
            else return DetectObjectsBatch(inputs, camera_id);
        }
        catch (const std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::RunInferencesBatch", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    ANSRTYOLO::~ANSRTYOLO() {
        try { Destroy(); }
        catch (std::exception& e) {
            _logger.LogError("ANSRTYOLO::~ANSRTYOLO()", e.what(), __FILE__, __LINE__);
        }
    }

    bool ANSRTYOLO::Destroy() {
        try {
            if (m_usingSharedPool) {
                // Release our reference to the shared pool.
                // Pool is destroyed only when all tasks release it.
                EnginePoolManager<float>::instance().release(m_poolKey);
                m_trtEngine.reset();   // drop shared_ptr (pool may survive)
                m_usingSharedPool = false;
            }
            else {
                m_trtEngine.reset();
            }
            m_nv12Helper.destroy();
            return true;
        }
        catch (std::exception& e) {
            _logger.LogError("ANSRTYOLO::Destroy()", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    // ====================================================================
    //  GPU Preprocessing — single image (pinned-memory H2D path)
    //
    //  1. Copy raw host image into a pinned (page-locked) buffer
    //  2. Upload from pinned memory → GPU  (DMA, no staging copy)
    //  3. BGR→RGB colour conversion on GPU
    //  4. Letterbox resize on GPU (right-bottom pad)
    //
    //  Pinned memory eliminates the internal pageable→pinned staging
    //  copy that CUDA performs for normal (pageable) host memory,
    //  cutting the H2D transfer of a 3840×2160 BGR frame (~24 MB)
    //  by 60-70%.
    // ====================================================================
    std::vector<std::vector<cv::cuda::GpuMat>> ANSRTYOLO::Preprocess(
            const cv::Mat& inputImage, ImageMetadata& outMeta) {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
        try {
            if (!_licenseValid) {
                _logger.LogFatal("ANSRTYOLO::Preprocess", "Invalid license", __FILE__, __LINE__);
                return {};
            }
            const auto& inputDims = m_trtEngine->getInputDims();
            const int inputH = inputDims[0].d[1];
            const int inputW = inputDims[0].d[2];

            // Early-out if CUDA context is dead (sticky error from CUVID crash etc.)
            if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) return {};

            cv::cuda::Stream stream;
            cv::cuda::GpuMat gpuImg;

            // Resolve source Mat (handle grayscale → BGR on CPU first)
            if (inputImage.channels() == 1) {
                cv::Mat img3Channel;
                cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
                gpuImg.upload(img3Channel, stream);
            } else {
                gpuImg.upload(inputImage, stream);
            }

            // GPU: BGR → RGB
            cv::cuda::GpuMat gpuRGB;
            cv::cuda::cvtColor(gpuImg, gpuRGB, cv::COLOR_BGR2RGB, 0, stream);

            outMeta.imgHeight = static_cast<float>(gpuRGB.rows);
            outMeta.imgWidth  = static_cast<float>(gpuRGB.cols);

            if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
                outMeta.ratio = 1.f / std::min(
                    inputDims[0].d[2] / static_cast<float>(gpuRGB.cols),
                    inputDims[0].d[1] / static_cast<float>(gpuRGB.rows));

                // Check if model is classification (output ndims <= 2)
                const auto& outputDims = m_trtEngine->getOutputDims();
                const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;

                cv::cuda::GpuMat gpuResized;
                if (gpuRGB.rows != inputH || gpuRGB.cols != inputW) {
                    if (isClassification) {
                        // Classification: direct resize (no letterbox padding)
                        cv::cuda::resize(gpuRGB, gpuResized, cv::Size(inputW, inputH),
                                         0, 0, cv::INTER_LINEAR, stream);
                    }
                    else {
                        // Detection/Seg/Pose/OBB: letterbox resize + right-bottom pad (on GPU)
                        gpuResized = Engine<float>::resizeKeepAspectRatioPadRightBottom(
                            gpuRGB, inputH, inputW);
                    }
                } else {
                    gpuResized = gpuRGB;
                }

                stream.waitForCompletion();

                std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
                std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
                return inputs;
            }
            else {
                _logger.LogFatal("ANSRTYOLO::Preprocess",
                    "Image height or width is zero (Width: " + std::to_string(outMeta.imgWidth) +
                    ", Height: " + std::to_string(outMeta.imgHeight) + ")", __FILE__, __LINE__);
                return {};
            }
        }
        catch (const std::exception& e) {
            _logger.LogWarn("ANSRTYOLO::Preprocess", std::string("Skipped frame: ") + e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    #if 0  // PreprocessFromNV12 — moved to NV12PreprocessHelper::tryNV12()
        try {
            if (!gpuData || !gpuData->yPlane || !gpuData->uvPlane) {
                if (!m_nv12NullLogged) {
                    m_nv12NullLogged = true;
                    _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
                        "Early exit: null data — gpuData=" + std::to_string(gpuData != nullptr) +
                        " yPlane=" + std::to_string(gpuData ? (gpuData->yPlane != nullptr) : false) +
                        " uvPlane=" + std::to_string(gpuData ? (gpuData->uvPlane != nullptr) : false) +
                        " isCuda=" + std::to_string(gpuData ? gpuData->isCudaDevicePtr : false),
                        __FILE__, __LINE__);
                }
                return {};
            }

            const auto& inputDims = m_trtEngine->getInputDims();
            const int inputH = inputDims[0].d[1];
            const int inputW = inputDims[0].d[2];
            const int frameW = gpuData->width;
            const int frameH = gpuData->height;

            if (frameW <= 0 || frameH <= 0) {
                if (!m_nv12DimLogged) {
                    m_nv12DimLogged = true;
                    _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
                        "Early exit: bad dimensions — w=" + std::to_string(frameW) +
                        " h=" + std::to_string(frameH),
                        __FILE__, __LINE__);
                }
                return {};
            }

            // Early-out if CUDA context is dead (sticky error from CUVID crash etc.)
            if (m_cudaContextDead) {
                if (!m_nv12DeadLogged) {
                    m_nv12DeadLogged = true;
                    _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
                        "Early exit: CUDA context dead",
                        __FILE__, __LINE__);
                }
                return {};
            }

            // Cache flag before lock is released — gpuData may be invalidated after unlock
            const bool isCudaDevice = gpuData->isCudaDevicePtr;

            // ── GPU index validation for zero-copy ──
            // NVDEC device pointers are only valid on the CUDA context that decoded them.
            // If decode GPU != inference GPU, wrapping those pointers causes
            // "illegal memory access" → sticky CUDA error → entire context dies.
            // Fall back to CPU memcpy+upload path when GPUs don't match.
            const int inferenceGpu = m_trtEngine ? m_trtEngine->getPreferredDeviceIndex() : 0;
            const bool gpuMatch = !isCudaDevice ||
                                  gpuData->gpuIndex < 0 ||       // unknown = trust it
                                  gpuData->gpuIndex == inferenceGpu;
            const bool useZeroCopy = isCudaDevice && gpuMatch;

            // Local plane pointers — default to gpuData's primary planes.
            // Overridden below for cross-GPU fallback (CPU NV12 instead of CUDA).
            uint8_t* effYPlane     = gpuData->yPlane;
            uint8_t* effUvPlane    = gpuData->uvPlane;
            int      effYLinesize  = gpuData->yLinesize;
            int      effUvLinesize = gpuData->uvLinesize;

            if (isCudaDevice && !gpuMatch) {
                // Cross-GPU: NV12 decoded on one GPU, inference on another.
                // CPU NV12 fallback uploads full decode-res NV12 (e.g. 3840x2160 = 12.4 MB)
                // over PCIe, which is SLOWER than BGR at display-res (1920x1080 = 6.2 MB).
                // Measured: CPU NV12 cross-GPU = 15-39ms preproc vs BGR = 10-20ms.
                // Just fall back to BGR — it's faster for the cross-GPU case.
                if (!m_gpuMismatchLogged) {
                    m_gpuMismatchLogged = true;
                    _logger.LogInfo("ANSRTYOLO::PreprocessFromNV12",
                        "GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) +
                        " vs inference GPU " + std::to_string(inferenceGpu) +
                        ") — skipping NV12, using BGR (faster for cross-GPU: "
                        "BGR uploads " + std::to_string(displayW * displayH * 3 / 1024) +
                        "KB display-res vs NV12 " + std::to_string(frameW * frameH * 3 / 2 / 1024) +
                        "KB full-res)",
                        __FILE__, __LINE__);
                }
                if (regLock.owns_lock()) regLock.unlock();
                return {};  // caller will use Preprocess(BGR) instead
            }

            // Diagnostic: log which path will be taken (once per instance)
            // Note: cross-GPU case already returned {} above, so reaching here
            // means either CUDA zero-copy (same GPU) or CPU NV12 upload (non-CUDA).
            if (!m_nv12PathLogged) {
                m_nv12PathLogged = true;
                const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY"
                    : "CPU_NV12_UPLOAD";
                _logger.LogInfo("ANSRTYOLO::PreprocessFromNV12",
                    std::string("Path: ") + pathName +
                    " | isCuda=" + std::to_string(isCudaDevice) +
                    " gpuMatch=" + std::to_string(gpuMatch) +
                    " decodeGpu=" + std::to_string(gpuData->gpuIndex) +
                    " infGpu=" + std::to_string(inferenceGpu) +
                    " frame=" + std::to_string(frameW) + "x" + std::to_string(frameH) +
                    " effYLine=" + std::to_string(effYLinesize) +
                    " effUvLine=" + std::to_string(effUvLinesize) +
                    " effYPtr=0x" + std::to_string(reinterpret_cast<uintptr_t>(effYPlane)) +
                    " hasCpuFallback=" + std::to_string(gpuData->cpuYPlane != nullptr),
                    __FILE__, __LINE__);
            }

            cv::cuda::Stream stream;
            cv::cuda::GpuMat gpuY, gpuUV;

            if (useZeroCopy) {
                // ── CUDA zero-copy: wrap NVDEC device pointers directly ──
                // No memcpy, no device-to-device copy — data stays in NVDEC VRAM.
                // The fused letterbox kernel samples only ~409K pixels from the 4K
                // source (vs 8.3M full copy), completing in <1ms on RTX 5080.
                // We hold the registry lock until the kernel finishes reading.
                gpuY  = cv::cuda::GpuMat(frameH,     frameW, CV_8UC1,
                            effYPlane,  static_cast<size_t>(effYLinesize));
                gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
                            effUvPlane, static_cast<size_t>(effUvLinesize));
                // Lock released after kernel completion (stream.waitForCompletion below)
            } else {
                // ── CPU path: memcpy + upload (fallback for D3D11VA / sw decode) ──
                // Hold registry lock during memcpy so the AVFrame can't be freed
                // by another thread calling gpu_frame_attach() on the same key.
                const size_t ySize  = static_cast<size_t>(frameW) * frameH;
                const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
                const size_t nv12Size = ySize + uvSize;
                ensurePinnedBuffer(nv12Size);
                if (!m_pinnedBuf) {
                    if (!m_nv12PinnedLogged) {
                        m_nv12PinnedLogged = true;
                        _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
                            "Early exit: pinned buffer alloc failed for " +
                            std::to_string(nv12Size) + " bytes",
                            __FILE__, __LINE__);
                    }
                    return {};
                }

                // Validate NV12 plane pointers before memcpy
                const size_t yBufNeeded  = (effYLinesize == frameW)
                    ? ySize
                    : static_cast<size_t>(effYLinesize) * frameH;
                const size_t uvBufNeeded = (effUvLinesize == frameW)
                    ? uvSize
                    : static_cast<size_t>(effUvLinesize) * (frameH / 2);

                if (!isMemoryReadable(effYPlane, std::min(yBufNeeded, (size_t)4096)) ||
                    !isMemoryReadable(effUvPlane, std::min(uvBufNeeded, (size_t)4096))) {
                    _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
                        "NV12 plane pointers not readable! yPlane=0x" +
                        std::to_string(reinterpret_cast<uintptr_t>(effYPlane)) +
                        " uvPlane=0x" +
                        std::to_string(reinterpret_cast<uintptr_t>(effUvPlane)) +
                        " yLinesize=" + std::to_string(effYLinesize) +
                        " uvLinesize=" + std::to_string(effUvLinesize) +
                        " w=" + std::to_string(frameW) + " h=" + std::to_string(frameH),
                        __FILE__, __LINE__);
                    if (regLock.owns_lock()) regLock.unlock();
                    return {};  // fall back to BGR
                }

                uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
                bool cpyOk = true;
                if (effYLinesize == frameW) {
                    cpyOk = safeMemcpy(dst, effYPlane, ySize);
                } else {
                    for (int row = 0; row < frameH && cpyOk; row++)
                        cpyOk = safeMemcpy(dst + row * frameW,
                                    effYPlane + row * effYLinesize, frameW);
                }
                if (cpyOk) {
                    uint8_t* uvDst = dst + ySize;
                    if (effUvLinesize == frameW) {
                        cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
                    } else {
                        for (int row = 0; row < frameH / 2 && cpyOk; row++)
                            cpyOk = safeMemcpy(uvDst + row * frameW,
                                        effUvPlane + row * effUvLinesize, frameW);
                    }
                }

                if (!cpyOk) {
                    _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
                        "Access violation during NV12 memcpy! Falling back to BGR. "
                        "yPlane=0x" + std::to_string(reinterpret_cast<uintptr_t>(effYPlane)) +
                        " uvPlane=0x" + std::to_string(reinterpret_cast<uintptr_t>(effUvPlane)) +
                        " yLinesize=" + std::to_string(effYLinesize) +
                        " uvLinesize=" + std::to_string(effUvLinesize) +
                        " w=" + std::to_string(frameW) + " h=" + std::to_string(frameH) +
                        " avframe=0x" + std::to_string(reinterpret_cast<uintptr_t>(gpuData->avframe)),
                        __FILE__, __LINE__);
                    if (regLock.owns_lock()) regLock.unlock();
                    return {};  // fall back to BGR
                }

                // NV12 data safely in pinned memory — release registry lock.
                // From here on we only read from m_pinnedBuf, not from gpuData.
                if (regLock.owns_lock()) regLock.unlock();

                cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
                cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
                    static_cast<uint8_t*>(m_pinnedBuf) + ySize);
                gpuY.upload(pinnedY, stream);
                gpuUV.upload(pinnedUV, stream);
            }

            // Use display dimensions for coordinate mapping so postprocessed
            // bboxes map to the display image (1080p), not the NV12 source (4K).
            const float metaW = (displayW > 0) ? static_cast<float>(displayW) : static_cast<float>(frameW);
            const float metaH = (displayH > 0) ? static_cast<float>(displayH) : static_cast<float>(frameH);
            outMeta.imgWidth  = metaW;
            outMeta.imgHeight = metaH;

            if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
                outMeta.ratio = 1.f / std::min(
                    inputDims[0].d[2] / metaW,
                    inputDims[0].d[1] / metaH);

                const auto& outputDims = m_trtEngine->getOutputDims();
                const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;

                cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
                cv::cuda::GpuMat gpuResized;

                if (isClassification) {
                    // Classification: NV12→RGB at full resolution, then simple resize
                    cv::cuda::GpuMat gpuRGB(frameH, frameW, CV_8UC3);
                    launchNV12ToRGB(
                        gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
                        gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
                        gpuRGB.ptr<uint8_t>(), static_cast<int>(gpuRGB.step),
                        frameW, frameH, rawStream);
                    cv::cuda::resize(gpuRGB, gpuResized, cv::Size(inputW, inputH),
                                     0, 0, cv::INTER_LINEAR, stream);
                } else if (frameW == inputW && frameH == inputH) {
                    // Source matches model input — direct NV12→RGB, no resize needed
                    gpuResized.create(inputH, inputW, CV_8UC3);
                    launchNV12ToRGB(
                        gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
                        gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
                        gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),
                        frameW, frameH, rawStream);
                } else {
                    // Detection: fused NV12→RGB + letterbox in a SINGLE kernel at
                    // output resolution (e.g. 640×640).  This avoids the 24MB 4K RGB
                    // intermediate and processes 20× fewer pixels than separate
                    // convert + resize for 4K→640 downscale.
                    float r = std::min(static_cast<float>(inputW) / frameW,
                                       static_cast<float>(inputH) / frameH);
                    int unpadW = static_cast<int>(r * frameW);
                    int unpadH = static_cast<int>(r * frameH);
                    float invScale = 1.0f / r;  // maps output coords → source coords

                    gpuResized.create(inputH, inputW, CV_8UC3);
                    launchNV12ToRGBLetterbox(
                        gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
                        gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
                        gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),
                        inputW, inputH,
                        frameW, frameH,
                        unpadW, unpadH,
                        invScale, rawStream);
                }

                stream.waitForCompletion();

                // Release registry lock now that kernel is done reading NVDEC pointers
                if (regLock.owns_lock()) regLock.unlock();

                // Log NV12 fast-path usage once per instance
                if (!m_nv12ActiveLogged) {
                    m_nv12ActiveLogged = true;
                    const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload";
                    const char* kernel = isClassification ? "separate" : "FUSED letterbox";
                    _logger.LogInfo("ANSRTYOLO::PreprocessFromNV12",
                        std::string(mode) + " ACTIVE (" + kernel + "): " +
                        std::to_string(frameW) + "x" + std::to_string(frameH) +
                        " NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) +
                        " display=" + std::to_string(displayW) + "x" + std::to_string(displayH),
                        __FILE__, __LINE__);
                }

                std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
                std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
                return inputs;
            }

            {
                if (!m_nv12MetaLogged) {
                    m_nv12MetaLogged = true;
                    _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
                        "Early exit: metadata dims invalid — metaW=" +
                        std::to_string(outMeta.imgWidth) + " metaH=" +
                        std::to_string(outMeta.imgHeight) +
                        " displayW=" + std::to_string(displayW) +
                        " displayH=" + std::to_string(displayH),
                        __FILE__, __LINE__);
                }
            }
            return {};
        }
        catch (const std::exception& e) {
            _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
                std::string("NV12 fast path failed, falling back to BGR: ") + e.what(),
                __FILE__, __LINE__);
            return {};
        }
    }
    #endif  // PreprocessFromNV12 moved to NV12PreprocessHelper

    // ====================================================================
    //  GPU Preprocessing — batch
    // ====================================================================
    std::vector<std::vector<cv::cuda::GpuMat>> ANSRTYOLO::PreprocessBatch(
            const std::vector<cv::Mat>& inputImages, BatchMetadata& outMetadata) {
        if (!_licenseValid) {
            _logger.LogError("ANSRTYOLO::PreprocessBatch", "Invalid license", __FILE__, __LINE__);
            return {};
        }
        if (inputImages.empty()) {
            _logger.LogError("ANSRTYOLO::PreprocessBatch", "Empty input images vector", __FILE__, __LINE__);
            return {};
        }
        if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) return {};
        try {
            const auto& inputDims = m_trtEngine->getInputDims();
            if (inputDims.empty()) {
                _logger.LogError("ANSRTYOLO::PreprocessBatch", "No input dimensions available", __FILE__, __LINE__);
                return {};
            }
            const int inputH = inputDims[0].d[1];
            const int inputW = inputDims[0].d[2];
            if (inputH <= 0 || inputW <= 0) {
                _logger.LogError("ANSRTYOLO::PreprocessBatch", "Invalid model input dimensions", __FILE__, __LINE__);
                return {};
            }

            outMetadata.imgHeights.resize(inputImages.size());
            outMetadata.imgWidths.resize(inputImages.size());
            outMetadata.ratios.resize(inputImages.size());

            std::vector<cv::cuda::GpuMat> batchProcessed;
            batchProcessed.reserve(inputImages.size());
            cv::cuda::Stream stream;

            for (size_t i = 0; i < inputImages.size(); ++i) {
                const auto& inputImage = inputImages[i];
                if (inputImage.empty()) {
                    _logger.LogError("ANSRTYOLO::PreprocessBatch",
                        "Empty input image at index " + std::to_string(i), __FILE__, __LINE__);
                    return {};
                }
                cv::cuda::GpuMat img;
                if (inputImage.channels() == 1) {
                    cv::Mat img3Channel;
                    cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
                    img.upload(img3Channel, stream);
                }
                else if (inputImage.channels() == 3) {
                    img.upload(inputImage, stream);
                }
                else {
                    _logger.LogError("ANSRTYOLO::PreprocessBatch",
                        "Unsupported channel count at index " + std::to_string(i), __FILE__, __LINE__);
                    return {};
                }

                cv::cuda::GpuMat imgRGB;
                cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);

                outMetadata.imgHeights[i] = imgRGB.rows;
                outMetadata.imgWidths[i]  = imgRGB.cols;
                if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) {
                    _logger.LogError("ANSRTYOLO::PreprocessBatch",
                        "Invalid dimensions for image " + std::to_string(i), __FILE__, __LINE__);
                    return {};
                }

                const auto& outputDims = m_trtEngine->getOutputDims();
                const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;

                const float scaleW = inputW / static_cast<float>(imgRGB.cols);
                const float scaleH = inputH / static_cast<float>(imgRGB.rows);
                outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(scaleW, scaleH);

                cv::cuda::GpuMat resized;
                if (imgRGB.rows != inputH || imgRGB.cols != inputW) {
                    if (isClassification) {
                        cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream);
                    } else {
                        resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW);
                    }
                }
                else {
                    resized = imgRGB;
                }

                batchProcessed.push_back(std::move(resized));
            }
            stream.waitForCompletion();

            std::vector<std::vector<cv::cuda::GpuMat>> inputs;
            inputs.push_back(std::move(batchProcessed));
            return inputs;
        }
        catch (const std::exception& e) {
            _logger.LogWarn("ANSRTYOLO::PreprocessBatch", std::string("Skipped batch: ") + e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    // ====================================================================
    //  OBB NMS helpers (Prob-IoU based) — static methods
    // ====================================================================
    void ANSRTYOLO::getCovarianceComponents(const OrientedBox& box,
                                            float& out1, float& out2, float& out3) {
        if (box.width <= 0.f || box.height <= 0.f) {
            out1 = out2 = out3 = 0.f;
            return;
        }
        const float vw = (box.width * box.width) / 12.0f;
        const float vh = (box.height * box.height) / 12.0f;
        const float cosT = std::cos(box.angle);
        const float sinT = std::sin(box.angle);
        const float cos2 = cosT * cosT;
        const float sin2 = sinT * sinT;
        const float sc   = sinT * cosT;
        out1 = vw * cos2 + vh * sin2;
        out2 = vw * sin2 + vh * cos2;
        out3 = (vw - vh) * sc;
    }

    std::vector<std::vector<float>> ANSRTYOLO::batchProbiou(
            const std::vector<OrientedBox>& obb1,
            const std::vector<OrientedBox>& obb2, float eps) {
        if (obb1.empty() || obb2.empty()) return {};
        const size_t n1 = obb1.size(), n2 = obb2.size();
        std::vector<std::vector<float>> iouMat(n1, std::vector<float>(n2, 0.f));

        struct CovData { float x, y, a, b, c; };
        std::vector<CovData> cov1(n1);
        for (size_t i = 0; i < n1; ++i) {
            float a, b, c;
            getCovarianceComponents(obb1[i], a, b, c);
            cov1[i] = { obb1[i].x, obb1[i].y, a, b, c };
        }
        for (size_t i = 0; i < n1; ++i) {
            for (size_t j = 0; j < n2; ++j) {
                float a2, b2, c2;
                getCovarianceComponents(obb2[j], a2, b2, c2);
                float dx = cov1[i].x - obb2[j].x;
                float dy = cov1[i].y - obb2[j].y;
                float sA = cov1[i].a + a2, sB = cov1[i].b + b2, sC = cov1[i].c + c2;
                float denom = sA * sB - sC * sC + eps;
                if (denom <= eps) continue;
                float t1 = ((sA*dy*dy + sB*dx*dx) * 0.25f) / denom;
                float t2 = ((sC*dx*dy) * -0.5f) / denom;
                float d1 = cov1[i].a*cov1[i].b - cov1[i].c*cov1[i].c;
                float d2 = a2*b2 - c2*c2;
                float sqrtDet = std::sqrt(std::max(d1, 0.f) * std::max(d2, 0.f) + eps);
                float t3 = 0.5f * std::log((sA*sB - sC*sC) / (4.f*sqrtDet) + eps);
                float bd = std::clamp(t1 + t2 + t3, eps, 100.f);
                float hd = std::sqrt(1.f - std::exp(-bd) + eps);
                iouMat[i][j] = 1.f - hd;
            }
        }
        return iouMat;
    }

    std::vector<int> ANSRTYOLO::nmsRotatedImpl(
            const std::vector<OrientedBox>& sortedBoxes, float iouThreshold) {
        if (sortedBoxes.empty()) return {};
        if (sortedBoxes.size() == 1) return { 0 };
        auto iouMat = batchProbiou(sortedBoxes, sortedBoxes);
        if (iouMat.empty()) return {};
        const int n = static_cast<int>(sortedBoxes.size());
        std::vector<int> keep;
        keep.reserve(n / 2);
        for (int j = 0; j < n; ++j) {
            bool shouldKeep = true;
            for (int i = 0; i < j; ++i) {
                if (iouMat[i][j] >= iouThreshold) { shouldKeep = false; break; }
            }
            if (shouldKeep) keep.push_back(j);
        }
        return keep;
    }

    std::vector<int> ANSRTYOLO::nmsRotated(
            const std::vector<OrientedBox>& boxes,
            const std::vector<float>& scores, float iouThreshold) {
        if (boxes.empty() || scores.empty() || boxes.size() != scores.size()) return {};
        std::vector<int> sortedIdx(boxes.size());
        std::iota(sortedIdx.begin(), sortedIdx.end(), 0);
        std::sort(sortedIdx.begin(), sortedIdx.end(),
            [&](int a, int b) { return scores[a] > scores[b]; });
        std::vector<OrientedBox> sortedBoxes;
        sortedBoxes.reserve(boxes.size());
        for (int i : sortedIdx) sortedBoxes.push_back(boxes[i]);
        auto keepSorted = nmsRotatedImpl(sortedBoxes, iouThreshold);
        std::vector<int> keepOrig;
        keepOrig.reserve(keepSorted.size());
        for (int si : keepSorted) keepOrig.push_back(sortedIdx[si]);
        return keepOrig;
    }

    std::vector<cv::Point2f> ANSRTYOLO::OBBToPoints(const OrientedBox& obb) {
        float angleDeg = obb.angle * 180.0f / static_cast<float>(CV_PI);
        cv::RotatedRect rr(cv::Point2f(obb.x, obb.y),
                           cv::Size2f(obb.width, obb.height), angleDeg);
        std::vector<cv::Point2f> corners(4);
        rr.points(corners.data());
        return corners;
    }

    // ====================================================================
    //  Detection — legacy postprocess
    // ====================================================================
    std::vector<Object> ANSRTYOLO::PostprocessDetection(
            std::vector<float>& featureVector,
            const std::string& camera_id, const ImageMetadata& meta) {
        try {
            const auto& outputDims = m_trtEngine->getOutputDims();
            auto numChannels = outputDims[0].d[1];
            auto numAnchors  = outputDims[0].d[2];
            // Derive numClasses from tensor shape (4 box coords subtracted)
            // rather than _classes.size() which may not match the model
            auto numClasses  = static_cast<size_t>(numChannels - 4);
            if (!_classes.empty() && _classes.size() <= static_cast<size_t>(numChannels - 4))
                numClasses = _classes.size();

            std::vector<cv::Rect> bboxes;
            std::vector<float> scores;
            std::vector<int> labels;
            std::vector<int> indices;

            cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data());
            output = output.t();

            for (int i = 0; i < numAnchors; i++) {
                auto rowPtr    = output.row(i).ptr<float>();
                auto bboxesPtr = rowPtr;
                auto scoresPtr = rowPtr + 4;
                auto maxSPtr   = std::max_element(scoresPtr, scoresPtr + numClasses);
                float score = *maxSPtr;
                if (score > _modelConfig.detectionScoreThreshold) {
                    float x = *bboxesPtr++;
                    float y = *bboxesPtr++;
                    float w = *bboxesPtr++;
                    float h = *bboxesPtr;
                    float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
                    float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
                    float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
                    float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
                    int label = static_cast<int>(maxSPtr - scoresPtr);
                    cv::Rect_<float> bbox;
                    bbox.x = x0; bbox.y = y0;
                    bbox.width = x1 - x0; bbox.height = y1 - y0;
                    bbox.x = std::max(0.f, bbox.x);
                    bbox.y = std::max(0.f, bbox.y);
                    bbox.width  = std::min(meta.imgWidth  - bbox.x, bbox.width);
                    bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height);
                    bboxes.push_back(bbox);
                    labels.push_back(label);
                    scores.push_back(score);
                }
            }

            cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices);
            std::vector<Object> objects;
            int classNameSize = static_cast<int>(_classes.size());
            for (auto& chosenIdx : indices) {
                if (scores[chosenIdx] > _modelConfig.detectionScoreThreshold) {
                    Object obj{};
                    obj.confidence = scores[chosenIdx];
                    obj.classId    = labels[chosenIdx];
                    obj.box        = bboxes[chosenIdx];
                    //obj.polygon    = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
                    if (!_classes.empty()) {
                        obj.className = (obj.classId < classNameSize)
                            ? _classes[obj.classId] : _classes[classNameSize - 1];
                    }
                    else { obj.className = "Unknown"; }
                    obj.cameraId = camera_id;
                    objects.push_back(obj);
                }
            }
            return objects;
        }
        catch (std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::PostprocessDetection", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    // ====================================================================
    //  Detection — end2end postprocess
    // ====================================================================
    std::vector<Object> ANSRTYOLO::PostprocessDetectionE2E(
            std::vector<float>& featureVector,
            const std::string& camera_id, const ImageMetadata& meta) {
        try {
            const auto& outputDims = m_trtEngine->getOutputDims();
            int numDets = outputDims[0].d[1];
            int numFeat = outputDims[0].d[2]; // 6: x1,y1,x2,y2,conf,classId

            std::vector<Object> results;
            results.reserve(numDets);

            for (int i = 0; i < numDets; ++i) {
                const float* det = featureVector.data() + i * numFeat;
                float conf = det[4];
                if (conf <= _modelConfig.detectionScoreThreshold) continue;

                int classId = static_cast<int>(det[5]);
                // Scale from model input space to original image
                float x0 = std::clamp(det[0] * meta.ratio, 0.f, meta.imgWidth);
                float y0 = std::clamp(det[1] * meta.ratio, 0.f, meta.imgHeight);
                float x1 = std::clamp(det[2] * meta.ratio, 0.f, meta.imgWidth);
                float y1 = std::clamp(det[3] * meta.ratio, 0.f, meta.imgHeight);
                float w = x1 - x0, h = y1 - y0;
                if (w < 1.f || h < 1.f) continue;

                Object obj;
                obj.classId    = classId;
                obj.confidence = conf;
                obj.box        = cv::Rect(static_cast<int>(x0), static_cast<int>(y0),
                                          static_cast<int>(w), static_cast<int>(h));
                //obj.polygon    = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
                int classNameSize = static_cast<int>(_classes.size());
                if (!_classes.empty() && classId >= 0 && classId < classNameSize)
                    obj.className = _classes[classId];
                obj.cameraId = camera_id;
                results.push_back(std::move(obj));
            }
            return results;
        }
        catch (std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::PostprocessDetectionE2E", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    // ====================================================================
    //  OBB — legacy postprocess
    // ====================================================================
    std::vector<Object> ANSRTYOLO::PostprocessOBB(
            std::vector<float>& featureVector,
            const std::string& camera_id, const ImageMetadata& meta) {
        try {
            const auto& outputDims = m_trtEngine->getOutputDims();
            int numChannels = outputDims[0].d[1];
            int numAnchors  = outputDims[0].d[2];
            int numClasses  = numChannels - 5; // 4 box + nc scores + 1 angle
            if (numClasses <= 0) return {};

            cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data()).t();

            struct OBBCandidate {
                OrientedBox box;
                float conf;
                int classId;
            };
            std::vector<OBBCandidate> candidates;
            candidates.reserve(numAnchors);

            for (int i = 0; i < numAnchors; ++i) {
                const float* row = output.ptr<float>(i);
                const float* scoresPtr = row + 4;
                float maxScore = -FLT_MAX;
                int bestClass = -1;
                for (int c = 0; c < numClasses; ++c) {
                    if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; }
                }
                if (maxScore <= _modelConfig.detectionScoreThreshold) continue;

                float angle = row[4 + numClasses];
                float cx = row[0] * meta.ratio;
                float cy = row[1] * meta.ratio;
                float bw = row[2] * meta.ratio;
                float bh = row[3] * meta.ratio;
                cx = std::clamp(cx, 0.f, meta.imgWidth);
                cy = std::clamp(cy, 0.f, meta.imgHeight);

                candidates.push_back({ { cx, cy, bw, bh, angle }, maxScore, bestClass });
            }
            if (candidates.empty()) return {};

            // Prob-IoU NMS
            std::vector<OrientedBox> boxes;
            std::vector<float> scores;
            boxes.reserve(candidates.size());
            scores.reserve(candidates.size());
            for (const auto& c : candidates) { boxes.push_back(c.box); scores.push_back(c.conf); }

            auto keepIdx = nmsRotated(boxes, scores, NMS_THRESHOLD);

            std::vector<Object> results;
            int classNameSize = static_cast<int>(_classes.size());
            results.reserve(std::min(static_cast<int>(keepIdx.size()), TOP_K));
            for (int idx : keepIdx) {
                if (static_cast<int>(results.size()) >= TOP_K) break;
                const auto& c = candidates[idx];
                Object obj;
                obj.classId    = c.classId;
                obj.confidence = c.conf;
                obj.kps        = { c.box.x, c.box.y, c.box.width, c.box.height, c.box.angle };
                auto absCorners = OBBToPoints(c.box);
                obj.box        = cv::boundingRect(absCorners);
                // Normalize OBB corners to [0,1] and close the polygon
                obj.polygon.reserve(absCorners.size() + 1);
                for (const auto& pt : absCorners) {
                    obj.polygon.emplace_back(
                        std::clamp(pt.x / meta.imgWidth, 0.f, 1.f),
                        std::clamp(pt.y / meta.imgHeight, 0.f, 1.f));
                }
                if (!obj.polygon.empty()) obj.polygon.push_back(obj.polygon.front()); // close
                if (!_classes.empty() && c.classId >= 0 && c.classId < classNameSize)
                    obj.className = _classes[c.classId];
                obj.cameraId = camera_id;
                results.push_back(std::move(obj));
            }
            return results;
        }
        catch (std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::PostprocessOBB", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    // ====================================================================
    //  OBB — end2end postprocess
    // ====================================================================
    std::vector<Object> ANSRTYOLO::PostprocessOBBE2E(
            std::vector<float>& featureVector,
            const std::string& camera_id, const ImageMetadata& meta) {
        try {
            const auto& outputDims = m_trtEngine->getOutputDims();
            int numDets = outputDims[0].d[1];
            int numFeat = outputDims[0].d[2]; // 7: cx,cy,w,h,angle,conf,classId

            std::vector<Object> results;
            results.reserve(numDets);

            for (int i = 0; i < numDets; ++i) {
                const float* det = featureVector.data() + i * numFeat;
                float angle = det[4];
                float conf  = det[5];
                if (conf <= _modelConfig.detectionScoreThreshold) continue;

                float cx    = det[0] * meta.ratio;
                float cy    = det[1] * meta.ratio;
                float bw    = det[2] * meta.ratio;
                float bh    = det[3] * meta.ratio;
                int classId = static_cast<int>(det[6]);

                cx = std::clamp(cx, 0.f, meta.imgWidth);
                cy = std::clamp(cy, 0.f, meta.imgHeight);

                OrientedBox obb{ cx, cy, bw, bh, angle };

                Object obj;
                obj.classId    = classId;
                obj.confidence = conf;
                obj.kps        = { cx, cy, bw, bh, angle };
                auto absCorners = OBBToPoints(obb);
                obj.box        = cv::boundingRect(absCorners);
                // Normalize OBB corners to [0,1] and close the polygon
                obj.polygon.reserve(absCorners.size() + 1);
                for (const auto& pt : absCorners) {
                    obj.polygon.emplace_back(
                        std::clamp(pt.x / meta.imgWidth, 0.f, 1.f),
                        std::clamp(pt.y / meta.imgHeight, 0.f, 1.f));
                }
                if (!obj.polygon.empty()) obj.polygon.push_back(obj.polygon.front()); // close
                int classNameSize = static_cast<int>(_classes.size());
                if (!_classes.empty() && classId >= 0 && classId < classNameSize)
                    obj.className = _classes[classId];
                obj.cameraId = camera_id;
                results.push_back(std::move(obj));
            }
            return results;
        }
        catch (std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::PostprocessOBBE2E", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    // ====================================================================
    //  Segmentation — legacy postprocess
    // ====================================================================
    std::vector<Object> ANSRTYOLO::PostprocessSegmentation(
            std::vector<std::vector<float>>& featureVectors,
            const std::string& camera_id, const ImageMetadata& meta) {
        try {
            const auto& outputDims = m_trtEngine->getOutputDims();
            int numChannels = outputDims[0].d[1];
            int numAnchors  = outputDims[0].d[2];
            const auto numClasses = numChannels - SEG_CHANNELS - 4;

            if (featureVectors[0].size() != static_cast<size_t>(numChannels) * numAnchors) return {};
            if (featureVectors[1].size() != static_cast<size_t>(SEG_CHANNELS) * SEG_H * SEG_W) return {};

            cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVectors[0].data()).t();
            cv::Mat protos = cv::Mat(SEG_CHANNELS, SEG_H * SEG_W, CV_32F, featureVectors[1].data());

            std::vector<int> labels;
            std::vector<float> scores;
            std::vector<cv::Rect> bboxes;
            std::vector<cv::Mat> maskConfs;
            std::vector<int> indices;

            for (int i = 0; i < numAnchors; i++) {
                auto rowPtr      = output.row(i).ptr<float>();
                auto bboxesPtr   = rowPtr;
                auto scoresPtr   = rowPtr + 4;
                auto maskConfsPtr = rowPtr + 4 + numClasses;
                auto maxSPtr = std::max_element(scoresPtr, scoresPtr + numClasses);
                float score = *maxSPtr;
                if (score > _modelConfig.detectionScoreThreshold) {
                    float x = *bboxesPtr++;
                    float y = *bboxesPtr++;
                    float w = *bboxesPtr++;
                    float h = *bboxesPtr;
                    float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
                    float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
                    float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
                    float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
                    int label = static_cast<int>(maxSPtr - scoresPtr);
                    cv::Rect_<float> bbox;
                    bbox.x = x0; bbox.y = y0;
                    bbox.width = x1 - x0; bbox.height = y1 - y0;
                    bbox.x = std::max(0.f, bbox.x);
                    bbox.y = std::max(0.f, bbox.y);
                    bbox.width  = std::min(meta.imgWidth  - bbox.x, bbox.width);
                    bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height);
                    cv::Mat maskConf = cv::Mat(1, SEG_CHANNELS, CV_32F, maskConfsPtr);
                    bboxes.push_back(bbox);
                    labels.push_back(label);
                    scores.push_back(score);
                    maskConfs.push_back(maskConf);
                }
            }

            cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices);
            cv::Mat masks;
            int classNameSize = static_cast<int>(_classes.size());
            std::vector<Object> objs;
            for (auto& i : indices) {
                if (scores[i] > _modelConfig.detectionScoreThreshold) {
                    Object obj;
                    obj.classId = labels[i];
                    if (!_classes.empty()) {
                        obj.className = (obj.classId < classNameSize)
                            ? _classes[obj.classId] : _classes[classNameSize - 1];
                    }
                    else { obj.className = "Unknown"; }
                    obj.box        = bboxes[i];
                    obj.confidence = scores[i];
                    obj.cameraId   = camera_id;
                    masks.push_back(maskConfs[i]);
                    objs.push_back(obj);
                }
            }

            if (!masks.empty()) {
                cv::Mat matmulRes = (masks * protos).t();

                // Apply sigmoid while still a single-channel 2D matrix
                cv::Mat negMat;
                cv::exp(-matmulRes, negMat);
                cv::Mat sigmoidFlat = 1.0 / (1.0 + negMat);

                // Now reshape into multi-channel and split
                cv::Mat sigmoidMat = sigmoidFlat.reshape(static_cast<int>(indices.size()),
                    { SEG_H, SEG_W });
                std::vector<cv::Mat> maskChannels;
                cv::split(sigmoidMat, maskChannels);

                // ROI in proto space (SEG_H x SEG_W), accounting for top-left letterbox padding
                // ANSRTYOLO pads right-bottom, so content starts at (0,0) in proto space
                cv::Rect roi;
                if (meta.imgHeight > meta.imgWidth) {
                    int roiW = std::min(static_cast<int>(std::round(
                        static_cast<float>(SEG_W) * meta.imgWidth / meta.imgHeight)), SEG_W);
                    roi = cv::Rect(0, 0, roiW, SEG_H);
                }
                else {
                    int roiH = std::min(static_cast<int>(std::round(
                        static_cast<float>(SEG_H) * meta.imgHeight / meta.imgWidth)), SEG_H);
                    roi = cv::Rect(0, 0, SEG_W, roiH);
                }
                roi &= cv::Rect(0, 0, SEG_W, SEG_H);

                int imgW = static_cast<int>(meta.imgWidth);
                int imgH = static_cast<int>(meta.imgHeight);

                // Precompute scale factors from proto-ROI to original image
                const float scaleX = static_cast<float>(imgW) / roi.width;
                const float scaleY = static_cast<float>(imgH) / roi.height;

                for (size_t i = 0; i < objs.size(); i++) {
                    cv::Rect safeBox = objs[i].box & cv::Rect(0, 0, imgW, imgH);
                    if (safeBox.area() <= 0) continue;

                    // Map bounding box back to proto-ROI space and crop there
                    int px0 = std::max(static_cast<int>(std::floor(safeBox.x / scaleX)), 0);
                    int py0 = std::max(static_cast<int>(std::floor(safeBox.y / scaleY)), 0);
                    int px1 = std::min(static_cast<int>(std::ceil((safeBox.x + safeBox.width) / scaleX)), roi.width);
                    int py1 = std::min(static_cast<int>(std::ceil((safeBox.y + safeBox.height) / scaleY)), roi.height);
                    if (px1 <= px0 || py1 <= py0) continue;

                    cv::Rect protoBox(roi.x + px0, roi.y + py0, px1 - px0, py1 - py0);
                    protoBox &= cv::Rect(0, 0, SEG_W, SEG_H);
                    if (protoBox.area() <= 0) continue;

                    // Resize only the small proto crop to the bounding box size
                    cv::Mat cropped = maskChannels[i](protoBox);
                    cv::Mat resized;
                    cv::resize(cropped, resized, cv::Size(safeBox.width, safeBox.height),
                               0, 0, cv::INTER_LINEAR);
                    objs[i].mask = resized > _modelConfig.modelConfThreshold;
                    objs[i].polygon = ANSUtilityHelper::MaskToNormalizedPolygon(
                        objs[i].mask, safeBox, meta.imgWidth, meta.imgHeight);
                }
            }
            // Fill polygon for objects that got masks
            for (auto& obj : objs) {
                if (obj.polygon.empty())
                    obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
            }
            return objs;
        }
        catch (std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::PostprocessSegmentation", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    // ====================================================================
    //  Segmentation — end2end postprocess
    // ====================================================================
    std::vector<Object> ANSRTYOLO::PostprocessSegE2E(
            std::vector<std::vector<float>>& featureVectors,
            const std::string& camera_id, const ImageMetadata& meta) {
        try {
            if (featureVectors.size() < 2) return {};
            const auto& outputDims = m_trtEngine->getOutputDims();
            int numDets = outputDims[0].d[1];
            int numFeat = outputDims[0].d[2]; // 6 + nm

            // Proto dimensions from second output
            int nm     = outputDims[1].d[1];
            int protoH = outputDims[1].d[2];
            int protoW = (outputDims[1].nbDims > 3) ? outputDims[1].d[3] : outputDims[1].d[2];
            if (numFeat < 6 + nm) return {};

            const float* raw = featureVectors[0].data();
            std::vector<Object> objs;
            cv::Mat maskCoeffs;

            for (int i = 0; i < numDets; ++i) {
                const float* det = raw + i * numFeat;
                float conf = det[4];
                if (conf <= _modelConfig.detectionScoreThreshold) continue;

                int classId = static_cast<int>(det[5]);
                float x0 = std::clamp(det[0] * meta.ratio, 0.f, meta.imgWidth);
                float y0 = std::clamp(det[1] * meta.ratio, 0.f, meta.imgHeight);
                float x1 = std::clamp(det[2] * meta.ratio, 0.f, meta.imgWidth);
                float y1 = std::clamp(det[3] * meta.ratio, 0.f, meta.imgHeight);
                float w = x1 - x0, h = y1 - y0;
                if (w < 1.f || h < 1.f) continue;

                Object obj;
                obj.classId    = classId;
                obj.confidence = conf;
                obj.box        = cv::Rect(static_cast<int>(x0), static_cast<int>(y0),
                                          static_cast<int>(w), static_cast<int>(h));
                int classNameSize = static_cast<int>(_classes.size());
                if (!_classes.empty() && classId >= 0 && classId < classNameSize)
                    obj.className = _classes[classId];
                obj.cameraId = camera_id;
                objs.push_back(std::move(obj));

                cv::Mat mc(1, nm, CV_32F);
                std::memcpy(mc.ptr<float>(), det + 6, nm * sizeof(float));
                maskCoeffs.push_back(mc);
            }

            if (!objs.empty() && !maskCoeffs.empty()) {
                cv::Mat protos(nm, protoH * protoW, CV_32F, featureVectors[1].data());
                cv::Mat matmulRes = (maskCoeffs * protos).t();

                // Apply sigmoid while still a single-channel 2D matrix
                cv::Mat negMat;
                cv::exp(-matmulRes, negMat);
                cv::Mat sigmoidFlat = 1.0 / (1.0 + negMat);

                // Now reshape into multi-channel and split
                cv::Mat sigmoidMat = sigmoidFlat.reshape(static_cast<int>(objs.size()),
                                                    { protoH, protoW });
                std::vector<cv::Mat> maskChannels;
                cv::split(sigmoidMat, maskChannels);

                // ROI in proto space, accounting for top-left letterbox padding
                // ANSRTYOLO pads right-bottom, so content starts at (0,0) in proto space
                cv::Rect roi;
                if (meta.imgHeight > meta.imgWidth) {
                    int roiW = std::min(static_cast<int>(std::round(
                        static_cast<float>(protoW) * meta.imgWidth / meta.imgHeight)), protoW);
                    roi = cv::Rect(0, 0, roiW, protoH);
                }
                else {
                    int roiH = std::min(static_cast<int>(std::round(
                        static_cast<float>(protoH) * meta.imgHeight / meta.imgWidth)), protoH);
                    roi = cv::Rect(0, 0, protoW, roiH);
                }
                roi &= cv::Rect(0, 0, protoW, protoH);

                int imgW = static_cast<int>(meta.imgWidth);
                int imgH = static_cast<int>(meta.imgHeight);

                const float scaleX = static_cast<float>(imgW) / roi.width;
                const float scaleY = static_cast<float>(imgH) / roi.height;

                for (size_t i = 0; i < objs.size(); ++i) {
                    cv::Rect safebox = objs[i].box & cv::Rect(0, 0, imgW, imgH);
                    if (safebox.area() <= 0) continue;

                    int px0 = std::max(static_cast<int>(std::floor(safebox.x / scaleX)), 0);
                    int py0 = std::max(static_cast<int>(std::floor(safebox.y / scaleY)), 0);
                    int px1 = std::min(static_cast<int>(std::ceil((safebox.x + safebox.width) / scaleX)), roi.width);
                    int py1 = std::min(static_cast<int>(std::ceil((safebox.y + safebox.height) / scaleY)), roi.height);
                    if (px1 <= px0 || py1 <= py0) continue;

                    cv::Rect protoBox(roi.x + px0, roi.y + py0, px1 - px0, py1 - py0);
                    protoBox &= cv::Rect(0, 0, protoW, protoH);
                    if (protoBox.area() <= 0) continue;

                    cv::Mat cropped = maskChannels[i](protoBox);
                    cv::Mat resized;
                    cv::resize(cropped, resized, cv::Size(safebox.width, safebox.height),
                               0, 0, cv::INTER_LINEAR);
                    objs[i].mask = resized > SEGMENTATION_THRESHOLD;
                    objs[i].polygon = ANSUtilityHelper::MaskToNormalizedPolygon(
                        objs[i].mask, safebox, meta.imgWidth, meta.imgHeight);
                }
            }
            for (auto& obj : objs) {
                if (obj.polygon.empty())
                    obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
            }
            return objs;
        }
        catch (std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::PostprocessSegE2E", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    // ====================================================================
    //  Pose — legacy postprocess
    // ====================================================================
    std::vector<Object> ANSRTYOLO::PostprocessPose(
            std::vector<float>& featureVector,
            const std::string& camera_id, const ImageMetadata& meta) {
        try {
            const auto& outputDims = m_trtEngine->getOutputDims();
            auto numChannels = outputDims[0].d[1];
            auto numAnchors  = outputDims[0].d[2];

            std::vector<cv::Rect> bboxes;
            std::vector<float> scores;
            std::vector<int> labels;
            std::vector<int> indices;
            std::vector<std::vector<float>> kpss;

            cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data()).t();

            for (int i = 0; i < numAnchors; i++) {
                auto rowPtr    = output.row(i).ptr<float>();
                auto bboxesPtr = rowPtr;
                auto scoresPtr = rowPtr + 4;
                auto kps_ptr   = rowPtr + 5;
                float score = *scoresPtr;
                if (score > _modelConfig.detectionScoreThreshold) {
                    float x = *bboxesPtr++;
                    float y = *bboxesPtr++;
                    float w = *bboxesPtr++;
                    float h = *bboxesPtr;
                    float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
                    float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
                    float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
                    float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
                    cv::Rect_<float> bbox;
                    bbox.x = x0; bbox.y = y0;
                    bbox.width = x1 - x0; bbox.height = y1 - y0;
                    bbox.x = std::max(0.f, bbox.x);
                    bbox.y = std::max(0.f, bbox.y);
                    bbox.width  = std::min(meta.imgWidth  - bbox.x, bbox.width);
                    bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height);
                    std::vector<float> kps;
                    for (int k = 0; k < NUM_KPS; k++) {
                        float kpsX = std::clamp(*(kps_ptr + 3 * k) * meta.ratio, 0.f, meta.imgWidth);
                        float kpsY = std::clamp(*(kps_ptr + 3 * k + 1) * meta.ratio, 0.f, meta.imgHeight);
                        float kpsS = *(kps_ptr + 3 * k + 2);
                        kps.push_back(kpsX);
                        kps.push_back(kpsY);
                        kps.push_back(kpsS);
                    }
                    bboxes.push_back(bbox);
                    labels.push_back(0);
                    scores.push_back(score);
                    kpss.push_back(kps);
                }
            }

            cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices);
            std::vector<Object> objects;
            int classNameSize = static_cast<int>(_classes.size());
            for (auto& chosenIdx : indices) {
                if (scores[chosenIdx] > _modelConfig.detectionScoreThreshold) {
                    Object obj{};
                    obj.confidence = scores[chosenIdx];
                    obj.classId    = labels[chosenIdx];
                    if (!_classes.empty()) {
                        obj.className = (obj.classId < classNameSize)
                            ? _classes[obj.classId] : _classes[classNameSize - 1];
                    }
                    else { obj.className = "Unknown"; }
                    obj.box      = bboxes[chosenIdx];
                    //obj.polygon  = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
                    obj.kps      = kpss[chosenIdx];
                    obj.cameraId = camera_id;
                    objects.push_back(obj);
                }
            }
            return objects;
        }
        catch (std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::PostprocessPose", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    // ====================================================================
    //  Pose — end2end postprocess
    // ====================================================================
    std::vector<Object> ANSRTYOLO::PostprocessPoseE2E(
            std::vector<float>& featureVector,
            const std::string& camera_id, const ImageMetadata& meta) {
        try {
            const auto& outputDims = m_trtEngine->getOutputDims();
            int numDets = outputDims[0].d[1];
            int numFeat = outputDims[0].d[2]; // 6 + nk*3
            int nk = (numFeat - 6) / 3;

            std::vector<Object> results;
            results.reserve(numDets);

            for (int i = 0; i < numDets; ++i) {
                const float* det = featureVector.data() + i * numFeat;
                float conf = det[4];
                if (conf <= _modelConfig.detectionScoreThreshold) continue;

                int classId = static_cast<int>(det[5]);
                float x0 = std::clamp(det[0] * meta.ratio, 0.f, meta.imgWidth);
                float y0 = std::clamp(det[1] * meta.ratio, 0.f, meta.imgHeight);
                float x1 = std::clamp(det[2] * meta.ratio, 0.f, meta.imgWidth);
                float y1 = std::clamp(det[3] * meta.ratio, 0.f, meta.imgHeight);
                float w = x1 - x0, h = y1 - y0;
                if (w < 1.f || h < 1.f) continue;

                const float* kpsPtr = det + 6;
                std::vector<float> kps;
                kps.reserve(nk * 3);
                for (int k = 0; k < nk; ++k) {
                    float kx = std::clamp(kpsPtr[3*k]   * meta.ratio, 0.f, meta.imgWidth);
                    float ky = std::clamp(kpsPtr[3*k+1] * meta.ratio, 0.f, meta.imgHeight);
                    float ks = kpsPtr[3*k+2];
                    kps.push_back(kx);
                    kps.push_back(ky);
                    kps.push_back(ks);
                }

                Object obj;
                obj.classId    = classId;
                obj.confidence = conf;
                obj.box        = cv::Rect(static_cast<int>(x0), static_cast<int>(y0),
                                          static_cast<int>(w), static_cast<int>(h));
                obj.kps        = std::move(kps);
                int classNameSize = static_cast<int>(_classes.size());
                if (!_classes.empty() && classId >= 0 && classId < classNameSize)
                    obj.className = _classes[classId];
                obj.cameraId = camera_id;
                results.push_back(std::move(obj));
            }
            return results;
        }
        catch (std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::PostprocessPoseE2E", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    // ====================================================================
    //  Classification postprocess
    // ====================================================================
    std::vector<Object> ANSRTYOLO::PostprocessClassify(
            std::vector<float>& featureVector,
            const std::string& camera_id, const ImageMetadata& meta) {
        try {
            const int nc = static_cast<int>(featureVector.size());
            if (nc == 0) return {};

            // Check if output is already a probability distribution (sums to ~1.0).
            // Some models include a Softmax layer; applying softmax again would
            // flatten the distribution and cause wrong classifications.
            float rawSum = 0.f;
            bool allNonNeg = true;
            for (int i = 0; i < nc; ++i) {
                rawSum += featureVector[i];
                if (featureVector[i] < 0.f) allNonNeg = false;
            }
            const bool alreadyNormalized = (allNonNeg && rawSum > 0.9f && rawSum < 1.1f);

            std::vector<float> probs(nc);
            if (alreadyNormalized) {
                for (int i = 0; i < nc; ++i) probs[i] = featureVector[i];
            } else {
                float maxVal = *std::max_element(featureVector.begin(), featureVector.end());
                float sumExp = 0.f;
                for (int i = 0; i < nc; ++i) {
                    probs[i] = std::exp(featureVector[i] - maxVal);
                    sumExp += probs[i];
                }
                for (int i = 0; i < nc; ++i) probs[i] /= sumExp;
            }

            int bestClass = 0;
            float bestProb = 0.f;
            for (int i = 0; i < nc; ++i) {
                if (probs[i] > bestProb) { bestProb = probs[i]; bestClass = i; }
            }

            const int imgW = static_cast<int>(meta.imgWidth);
            const int imgH = static_cast<int>(meta.imgHeight);

            Object obj;
            if (imgW > 20 && imgH > 20) {
                obj.box = cv::Rect(10, 10, imgW - 20, imgH - 20);
            }
            else {
                obj.box = cv::Rect(0, 0, imgW, imgH);
            }
            //obj.polygon    = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
            obj.classId    = bestClass;
            obj.confidence = bestProb;
            obj.cameraId   = camera_id;
            int classNameSize = static_cast<int>(_classes.size());
            if (!_classes.empty() && bestClass >= 0 && bestClass < classNameSize)
                obj.className = _classes[bestClass];
            return { std::move(obj) };
        }
        catch (std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::PostprocessClassify", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    // ====================================================================
    //  DetectObjects — single image with auto-detection of task type
    // ====================================================================
    std::vector<Object> ANSRTYOLO::DetectObjects(const cv::Mat& inputImage,
                                                  const std::string& camera_id) {
        try {
            // --- Debug timer helper (zero-cost when _debugFlag == false) ---
            using Clock = std::chrono::steady_clock;
            const bool dbg = _debugFlag;
            auto t0 = dbg ? Clock::now() : Clock::time_point{};
            auto tPrev = t0;
            auto elapsed = [&]() -> double {
                auto now = Clock::now();
                double ms = std::chrono::duration<double, std::milli>(now - tPrev).count();
                tPrev = now;
                return ms;
            };

            // --- 1. Set GPU device context ---
            if (m_trtEngine) {
                m_trtEngine->setDeviceContext();
            }
            double msSetDevice = dbg ? elapsed() : 0;

            // --- 1b. CUDA context health check ---
            if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) {
                return {};
            }

            // --- 2. Preprocess under lock ---
            ANS_DBG("YOLO", "Preprocess START %dx%d", inputImage.cols, inputImage.rows);
            ImageMetadata meta;
            std::vector<std::vector<cv::cuda::GpuMat>> input;
            bool usedNV12 = false;
            float bgrFullResScaleX = 1.0f, bgrFullResScaleY = 1.0f;
            {
                std::lock_guard<std::recursive_mutex> lock(_mutex);
                const int inferenceGpu = m_trtEngine ? m_trtEngine->getPreferredDeviceIndex() : 0;
                const auto& inputDims = m_trtEngine->getInputDims();
                const int inputW = inputDims[0].d[2];
                const int inputH = inputDims[0].d[1];

                auto nv12 = m_nv12Helper.tryNV12(inputImage, inferenceGpu, inputW, inputH,
                                                  NV12PreprocessHelper::defaultYOLOLauncher(),
                                                  _logger, "ANSRTYOLO");
                if (nv12.succeeded) {
                    meta.imgWidth  = nv12.metaWidth;
                    meta.imgHeight = nv12.metaHeight;
                    meta.ratio     = nv12.ratio;
                    input = {{ std::move(nv12.gpuRGB) }};
                    usedNV12 = true;
                }
                else if (nv12.useBgrFullRes) {
                    input = Preprocess(nv12.bgrFullResImg, meta);
                    usedNV12 = !input.empty();
                    bgrFullResScaleX = nv12.bgrFullResScaleX;
                    bgrFullResScaleY = nv12.bgrFullResScaleY;
                }

                if (input.empty()) {
                    input = Preprocess(inputImage, meta);
                }
                m_nv12Helper.tickInference();
            }
            double msPreprocess = dbg ? elapsed() : 0;

            if (input.empty()) {
                _logger.LogWarn("ANSRTYOLO::DetectObjects", "Skipped: preprocessing returned empty input", __FILE__, __LINE__);
                return {};
            }

            // --- 3. TRT Inference (mutex released for concurrent GPU slots) ---
            ANS_DBG("YOLO", "TRT inference START nv12=%d inputSize=%dx%d",
                    (int)usedNV12,
                    input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].cols),
                    input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].rows));
            auto _trtStart = std::chrono::steady_clock::now();
            std::vector<std::vector<std::vector<float>>> featureVectors;
            if (!m_trtEngine->runInference(input, featureVectors)) {
                ANS_DBG("YOLO", "ERROR: TRT runInference FAILED");
                _logger.LogError("ANSRTYOLO::DetectObjects", "Error running inference", __FILE__, __LINE__);
                return {};
            }
            auto _trtEnd = std::chrono::steady_clock::now();
            double _trtMs = std::chrono::duration<double, std::milli>(_trtEnd - _trtStart).count();
            if (_trtMs > 500.0) {
                ANS_DBG("YOLO", "SLOW TRT inference: %.1fms", _trtMs);
            }
            double msInference = dbg ? elapsed() : 0;

            // --- 4. Transform output ---
            std::vector<Object> results;
            bool isClassification = false;
            {
                std::lock_guard<std::recursive_mutex> lock(_mutex);
                const auto& outputDims = m_trtEngine->getOutputDims();
                const size_t numOutputs = outputDims.size();

                if (numOutputs >= 2) {
                    std::vector<std::vector<float>> featureVector2d;
                    Engine<float>::transformOutput(featureVectors, featureVector2d);
                    double msTransform = dbg ? elapsed() : 0;

                    int dim1 = outputDims[0].d[1];
                    int dim2 = outputDims[0].d[2];
                    if (dim1 > dim2 || dim2 <= 20)
                        results = PostprocessSegE2E(featureVector2d, camera_id, meta);
                    else
                        results = PostprocessSegmentation(featureVector2d, camera_id, meta);

                    if (dbg) {
                        double msPostprocess = elapsed();
                        _logger.LogInfo("ANSRTYOLO::DetectObjects",
                            "[DEBUG] Seg | " + std::string(usedNV12 ? "NV12" : "BGR") +
                            " | SetDev=" + std::to_string(msSetDevice) +
                            "ms Preproc=" + std::to_string(msPreprocess) +
                            "ms Inf=" + std::to_string(msInference) +
                            "ms Transform=" + std::to_string(msTransform) +
                            "ms Postproc=" + std::to_string(msPostprocess) +
                            "ms Det=" + std::to_string(results.size()),
                            __FILE__, __LINE__);
                    }
                }
                else {
                    std::vector<float> featureVector;
                    Engine<float>::transformOutput(featureVectors, featureVector);
                    double msTransform = dbg ? elapsed() : 0;

                    if (outputDims[0].nbDims <= 2) {
                        results = PostprocessClassify(featureVector, camera_id, meta);
                        isClassification = true;
                    }
                    else {
                        int dim1 = outputDims[0].d[1];
                        int dim2 = outputDims[0].d[2];
                        int nc   = static_cast<int>(_classes.size());

                        const bool isEndToEnd = (dim1 > dim2) || (dim2 <= 20);
                        if (isEndToEnd) {
                            if (dim2 == 6)
                                results = PostprocessDetectionE2E(featureVector, camera_id, meta);
                            else if (dim2 == 7)
                                results = PostprocessOBBE2E(featureVector, camera_id, meta);
                            else if (dim2 > 7 && (dim2 - 6) % 3 == 0)
                                results = PostprocessPoseE2E(featureVector, camera_id, meta);
                            else
                                results = PostprocessDetectionE2E(featureVector, camera_id, meta);
                        }
                        else {
                            int extra = dim1 - 4;
                            bool routed = false;
                            if (nc > 0 && nc <= extra) {
                                if (extra == nc) {
                                    results = PostprocessDetection(featureVector, camera_id, meta);
                                    routed = true;
                                }
                                else if (extra == nc + 1) {
                                    results = PostprocessOBB(featureVector, camera_id, meta);
                                    routed = true;
                                }
                                else if ((extra - nc) % 3 == 0 && (extra - nc) >= 3) {
                                    results = PostprocessPose(featureVector, camera_id, meta);
                                    routed = true;
                                }
                            }

                            if (!routed) {
                                if (extra >= 2) {
                                    cv::Mat probe = cv::Mat(dim1, dim2, CV_32F, featureVector.data()).t();
                                    int lastCol = dim1 - 1;
                                    int numSamples = std::min(dim2, 100);
                                    int angleCount = 0;
                                    for (int s = 0; s < numSamples; ++s) {
                                        float v = probe.at<float>(s, lastCol);
                                        if (v >= -3.15f && v <= 3.15f) ++angleCount;
                                    }
                                    if (angleCount > numSamples * 8 / 10) {
                                        results = PostprocessOBB(featureVector, camera_id, meta);
                                        routed = true;
                                    }
                                }
                                if (!routed && dim1 == 56)
                                    results = PostprocessPose(featureVector, camera_id, meta);
                                else if (!routed)
                                    results = PostprocessDetection(featureVector, camera_id, meta);
                            }
                        }
                    }

                    if (dbg) {
                        double msPostprocess = elapsed();
                        _logger.LogInfo("ANSRTYOLO::DetectObjects",
                            "[DEBUG] " + camera_id +
                            " | " + std::string(usedNV12 ? "NV12" : "BGR") +
                            " | SetDev=" + std::to_string(msSetDevice) +
                            "ms Preproc=" + std::to_string(msPreprocess) +
                            "ms Inf=" + std::to_string(msInference) +
                            "ms Transform=" + std::to_string(msTransform) +
                            "ms Postproc=" + std::to_string(msPostprocess) +
                            "ms Det=" + std::to_string(results.size()) +
                            (isClassification ? " [classify]" : " [detect]"),
                            __FILE__, __LINE__);
                    }
                }
            }

            // --- 4b. Rescale coords from full-res to display-res (BGR full-res path) ---
            // When ANSVideoPlayer provides full-res BGR via the registry, Preprocess
            // and Postprocess operate in full-res coordinates. But the caller passed
            // a display-res inputImage and expects coords in that space. Remap here.
            if (bgrFullResScaleX != 1.0f || bgrFullResScaleY != 1.0f) {
                for (auto& obj : results) {
                    obj.box.x      = static_cast<int>(obj.box.x      * bgrFullResScaleX);
                    obj.box.y      = static_cast<int>(obj.box.y      * bgrFullResScaleY);
                    obj.box.width  = static_cast<int>(obj.box.width  * bgrFullResScaleX);
                    obj.box.height = static_cast<int>(obj.box.height * bgrFullResScaleY);
                    // Rescale polygon points if present (segmentation / OBB)
                    for (auto& pt : obj.polygon) {
                        pt.x *= bgrFullResScaleX;
                        pt.y *= bgrFullResScaleY;
                    }
                    // Rescale keypoints if present (pose: x,y,conf triplets)
                    for (size_t k = 0; k + 2 < obj.kps.size(); k += 3) {
                        obj.kps[k]     *= bgrFullResScaleX;
                        obj.kps[k + 1] *= bgrFullResScaleY;
                    }
                }
            }

            // --- 5. Tracking + Stabilization ---
            if (_trackerEnabled && !isClassification) {
                results = ApplyTracking(results, camera_id);
                double msTracking = dbg ? elapsed() : 0;
                if (_stabilizationEnabled) {
                    results = StabilizeDetections(results, camera_id);
                }
                double msStabilize = dbg ? elapsed() : 0;
                if (dbg) {
                    _logger.LogInfo("ANSRTYOLO::DetectObjects",
                        "[DEBUG] " + camera_id +
                        " | Tracking=" + std::to_string(msTracking) +
                        "ms Stabilize=" + std::to_string(msStabilize) + "ms",
                        __FILE__, __LINE__);
                }
            }

            // --- 6. Total pipeline time ---
            if (dbg) {
                double msTotal = std::chrono::duration<double, std::milli>(Clock::now() - t0).count();
                _logger.LogInfo("ANSRTYOLO::DetectObjects",
                    "[DEBUG] " + camera_id + " | TOTAL=" + std::to_string(msTotal) +
                    "ms (" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) +
                    ") Results=" + std::to_string(results.size()),
                    __FILE__, __LINE__);
            }

            return results;
        }
        catch (std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::DetectObjects", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

    // ====================================================================
    //  DetectObjectsBatch — batch inference with auto-detection
    // ====================================================================
    std::vector<std::vector<Object>> ANSRTYOLO::DetectObjectsBatch(
            const std::vector<cv::Mat>& inputImages, const std::string& camera_id) {
        if (inputImages.empty()) {
            _logger.LogError("ANSRTYOLO::DetectObjectsBatch", "Empty input images vector", __FILE__, __LINE__);
            return {};
        }

        // Auto-split if batch exceeds engine capacity
        const int maxBatch = m_options.maxBatchSize > 0 ? m_options.maxBatchSize : 1;
        if (static_cast<int>(inputImages.size()) > maxBatch && maxBatch > 0) {
            const size_t numImages = inputImages.size();
            std::vector<std::vector<Object>> allResults;
            allResults.reserve(numImages);
            for (size_t start = 0; start < numImages; start += static_cast<size_t>(maxBatch)) {
                const size_t end = std::min(start + static_cast<size_t>(maxBatch), numImages);
                std::vector<cv::Mat> chunk(inputImages.begin() + start, inputImages.begin() + end);
                auto chunkResults = DetectObjectsBatch(chunk, camera_id);
                if (chunkResults.size() == chunk.size()) {
                    for (auto& r : chunkResults) allResults.push_back(std::move(r));
                }
                else {
                    _logger.LogError("ANSRTYOLO::DetectObjectsBatch",
                        "Chunk returned " + std::to_string(chunkResults.size()) +
                        " results, expected " + std::to_string(chunk.size()), __FILE__, __LINE__);
                    for (auto& r : chunkResults) allResults.push_back(std::move(r));
                    for (size_t pad = chunkResults.size(); pad < chunk.size(); ++pad)
                        allResults.push_back({});
                }
            }
            return allResults;
        }

        try {
            // --- Debug timer helper ---
            using Clock = std::chrono::steady_clock;
            const bool dbg = _debugFlag;
            auto t0 = dbg ? Clock::now() : Clock::time_point{};
            auto tPrev = t0;
            auto elapsed = [&]() -> double {
                auto now = Clock::now();
                double ms = std::chrono::duration<double, std::milli>(now - tPrev).count();
                tPrev = now;
                return ms;
            };

            // Ensure correct GPU context for preprocessing (multi-GPU safety)
            if (m_trtEngine) {
                m_trtEngine->setDeviceContext();
            }
            double msSetDevice = dbg ? elapsed() : 0;

            // CUDA context health check (same as DetectObjects)
            if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) {
                return {};
            }

            const size_t realCount = inputImages.size();

            // Pad batch to next power-of-2
            size_t paddedCount = 1;
            while (paddedCount < realCount) paddedCount *= 2;
            paddedCount = std::min(paddedCount, static_cast<size_t>(maxBatch));

            const std::vector<cv::Mat>* batchPtr = &inputImages;
            std::vector<cv::Mat> paddedImages;
            if (paddedCount > realCount) {
                paddedImages.reserve(paddedCount);
                paddedImages.insert(paddedImages.end(), inputImages.begin(), inputImages.end());
                for (size_t p = realCount; p < paddedCount; ++p)
                    paddedImages.push_back(inputImages.back());
                batchPtr = &paddedImages;
            }
            double msPad = dbg ? elapsed() : 0;

            BatchMetadata metadata;
            const auto inputs = PreprocessBatch(*batchPtr, metadata);
            double msPreprocess = dbg ? elapsed() : 0;
            if (inputs.empty() || inputs[0].empty()) {
                _logger.LogWarn("ANSRTYOLO::DetectObjectsBatch", "Skipped: preprocessing failed", __FILE__, __LINE__);
                return {};
            }

            // Check for prior CUDA errors before inference.
            cudaError_t priorErr = cudaGetLastError();
            if (priorErr != cudaSuccess) {
                _logger.LogWarn("ANSRTYOLO::DetectObjectsBatch",
                    std::string("Cleared prior CUDA error before inference: ")
                        + cudaGetErrorString(priorErr),
                    __FILE__, __LINE__);
            }

            std::vector<std::vector<std::vector<float>>> featureVectors;
            auto succ = m_trtEngine->runInference(inputs, featureVectors);
            if (!succ) {
                cudaError_t postErr = cudaPeekAtLastError();
                std::string detail = "runInference returned false, batchSize="
                    + std::to_string(inputs[0].size());
                if (postErr != cudaSuccess) {
                    detail += ", CUDA error: ";
                    detail += cudaGetErrorString(postErr);
                }
                _logger.LogError("ANSRTYOLO::DetectObjectsBatch", detail, __FILE__, __LINE__);
                return {};
            }

            double msInference = dbg ? elapsed() : 0;

            if (featureVectors.size() != paddedCount) {
                _logger.LogError("ANSRTYOLO::DetectObjectsBatch", "Output batch size mismatch", __FILE__, __LINE__);
                return {};
            }
            featureVectors.resize(realCount);

            const auto& outputDims = m_trtEngine->getOutputDims();
            const size_t numOutputs = outputDims.size();
            const size_t numBatch   = featureVectors.size();

            // Determine task type once (same model for all images in batch)
            int dim1 = outputDims[0].d[1];
            int dim2 = outputDims[0].d[2];
            int nc   = static_cast<int>(_classes.size());

            enum class TaskType { DetLegacy, DetE2E, OBBLegacy, OBBE2E,
                                  SegLegacy, SegE2E, PoseLegacy, PoseE2E, Classify };
            TaskType taskType = TaskType::DetLegacy; // default

            // E2E: dim1 > dim2 (e.g. [B,300,6]);  Legacy: dim1 < dim2 (e.g. [B,84,8400])
            const bool isEndToEnd = (dim1 > dim2) || (dim2 <= 20);

            if (numOutputs >= 2) {
                taskType = isEndToEnd ? TaskType::SegE2E : TaskType::SegLegacy;
            }
            else if (outputDims[0].nbDims <= 2) {
                taskType = TaskType::Classify;
            }
            else if (isEndToEnd) {
                if (dim2 == 6)                              taskType = TaskType::DetE2E;
                else if (dim2 == 7)                         taskType = TaskType::OBBE2E;
                else if (dim2 > 7 && (dim2-6) % 3 == 0)    taskType = TaskType::PoseE2E;
                else                                        taskType = TaskType::DetE2E;
            }
            else {
                int extra = dim1 - 4;
                bool routed = false;

                // Try class-list-based routing first (only if class count fits within tensor)
                if (nc > 0 && nc <= extra) {
                    if (extra == nc)                                       { taskType = TaskType::DetLegacy;  routed = true; }
                    else if (extra == nc + 1)                              { taskType = TaskType::OBBLegacy;  routed = true; }
                    else if ((extra-nc) % 3 == 0 && (extra-nc) >= 3)      { taskType = TaskType::PoseLegacy; routed = true; }
                }

                // Fallback: probe last channel for angle values to detect OBB
                if (!routed && extra >= 2 && !featureVectors.empty() && !featureVectors[0].empty() && !featureVectors[0][0].empty()) {
                    // Transpose first image's feature vector and probe last column
                    cv::Mat raw(dim1, dim2, CV_32F, const_cast<float*>(featureVectors[0][0].data()));
                    cv::Mat probe;
                    cv::transpose(raw, probe); // [dim2, dim1]
                    int lastCol = dim1 - 1;
                    int numSamples = std::min(dim2, 100);
                    int angleCount = 0;
                    for (int s = 0; s < numSamples; ++s) {
                        float v = probe.at<float>(s, lastCol);
                        if (v >= -3.15f && v <= 3.15f) ++angleCount;
                    }
                    if (angleCount > numSamples * 8 / 10) {
                        taskType = TaskType::OBBLegacy;
                        routed = true;
                    }
                }

                if (!routed) {
                    if (dim1 == 56) taskType = TaskType::PoseLegacy;
                    else            taskType = TaskType::DetLegacy;
                }
            }

            // Process each image in parallel
            std::vector<std::vector<Object>> batchDetections(numBatch);
            std::vector<std::future<std::vector<Object>>> postFutures;
            postFutures.reserve(numBatch);

            for (size_t batchIdx = 0; batchIdx < numBatch; ++batchIdx) {
                const auto& batchOutput = featureVectors[batchIdx];

                ImageMetadata imgMeta;
                imgMeta.ratio     = metadata.ratios[batchIdx];
                imgMeta.imgWidth  = static_cast<float>(metadata.imgWidths[batchIdx]);
                imgMeta.imgHeight = static_cast<float>(metadata.imgHeights[batchIdx]);

                switch (taskType) {
                case TaskType::SegLegacy:
                case TaskType::SegE2E: {
                    std::vector<std::vector<float>> fv2d;
                    fv2d.reserve(batchOutput.size());
                    for (const auto& out : batchOutput) fv2d.push_back(out);
                    if (taskType == TaskType::SegE2E) {
                        postFutures.push_back(std::async(std::launch::async,
                            [this, fv = std::move(fv2d), cid = camera_id, m = imgMeta]() mutable {
                                return PostprocessSegE2E(fv, cid, m);
                            }));
                    }
                    else {
                        postFutures.push_back(std::async(std::launch::async,
                            [this, fv = std::move(fv2d), cid = camera_id, m = imgMeta]() mutable {
                                return PostprocessSegmentation(fv, cid, m);
                            }));
                    }
                    break;
                }
                case TaskType::Classify: {
                    std::vector<float> fv = batchOutput.empty() ? std::vector<float>{} : batchOutput[0];
                    postFutures.push_back(std::async(std::launch::async,
                        [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
                            return PostprocessClassify(fv, cid, m);
                        }));
                    break;
                }
                default: {
                    std::vector<float> fv = batchOutput.empty() ? std::vector<float>{} : batchOutput[0];
                    switch (taskType) {
                    case TaskType::DetLegacy:
                        postFutures.push_back(std::async(std::launch::async,
                            [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
                                return PostprocessDetection(fv, cid, m);
                            }));
                        break;
                    case TaskType::DetE2E:
                        postFutures.push_back(std::async(std::launch::async,
                            [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
                                return PostprocessDetectionE2E(fv, cid, m);
                            }));
                        break;
                    case TaskType::OBBLegacy:
                        postFutures.push_back(std::async(std::launch::async,
                            [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
                                return PostprocessOBB(fv, cid, m);
                            }));
                        break;
                    case TaskType::OBBE2E:
                        postFutures.push_back(std::async(std::launch::async,
                            [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
                                return PostprocessOBBE2E(fv, cid, m);
                            }));
                        break;
                    case TaskType::PoseLegacy:
                        postFutures.push_back(std::async(std::launch::async,
                            [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
                                return PostprocessPose(fv, cid, m);
                            }));
                        break;
                    case TaskType::PoseE2E:
                        postFutures.push_back(std::async(std::launch::async,
                            [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
                                return PostprocessPoseE2E(fv, cid, m);
                            }));
                        break;
                    default:
                        postFutures.push_back(std::async(std::launch::async,
                            [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
                                return PostprocessDetection(fv, cid, m);
                            }));
                        break;
                    }
                    break;
                }
                }
            }

            // Gather results
            for (size_t i = 0; i < numBatch; ++i)
                batchDetections[i] = postFutures[i].get();

            // Apply tracker per frame (skip for classification models)
            if (_trackerEnabled && taskType != TaskType::Classify) {
                for (auto& results : batchDetections) {
                    if (!results.empty()) {
                        results = ApplyTracking(results, camera_id);
                        if (_stabilizationEnabled) {
                            results = StabilizeDetections(results, camera_id);
                        }
                    }
                }
            }

            if (dbg) {
                double msPostprocess = elapsed();
                double msTotal = std::chrono::duration<double, std::milli>(Clock::now() - t0).count();
                _logger.LogInfo("ANSRTYOLO::DetectObjectsBatch",
                    "[DEBUG] " + camera_id +
                    " batch=" + std::to_string(realCount) +
                    " | SetDev=" + std::to_string(msSetDevice) +
                    "ms Pad=" + std::to_string(msPad) +
                    "ms Preproc=" + std::to_string(msPreprocess) +
                    "ms Inf=" + std::to_string(msInference) +
                    "ms Postproc=" + std::to_string(msPostprocess) +
                    "ms TOTAL=" + std::to_string(msTotal) + "ms",
                    __FILE__, __LINE__);
            }

            return batchDetections;
        }
        catch (const std::exception& e) {
            _logger.LogFatal("ANSRTYOLO::DetectObjectsBatch", e.what(), __FILE__, __LINE__);
            return {};
        }
    }

} // namespace ANSCENTER