ANSCORE/modules/ANSODEngine/ANSTENSORTRTOD.cpp

#include "ANSTENSORRTOD.h"
#include "Utility.h"
#include <future>
#include <opencv2/cudaimgproc.hpp>
namespace ANSCENTER
{
	bool TENSORRTOD::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder)) {
			return false;
		}
		if (!FileExist(_modelFilePath)) {
			this->_logger.LogFatal("TENSORRTOD::OptimizeModel", "Raw model file path does not exist", __FILE__, __LINE__);
			return false;
		}
		try {
			_fp16 = fp16;
			optimizedModelFolder = GetParentFolder(_modelFilePath);
			// Check if the engine already exists to avoid reinitializing
			if (!m_trtEngine) {
				// Fixed batch size of 1 for this model
				m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
				m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
				m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
				m_options.maxInputHeight = _modelConfig.maxInputHeight;
				m_options.minInputHeight = _modelConfig.minInputHeight;
				m_options.optInputHeight = _modelConfig.optInputHeight;
				m_options.maxInputWidth = _modelConfig.maxInputWidth;
				m_options.minInputWidth = _modelConfig.minInputWidth;
				m_options.optInputWidth = _modelConfig.optInputWidth;
				m_options.engineFileDir = optimizedModelFolder;
				m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
				// Create the TensorRT inference engine
				m_trtEngine = std::make_unique<Engine<float>>(m_options);
			}

			// Build the TensorRT engine
			auto succ = m_trtEngine->buildWithRetry(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE);
			if (!succ) {
				const std::string errMsg =
					"Error: Unable to build the TensorRT engine. "
					"Try increasing TensorRT log severity to kVERBOSE.";
				this->_logger.LogError("TENSORRTOD::OptimizeModel", errMsg, __FILE__, __LINE__);
				_modelLoadValid = false;
				return false;
			}
			// Sync GPU-capped batch sizes from engine (build may reduce based on VRAM tier)
			m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize;
			m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize;
			_modelLoadValid = true;
			return true;
		}
		catch (const std::exception& e) {
			this->_logger.LogFatal("TENSORRTOD::OptimizeModel", e.what(), __FILE__, __LINE__);
			optimizedModelFolder.clear();
			return false;
		}
	}
	bool TENSORRTOD::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) {
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		ModelLoadingGuard mlg(_modelLoading);
		try {
			_isFixedBatch = false;
			bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword);
			if (!result) return false;
			_modelConfig.detectionType = ANSCENTER::DetectionType::DETECTION;
			_modelConfig.modelType = ModelType::TENSORRT;
			_modelConfig.inpHeight = 640;
			_modelConfig.inpWidth = 640;
			if (_modelConfig.modelMNSThreshold < 0.2)
				_modelConfig.modelMNSThreshold = 0.5;
			if (_modelConfig.modelConfThreshold < 0.2)
				_modelConfig.modelConfThreshold = 0.5;
			if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)  // 133 = COCO wholebody max
				_modelConfig.numKPS = 17;
			if (_modelConfig.kpsThreshold == 0)_modelConfig.kpsThreshold = 0.5; // If not define
			_fp16 = true; // Load Model from Here
			// Load Model from Here
			TOP_K = 100;
			SEG_CHANNELS = 32;
			PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
			NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
			SEGMENTATION_THRESHOLD = 0.5f;
			SEG_H = 160;
			SEG_W = 160;
			NUM_KPS = _modelConfig.numKPS;
			KPS_THRESHOLD = _modelConfig.kpsThreshold;
			SEG_CHANNELS = 32;      // For segmentation

			if (!m_trtEngine) {
				// Fixed batch size of 1 for this model
				m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
				m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
				m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
				m_options.maxInputHeight = _modelConfig.maxInputHeight;
				m_options.minInputHeight = _modelConfig.minInputHeight;
				m_options.optInputHeight = _modelConfig.optInputHeight;
				m_options.maxInputWidth = _modelConfig.maxInputWidth;
				m_options.minInputWidth = _modelConfig.minInputWidth;
				m_options.optInputWidth = _modelConfig.optInputWidth;
				m_options.engineFileDir = _modelFolder;
				// Use FP16 or FP32 precision based on the input flag
				m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
				// Create the TensorRT inference engine
				m_trtEngine = std::make_unique<Engine<float>>(m_options);
			}
			// 0. Check if the configuration file exist
			if (FileExist(_modelConfigFile)) {
				ModelType modelType;
				std::vector<int> inputShape;
				_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
				if (inputShape.size() == 2) {
					if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0];
					if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1];
				}
			}
			else {// This is old version of model zip file
				_modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
				_classFilePath = CreateFilePath(_modelFolder, "classes.names");
				std::ifstream isValidFileName(_classFilePath);
				if (!isValidFileName)
				{
					this->_logger.LogDebug("TENSORRTOD::Initialize.  Load classes from string", _classFilePath, __FILE__, __LINE__);
					LoadClassesFromString();
				}
				else {
					this->_logger.LogDebug("TENSORRTOD::Initialize.  Load classes from file", _classFilePath, __FILE__, __LINE__);
					LoadClassesFromFile();
				}
			}
			// Load the TensorRT engine file
			if (this->_loadEngineOnCreation) {
				auto succ = m_trtEngine->buildLoadNetwork(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
				if (!succ) {
					const std::string errMsg = "Error: Unable to load TensorRT engine weights into memory. " + _modelFilePath;
					this->_logger.LogError("TENSORRTOD::Initialize", errMsg, __FILE__, __LINE__);
					_modelLoadValid = false;
					return false;
				}
				// Sync GPU-capped batch sizes from engine (build may reduce based on VRAM tier)
				m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize;
				m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize;
			}
			_modelLoadValid = true;
			_isInitialized = true;
			return true;
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("TENSORRTOD::LoadModel", e.what(), __FILE__, __LINE__);
			return false;
		}
	}
	bool TENSORRTOD::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig, std::string modelName, std::string className, const std::string& modelFolder, std::string& labelMap) {
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		ModelLoadingGuard mlg(_modelLoading);
		try
		{
			_isFixedBatch = false;
			bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap);
			if (!result) return false;
			_modelConfig = modelConfig;
			_modelConfig.detectionType = ANSCENTER::DetectionType::DETECTION;
			_modelConfig.modelType = ModelType::TENSORRT;
			_modelConfig.inpHeight = 640;
			_modelConfig.precisionType = PrecisionType::FP32; // Default to FP16 for TensorRT models
			if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)  // 133 = COCO wholebody max
				_modelConfig.numKPS = 17;
			_modelConfig.inpWidth = 640;
			if (_modelConfig.modelMNSThreshold < 0.2)
				_modelConfig.modelMNSThreshold = 0.5;
			if (_modelConfig.modelConfThreshold < 0.2)
				_modelConfig.modelConfThreshold = 0.5;
			if (_modelConfig.kpsThreshold <= 0)_modelConfig.kpsThreshold = 0.5; // If not define
			_fp16 = true; // Load Model from Here
			// Load Model from Here
			TOP_K = 100;
			SEG_CHANNELS = 32;
			PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
			NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
			SEGMENTATION_THRESHOLD = 0.5f;
			SEG_H = 160;
			SEG_W = 160;
			NUM_KPS = _modelConfig.numKPS;
			KPS_THRESHOLD = _modelConfig.kpsThreshold;
			SEG_CHANNELS = 32;      // For segmentation
			std::string _modelName = modelName;
			if (_modelName.empty()) {
				_modelName = "train_last";
			}
			std::string modelFullName = _modelName + ".onnx";
			if (!m_trtEngine) {
				// Fixed batch size of 1 for this model
				m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
				m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
				m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
				m_options.maxInputHeight = _modelConfig.maxInputHeight;
				m_options.minInputHeight = _modelConfig.minInputHeight;
				m_options.optInputHeight = _modelConfig.optInputHeight;
				m_options.maxInputWidth = _modelConfig.maxInputWidth;
				m_options.minInputWidth = _modelConfig.minInputWidth;
				m_options.optInputWidth = _modelConfig.optInputWidth;
				m_options.engineFileDir = _modelFolder;
				// Use FP16 or FP32 precision based on the input flag
				m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
				// Create the TensorRT inference engine
				m_trtEngine = std::make_unique<Engine<float>>(m_options);
			}
			// 0. Check if the configuration file exist
			if (FileExist(_modelConfigFile)) {
				ModelType modelType;
				std::vector<int> inputShape;
				_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
				if (inputShape.size() == 2) {
					if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0];
					if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1];
				}
			}
			else {// This is old version of model zip file
				_modelFilePath = CreateFilePath(_modelFolder, modelFullName);
				_classFilePath = CreateFilePath(_modelFolder, className);
				std::ifstream isValidFileName(_classFilePath);
				if (!isValidFileName)
				{
					this->_logger.LogDebug("TENSORRTOD::Initialize.  Load classes from string", _classFilePath, __FILE__, __LINE__);
					LoadClassesFromString();
				}
				else {
					this->_logger.LogDebug("TENSORRTOD::Initialize.  Load classes from file", _classFilePath, __FILE__, __LINE__);
					LoadClassesFromFile();
				}
			}
			// 1. Load labelMap and engine
			labelMap.clear();
			if (!_classes.empty())
				labelMap = VectorToCommaSeparatedString(_classes);

			// Load the TensorRT engine file
			if (this->_loadEngineOnCreation) {
				auto succ = m_trtEngine->buildLoadNetwork(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
				if (!succ) {
					const std::string errMsg = "Error: Unable to load TensorRT engine weights into memory. " + _modelFilePath;
					this->_logger.LogError("TENSORRTOD::Initialize", errMsg, __FILE__, __LINE__);
					_modelLoadValid = false;
					return false;
				}
				// Sync GPU-capped batch sizes from engine (build may reduce based on VRAM tier)
				m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize;
				m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize;
			}
			_modelLoadValid = true;
			_isInitialized = true;
			return true;
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("TENSORRTOD::LoadModelFromFolder", e.what(), __FILE__, __LINE__);
			return false;
		}
	}
	bool TENSORRTOD::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, std::string& labelMap) {
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		ModelLoadingGuard mlg(_modelLoading);
		try {
			const bool engineAlreadyLoaded = _modelLoadValid && _isInitialized && m_trtEngine != nullptr;
			_modelLoadValid = false;
			_isFixedBatch = false;
			bool result = ANSODBase::Initialize(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, labelMap);
			if (!result) return false;
			// Parsing for YOLO only here
			_modelConfig = modelConfig;
			_modelConfig.detectionType = ANSCENTER::DetectionType::DETECTION;
			_modelConfig.modelType = ModelType::TENSORRT;
			_modelConfig.inpHeight = 640;
			_modelConfig.inpWidth = 640;
			if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)  // 133 = COCO wholebody max
				_modelConfig.numKPS = 17;
			_modelConfig.precisionType = PrecisionType::FP32; // Default to FP16 for TensorRT models
			if (_modelConfig.modelMNSThreshold < 0.2)
				_modelConfig.modelMNSThreshold = 0.5;
			if (_modelConfig.modelConfThreshold < 0.2)
				_modelConfig.modelConfThreshold = 0.5;
			if (_modelConfig.kpsThreshold == 0)_modelConfig.kpsThreshold = 0.5; // If not define
			_fp16 = true; // Load Model from Here
			// Load Model from Here
			TOP_K = 100;
			SEG_CHANNELS = 32;
			PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
			NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
			SEGMENTATION_THRESHOLD = 0.5f;
			SEG_H = 160;
			SEG_W = 160;
			NUM_KPS = _modelConfig.numKPS;
			KPS_THRESHOLD = _modelConfig.kpsThreshold;
			SEG_CHANNELS = 32;      // For segmentation

			if (!m_trtEngine) {
				// Fixed batch size of 1 for this model
				m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
				m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
				m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
				m_options.maxInputHeight = _modelConfig.maxInputHeight;
				m_options.minInputHeight = _modelConfig.minInputHeight;
				m_options.optInputHeight = _modelConfig.optInputHeight;
				m_options.maxInputWidth = _modelConfig.maxInputWidth;
				m_options.minInputWidth = _modelConfig.minInputWidth;
				m_options.optInputWidth = _modelConfig.optInputWidth;
				m_options.engineFileDir = _modelFolder;
				// Use FP16 or FP32 precision based on the input flag
				m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
				// Create the TensorRT inference engine
				m_trtEngine = std::make_unique<Engine<float>>(m_options);
			}
			// 0. Check if the configuration file exist
			if (FileExist(_modelConfigFile)) {
				ModelType modelType;
				std::vector<int> inputShape;
				_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
				if (inputShape.size() == 2) {
					if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0];
					if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1];
				}
			}
			else {// This is old version of model zip file
				_modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
				_classFilePath = CreateFilePath(_modelFolder, "classes.names");
				std::ifstream isValidFileName(_classFilePath);
				if (!isValidFileName)
				{
					this->_logger.LogDebug("TENSORRTOD::Initialize.  Load classes from string", _classFilePath, __FILE__, __LINE__);
					LoadClassesFromString();
				}
				else {
					this->_logger.LogDebug("TENSORRTOD::Initialize.  Load classes from file", _classFilePath, __FILE__, __LINE__);
					LoadClassesFromFile();
				}
			}
			// 1. Load labelMap and engine
			labelMap.clear();
			if (!_classes.empty())
				labelMap = VectorToCommaSeparatedString(_classes);

			// Load the TensorRT engine file
			if (this->_loadEngineOnCreation && !engineAlreadyLoaded) {
				auto succ = m_trtEngine->buildLoadNetwork(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
				if (!succ) {
					const std::string errMsg = "Error: Unable to load TensorRT engine weights into memory. " + _modelFilePath;
					this->_logger.LogError("TENSORRTOD::Initialize", errMsg, __FILE__, __LINE__);
					_modelLoadValid = false;
					return false;
				}
				// Sync GPU-capped batch sizes from engine (build may reduce based on VRAM tier)
				m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize;
				m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize;
			}
			_modelLoadValid = true;
			_isInitialized = true;
			return true;
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("TENSORRTOD::Initialize", e.what(), __FILE__, __LINE__);
			return false;
		}
	}
	std::vector<Object> TENSORRTOD::RunInference(const cv::Mat& inputImgBGR) {
		return RunInference(inputImgBGR, "");
	}
	std::vector<Object> TENSORRTOD::RunInference(const cv::Mat& inputImgBGR,const std::string& camera_id)
	{
		// Validate state under a brief lock — do NOT hold across DetectObjects so that
		// the Engine pool can run concurrent inferences on different GPU slots.
		if (!PreInferenceCheck("TENSORRTOD::RunInference")) return {};

		try {
			return DetectObjects(inputImgBGR, camera_id);
		}
		catch (const std::exception& e) {
			_logger.LogFatal("TENSORRTOD::RunInference", e.what(), __FILE__, __LINE__);
			return {};
		}
	}
	std::vector<std::vector<Object>> TENSORRTOD::RunInferencesBatch(const std::vector<cv::Mat>& inputs, const std::string& camera_id) {
		// Validate state under a brief lock — do NOT hold across DetectObjectsBatch so that
		// the Engine pool can serve concurrent batch requests on different GPU slots.
		if (!PreInferenceCheck("TENSORRTOD::RunInferencesBatch")) return {};

		try {
			if (_isFixedBatch) return ANSODBase::RunInferencesBatch(inputs, camera_id);
			else return DetectObjectsBatch(inputs, camera_id);
		}
		catch (const std::exception& e) {
			this->_logger.LogFatal("TENSORRTOD::RunInferenceBatch", e.what(), __FILE__, __LINE__);
			return {};
		}
	}

	TENSORRTOD::~TENSORRTOD() {
		try {
			Destroy();
		}
		catch (std::exception& e) {
			this->_logger.LogError("TENSORRTOD::~TENSORRTOD()", e.what(), __FILE__, __LINE__);
		}
	}
	bool TENSORRTOD::Destroy() {
		try {
			m_trtEngine.reset();  // Releases the current engine and sets m_trtEngine to nullptr.
			return true;
		}
		catch (std::exception& e) {
			this->_logger.LogError("TENSORRTOD::~TENSORRTOD()", e.what(), __FILE__, __LINE__);
			return false;
		}
	}
	// private
	std::vector<Object> TENSORRTOD::DetectObjects(const cv::Mat& inputImage, const std::string& camera_id) {
		try {
			// --- 1. Set GPU device context ---
			if (m_trtEngine) {
				m_trtEngine->setDeviceContext();
			}

			// --- 1b. CUDA context health check ---
			if (!m_nv12Helper.isCudaContextHealthy(_logger, "TENSORRTOD")) {
				return {};
			}

			// --- 2. Preprocess under lock ---
			// Try NV12 fast path first, falls back to standard GPU preprocessing.
			ImageMetadata meta;
			std::vector<std::vector<cv::cuda::GpuMat>> input;
			bool usedNV12 = false;
			float bgrFullResScaleX = 1.0f, bgrFullResScaleY = 1.0f;
			{
				std::lock_guard<std::recursive_mutex> lock(_mutex);
				const int inferenceGpu = m_trtEngine ? m_trtEngine->getPreferredDeviceIndex() : 0;
				const auto& inputDims = m_trtEngine->getInputDims();
				const int inputW = inputDims[0].d[2];
				const int inputH = inputDims[0].d[1];

				auto nv12 = m_nv12Helper.tryNV12(inputImage, inferenceGpu, inputW, inputH,
				                                  NV12PreprocessHelper::defaultYOLOLauncher(),
				                                  _logger, "TENSORRTOD");
				if (nv12.succeeded) {
					meta.imgWidth  = nv12.metaWidth;
					meta.imgHeight = nv12.metaHeight;
					meta.ratio     = nv12.ratio;
					input = {{ std::move(nv12.gpuRGB) }};
					usedNV12 = true;
				}
				else if (nv12.useBgrFullRes) {
					input = Preprocess(nv12.bgrFullResImg, meta);
					usedNV12 = !input.empty();
					bgrFullResScaleX = nv12.bgrFullResScaleX;
					bgrFullResScaleY = nv12.bgrFullResScaleY;
				}

				if (input.empty()) {
					input = Preprocess(inputImage, meta);
				}
				m_nv12Helper.tickInference();
			}

			if (input.empty()) {
				this->_logger.LogWarn("TENSORRTOD::DetectObjects", "Skipped: preprocessing returned empty input", __FILE__, __LINE__);
				return {};
			}

			// Phase 2: Inference — mutex released so the Engine pool can serve concurrent callers
			// on different GPU slots simultaneously.
			std::vector<std::vector<std::vector<float>>> featureVectors;
			if (!m_trtEngine->runInference(input, featureVectors)) {
				this->_logger.LogError("TENSORRTOD::DetectObjects", "Error running inference", __FILE__, __LINE__);
				return {};
			}

			// Phase 3: Postprocess under lock (reads _classes and _modelConfig).
			std::lock_guard<std::recursive_mutex> lock(_mutex);
			std::vector<Object> ret;
			const auto& numOutputs = m_trtEngine->getOutputDims().size();
			if (numOutputs == 1) {
				// Object detection or pose estimation
				std::vector<float> featureVector;
				Engine<float>::transformOutput(featureVectors, featureVector);

				const auto& outputDims = m_trtEngine->getOutputDims();
				int numChannels = outputDims[outputDims.size() - 1].d[1];
				if (numChannels == 56) {
					ret = PostProcessPose(featureVector, camera_id, meta);
				}
				else {
					ret = Postprocess(featureVector, camera_id, meta);
				}
			}
			else {
				// Segmentation
				std::vector<std::vector<float>> featureVector;
				Engine<float>::transformOutput(featureVectors, featureVector);
				ret = PostProcessSegmentation(featureVector, camera_id, meta);
			}

			// --- 4b. Rescale coords from full-res to display-res (BGR full-res path) ---
			if (bgrFullResScaleX != 1.0f || bgrFullResScaleY != 1.0f) {
				for (auto& obj : ret) {
					obj.box.x      = static_cast<int>(obj.box.x      * bgrFullResScaleX);
					obj.box.y      = static_cast<int>(obj.box.y      * bgrFullResScaleY);
					obj.box.width  = static_cast<int>(obj.box.width  * bgrFullResScaleX);
					obj.box.height = static_cast<int>(obj.box.height * bgrFullResScaleY);
					for (auto& pt : obj.polygon) {
						pt.x *= bgrFullResScaleX;
						pt.y *= bgrFullResScaleY;
					}
					for (size_t k = 0; k + 2 < obj.kps.size(); k += 3) {
						obj.kps[k]     *= bgrFullResScaleX;
						obj.kps[k + 1] *= bgrFullResScaleY;
					}
				}
			}

			// Apply tracker and stabilization if enabled
			if (_trackerEnabled) {
				ret = ApplyTracking(ret, camera_id);
				if (_stabilizationEnabled) ret = StabilizeDetections(ret, camera_id);
			}
			return ret;
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("TENSORRTOD::DetectObjects", e.what(), __FILE__, __LINE__);
			return {};
		}
	}
	std::vector<std::vector<cv::cuda::GpuMat>> TENSORRTOD::Preprocess(const cv::Mat& inputImage, ImageMetadata& outMeta) {
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		try {
			if (!_licenseValid) {
				this->_logger.LogFatal("TENSORRTOD::Preprocess", "Invalid license", __FILE__, __LINE__);
				return {};
			}

			// Get model input dimensions
			const auto& inputDims = m_trtEngine->getInputDims();
			const int inputH = inputDims[0].d[1];
			const int inputW = inputDims[0].d[2];
			// --- CPU preprocessing: resize + BGR->RGB before GPU upload ---
			cv::Mat srcImg = inputImage;
			if (srcImg.channels() == 1) {
				cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
			}

			// Set image size parameters from ORIGINAL image (before resize)
			outMeta.imgHeight = static_cast<float>(srcImg.rows);
			outMeta.imgWidth  = static_cast<float>(srcImg.cols);
			if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
				outMeta.ratio = 1.f / std::min(inputDims[0].d[2] / static_cast<float>(srcImg.cols),
					inputDims[0].d[1] / static_cast<float>(srcImg.rows));

				const auto& outputDims = m_trtEngine->getOutputDims();
				const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;

				// CPU resize to model input size
				cv::Mat cpuResized;
				if (srcImg.rows != inputH || srcImg.cols != inputW) {
					if (isClassification) {
						cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
					} else {
						cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
					}
				} else {
					cpuResized = srcImg;
				}

				// CPU BGR -> RGB
				cv::Mat cpuRGB;
				cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);

				// Upload small image to GPU
				cv::cuda::Stream stream;
				cv::cuda::GpuMat gpuResized;
				gpuResized.upload(cpuRGB, stream);
				stream.waitForCompletion();

				// Convert to format expected by our inference engine
				std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
				std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
				return inputs;
			}
			else {
				this->_logger.LogFatal("TENSORRTCL::Preprocess",
					"Image height or width is zero after processing (Width: " + std::to_string(outMeta.imgWidth) +
					", Height: " + std::to_string(outMeta.imgHeight) + ")",
					__FILE__, __LINE__);
				return {};
			}
		}
		catch (const std::exception& e) {
			this->_logger.LogWarn("TENSORRTOD::Preprocess", std::string("Skipped frame: ") + e.what(), __FILE__, __LINE__);
			return {};
		}
	}
	std::vector<Object> TENSORRTOD::PostProcessSegmentation(std::vector<std::vector<float>>& featureVectors, const std::string& camera_id, const ImageMetadata& meta) {
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		try {
			if (!_licenseValid) {
				this->_logger.LogFatal("TENSORRTOD::PostProcessSegmentation", "Invalid license", __FILE__, __LINE__);
				std::vector<Object> result;
				result.clear();
				return result;
			}
			const auto& outputDims = m_trtEngine->getOutputDims();

			int numChannels = outputDims[0].d[1];
			int numAnchors = outputDims[0].d[2];

			const auto numClasses = numChannels - SEG_CHANNELS - 4;

			// Ensure the output lengths are correct
			if (featureVectors[0].size() != static_cast<size_t>(numChannels) * numAnchors) {
				return {};
			}
			if (featureVectors[1].size() != static_cast<size_t>(SEG_CHANNELS) * SEG_H * SEG_W) {
				return {};
			}

			cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVectors[0].data());
			output = output.t();

			cv::Mat protos = cv::Mat(SEG_CHANNELS, SEG_H * SEG_W, CV_32F, featureVectors[1].data());

			std::vector<int> labels;
			std::vector<float> scores;
			std::vector<cv::Rect> bboxes;
			std::vector<cv::Mat> maskConfs;
			std::vector<int> indices;

			// Object the bounding boxes and class labels
			for (int i = 0; i < numAnchors; i++) {
				auto rowPtr = output.row(i).ptr<float>();
				auto bboxesPtr = rowPtr;
				auto scoresPtr = rowPtr + 4;
				auto maskConfsPtr = rowPtr + 4 + numClasses;
				auto maxSPtr = std::max_element(scoresPtr, scoresPtr + numClasses);
				float score = *maxSPtr;
				if (score > this->_modelConfig.detectionScoreThreshold)
				{
					float x = *bboxesPtr++;
					float y = *bboxesPtr++;
					float w = *bboxesPtr++;
					float h = *bboxesPtr;

					float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
					float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
					float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
					float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);

					int label = maxSPtr - scoresPtr;
					cv::Rect_<float> bbox;
					bbox.x = x0;
					bbox.y = y0;
					bbox.width = x1 - x0;
					bbox.height = y1 - y0;
					bbox.x = std::max(0.f, bbox.x);
					bbox.y = std::max(0.f, bbox.y);
					bbox.width = std::min(meta.imgWidth - bbox.x, bbox.width);
					bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height);
					cv::Mat maskConf = cv::Mat(1, SEG_CHANNELS, CV_32F, maskConfsPtr);
					bboxes.push_back(bbox);
					labels.push_back(label);
					scores.push_back(score);
					maskConfs.push_back(maskConf);
				}
			}
			cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices);
			cv::Mat masks;
			int classNameSize = static_cast<int>(_classes.size());
			std::vector<Object> objs;
			for (auto& i : indices) {
				if (scores[i] > _modelConfig.detectionScoreThreshold) {
					cv::Rect tmp = bboxes[i];
					Object obj;
					obj.classId = labels[i];
					if (!_classes.empty()) {
						if (obj.classId < classNameSize) {
							obj.className = _classes[obj.classId];
						}
						else {
							obj.className = _classes[classNameSize - 1]; // Use last valid class name if out of range
						}
					}
					else {
						obj.className = "Unknown"; // Fallback if _classes is empty
					}
					obj.box = tmp;
					obj.confidence = scores[i];
					obj.className = _classes[labels[i]];
					masks.push_back(maskConfs[i]);
					objs.push_back(obj);
				}
			}
			if (!masks.empty()) {
				cv::Mat matmulRes = (masks * protos).t();
				cv::Mat maskMat = matmulRes.reshape(indices.size(), { _modelConfig.inpWidth, _modelConfig.inpHeight });
				std::vector<cv::Mat> maskChannels;
				cv::split(maskMat, maskChannels);
				const auto inputDims = m_trtEngine->getInputDims();

				cv::Rect roi;
				if (meta.imgHeight > meta.imgWidth) {
					roi = cv::Rect(0, 0, _modelConfig.inpWidth * meta.imgWidth / meta.imgHeight, _modelConfig.inpHeight);
				}
				else {
					roi = cv::Rect(0, 0, _modelConfig.inpWidth, _modelConfig.inpHeight * meta.imgHeight / meta.imgWidth);
				}
				for (size_t i = 0; i < indices.size(); i++)
				{
					cv::Mat dest, mask;
					cv::exp(-maskChannels[i], dest);
					dest = 1.0 / (1.0 + dest);
					dest = dest(roi);
					objs[i].cameraId = camera_id;
					cv::resize(
						dest,
						mask,
						cv::Size(static_cast<int>(meta.imgWidth), static_cast<int>(meta.imgHeight)),
						cv::INTER_LINEAR
					);
					objs[i].mask = mask(objs[i].box) > _modelConfig.modelConfThreshold;// Need to check segmentation
				}
			}
			//EnqueueDetection(objs, camera_id);
			return objs;
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("TENSORRTOD::PostProcessSegmentation", e.what(), __FILE__, __LINE__);
			std::vector<Object>result;
			result.clear();
			return result;
		}
	}
	std::vector<Object> TENSORRTOD::PostProcessPose(std::vector<float>& featureVector, const std::string& camera_id, const ImageMetadata& meta) {
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		try {
			const auto& outputDims = m_trtEngine->getOutputDims();
			auto numChannels = outputDims[0].d[1];
			auto numAnchors = outputDims[0].d[2];

			std::vector<cv::Rect> bboxes;
			std::vector<float> scores;
			std::vector<int> labels;
			std::vector<int> indices;
			std::vector<std::vector<float>> kpss;

			cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data());
			output = output.t();

			// Get all the YOLO proposals
			for (int i = 0; i < numAnchors; i++) {
			auto rowPtr = output.row(i).ptr<float>();
			auto bboxesPtr = rowPtr;
			auto scoresPtr = rowPtr + 4;
			auto kps_ptr = rowPtr + 5;
			float score = *scoresPtr;
			if (score > this->_modelConfig.detectionScoreThreshold)
			{
				float x = *bboxesPtr++;
				float y = *bboxesPtr++;
				float w = *bboxesPtr++;
				float h = *bboxesPtr;

				float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
				float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
				float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
				float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);

				cv::Rect_<float> bbox;
				bbox.x = x0;
				bbox.y = y0;
				bbox.width = x1 - x0;
				bbox.height = y1 - y0;
				bbox.x = std::max(0.f, bbox.x);
				bbox.y = std::max(0.f, bbox.y);
				bbox.width = std::min(meta.imgWidth - bbox.x, bbox.width);
				bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height);
				std::vector<float> kps;
				for (int k = 0; k < NUM_KPS; k++) {
					float kpsX = *(kps_ptr + 3 * k) * meta.ratio;
					float kpsY = *(kps_ptr + 3 * k + 1) * meta.ratio;
					float kpsS = *(kps_ptr + 3 * k + 2);
					kpsX = std::clamp(kpsX, 0.f, meta.imgWidth);
					kpsY = std::clamp(kpsY, 0.f, meta.imgHeight);
					kps.push_back(kpsX);
					kps.push_back(kpsY);
					kps.push_back(kpsS);
				}

				bboxes.push_back(bbox);
				labels.push_back(0); // All detected objects are people
				scores.push_back(score);
				kpss.push_back(kps);
			}
			}

			// Run NMS
			cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices);
			std::vector<Object> objects;
			int classNameSize = static_cast<int>(_classes.size());
			// Choose the top k detections
			for (auto& chosenIdx : indices) {
			if (scores[chosenIdx] > _modelConfig.detectionScoreThreshold) {
				Object obj{};
				obj.confidence = scores[chosenIdx];
				obj.classId = labels[chosenIdx];
				if (!_classes.empty()) {
					if (obj.classId < classNameSize) {
						obj.className = _classes[obj.classId];
					}
					else {
						obj.className = _classes[classNameSize - 1]; // Use last valid class name if out of range
					}
				}
				else {
					obj.className = "Unknown"; // Fallback if _classes is empty
				}
				obj.box = bboxes[chosenIdx];
				obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
				obj.kps = kpss[chosenIdx];
				obj.cameraId = camera_id;
				objects.push_back(obj);
			}
			}
			//EnqueueDetection(objects, camera_id);
			return objects;
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("TENSORRTOD::PostProcessPose", e.what(), __FILE__, __LINE__);
			std::vector<Object> result;
			result.clear();
			return result;
		}
	}
	std::vector<Object> TENSORRTOD::Postprocess(std::vector<float>& featureVector, const std::string& camera_id, const ImageMetadata& meta) {
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		try {
			const auto& outputDims = m_trtEngine->getOutputDims();
			auto numChannels = outputDims[0].d[1];
			auto numAnchors = outputDims[0].d[2];
			auto numClasses = _classes.size();

			std::vector<cv::Rect> bboxes;
			std::vector<float> scores;
			std::vector<int> labels;
			std::vector<int> indices;

			cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data());
			output = output.t();

			// Get all the YOLO proposals
			for (int i = 0; i < numAnchors; i++) {
			auto rowPtr = output.row(i).ptr<float>();
			auto bboxesPtr = rowPtr;
			auto scoresPtr = rowPtr + 4;
			auto maxSPtr = std::max_element(scoresPtr, scoresPtr + numClasses);
			float score = *maxSPtr;
			if (score > this->_modelConfig.detectionScoreThreshold) {
				float x = *bboxesPtr++;
				float y = *bboxesPtr++;
				float w = *bboxesPtr++;
				float h = *bboxesPtr;
				float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
				float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
				float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
				float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
				int label = maxSPtr - scoresPtr;
				cv::Rect_<float> bbox;
				bbox.x = x0;
				bbox.y = y0;
				bbox.width = x1 - x0;
				bbox.height = y1 - y0;
				bbox.x = std::max(0.f, bbox.x);
				bbox.y = std::max(0.f, bbox.y);
				bbox.width = std::min(meta.imgWidth - bbox.x, bbox.width);
				bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height);
				bboxes.push_back(bbox);
				labels.push_back(label);
				scores.push_back(score);
			}
			}

			// Run NMS
			cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices);
			std::vector<Object> objects;
			int classNameSize = static_cast<int>(_classes.size());
			// Choose the top k detections
			for (auto& chosenIdx : indices) {
			if (scores[chosenIdx] > _modelConfig.detectionScoreThreshold) {
				Object obj{};
				obj.confidence = scores[chosenIdx];
				obj.classId = labels[chosenIdx];
				obj.box = bboxes[chosenIdx];
				obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);

				if (!_classes.empty()) {
					if (obj.classId < classNameSize) {
						obj.className = _classes[obj.classId];
					}
					else {
						obj.className = _classes[classNameSize - 1]; // Use last valid class name if out of range
					}
				}
				else {
					obj.className = "Unknown"; // Fallback if _classes is empty
				}
				obj.cameraId = camera_id;
				objects.push_back(obj);
			}
			}
			//EnqueueDetection(objects, camera_id);
			return objects;
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("TENSORRTOD::Postproces", e.what(), __FILE__, __LINE__);
			std::vector<Object> result;
			result.clear();
			return result;
		}

	}
	std::vector<std::vector<Object>> TENSORRTOD::DetectObjectsBatch(const std::vector<cv::Mat>& inputImages,const std::string& camera_id) {

		if (inputImages.empty()) {
			_logger.LogError("TENSORRTOD::DetectObjectsBatch",
			"Empty input images vector", __FILE__, __LINE__);
			return {};
		}

		// Auto-split if batch exceeds engine capacity
		const int maxBatch = m_options.maxBatchSize > 0 ? m_options.maxBatchSize : 1;
		if (static_cast<int>(inputImages.size()) > maxBatch && maxBatch > 0) {
			const size_t numImages = inputImages.size();
			std::vector<std::vector<Object>> allResults;
			allResults.reserve(numImages);
			// Process chunks sequentially to avoid GPU contention on the same engine
			for (size_t start = 0; start < numImages; start += static_cast<size_t>(maxBatch)) {
				const size_t end = std::min(start + static_cast<size_t>(maxBatch), numImages);
				std::vector<cv::Mat> chunk(inputImages.begin() + start, inputImages.begin() + end);
				auto chunkResults = DetectObjectsBatch(chunk, camera_id);
				if (chunkResults.size() == chunk.size()) {
					for (auto& r : chunkResults) allResults.push_back(std::move(r));
				}
				else {
					_logger.LogError("TENSORTRTOD::DetectObjectsBatch",
						"Chunk returned " + std::to_string(chunkResults.size()) +
						" results, expected " + std::to_string(chunk.size()) +
						". Padding with empty results.", __FILE__, __LINE__);
					for (auto& r : chunkResults) allResults.push_back(std::move(r));
					for (size_t pad = chunkResults.size(); pad < chunk.size(); ++pad) {
						allResults.push_back({});
					}
				}
			}
			return allResults;
		}

		try {
			const size_t realCount = inputImages.size();

			// ── Pad batch to next power-of-2 ─────────────────────────────
			// Eliminates batch-size thrashing (e.g. 3→4→3→4) and ensures
			// every inference hits a pre-warmed CUDA graph.  The padding
			// images are duplicates of the last real image (cheapest option
			// — avoids allocating new cv::Mat memory).
			size_t paddedCount = 1;
			while (paddedCount < realCount) paddedCount *= 2;
			// Clamp to engine max batch
			paddedCount = std::min(paddedCount, static_cast<size_t>(maxBatch));

			const std::vector<cv::Mat>* batchPtr = &inputImages;
			std::vector<cv::Mat> paddedImages;

			if (paddedCount > realCount) {
				paddedImages.reserve(paddedCount);
				paddedImages.insert(paddedImages.end(), inputImages.begin(), inputImages.end());
				// Duplicate last image for padding slots
				for (size_t p = realCount; p < paddedCount; ++p)
					paddedImages.push_back(inputImages.back());
				batchPtr = &paddedImages;
			}

			// Create local metadata for this batch
			BatchMetadata metadata;

			// Preprocess all images in batch (including padding)
			const auto inputs = PreprocessBatch(*batchPtr, metadata);
			if (inputs.empty() || inputs[0].empty()) {
			_logger.LogError("TENSORRTOD::DetectObjectsBatch",
				"Preprocessing failed", __FILE__, __LINE__);
			return {};
			}

			// Run batch inference
			std::vector<std::vector<std::vector<float>>> featureVectors;
			auto succ = m_trtEngine->runInference(inputs, featureVectors);
			if (!succ) {
			_logger.LogError("TENSORRTOD::DetectObjectsBatch",
				"Error running inference", __FILE__, __LINE__);
			// Do NOT set _isFixedBatch = true here. A transient failure (CUDA OOM,
			// stream error, etc.) should not permanently fall back to single-image mode.
			return {};
			}

			// Validate output size (against padded count)
			if (featureVectors.size() != paddedCount) {
			_logger.LogError("TENSORRTOD::DetectObjectsBatch",
				"Output batch size mismatch", __FILE__, __LINE__);
			return {};
			}

			// Trim to real count — discard padding results
			featureVectors.resize(realCount);

			// Process results in parallel -- each image's postprocess is fully
			// independent; no shared mutable state exists between per-image calls.
			const auto& outputDims = m_trtEngine->getOutputDims();
			const size_t numOutputs = outputDims.size();
			const size_t numBatch   = featureVectors.size();

			std::vector<std::vector<Object>> batchDetections(numBatch);
			std::vector<std::future<std::vector<Object>>> postFutures;
			postFutures.reserve(numBatch);

			for (size_t batchIdx = 0; batchIdx < numBatch; ++batchIdx) {
				const auto& batchOutput = featureVectors[batchIdx];

				ImageMetadata imgMeta;
				imgMeta.ratio     = metadata.ratios[batchIdx];
				imgMeta.imgWidth  = static_cast<float>(metadata.imgWidths[batchIdx]);
				imgMeta.imgHeight = static_cast<float>(metadata.imgHeights[batchIdx]);

				if (numOutputs == 1) {
					std::vector<float> featureVector =
						batchOutput.empty() ? std::vector<float>{} : batchOutput[0];
					if (batchOutput.empty()) {
						_logger.LogWarn("TENSORRTOD::DetectObjectsBatch",
							"Empty output for image " + std::to_string(batchIdx),
							__FILE__, __LINE__);
					}
					const int numChannels = outputDims[0].d[1];
					if (numChannels == 56) {
						postFutures.push_back(std::async(std::launch::async,
							[this, fv = std::move(featureVector), cid = camera_id, meta = imgMeta]() mutable {
								return PostProcessPose(fv, cid, meta);
							}));
					} else {
						postFutures.push_back(std::async(std::launch::async,
							[this, fv = std::move(featureVector), cid = camera_id, idx = batchIdx, &metadata]() mutable {
								return PostprocessBatch(fv, cid, idx, metadata);
							}));
					}
				} else {
					if (batchOutput.empty()) {
						_logger.LogWarn("TENSORRTOD::DetectObjectsBatch",
							"Empty output for image " + std::to_string(batchIdx),
							__FILE__, __LINE__);
					}
					std::vector<std::vector<float>> featureVector2d;
					featureVector2d.reserve(batchOutput.size());
					for (const auto& out : batchOutput) featureVector2d.push_back(out);
					postFutures.push_back(std::async(std::launch::async,
						[this, fv2d = std::move(featureVector2d), cid = camera_id, meta = imgMeta]() mutable {
							return PostProcessSegmentation(fv2d, cid, meta);
						}));
				}
			}

			// Gather results in original order; metadata stays alive until here.
			for (size_t i = 0; i < numBatch; ++i)
				batchDetections[i] = postFutures[i].get();

			return batchDetections;
		}
		catch (const std::exception& e) {
			_logger.LogFatal("TENSORRTOD::DetectObjectsBatch", e.what(), __FILE__, __LINE__);
			return {};
		}
	}
	std::vector<std::vector<cv::cuda::GpuMat>> TENSORRTOD::PreprocessBatch(const std::vector<cv::Mat>& inputImages,BatchMetadata& outMetadata) {

		if (!_licenseValid) {
			_logger.LogError("TENSORRTOD::PreprocessBatch",
			"Invalid license", __FILE__, __LINE__);
			return {};
		}

		if (inputImages.empty()) {
			_logger.LogError("TENSORRTOD::PreprocessBatch",
			"Empty input images vector", __FILE__, __LINE__);
			return {};
		}

		try {
			const auto& inputDims = m_trtEngine->getInputDims();
			if (inputDims.empty()) {
			_logger.LogError("TENSORRTOD::PreprocessBatch",
				"No input dimensions available", __FILE__, __LINE__);
			return {};
			}

			const int inputH = inputDims[0].d[1];
			const int inputW = inputDims[0].d[2];

			if (inputH <= 0 || inputW <= 0) {
			_logger.LogError("TENSORRTOD::PreprocessBatch",
				"Invalid model input dimensions", __FILE__, __LINE__);
			return {};
			}

			// Initialize output metadata
			outMetadata.imgHeights.resize(inputImages.size());
			outMetadata.imgWidths.resize(inputImages.size());
			outMetadata.ratios.resize(inputImages.size());

			std::vector<cv::cuda::GpuMat> batchProcessed;
			batchProcessed.reserve(inputImages.size());

			cv::cuda::Stream stream;

			for (size_t i = 0; i < inputImages.size(); ++i) {
			const auto& inputImage = inputImages[i];

			if (inputImage.empty()) {
				_logger.LogError("TENSORRTOD::PreprocessBatch",
					"Empty input image at index " + std::to_string(i),
					__FILE__, __LINE__);
				return {};
			}

			// CPU preprocessing: resize + BGR->RGB before GPU upload
			cv::Mat srcImg = inputImage;
			if (srcImg.channels() == 1) {
				cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
			} else if (srcImg.channels() != 3) {
				_logger.LogError("TENSORRTOD::PreprocessBatch",
					"Unsupported channel count at index " + std::to_string(i),
					__FILE__, __LINE__);
				return {};
			}

			// Store in output metadata from ORIGINAL image
			outMetadata.imgHeights[i] = srcImg.rows;
			outMetadata.imgWidths[i] = srcImg.cols;

			if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) {
				_logger.LogError("TENSORRTOD::PreprocessBatch",
					"Invalid dimensions for image " + std::to_string(i),
					__FILE__, __LINE__);
				return {};
			}

			const auto& outputDims = m_trtEngine->getOutputDims();
			const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;

			const float scaleW = inputW / static_cast<float>(srcImg.cols);
			const float scaleH = inputH / static_cast<float>(srcImg.rows);
			outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(scaleW, scaleH);

			cv::Mat cpuResized;
			if (srcImg.rows != inputH || srcImg.cols != inputW) {
				if (isClassification) {
					cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
				} else {
					cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
				}
			} else {
				cpuResized = srcImg;
			}

			cv::Mat cpuRGB;
			cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);

			cv::cuda::GpuMat gpuResized;
			gpuResized.upload(cpuRGB, stream);
			batchProcessed.push_back(std::move(gpuResized));
			}

			stream.waitForCompletion();

			std::vector<std::vector<cv::cuda::GpuMat>> inputs;
			inputs.push_back(std::move(batchProcessed));

			return inputs;

		}
		catch (const std::exception& e) {
			_logger.LogFatal("TENSORRTOD::PreprocessBatch", e.what(), __FILE__, __LINE__);
			return {};
		}
	}
	std::vector<Object> TENSORRTOD::PostprocessBatch(std::vector<float>& featureVector,const std::string& camera_id,size_t batchIdx,const BatchMetadata& metadata) {

		try {
			// Bounds checking
			if (batchIdx >= metadata.ratios.size() ||
			batchIdx >= metadata.imgWidths.size() ||
			batchIdx >= metadata.imgHeights.size()) {
			_logger.LogError("TENSORRTOD::PostprocessBatch",
				"Batch index out of range", __FILE__, __LINE__);
			return {};
			}

			const auto& outputDims = m_trtEngine->getOutputDims();
			auto numChannels = outputDims[0].d[1];
			auto numAnchors = outputDims[0].d[2];
			auto numClasses = _classes.size();

			// Get batch-specific metadata - NO LOCK NEEDED!
			const float ratio = metadata.ratios[batchIdx];
			const int imgWidth = metadata.imgWidths[batchIdx];
			const int imgHeight = metadata.imgHeights[batchIdx];

			std::vector<cv::Rect> bboxes;
			std::vector<float> scores;
			std::vector<int> labels;
			std::vector<int> indices;

			bboxes.reserve(numAnchors / 10);
			scores.reserve(numAnchors / 10);
			labels.reserve(numAnchors / 10);

			cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data());
			output = output.t();

			for (int i = 0; i < numAnchors; i++) {
			auto rowPtr = output.row(i).ptr<float>();
			auto bboxesPtr = rowPtr;
			auto scoresPtr = rowPtr + 4;
			auto maxSPtr = std::max_element(scoresPtr, scoresPtr + numClasses);
			float score = *maxSPtr;

			if (score > _modelConfig.detectionScoreThreshold) {
				float x = *bboxesPtr++;
				float y = *bboxesPtr++;
				float w = *bboxesPtr++;
				float h = *bboxesPtr;

				float x0 = std::clamp((x - 0.5f * w) * ratio, 0.f, static_cast<float>(imgWidth));
				float y0 = std::clamp((y - 0.5f * h) * ratio, 0.f, static_cast<float>(imgHeight));
				float x1 = std::clamp((x + 0.5f * w) * ratio, 0.f, static_cast<float>(imgWidth));
				float y1 = std::clamp((y + 0.5f * h) * ratio, 0.f, static_cast<float>(imgHeight));

				int label = static_cast<int>(maxSPtr - scoresPtr);

				cv::Rect_<float> bbox;
				bbox.x = x0;
				bbox.y = y0;
				bbox.width = x1 - x0;
				bbox.height = y1 - y0;

				if (bbox.width > 0.f && bbox.height > 0.f) {
					bboxes.push_back(bbox);
					labels.push_back(label);
					scores.push_back(score);
				}
			}
			}

			cv::dnn::NMSBoxesBatched(bboxes, scores, labels,
			PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices);

			int classNameSize = static_cast<int>(_classes.size());
			std::vector<Object> objects;
			objects.reserve(indices.size());

			for (auto& chosenIdx : indices) {
			if (scores[chosenIdx] > _modelConfig.detectionScoreThreshold) {
				Object obj{};
				obj.confidence = scores[chosenIdx];
				obj.classId = labels[chosenIdx];
				obj.box = bboxes[chosenIdx];
				obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(
					obj.box, imgWidth, imgHeight);

				if (!_classes.empty()) {
					if (obj.classId < classNameSize) {
						obj.className = _classes[obj.classId];
					}
					else {
						obj.className = _classes[classNameSize - 1];
					}
				}
				else {
					obj.className = "Unknown";
				}

				obj.cameraId = camera_id;
				objects.push_back(std::move(obj));
			}
			}

			return objects;
		}
		catch (const std::exception& e) {
			_logger.LogFatal("TENSORRTOD::PostprocessBatch", e.what(), __FILE__, __LINE__);
			return {};
		}
	}
}