Refactor project structure

2026-03-28 19:56:39 +11:00
parent 1d267378b2
commit 8a2e721058
511 changed files with 59 additions and 48 deletions
--- a/modules/ANSODEngine/ANSTENSORRTCL.cpp
+++ b/modules/ANSODEngine/ANSTENSORRTCL.cpp
@@ -0,0 +1,988 @@
+#include "ANSTENSORRTCL.h"
+#include "Utility.h"
+#include <opencv2/cudaimgproc.hpp>
+#include <future>
+namespace ANSCENTER
+{
+	bool TENSORRTCL::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
+		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder)) {
+			return false;
+		}
+		if (!FileExist(_modelFilePath)) {
+			this->_logger.LogFatal("TENSORRTCL::OptimizeModel", "Raw model file path does not exist", __FILE__, __LINE__);
+			return false;
+		}
+		try {
+			_fp16 = fp16;
+			optimizedModelFolder = GetParentFolder(_modelFilePath);
+			// Check if the engine already exists to avoid reinitializing
+			if (!m_trtEngine) {
+				// Fixed batch size of 1 for this model
+				m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
+				m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
+				m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
+				
+				m_options.maxInputHeight = _modelConfig.maxInputHeight;
+				m_options.minInputHeight = _modelConfig.minInputHeight;
+				m_options.optInputHeight = _modelConfig.optInputHeight;
+				m_options.maxInputWidth = _modelConfig.maxInputWidth;
+				m_options.minInputWidth = _modelConfig.minInputWidth;
+				m_options.optInputWidth = _modelConfig.optInputWidth;
+
+				m_options.engineFileDir = optimizedModelFolder;
+				// Use FP16 or FP32 precision based on the input flag
+				m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
+				// Create the TensorRT inference engine
+				m_trtEngine = std::make_unique<Engine<float>>(m_options);
+			}
+
+			// Build the TensorRT engine
+			auto succ = m_trtEngine->buildWithRetry(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE);
+			if (!succ) {
+				const std::string errMsg =
+					"Error: Unable to build the TensorRT engine. "
+					"Try increasing TensorRT log severity to kVERBOSE.";
+				this->_logger.LogError("TENSORRTCL::OptimizeModel", errMsg, __FILE__, __LINE__);
+				_modelLoadValid = false;
+				return false;
+			}
+			_modelLoadValid = true;
+			return true;
+		}
+		catch (const std::exception& e) {
+			this->_logger.LogFatal("TENSORRTCL::OptimizeModel", e.what(), __FILE__, __LINE__);
+			optimizedModelFolder.clear();
+			return false;
+		}
+	}
+	bool TENSORRTCL::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) {
+		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		try {
+			bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword);
+			if (!result) return false;
+			_modelConfig.detectionType = ANSCENTER::DetectionType::CLASSIFICATION;
+			_modelConfig.modelType = ModelType::TENSORRT;
+			_modelConfig.inpHeight = 224;
+			_modelConfig.inpWidth = 224;
+			if (_modelConfig.modelMNSThreshold < 0.2)
+				_modelConfig.modelMNSThreshold = 0.5;
+			if (_modelConfig.modelConfThreshold < 0.2)
+				_modelConfig.modelConfThreshold = 0.5;
+			if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)  // 133 = COCO wholebody max
+				_modelConfig.numKPS = 17;
+			if (_modelConfig.kpsThreshold == 0)_modelConfig.kpsThreshold = 0.5; // If not define
+			// if (_modelConfig.precisionType == PrecisionType::FP16)_fp16 = true;
+			_fp16 = true; // Load Model from Here
+			// Load Model from Here
+			TOP_K = 100;
+			SEG_CHANNELS = 32;
+			PROBABILITY_THRESHOLD = 0.3;
+			NMS_THRESHOLD = 0.65f;
+			SEGMENTATION_THRESHOLD = 0.5f;
+			SEG_H = 160;
+			SEG_W = 160;
+			NUM_KPS = _modelConfig.numKPS;
+			KPS_THRESHOLD = _modelConfig.kpsThreshold;
+			SEG_CHANNELS = 32;      // For segmentation 
+
+			if (!m_trtEngine) {
+				// Fixed batch size of 1 for this model
+				m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
+				m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
+				m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
+
+				m_options.maxInputHeight = _modelConfig.maxInputHeight;
+				m_options.minInputHeight = _modelConfig.minInputHeight;
+				m_options.optInputHeight = _modelConfig.optInputHeight;
+				m_options.maxInputWidth = _modelConfig.maxInputWidth;
+				m_options.minInputWidth = _modelConfig.minInputWidth;
+				m_options.optInputWidth = _modelConfig.optInputWidth;
+
+
+				m_options.engineFileDir = _modelFolder;
+				// Use FP16 or FP32 precision based on the input flag
+				m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
+				// Create the TensorRT inference engine
+				m_trtEngine = std::make_unique<Engine<float>>(m_options);
+			}
+			// 0. Check if the configuration file exist
+			if (FileExist(_modelConfigFile)) {
+				ModelType modelType;
+				std::vector<int> inputShape;
+				_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
+				if (inputShape.size() == 2) {
+					if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0];
+					if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1];
+				}
+			}
+			else {// This is old version of model zip file
+				_modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
+				_classFilePath = CreateFilePath(_modelFolder, "classes.names");
+				std::ifstream isValidFileName(_classFilePath);
+				if (!isValidFileName)
+				{
+					this->_logger.LogDebug("TENSORRTCL::Initialize.  Load classes from string", _classFilePath, __FILE__, __LINE__);
+					LoadClassesFromString();
+				}
+				else {
+					this->_logger.LogDebug("TENSORRTCL::Initialize.  Load classes from file", _classFilePath, __FILE__, __LINE__);
+					LoadClassesFromFile();
+				}
+			}
+			// Load the TensorRT engine file
+			if (this->_loadEngineOnCreation) {
+				auto succ = m_trtEngine->buildLoadNetwork(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
+				if (!succ) {
+					const std::string errMsg = "Error: Unable to load TensorRT engine weights into memory. " + _modelFilePath;
+					this->_logger.LogError("TENSORRTCL::Initialize", errMsg, __FILE__, __LINE__);
+					_modelLoadValid = false;
+					return false;
+				}
+			}
+			_modelLoadValid = true;
+			_isInitialized = true;
+			return true;
+		}
+		catch (std::exception& e) {
+			this->_logger.LogFatal("TENSORRTCL::LoadModel", e.what(), __FILE__, __LINE__);
+			return false;
+		}
+
+	}
+	bool TENSORRTCL::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig, std::string modelName, std::string className, const std::string& modelFolder, std::string& labelMap) {
+		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		try {
+			bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap);
+			if (!result) return false;
+			std::string _modelName = modelName;
+			if (_modelName.empty()) {
+				_modelName = "train_last";
+			}
+			std::string modelFullName = _modelName + ".onnx";
+			// Parsing for YOLO only here
+			_modelConfig = modelConfig;
+			_modelConfig.detectionType = ANSCENTER::DetectionType::CLASSIFICATION;
+			_modelConfig.modelType = ModelType::TENSORRT;
+			_modelConfig.inpHeight = 224;
+			_modelConfig.inpWidth = 224;
+			if (_modelConfig.modelMNSThreshold < 0.2)
+				_modelConfig.modelMNSThreshold = 0.5;
+			if (_modelConfig.modelConfThreshold < 0.2)
+				_modelConfig.modelConfThreshold = 0.5;
+			if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)  // 133 = COCO wholebody max
+				_modelConfig.numKPS = 17;
+			if (_modelConfig.kpsThreshold == 0)_modelConfig.kpsThreshold = 0.5; // If not define
+			_fp16 = true; // Load Model from Here
+			// Load Model from Here
+			TOP_K = 100;
+			SEG_CHANNELS = 32;
+			PROBABILITY_THRESHOLD = 0.3;
+			NMS_THRESHOLD = 0.65f;
+			SEGMENTATION_THRESHOLD = 0.5f;
+			SEG_H = 160;
+			SEG_W = 160;
+			NUM_KPS = _modelConfig.numKPS;
+			KPS_THRESHOLD = _modelConfig.kpsThreshold;
+			SEG_CHANNELS = 32;      // For segmentation 
+			if (!m_trtEngine) {
+				// Fixed batch size of 1 for this model
+				m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
+				m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
+				m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
+
+				m_options.maxInputHeight = _modelConfig.maxInputHeight;
+				m_options.minInputHeight = _modelConfig.minInputHeight;
+				m_options.optInputHeight = _modelConfig.optInputHeight;
+				m_options.maxInputWidth = _modelConfig.maxInputWidth;
+				m_options.minInputWidth = _modelConfig.minInputWidth;
+				m_options.optInputWidth = _modelConfig.optInputWidth;
+
+				m_options.engineFileDir = _modelFolder;
+				// Use FP16 or FP32 precision based on the input flag
+				m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
+				// Create the TensorRT inference engine
+				m_trtEngine = std::make_unique<Engine<float>>(m_options);
+			}
+			// 0. Check if the configuration file exist
+			if (FileExist(_modelConfigFile)) {
+				ModelType modelType;
+				std::vector<int> inputShape;
+				_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
+				if (inputShape.size() == 2) {
+					if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0];
+					if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1];
+				}
+			}
+			else {// This is old version of model zip file
+				_modelFilePath = CreateFilePath(_modelFolder, modelFullName);
+				_classFilePath = CreateFilePath(_modelFolder, className);
+				std::ifstream isValidFileName(_classFilePath);
+				if (!isValidFileName)
+				{
+					this->_logger.LogDebug("TENSORRTCL::Initialize.  Load classes from string", _classFilePath, __FILE__, __LINE__);
+					LoadClassesFromString();
+				}
+				else {
+					this->_logger.LogDebug("TENSORRTCL::Initialize.  Load classes from file", _classFilePath, __FILE__, __LINE__);
+					LoadClassesFromFile();
+				}
+			}
+			// 1. Load labelMap and engine
+			labelMap.clear();
+			if (!_classes.empty())
+				labelMap = VectorToCommaSeparatedString(_classes);
+
+			// Load the TensorRT engine file
+			if (this->_loadEngineOnCreation) {
+				auto succ = m_trtEngine->buildLoadNetwork(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
+				if (!succ) {
+					const std::string errMsg = "Error: Unable to load TensorRT engine weights into memory. " + _modelFilePath;
+					this->_logger.LogError("TENSORRTCL::Initialize", errMsg, __FILE__, __LINE__);
+					_modelLoadValid = false;
+					return false;
+				}
+
+			}
+			_modelLoadValid = true;
+			_isInitialized = true;
+			return true;
+		}
+		catch (std::exception& e) {
+			this->_logger.LogFatal("TENSORRTCL::LoadModel", e.what(), __FILE__, __LINE__);
+			return false;
+		}
+	}
+	bool TENSORRTCL::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, std::string& labelMap) {
+		const bool engineAlreadyLoaded = _modelLoadValid && _isInitialized && m_trtEngine != nullptr;
+		_modelLoadValid = false;
+		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		try {
+			bool result = ANSODBase::Initialize(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, labelMap);
+			if (!result) return false;
+			// Parsing for YOLO only here
+			_modelConfig = modelConfig;
+			_modelConfig.detectionType = ANSCENTER::DetectionType::CLASSIFICATION;
+			_modelConfig.modelType = ModelType::TENSORRT;
+			_modelConfig.inpHeight = 224;
+			_modelConfig.inpWidth = 224;
+			if (_modelConfig.modelMNSThreshold < 0.2)
+				_modelConfig.modelMNSThreshold = 0.5;
+			if (_modelConfig.modelConfThreshold < 0.2)
+				_modelConfig.modelConfThreshold = 0.5;
+			if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)  // 133 = COCO wholebody max
+				_modelConfig.numKPS = 17;
+			if (_modelConfig.kpsThreshold == 0)_modelConfig.kpsThreshold = 0.5; // If not define
+			// if (_modelConfig.precisionType == PrecisionType::FP16)_fp16 = true;
+			_fp16 = true; // Load Model from Here
+			// Load Model from Here
+			TOP_K = 100;
+			SEG_CHANNELS = 32;
+			PROBABILITY_THRESHOLD = 0.3;
+			NMS_THRESHOLD = 0.65f;
+			SEGMENTATION_THRESHOLD = 0.5f;
+			SEG_H = 160;
+			SEG_W = 160;
+			NUM_KPS = _modelConfig.numKPS;
+			KPS_THRESHOLD = _modelConfig.kpsThreshold;
+			SEG_CHANNELS = 32;      // For segmentation 
+
+			if (!m_trtEngine) {
+				// Fixed batch size of 1 for this model
+				m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
+				m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
+				m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
+				
+				m_options.maxInputHeight = _modelConfig.maxInputHeight;
+				m_options.minInputHeight = _modelConfig.minInputHeight;
+				m_options.optInputHeight = _modelConfig.optInputHeight;
+				m_options.maxInputWidth = _modelConfig.maxInputWidth;
+				m_options.minInputWidth = _modelConfig.minInputWidth;
+				m_options.optInputWidth = _modelConfig.optInputWidth;
+
+				m_options.engineFileDir = _modelFolder;
+				// Use FP16 or FP32 precision based on the input flag
+				m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
+				// Create the TensorRT inference engine
+				m_trtEngine = std::make_unique<Engine<float>>(m_options);
+			}
+			// 0. Check if the configuration file exist
+			if (FileExist(_modelConfigFile)) {
+				ModelType modelType;
+				std::vector<int> inputShape;
+				_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
+				if (inputShape.size() == 2) {
+					if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0];
+					if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1];
+				}
+			}
+			else {// This is old version of model zip file
+				_modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
+				_classFilePath = CreateFilePath(_modelFolder, "classes.names");
+				std::ifstream isValidFileName(_classFilePath);
+				if (!isValidFileName)
+				{
+					this->_logger.LogDebug("TENSORRTCL::Initialize.  Load classes from string", _classFilePath, __FILE__, __LINE__);
+					LoadClassesFromString();
+				}
+				else {
+					this->_logger.LogDebug("TENSORRTCL::Initialize.  Load classes from file", _classFilePath, __FILE__, __LINE__);
+					LoadClassesFromFile();
+				}
+			}
+			// 1. Load labelMap and engine
+			labelMap.clear();
+			if (!_classes.empty())
+				labelMap = VectorToCommaSeparatedString(_classes);
+
+			// Load the TensorRT engine file
+			if (this->_loadEngineOnCreation && !engineAlreadyLoaded) {
+				auto succ = m_trtEngine->buildLoadNetwork(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
+				if (!succ) {
+					const std::string errMsg = "Error: Unable to load TensorRT engine weights into memory. " + _modelFilePath;
+					this->_logger.LogError("TENSORRTCL::Initialize", errMsg, __FILE__, __LINE__);
+					_modelLoadValid = false;
+					return false;
+				}
+			}
+			_modelLoadValid = true;
+			_isInitialized = true;
+			return true;
+		}
+		catch (std::exception& e) {
+			this->_logger.LogFatal("TENSORRTCL::Initialize", e.what(), __FILE__, __LINE__);
+			return false;
+		}
+	}
+	std::vector<Object> TENSORRTCL::RunInference(const cv::Mat& inputImgBGR) {
+		return RunInference(inputImgBGR, "CustomCam");
+	}
+	std::vector<Object> TENSORRTCL::RunInference(const cv::Mat& inputImgBGR,const std::string& camera_id)
+	{
+		// Validate state under brief lock
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			if (!_modelLoadValid) {
+				_logger.LogError("TENSORRTCL::RunInference",
+					"Cannot load the TensorRT model. Please check if it exists",
+					__FILE__, __LINE__);
+				return {};
+			}
+
+			if (!_licenseValid) {
+				_logger.LogError("TENSORRTCL::RunInference",
+					"Runtime license is not valid or expired. Please contact ANSCENTER",
+					__FILE__, __LINE__);
+				return {};
+			}
+
+			if (!_isInitialized) {
+				_logger.LogError("TENSORRTCL::RunInference",
+					"Model is not initialized",
+					__FILE__, __LINE__);
+				return {};
+			}
+
+			if (inputImgBGR.empty() || inputImgBGR.cols < 5 || inputImgBGR.rows < 5) {
+				return {};
+			}
+		}
+		try {
+			return DetectObjects(inputImgBGR, camera_id);
+		}
+		catch (const std::exception& e) {
+			_logger.LogFatal("TENSORRTCL::RunInference", e.what(), __FILE__, __LINE__);
+			return {};
+		}
+	}
+		std::vector<std::vector<Object>> TENSORRTCL::RunInferencesBatch(const std::vector<cv::Mat>& inputs, const std::string& camera_id) {
+		// Validate state under brief lock
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			if (!_modelLoadValid) {
+				this->_logger.LogFatal("TENSORRTCL::RunInferencesBatch",
+					"Cannot load the TensorRT model. Please check if it exists", __FILE__, __LINE__);
+				return {};
+			}
+
+			if (!_licenseValid) {
+				this->_logger.LogFatal("TENSORRTCL::RunInferencesBatch",
+					"Runtime license is not valid or expired. Please contact ANSCENTER", __FILE__, __LINE__);
+				return {};
+			}
+
+			if (!_isInitialized) {
+				this->_logger.LogFatal("TENSORRTCL::RunInferencesBatch",
+					"Engine not initialized", __FILE__, __LINE__);
+				return {};
+			}
+
+			if (inputs.empty()) return {};
+		}
+		try {
+			return DetectObjectsBatch(inputs, camera_id);
+		}
+		catch (const std::exception& e) {
+			this->_logger.LogFatal("TENSORRTCL::RunInferencesBatch", e.what(), __FILE__, __LINE__);
+			return {};
+		}
+	};
+		TENSORRTCL::~TENSORRTCL() {
+		try {
+			Destroy();
+		}
+		catch (std::exception& e) {
+			this->_logger.LogError("TENSORRTCL::~TENSORRTCL()", e.what(), __FILE__, __LINE__);
+		}
+	}
+	bool TENSORRTCL::Destroy() {
+		try {
+			m_trtEngine.reset();  // Releases the current engine and sets m_trtEngine to nullptr.
+			return true;
+		}
+		catch (std::exception& e) {
+			this->_logger.LogError("TENSORRTCL::~TENSORRTCL()", e.what(), __FILE__, __LINE__);
+			return false;
+		}
+	}
+
+	// private
+	std::vector<Object> TENSORRTCL::DetectObjects(const cv::Mat& inputImage, const std::string& camera_id) {
+		try {
+			// --- 1. Set GPU device context ---
+			if (m_trtEngine) {
+				m_trtEngine->setDeviceContext();
+			}
+
+			// --- 1b. CUDA context health check ---
+			if (!m_nv12Helper.isCudaContextHealthy(_logger, "TENSORRTCL")) {
+				return {};
+			}
+
+			// --- 2. Preprocess under lock ---
+			// Try NV12 fast path first (classification: direct resize, no letterbox).
+			ImageMetadata meta;
+			std::vector<std::vector<cv::cuda::GpuMat>> input;
+			{
+				std::lock_guard<std::recursive_mutex> lock(_mutex);
+				const int inferenceGpu = m_trtEngine ? m_trtEngine->getPreferredDeviceIndex() : 0;
+				const auto& inputDims = m_trtEngine->getInputDims();
+				const int inputW = inputDims[0].d[2];
+				const int inputH = inputDims[0].d[1];
+
+				auto nv12 = m_nv12Helper.tryNV12(inputImage, inferenceGpu, inputW, inputH,
+				                                  NV12PreprocessHelper::classificationLauncher(),
+				                                  _logger, "TENSORRTCL");
+				if (nv12.succeeded) {
+					meta.imgWidth  = nv12.metaWidth;
+					meta.imgHeight = nv12.metaHeight;
+					meta.ratio     = 1.f;  // classification: no letterbox
+					input = {{ std::move(nv12.gpuRGB) }};
+				}
+				else if (nv12.useBgrFullRes) {
+					input = Preprocess(nv12.bgrFullResImg, meta);
+				}
+
+				if (input.empty()) {
+					input = Preprocess(inputImage, meta);
+				}
+				m_nv12Helper.tickInference();
+			}
+			if (input.empty()) return {};
+
+			// Phase 2: Inference — mutex released; pool dispatches to idle GPU slot
+			std::vector<std::vector<std::vector<float>>> featureVectors;
+			auto succ = m_trtEngine->runInference(input, featureVectors);
+			if (!succ) {
+				this->_logger.LogFatal("TENSORRTCL::DetectObjects", "Error running inference", __FILE__, __LINE__);
+				return {};
+			}
+
+			// Phase 3: Postprocess under brief lock
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			std::vector<float> featureVector;
+			Engine<float>::transformOutput(featureVectors, featureVector);
+			return Postprocess(featureVector, camera_id, meta);
+		}
+		catch (std::exception& e) {
+			this->_logger.LogFatal("TENSORRTCL::DetectObjects", e.what(), __FILE__, __LINE__);
+			return {};
+		}
+	}
+		std::vector<std::vector<cv::cuda::GpuMat>> TENSORRTCL::Preprocess(const cv::Mat& inputImage, ImageMetadata& outMeta) {
+		try {
+			if (!_licenseValid) {
+				this->_logger.LogFatal("TENSORRTCL::Preprocess", "Invalid license", __FILE__, __LINE__);
+				return {};
+			}
+
+			if (inputImage.empty()) {
+				this->_logger.LogFatal("TENSORRTCL::Preprocess", "Input image is empty", __FILE__, __LINE__);
+				return {};
+			}
+
+			if ((inputImage.cols < 5) || (inputImage.rows < 5)) {
+				this->_logger.LogFatal("TENSORRTCL::Preprocess",
+					"Input image is too small (Width: " + std::to_string(inputImage.cols) +
+					", Height: " + std::to_string(inputImage.rows) + ")",
+					__FILE__, __LINE__);
+				return {};
+			}
+
+			// Populate the input vectors
+			const auto& inputDims = m_trtEngine->getInputDims();
+			const int inputH = inputDims[0].d[1];
+			const int inputW = inputDims[0].d[2];
+
+			// Upload the image to GPU memory
+			cv::cuda::Stream stream;  // Create a custom stream
+			cv::cuda::GpuMat img;
+
+			if (inputImage.channels() == 1) {
+				// Convert grayscale to 3-channel BGR before uploading
+				cv::Mat img3Channel;
+				cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
+				img.upload(img3Channel, stream);
+			}
+			else {
+				img.upload(inputImage, stream);
+			}
+
+			// Convert BGR to RGB
+			cv::cuda::GpuMat imgRGB;
+			cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
+
+			// These parameters will be used in the post-processing stage
+			outMeta.imgHeight = imgRGB.rows;
+			outMeta.imgWidth = imgRGB.cols;
+
+			if (outMeta.imgHeight <= 0 || outMeta.imgWidth <= 0) {
+				_logger.LogFatal("TENSORRTCL::Preprocess", "Image height or width is zero", __FILE__, __LINE__);
+				return {};
+			}
+
+			if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
+				outMeta.ratio = 1.f;
+
+				cv::cuda::GpuMat resized = imgRGB;
+
+				// Classification: direct resize (no letterbox padding) — matches ANSONNXCL
+				// Must use explicit stream to avoid conflict with CUDA Graph capture on null stream
+				if (resized.rows != inputDims[0].d[1] || resized.cols != inputDims[0].d[2]) {
+					cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream);
+				}
+
+				// Wait for all GPU ops to complete before returning GpuMats
+				stream.waitForCompletion();
+
+				// Convert to format expected by our inference engine
+				std::vector<cv::cuda::GpuMat> input{ std::move(resized) };
+				std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
+				return inputs;
+			}
+			else {
+				this->_logger.LogFatal("TENSORRTCL::Preprocess",
+					"Image height or width is zero after processing (Width: " + std::to_string(outMeta.imgWidth) +
+					", Height: " + std::to_string(outMeta.imgHeight) + ")",
+					__FILE__, __LINE__);
+				return {};
+			}
+		}
+		catch (const std::exception& e) {
+			this->_logger.LogFatal("TENSORRTCL::Preprocess", e.what(), __FILE__, __LINE__);
+			return {};
+		}
+	}
+		std::vector<Object> TENSORRTCL::Postprocess(std::vector<float>& featureVector, const std::string& camera_id, const ImageMetadata& meta) {
+		std::vector<Object> outputs;
+		try {
+			// Check if output is already a probability distribution (sums to ~1.0).
+			// Some models include a Softmax layer; applying softmax again would
+			// flatten the distribution and cause wrong classifications.
+			float rawSum = 0.f;
+			bool allNonNeg = true;
+			for (const auto& v : featureVector) {
+				rawSum += v;
+				if (v < 0.f) allNonNeg = false;
+			}
+			const bool alreadyNormalized = (allNonNeg && rawSum > 0.9f && rawSum < 1.1f);
+
+			if (!alreadyNormalized) {
+				// Raw logits — apply softmax
+				float maxLogit = *std::max_element(featureVector.begin(), featureVector.end());
+				float sumExp = 0.f;
+				for (auto& v : featureVector) {
+					v = std::exp(v - maxLogit);
+					sumExp += v;
+				}
+				for (auto& v : featureVector)
+					v /= sumExp;
+			}
+
+			auto max_idx = std::max_element(featureVector.begin(), featureVector.end());
+			int class_id = static_cast<int>(std::distance(featureVector.begin(), max_idx));
+			float score = *max_idx;
+			int classNameSize = _classes.size();
+			Object clsResult;
+			clsResult.classId = class_id;
+			if (!_classes.empty()) {
+				if (clsResult.classId < classNameSize) {
+					clsResult.className = _classes[clsResult.classId];
+				}
+				else {
+					clsResult.className = _classes[classNameSize - 1]; // Use last valid class name if out of range
+				}
+			}
+			else {
+				clsResult.className = "Unknown"; // Fallback if _classes is empty
+			}
+
+
+			clsResult.confidence = score;
+			if (meta.imgWidth > 20 && meta.imgHeight > 20) {
+				clsResult.box = cv::Rect(10, 10, meta.imgWidth - 20, meta.imgHeight - 20);
+			}
+			else {
+				clsResult.box = cv::Rect(0, 0, meta.imgWidth, meta.imgHeight);
+			}
+			clsResult.polygon = ANSUtilityHelper::RectToNormalizedPolygon(clsResult.box, meta.imgWidth, meta.imgHeight);
+			clsResult.cameraId = camera_id;
+			outputs.push_back(clsResult);
+			return outputs;
+			//EnqueueDetection(objects, camera_id);
+		}
+		catch (std::exception& e) {
+			this->_logger.LogFatal("TENSORRTCL::Postproces", e.what(), __FILE__, __LINE__);
+			return outputs;
+		}
+
+	}
+		std::vector<std::vector<Object>> TENSORRTCL::DetectObjectsBatch(const std::vector<cv::Mat>& inputImages, const std::string& camera_id)
+	{
+		// Validate under brief lock
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			if (inputImages.empty()) {
+				_logger.LogFatal("TENSORRTCL::DetectObjectsBatch",
+					"Empty input images vector", __FILE__, __LINE__);
+				return {};
+			}
+		}
+
+		// Auto-split if batch exceeds engine capacity
+		const int maxBatch = m_options.maxBatchSize > 0 ? m_options.maxBatchSize : 1;
+		if (static_cast<int>(inputImages.size()) > maxBatch) {
+			const size_t numImages = inputImages.size();
+			std::vector<std::vector<Object>> allResults;
+			allResults.reserve(numImages);
+			// Process chunks sequentially to avoid GPU contention on the same engine
+			for (size_t start = 0; start < numImages; start += static_cast<size_t>(maxBatch)) {
+				const size_t end = std::min(start + static_cast<size_t>(maxBatch), numImages);
+				std::vector<cv::Mat> chunk(inputImages.begin() + start, inputImages.begin() + end);
+				auto chunkResults = DetectObjectsBatch(chunk, camera_id);
+				if (chunkResults.size() == chunk.size()) {
+					for (auto& r : chunkResults) allResults.push_back(std::move(r));
+				}
+				else {
+					// Chunk failed or returned wrong size — pad with empty results
+					_logger.LogError("TENSORRTCL::DetectObjectsBatch",
+						"Chunk returned " + std::to_string(chunkResults.size()) +
+						" results, expected " + std::to_string(chunk.size()) +
+						". Padding with empty results.", __FILE__, __LINE__);
+					for (auto& r : chunkResults) allResults.push_back(std::move(r));
+					for (size_t pad = chunkResults.size(); pad < chunk.size(); ++pad) {
+						allResults.push_back({});
+					}
+				}
+			}
+			return allResults;
+		}
+
+		_logger.LogDebug("TENSORRTCL::DetectObjectsBatch",
+			"Processing batch of " + std::to_string(inputImages.size()) + " images",
+			__FILE__, __LINE__);
+
+		// Phase 1: Preprocess under brief lock
+		BatchMetadata metadata;
+		std::vector<std::vector<cv::cuda::GpuMat>> inputs;
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+			inputs = PreprocessBatch(inputImages, metadata);
+		}
+		if (inputs.empty() || inputs[0].empty()) {
+			_logger.LogFatal("TENSORRTCL::DetectObjectsBatch",
+				"Preprocessing failed", __FILE__, __LINE__);
+			return {};
+		}
+
+		// Phase 2: Inference — mutex released; pool dispatches to idle GPU slot
+		std::vector<std::vector<std::vector<float>>> featureVectors;
+		bool succ = m_trtEngine->runInference(inputs, featureVectors);
+		if (!succ) {
+			_logger.LogFatal("TENSORRTCL::DetectObjectsBatch",
+				"Error running batch inference", __FILE__, __LINE__);
+			return {};
+		}
+
+		// Phase 3: Parallel postprocessing
+		const size_t numBatch = featureVectors.size();
+		std::vector<std::vector<Object>> batchDetections(numBatch);
+		std::vector<std::future<std::vector<Object>>> postFutures;
+		postFutures.reserve(numBatch);
+
+		for (size_t batchIdx = 0; batchIdx < numBatch; ++batchIdx) {
+			const auto& batchOutput = featureVectors[batchIdx];
+			std::vector<float> fv = batchOutput.empty() ? std::vector<float>{} : batchOutput[0];
+			postFutures.push_back(std::async(std::launch::async,
+				[this, fv = std::move(fv), cid = camera_id, idx = batchIdx, &metadata]() mutable {
+					return PostprocessBatch(fv, cid, idx, metadata);
+				}));
+		}
+		for (size_t i = 0; i < numBatch; ++i)
+			batchDetections[i] = postFutures[i].get();
+
+		_logger.LogDebug("TENSORRTCL::DetectObjectsBatch",
+			"Batch processing complete. Images: " + std::to_string(numBatch),
+			__FILE__, __LINE__);
+		return batchDetections;
+	}
+		std::vector<std::vector<cv::cuda::GpuMat>> TENSORRTCL::PreprocessBatch(const std::vector<cv::Mat>& inputImages, BatchMetadata& outMetadata)
+	{
+		try {
+			// Validate license
+			if (!_licenseValid) {
+				_logger.LogFatal("TENSORRTCL::PreprocessBatch",
+					"Invalid license", __FILE__, __LINE__);
+				return {};
+			}
+
+			// Validate input
+			if (inputImages.empty()) {
+				_logger.LogFatal("TENSORRTCL::PreprocessBatch",
+					"Input images vector is empty", __FILE__, __LINE__);
+				return {};
+			}
+
+			size_t batchSize = inputImages.size();
+
+			// Get model input dimensions
+			const auto& inputDims = m_trtEngine->getInputDims();
+			const int inputH = inputDims[0].d[1];
+			const int inputW = inputDims[0].d[2];
+
+			_logger.LogDebug("TENSORRTCL::PreprocessBatch",
+				"Preprocessing " + std::to_string(batchSize) + " images to " +
+				std::to_string(inputW) + "x" + std::to_string(inputH),
+				__FILE__, __LINE__);
+
+			// Create CUDA stream for async operations
+			cv::cuda::Stream stream;
+
+			// Store ALL images in a SINGLE batch vector
+			std::vector<cv::cuda::GpuMat> batchedImages;
+			batchedImages.reserve(batchSize);
+
+			// Store image dimensions for postprocessing
+			outMetadata.imgHeights.clear();
+			outMetadata.imgWidths.clear();
+			outMetadata.ratios.clear();
+			outMetadata.imgHeights.reserve(batchSize);
+			outMetadata.imgWidths.reserve(batchSize);
+			outMetadata.ratios.reserve(batchSize);
+
+			// Process each image
+			for (size_t i = 0; i < batchSize; ++i) {
+				const cv::Mat& inputImage = inputImages[i];
+
+				// Validate individual image
+				if (inputImage.empty()) {
+					_logger.LogFatal("TENSORRTCL::PreprocessBatch",
+						"Input image at index " + std::to_string(i) + " is empty",
+						__FILE__, __LINE__);
+					return {};
+				}
+
+				if (inputImage.cols < 5 || inputImage.rows < 5) {
+					_logger.LogFatal("TENSORRTCL::PreprocessBatch",
+						"Image at index " + std::to_string(i) +
+						" is too small (Width: " + std::to_string(inputImage.cols) +
+						", Height: " + std::to_string(inputImage.rows) + ")",
+						__FILE__, __LINE__);
+					return {};
+				}
+
+				// Upload to GPU
+				cv::cuda::GpuMat img;
+				if (inputImage.channels() == 1) {
+					// Convert grayscale to BGR
+					cv::Mat img3Channel;
+					cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
+					img.upload(img3Channel, stream);
+				}
+				else {
+					img.upload(inputImage, stream);
+				}
+
+				// Convert BGR to RGB
+				cv::cuda::GpuMat imgRGB;
+				cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
+
+				// Store original dimensions
+				int imgHeight = imgRGB.rows;
+				int imgWidth = imgRGB.cols;
+
+				if (imgHeight <= 0 || imgWidth <= 0) {
+					_logger.LogFatal("TENSORRTCL::PreprocessBatch",
+						"Image at index " + std::to_string(i) + " has zero height or width",
+						__FILE__, __LINE__);
+					return {};
+				}
+
+				outMetadata.imgHeights.push_back(imgHeight);
+				outMetadata.imgWidths.push_back(imgWidth);
+
+				// Calculate resize ratio
+				float ratio = 1.f / std::min(
+					inputDims[0].d[2] / static_cast<float>(imgRGB.cols),
+					inputDims[0].d[1] / static_cast<float>(imgRGB.rows)
+				);
+				outMetadata.ratios.push_back(ratio);
+
+				// Resize maintaining aspect ratio with padding
+				cv::cuda::GpuMat resized;
+				if (imgRGB.rows != inputDims[0].d[1] || imgRGB.cols != inputDims[0].d[2]) {
+					resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(
+						imgRGB, inputDims[0].d[1], inputDims[0].d[2]
+					);
+				}
+				else {
+					resized = imgRGB;
+				}
+
+				// Add to batch
+				batchedImages.push_back(std::move(resized));
+			}
+
+			// Wait for all GPU operations to complete
+			stream.waitForCompletion();
+
+			// Return as single batched input
+			std::vector<std::vector<cv::cuda::GpuMat>> result;
+			result.push_back(std::move(batchedImages));
+
+			return result;
+		}
+		catch (const std::exception& e) {
+			_logger.LogFatal("TENSORRTCL::PreprocessBatch",
+				e.what(), __FILE__, __LINE__);
+			return {};
+		}
+	}
+		std::vector<Object> TENSORRTCL::PostprocessBatch(std::vector<float>& featureVector, const std::string& camera_id, size_t batchIdx, const BatchMetadata& metadata)
+	{
+		std::vector<Object> outputs;
+
+		try {
+			// Validate batch index
+			if (batchIdx >= metadata.imgHeights.size() ||
+				batchIdx >= metadata.imgWidths.size()) {
+				_logger.LogFatal("TENSORRTCL::PostprocessBatch",
+					"Batch index " + std::to_string(batchIdx) +
+					" out of range (stored " + std::to_string(metadata.imgHeights.size()) + " images)",
+					__FILE__, __LINE__);
+				return outputs;
+			}
+
+			// Validate feature vector
+			if (featureVector.empty()) {
+				_logger.LogFatal("TENSORRTCL::PostprocessBatch",
+					"Feature vector is empty for batch index " + std::to_string(batchIdx),
+					__FILE__, __LINE__);
+				return outputs;
+			}
+
+			// Get image dimensions for this batch index
+			int imgHeight = metadata.imgHeights[batchIdx];
+			int imgWidth = metadata.imgWidths[batchIdx];
+
+			// Normalize if raw logits (same logic as single-image Postprocess)
+			float rawSum = 0.f;
+			bool allNonNeg = true;
+			for (const auto& v : featureVector) {
+				rawSum += v;
+				if (v < 0.f) allNonNeg = false;
+			}
+			const bool alreadyNorm = (allNonNeg && rawSum > 0.9f && rawSum < 1.1f);
+			if (!alreadyNorm) {
+				float maxLogit = *std::max_element(featureVector.begin(), featureVector.end());
+				float sumExp = 0.f;
+				for (auto& v : featureVector) {
+					v = std::exp(v - maxLogit);
+					sumExp += v;
+				}
+				for (auto& v : featureVector) v /= sumExp;
+			}
+
+			// Find max element (classification result)
+			auto max_idx = std::max_element(featureVector.begin(), featureVector.end());
+			if (max_idx == featureVector.end()) {
+				_logger.LogFatal("TENSORRTCL::PostprocessBatch",
+					"Failed to find max element in feature vector for batch index " +
+					std::to_string(batchIdx),
+					__FILE__, __LINE__);
+				return outputs;
+			}
+
+			int class_id = static_cast<int>(std::distance(featureVector.begin(), max_idx));
+			float score = *max_idx;
+
+			// Create object result
+			Object clsResult;
+			clsResult.classId = class_id;
+
+			// Get class name
+			int classNameSize = static_cast<int>(_classes.size());
+			if (!_classes.empty()) {
+				if (class_id >= 0 && class_id < classNameSize) {
+					clsResult.className = _classes[class_id];
+				}
+				else {
+					clsResult.className = _classes[classNameSize - 1];
+
+				}
+			}
+			else {
+				clsResult.className = "Unknown";
+
+			}
+
+			clsResult.confidence = score;
+
+			// Create bounding box with margins
+			if (imgWidth > 20 && imgHeight > 20) {
+				clsResult.box = cv::Rect(10, 10, imgWidth - 20, imgHeight - 20);
+			}
+			else {
+				clsResult.box = cv::Rect(0, 0, imgWidth, imgHeight);
+			}
+
+			// Convert to normalized polygon
+			clsResult.polygon = ANSUtilityHelper::RectToNormalizedPolygon(
+				clsResult.box, imgWidth, imgHeight
+			);
+
+			clsResult.cameraId = camera_id;
+
+			outputs.push_back(std::move(clsResult));
+
+			return outputs;
+		}
+		catch (const std::exception& e) {
+			_logger.LogFatal("TENSORRTCL::PostprocessBatch",
+				"Error for batch index " + std::to_string(batchIdx) + ": " + e.what(),
+				__FILE__, __LINE__);
+			return outputs;
+		}
+	}
+	
+}