#include "ANSTENSORRTCL.h" #include "Utility.h" #include #include namespace ANSCENTER { bool TENSORRTCL::OptimizeModel(bool fp16, std::string& optimizedModelFolder) { std::lock_guard lock(_mutex); if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder)) { return false; } if (!FileExist(_modelFilePath)) { this->_logger.LogFatal("TENSORRTCL::OptimizeModel", "Raw model file path does not exist", __FILE__, __LINE__); return false; } try { _fp16 = fp16; optimizedModelFolder = GetParentFolder(_modelFilePath); // Check if the engine already exists to avoid reinitializing if (!m_trtEngine) { // Fixed batch size of 1 for this model m_options.optBatchSize = _modelConfig.gpuOptBatchSize; m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize; m_options.deviceIndex = _modelConfig.gpuDeviceIndex; m_options.maxInputHeight = _modelConfig.maxInputHeight; m_options.minInputHeight = _modelConfig.minInputHeight; m_options.optInputHeight = _modelConfig.optInputHeight; m_options.maxInputWidth = _modelConfig.maxInputWidth; m_options.minInputWidth = _modelConfig.minInputWidth; m_options.optInputWidth = _modelConfig.optInputWidth; m_options.engineFileDir = optimizedModelFolder; // Use FP16 or FP32 precision based on the input flag m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32); // Create the TensorRT inference engine m_trtEngine = std::make_unique>(m_options); } // Build the TensorRT engine auto succ = m_trtEngine->buildWithRetry(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE); if (!succ) { const std::string errMsg = "Error: Unable to build the TensorRT engine. " "Try increasing TensorRT log severity to kVERBOSE."; this->_logger.LogError("TENSORRTCL::OptimizeModel", errMsg, __FILE__, __LINE__); _modelLoadValid = false; return false; } _modelLoadValid = true; return true; } catch (const std::exception& e) { this->_logger.LogFatal("TENSORRTCL::OptimizeModel", e.what(), __FILE__, __LINE__); optimizedModelFolder.clear(); return false; } } bool TENSORRTCL::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) { std::lock_guard lock(_mutex); ModelLoadingGuard mlg(_modelLoading); try { bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword); if (!result) return false; _modelConfig.detectionType = ANSCENTER::DetectionType::CLASSIFICATION; _modelConfig.modelType = ModelType::TENSORRT; _modelConfig.inpHeight = 224; _modelConfig.inpWidth = 224; if (_modelConfig.modelMNSThreshold < 0.2) _modelConfig.modelMNSThreshold = 0.5; if (_modelConfig.modelConfThreshold < 0.2) _modelConfig.modelConfThreshold = 0.5; if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133) // 133 = COCO wholebody max _modelConfig.numKPS = 17; if (_modelConfig.kpsThreshold == 0)_modelConfig.kpsThreshold = 0.5; // If not define // if (_modelConfig.precisionType == PrecisionType::FP16)_fp16 = true; _fp16 = true; // Load Model from Here // Load Model from Here TOP_K = 100; SEG_CHANNELS = 32; PROBABILITY_THRESHOLD = 0.3; NMS_THRESHOLD = 0.65f; SEGMENTATION_THRESHOLD = 0.5f; SEG_H = 160; SEG_W = 160; NUM_KPS = _modelConfig.numKPS; KPS_THRESHOLD = _modelConfig.kpsThreshold; SEG_CHANNELS = 32; // For segmentation if (!m_trtEngine) { // Fixed batch size of 1 for this model m_options.optBatchSize = _modelConfig.gpuOptBatchSize; m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize; m_options.deviceIndex = _modelConfig.gpuDeviceIndex; m_options.maxInputHeight = _modelConfig.maxInputHeight; m_options.minInputHeight = _modelConfig.minInputHeight; m_options.optInputHeight = _modelConfig.optInputHeight; m_options.maxInputWidth = _modelConfig.maxInputWidth; m_options.minInputWidth = _modelConfig.minInputWidth; m_options.optInputWidth = _modelConfig.optInputWidth; m_options.engineFileDir = _modelFolder; // Use FP16 or FP32 precision based on the input flag m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32); // Create the TensorRT inference engine m_trtEngine = std::make_unique>(m_options); } // 0. Check if the configuration file exist if (FileExist(_modelConfigFile)) { ModelType modelType; std::vector inputShape; _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape); if (inputShape.size() == 2) { if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0]; if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1]; } } else {// This is old version of model zip file _modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx"); _classFilePath = CreateFilePath(_modelFolder, "classes.names"); std::ifstream isValidFileName(_classFilePath); if (!isValidFileName) { this->_logger.LogDebug("TENSORRTCL::Initialize. Load classes from string", _classFilePath, __FILE__, __LINE__); LoadClassesFromString(); } else { this->_logger.LogDebug("TENSORRTCL::Initialize. Load classes from file", _classFilePath, __FILE__, __LINE__); LoadClassesFromFile(); } } // Load the TensorRT engine file if (this->_loadEngineOnCreation) { auto succ = m_trtEngine->buildLoadNetwork(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu); if (!succ) { const std::string errMsg = "Error: Unable to load TensorRT engine weights into memory. " + _modelFilePath; this->_logger.LogError("TENSORRTCL::Initialize", errMsg, __FILE__, __LINE__); _modelLoadValid = false; return false; } } _modelLoadValid = true; _isInitialized = true; return true; } catch (std::exception& e) { this->_logger.LogFatal("TENSORRTCL::LoadModel", e.what(), __FILE__, __LINE__); return false; } } bool TENSORRTCL::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig, std::string modelName, std::string className, const std::string& modelFolder, std::string& labelMap) { std::lock_guard lock(_mutex); ModelLoadingGuard mlg(_modelLoading); try { bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap); if (!result) return false; std::string _modelName = modelName; if (_modelName.empty()) { _modelName = "train_last"; } std::string modelFullName = _modelName + ".onnx"; // Parsing for YOLO only here _modelConfig = modelConfig; _modelConfig.detectionType = ANSCENTER::DetectionType::CLASSIFICATION; _modelConfig.modelType = ModelType::TENSORRT; _modelConfig.inpHeight = 224; _modelConfig.inpWidth = 224; if (_modelConfig.modelMNSThreshold < 0.2) _modelConfig.modelMNSThreshold = 0.5; if (_modelConfig.modelConfThreshold < 0.2) _modelConfig.modelConfThreshold = 0.5; if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133) // 133 = COCO wholebody max _modelConfig.numKPS = 17; if (_modelConfig.kpsThreshold == 0)_modelConfig.kpsThreshold = 0.5; // If not define _fp16 = true; // Load Model from Here // Load Model from Here TOP_K = 100; SEG_CHANNELS = 32; PROBABILITY_THRESHOLD = 0.3; NMS_THRESHOLD = 0.65f; SEGMENTATION_THRESHOLD = 0.5f; SEG_H = 160; SEG_W = 160; NUM_KPS = _modelConfig.numKPS; KPS_THRESHOLD = _modelConfig.kpsThreshold; SEG_CHANNELS = 32; // For segmentation if (!m_trtEngine) { // Fixed batch size of 1 for this model m_options.optBatchSize = _modelConfig.gpuOptBatchSize; m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize; m_options.deviceIndex = _modelConfig.gpuDeviceIndex; m_options.maxInputHeight = _modelConfig.maxInputHeight; m_options.minInputHeight = _modelConfig.minInputHeight; m_options.optInputHeight = _modelConfig.optInputHeight; m_options.maxInputWidth = _modelConfig.maxInputWidth; m_options.minInputWidth = _modelConfig.minInputWidth; m_options.optInputWidth = _modelConfig.optInputWidth; m_options.engineFileDir = _modelFolder; // Use FP16 or FP32 precision based on the input flag m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32); // Create the TensorRT inference engine m_trtEngine = std::make_unique>(m_options); } // 0. Check if the configuration file exist if (FileExist(_modelConfigFile)) { ModelType modelType; std::vector inputShape; _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape); if (inputShape.size() == 2) { if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0]; if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1]; } } else {// This is old version of model zip file _modelFilePath = CreateFilePath(_modelFolder, modelFullName); _classFilePath = CreateFilePath(_modelFolder, className); std::ifstream isValidFileName(_classFilePath); if (!isValidFileName) { this->_logger.LogDebug("TENSORRTCL::Initialize. Load classes from string", _classFilePath, __FILE__, __LINE__); LoadClassesFromString(); } else { this->_logger.LogDebug("TENSORRTCL::Initialize. Load classes from file", _classFilePath, __FILE__, __LINE__); LoadClassesFromFile(); } } // 1. Load labelMap and engine labelMap.clear(); if (!_classes.empty()) labelMap = VectorToCommaSeparatedString(_classes); // Load the TensorRT engine file if (this->_loadEngineOnCreation) { auto succ = m_trtEngine->buildLoadNetwork(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu); if (!succ) { const std::string errMsg = "Error: Unable to load TensorRT engine weights into memory. " + _modelFilePath; this->_logger.LogError("TENSORRTCL::Initialize", errMsg, __FILE__, __LINE__); _modelLoadValid = false; return false; } } _modelLoadValid = true; _isInitialized = true; return true; } catch (std::exception& e) { this->_logger.LogFatal("TENSORRTCL::LoadModel", e.what(), __FILE__, __LINE__); return false; } } bool TENSORRTCL::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, std::string& labelMap) { const bool engineAlreadyLoaded = _modelLoadValid && _isInitialized && m_trtEngine != nullptr; _modelLoadValid = false; std::lock_guard lock(_mutex); ModelLoadingGuard mlg(_modelLoading); try { bool result = ANSODBase::Initialize(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, labelMap); if (!result) return false; // Parsing for YOLO only here _modelConfig = modelConfig; _modelConfig.detectionType = ANSCENTER::DetectionType::CLASSIFICATION; _modelConfig.modelType = ModelType::TENSORRT; _modelConfig.inpHeight = 224; _modelConfig.inpWidth = 224; if (_modelConfig.modelMNSThreshold < 0.2) _modelConfig.modelMNSThreshold = 0.5; if (_modelConfig.modelConfThreshold < 0.2) _modelConfig.modelConfThreshold = 0.5; if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133) // 133 = COCO wholebody max _modelConfig.numKPS = 17; if (_modelConfig.kpsThreshold == 0)_modelConfig.kpsThreshold = 0.5; // If not define // if (_modelConfig.precisionType == PrecisionType::FP16)_fp16 = true; _fp16 = true; // Load Model from Here // Load Model from Here TOP_K = 100; SEG_CHANNELS = 32; PROBABILITY_THRESHOLD = 0.3; NMS_THRESHOLD = 0.65f; SEGMENTATION_THRESHOLD = 0.5f; SEG_H = 160; SEG_W = 160; NUM_KPS = _modelConfig.numKPS; KPS_THRESHOLD = _modelConfig.kpsThreshold; SEG_CHANNELS = 32; // For segmentation if (!m_trtEngine) { // Fixed batch size of 1 for this model m_options.optBatchSize = _modelConfig.gpuOptBatchSize; m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize; m_options.deviceIndex = _modelConfig.gpuDeviceIndex; m_options.maxInputHeight = _modelConfig.maxInputHeight; m_options.minInputHeight = _modelConfig.minInputHeight; m_options.optInputHeight = _modelConfig.optInputHeight; m_options.maxInputWidth = _modelConfig.maxInputWidth; m_options.minInputWidth = _modelConfig.minInputWidth; m_options.optInputWidth = _modelConfig.optInputWidth; m_options.engineFileDir = _modelFolder; // Use FP16 or FP32 precision based on the input flag m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32); // Create the TensorRT inference engine m_trtEngine = std::make_unique>(m_options); } // 0. Check if the configuration file exist if (FileExist(_modelConfigFile)) { ModelType modelType; std::vector inputShape; _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape); if (inputShape.size() == 2) { if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0]; if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1]; } } else {// This is old version of model zip file _modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx"); _classFilePath = CreateFilePath(_modelFolder, "classes.names"); std::ifstream isValidFileName(_classFilePath); if (!isValidFileName) { this->_logger.LogDebug("TENSORRTCL::Initialize. Load classes from string", _classFilePath, __FILE__, __LINE__); LoadClassesFromString(); } else { this->_logger.LogDebug("TENSORRTCL::Initialize. Load classes from file", _classFilePath, __FILE__, __LINE__); LoadClassesFromFile(); } } // 1. Load labelMap and engine labelMap.clear(); if (!_classes.empty()) labelMap = VectorToCommaSeparatedString(_classes); // Load the TensorRT engine file if (this->_loadEngineOnCreation && !engineAlreadyLoaded) { auto succ = m_trtEngine->buildLoadNetwork(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu); if (!succ) { const std::string errMsg = "Error: Unable to load TensorRT engine weights into memory. " + _modelFilePath; this->_logger.LogError("TENSORRTCL::Initialize", errMsg, __FILE__, __LINE__); _modelLoadValid = false; return false; } } _modelLoadValid = true; _isInitialized = true; return true; } catch (std::exception& e) { this->_logger.LogFatal("TENSORRTCL::Initialize", e.what(), __FILE__, __LINE__); return false; } } std::vector TENSORRTCL::RunInference(const cv::Mat& inputImgBGR) { return RunInference(inputImgBGR, "CustomCam"); } std::vector TENSORRTCL::RunInference(const cv::Mat& inputImgBGR,const std::string& camera_id) { // Validate state under brief lock if (!PreInferenceCheck("TENSORRTCL::RunInference")) return {}; try { return DetectObjects(inputImgBGR, camera_id); } catch (const std::exception& e) { _logger.LogFatal("TENSORRTCL::RunInference", e.what(), __FILE__, __LINE__); return {}; } } std::vector> TENSORRTCL::RunInferencesBatch(const std::vector& inputs, const std::string& camera_id) { // Validate state under brief lock if (!PreInferenceCheck("TENSORRTCL::RunInferencesBatch")) return {}; try { return DetectObjectsBatch(inputs, camera_id); } catch (const std::exception& e) { this->_logger.LogFatal("TENSORRTCL::RunInferencesBatch", e.what(), __FILE__, __LINE__); return {}; } }; TENSORRTCL::~TENSORRTCL() { try { Destroy(); } catch (std::exception& e) { this->_logger.LogError("TENSORRTCL::~TENSORRTCL()", e.what(), __FILE__, __LINE__); } } bool TENSORRTCL::Destroy() { try { m_trtEngine.reset(); // Releases the current engine and sets m_trtEngine to nullptr. return true; } catch (std::exception& e) { this->_logger.LogError("TENSORRTCL::~TENSORRTCL()", e.what(), __FILE__, __LINE__); return false; } } // private std::vector TENSORRTCL::DetectObjects(const cv::Mat& inputImage, const std::string& camera_id) { try { // --- 1. Set GPU device context --- if (m_trtEngine) { m_trtEngine->setDeviceContext(); } // --- 1b. CUDA context health check --- if (!m_nv12Helper.isCudaContextHealthy(_logger, "TENSORRTCL")) { return {}; } // --- 2. Preprocess under lock --- // Try NV12 fast path first (classification: direct resize, no letterbox). ImageMetadata meta; std::vector> input; { std::lock_guard lock(_mutex); const int inferenceGpu = m_trtEngine ? m_trtEngine->getPreferredDeviceIndex() : 0; const auto& inputDims = m_trtEngine->getInputDims(); const int inputW = inputDims[0].d[2]; const int inputH = inputDims[0].d[1]; auto nv12 = m_nv12Helper.tryNV12(inputImage, inferenceGpu, inputW, inputH, NV12PreprocessHelper::classificationLauncher(), _logger, "TENSORRTCL"); if (nv12.succeeded) { meta.imgWidth = nv12.metaWidth; meta.imgHeight = nv12.metaHeight; meta.ratio = 1.f; // classification: no letterbox input = {{ std::move(nv12.gpuRGB) }}; } else if (nv12.useBgrFullRes) { input = Preprocess(nv12.bgrFullResImg, meta); } if (input.empty()) { input = Preprocess(inputImage, meta); } m_nv12Helper.tickInference(); } if (input.empty()) return {}; // Phase 2: Inference — mutex released; pool dispatches to idle GPU slot std::vector>> featureVectors; auto succ = m_trtEngine->runInference(input, featureVectors); if (!succ) { this->_logger.LogFatal("TENSORRTCL::DetectObjects", "Error running inference", __FILE__, __LINE__); return {}; } // Phase 3: Postprocess under brief lock std::lock_guard lock(_mutex); std::vector featureVector; Engine::transformOutput(featureVectors, featureVector); return Postprocess(featureVector, camera_id, meta); } catch (std::exception& e) { this->_logger.LogFatal("TENSORRTCL::DetectObjects", e.what(), __FILE__, __LINE__); return {}; } } std::vector> TENSORRTCL::Preprocess(const cv::Mat& inputImage, ImageMetadata& outMeta) { try { if (!_licenseValid) { this->_logger.LogFatal("TENSORRTCL::Preprocess", "Invalid license", __FILE__, __LINE__); return {}; } if (inputImage.empty()) { this->_logger.LogFatal("TENSORRTCL::Preprocess", "Input image is empty", __FILE__, __LINE__); return {}; } if ((inputImage.cols < 5) || (inputImage.rows < 5)) { this->_logger.LogFatal("TENSORRTCL::Preprocess", "Input image is too small (Width: " + std::to_string(inputImage.cols) + ", Height: " + std::to_string(inputImage.rows) + ")", __FILE__, __LINE__); return {}; } // Populate the input vectors const auto& inputDims = m_trtEngine->getInputDims(); const int inputH = inputDims[0].d[1]; const int inputW = inputDims[0].d[2]; // --- CPU preprocessing: resize + BGR->RGB before GPU upload --- cv::Mat srcImg = inputImage; if (srcImg.channels() == 1) { cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR); } // These parameters will be used in the post-processing stage outMeta.imgHeight = srcImg.rows; outMeta.imgWidth = srcImg.cols; if (outMeta.imgHeight <= 0 || outMeta.imgWidth <= 0) { _logger.LogFatal("TENSORRTCL::Preprocess", "Image height or width is zero", __FILE__, __LINE__); return {}; } if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) { outMeta.ratio = 1.f; // Classification: direct CPU resize (no letterbox padding) cv::Mat cpuResized; if (srcImg.rows != inputH || srcImg.cols != inputW) { cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); } else { cpuResized = srcImg; } // CPU BGR -> RGB cv::Mat cpuRGB; cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); // Upload small image to GPU cv::cuda::Stream stream; cv::cuda::GpuMat gpuResized; gpuResized.upload(cpuRGB, stream); stream.waitForCompletion(); // Convert to format expected by our inference engine std::vector input{ std::move(gpuResized) }; std::vector> inputs{ std::move(input) }; return inputs; } else { this->_logger.LogFatal("TENSORRTCL::Preprocess", "Image height or width is zero after processing (Width: " + std::to_string(outMeta.imgWidth) + ", Height: " + std::to_string(outMeta.imgHeight) + ")", __FILE__, __LINE__); return {}; } } catch (const std::exception& e) { this->_logger.LogFatal("TENSORRTCL::Preprocess", e.what(), __FILE__, __LINE__); return {}; } } std::vector TENSORRTCL::Postprocess(std::vector& featureVector, const std::string& camera_id, const ImageMetadata& meta) { std::vector outputs; try { // Check if output is already a probability distribution (sums to ~1.0). // Some models include a Softmax layer; applying softmax again would // flatten the distribution and cause wrong classifications. float rawSum = 0.f; bool allNonNeg = true; for (const auto& v : featureVector) { rawSum += v; if (v < 0.f) allNonNeg = false; } const bool alreadyNormalized = (allNonNeg && rawSum > 0.9f && rawSum < 1.1f); if (!alreadyNormalized) { // Raw logits — apply softmax float maxLogit = *std::max_element(featureVector.begin(), featureVector.end()); float sumExp = 0.f; for (auto& v : featureVector) { v = std::exp(v - maxLogit); sumExp += v; } for (auto& v : featureVector) v /= sumExp; } auto max_idx = std::max_element(featureVector.begin(), featureVector.end()); int class_id = static_cast(std::distance(featureVector.begin(), max_idx)); float score = *max_idx; int classNameSize = _classes.size(); Object clsResult; clsResult.classId = class_id; if (!_classes.empty()) { if (clsResult.classId < classNameSize) { clsResult.className = _classes[clsResult.classId]; } else { clsResult.className = _classes[classNameSize - 1]; // Use last valid class name if out of range } } else { clsResult.className = "Unknown"; // Fallback if _classes is empty } clsResult.confidence = score; if (meta.imgWidth > 20 && meta.imgHeight > 20) { clsResult.box = cv::Rect(10, 10, meta.imgWidth - 20, meta.imgHeight - 20); } else { clsResult.box = cv::Rect(0, 0, meta.imgWidth, meta.imgHeight); } clsResult.polygon = ANSUtilityHelper::RectToNormalizedPolygon(clsResult.box, meta.imgWidth, meta.imgHeight); clsResult.cameraId = camera_id; outputs.push_back(clsResult); return outputs; //EnqueueDetection(objects, camera_id); } catch (std::exception& e) { this->_logger.LogFatal("TENSORRTCL::Postproces", e.what(), __FILE__, __LINE__); return outputs; } } std::vector> TENSORRTCL::DetectObjectsBatch(const std::vector& inputImages, const std::string& camera_id) { // Validate under brief lock { std::lock_guard lock(_mutex); if (inputImages.empty()) { _logger.LogFatal("TENSORRTCL::DetectObjectsBatch", "Empty input images vector", __FILE__, __LINE__); return {}; } } // Auto-split if batch exceeds engine capacity const int maxBatch = m_options.maxBatchSize > 0 ? m_options.maxBatchSize : 1; if (static_cast(inputImages.size()) > maxBatch) { const size_t numImages = inputImages.size(); std::vector> allResults; allResults.reserve(numImages); // Process chunks sequentially to avoid GPU contention on the same engine for (size_t start = 0; start < numImages; start += static_cast(maxBatch)) { const size_t end = std::min(start + static_cast(maxBatch), numImages); std::vector chunk(inputImages.begin() + start, inputImages.begin() + end); auto chunkResults = DetectObjectsBatch(chunk, camera_id); if (chunkResults.size() == chunk.size()) { for (auto& r : chunkResults) allResults.push_back(std::move(r)); } else { // Chunk failed or returned wrong size — pad with empty results _logger.LogError("TENSORRTCL::DetectObjectsBatch", "Chunk returned " + std::to_string(chunkResults.size()) + " results, expected " + std::to_string(chunk.size()) + ". Padding with empty results.", __FILE__, __LINE__); for (auto& r : chunkResults) allResults.push_back(std::move(r)); for (size_t pad = chunkResults.size(); pad < chunk.size(); ++pad) { allResults.push_back({}); } } } return allResults; } _logger.LogDebug("TENSORRTCL::DetectObjectsBatch", "Processing batch of " + std::to_string(inputImages.size()) + " images", __FILE__, __LINE__); // Phase 1: Preprocess under brief lock BatchMetadata metadata; std::vector> inputs; { std::lock_guard lock(_mutex); inputs = PreprocessBatch(inputImages, metadata); } if (inputs.empty() || inputs[0].empty()) { _logger.LogFatal("TENSORRTCL::DetectObjectsBatch", "Preprocessing failed", __FILE__, __LINE__); return {}; } // Phase 2: Inference — mutex released; pool dispatches to idle GPU slot std::vector>> featureVectors; bool succ = m_trtEngine->runInference(inputs, featureVectors); if (!succ) { _logger.LogFatal("TENSORRTCL::DetectObjectsBatch", "Error running batch inference", __FILE__, __LINE__); return {}; } // Phase 3: Parallel postprocessing const size_t numBatch = featureVectors.size(); std::vector> batchDetections(numBatch); std::vector>> postFutures; postFutures.reserve(numBatch); for (size_t batchIdx = 0; batchIdx < numBatch; ++batchIdx) { const auto& batchOutput = featureVectors[batchIdx]; std::vector fv = batchOutput.empty() ? std::vector{} : batchOutput[0]; postFutures.push_back(std::async(std::launch::async, [this, fv = std::move(fv), cid = camera_id, idx = batchIdx, &metadata]() mutable { return PostprocessBatch(fv, cid, idx, metadata); })); } for (size_t i = 0; i < numBatch; ++i) batchDetections[i] = postFutures[i].get(); _logger.LogDebug("TENSORRTCL::DetectObjectsBatch", "Batch processing complete. Images: " + std::to_string(numBatch), __FILE__, __LINE__); return batchDetections; } std::vector> TENSORRTCL::PreprocessBatch(const std::vector& inputImages, BatchMetadata& outMetadata) { try { // Validate license if (!_licenseValid) { _logger.LogFatal("TENSORRTCL::PreprocessBatch", "Invalid license", __FILE__, __LINE__); return {}; } // Validate input if (inputImages.empty()) { _logger.LogFatal("TENSORRTCL::PreprocessBatch", "Input images vector is empty", __FILE__, __LINE__); return {}; } size_t batchSize = inputImages.size(); // Get model input dimensions const auto& inputDims = m_trtEngine->getInputDims(); const int inputH = inputDims[0].d[1]; const int inputW = inputDims[0].d[2]; _logger.LogDebug("TENSORRTCL::PreprocessBatch", "Preprocessing " + std::to_string(batchSize) + " images to " + std::to_string(inputW) + "x" + std::to_string(inputH), __FILE__, __LINE__); // Create CUDA stream for async operations cv::cuda::Stream stream; // Store ALL images in a SINGLE batch vector std::vector batchedImages; batchedImages.reserve(batchSize); // Store image dimensions for postprocessing outMetadata.imgHeights.clear(); outMetadata.imgWidths.clear(); outMetadata.ratios.clear(); outMetadata.imgHeights.reserve(batchSize); outMetadata.imgWidths.reserve(batchSize); outMetadata.ratios.reserve(batchSize); // Process each image for (size_t i = 0; i < batchSize; ++i) { const cv::Mat& inputImage = inputImages[i]; // Validate individual image if (inputImage.empty()) { _logger.LogFatal("TENSORRTCL::PreprocessBatch", "Input image at index " + std::to_string(i) + " is empty", __FILE__, __LINE__); return {}; } if (inputImage.cols < 5 || inputImage.rows < 5) { _logger.LogFatal("TENSORRTCL::PreprocessBatch", "Image at index " + std::to_string(i) + " is too small (Width: " + std::to_string(inputImage.cols) + ", Height: " + std::to_string(inputImage.rows) + ")", __FILE__, __LINE__); return {}; } // CPU preprocessing: resize + BGR->RGB before GPU upload cv::Mat srcImg = inputImage; if (srcImg.channels() == 1) { cv::Mat img3Channel; cv::cvtColor(srcImg, img3Channel, cv::COLOR_GRAY2BGR); srcImg = img3Channel; } // Store original dimensions int imgHeight = srcImg.rows; int imgWidth = srcImg.cols; if (imgHeight <= 0 || imgWidth <= 0) { _logger.LogFatal("TENSORRTCL::PreprocessBatch", "Image at index " + std::to_string(i) + " has zero height or width", __FILE__, __LINE__); return {}; } outMetadata.imgHeights.push_back(imgHeight); outMetadata.imgWidths.push_back(imgWidth); // Classification: ratio is always 1.0 outMetadata.ratios.push_back(1.f); // Classification: direct CPU resize (no letterbox padding) cv::Mat cpuResized; if (srcImg.rows != inputH || srcImg.cols != inputW) { cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR); } else { cpuResized = srcImg; } cv::Mat cpuRGB; cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB); cv::cuda::GpuMat gpuResized; gpuResized.upload(cpuRGB, stream); // Add to batch batchedImages.push_back(std::move(gpuResized)); } // Wait for all GPU operations to complete stream.waitForCompletion(); // Return as single batched input std::vector> result; result.push_back(std::move(batchedImages)); return result; } catch (const std::exception& e) { _logger.LogFatal("TENSORRTCL::PreprocessBatch", e.what(), __FILE__, __LINE__); return {}; } } std::vector TENSORRTCL::PostprocessBatch(std::vector& featureVector, const std::string& camera_id, size_t batchIdx, const BatchMetadata& metadata) { std::vector outputs; try { // Validate batch index if (batchIdx >= metadata.imgHeights.size() || batchIdx >= metadata.imgWidths.size()) { _logger.LogFatal("TENSORRTCL::PostprocessBatch", "Batch index " + std::to_string(batchIdx) + " out of range (stored " + std::to_string(metadata.imgHeights.size()) + " images)", __FILE__, __LINE__); return outputs; } // Validate feature vector if (featureVector.empty()) { _logger.LogFatal("TENSORRTCL::PostprocessBatch", "Feature vector is empty for batch index " + std::to_string(batchIdx), __FILE__, __LINE__); return outputs; } // Get image dimensions for this batch index int imgHeight = metadata.imgHeights[batchIdx]; int imgWidth = metadata.imgWidths[batchIdx]; // Normalize if raw logits (same logic as single-image Postprocess) float rawSum = 0.f; bool allNonNeg = true; for (const auto& v : featureVector) { rawSum += v; if (v < 0.f) allNonNeg = false; } const bool alreadyNorm = (allNonNeg && rawSum > 0.9f && rawSum < 1.1f); if (!alreadyNorm) { float maxLogit = *std::max_element(featureVector.begin(), featureVector.end()); float sumExp = 0.f; for (auto& v : featureVector) { v = std::exp(v - maxLogit); sumExp += v; } for (auto& v : featureVector) v /= sumExp; } // Find max element (classification result) auto max_idx = std::max_element(featureVector.begin(), featureVector.end()); if (max_idx == featureVector.end()) { _logger.LogFatal("TENSORRTCL::PostprocessBatch", "Failed to find max element in feature vector for batch index " + std::to_string(batchIdx), __FILE__, __LINE__); return outputs; } int class_id = static_cast(std::distance(featureVector.begin(), max_idx)); float score = *max_idx; // Create object result Object clsResult; clsResult.classId = class_id; // Get class name int classNameSize = static_cast(_classes.size()); if (!_classes.empty()) { if (class_id >= 0 && class_id < classNameSize) { clsResult.className = _classes[class_id]; } else { clsResult.className = _classes[classNameSize - 1]; } } else { clsResult.className = "Unknown"; } clsResult.confidence = score; // Create bounding box with margins if (imgWidth > 20 && imgHeight > 20) { clsResult.box = cv::Rect(10, 10, imgWidth - 20, imgHeight - 20); } else { clsResult.box = cv::Rect(0, 0, imgWidth, imgHeight); } // Convert to normalized polygon clsResult.polygon = ANSUtilityHelper::RectToNormalizedPolygon( clsResult.box, imgWidth, imgHeight ); clsResult.cameraId = camera_id; outputs.push_back(std::move(clsResult)); return outputs; } catch (const std::exception& e) { _logger.LogFatal("TENSORRTCL::PostprocessBatch", "Error for batch index " + std::to_string(batchIdx) + ": " + e.what(), __FILE__, __LINE__); return outputs; } } }