Files
ANSCORE/modules/ANSODEngine/ANSTENSORRTCL.cpp

974 lines
34 KiB
C++
Raw Normal View History

2026-03-28 16:54:11 +11:00
#include "ANSTENSORRTCL.h"
#include "Utility.h"
#include <opencv2/cudaimgproc.hpp>
#include <future>
namespace ANSCENTER
{
bool TENSORRTCL::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder)) {
return false;
}
if (!FileExist(_modelFilePath)) {
this->_logger.LogFatal("TENSORRTCL::OptimizeModel", "Raw model file path does not exist", __FILE__, __LINE__);
return false;
}
try {
_fp16 = fp16;
optimizedModelFolder = GetParentFolder(_modelFilePath);
// Check if the engine already exists to avoid reinitializing
if (!m_trtEngine) {
// Fixed batch size of 1 for this model
m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
m_options.maxInputHeight = _modelConfig.maxInputHeight;
m_options.minInputHeight = _modelConfig.minInputHeight;
m_options.optInputHeight = _modelConfig.optInputHeight;
m_options.maxInputWidth = _modelConfig.maxInputWidth;
m_options.minInputWidth = _modelConfig.minInputWidth;
m_options.optInputWidth = _modelConfig.optInputWidth;
m_options.engineFileDir = optimizedModelFolder;
// Use FP16 or FP32 precision based on the input flag
m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
// Create the TensorRT inference engine
m_trtEngine = std::make_unique<Engine<float>>(m_options);
}
// Build the TensorRT engine
auto succ = m_trtEngine->buildWithRetry(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE);
if (!succ) {
const std::string errMsg =
"Error: Unable to build the TensorRT engine. "
"Try increasing TensorRT log severity to kVERBOSE.";
this->_logger.LogError("TENSORRTCL::OptimizeModel", errMsg, __FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
_modelLoadValid = true;
return true;
}
catch (const std::exception& e) {
this->_logger.LogFatal("TENSORRTCL::OptimizeModel", e.what(), __FILE__, __LINE__);
optimizedModelFolder.clear();
return false;
}
}
bool TENSORRTCL::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword);
if (!result) return false;
_modelConfig.detectionType = ANSCENTER::DetectionType::CLASSIFICATION;
_modelConfig.modelType = ModelType::TENSORRT;
_modelConfig.inpHeight = 224;
_modelConfig.inpWidth = 224;
if (_modelConfig.modelMNSThreshold < 0.2)
_modelConfig.modelMNSThreshold = 0.5;
if (_modelConfig.modelConfThreshold < 0.2)
_modelConfig.modelConfThreshold = 0.5;
if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133) // 133 = COCO wholebody max
_modelConfig.numKPS = 17;
if (_modelConfig.kpsThreshold == 0)_modelConfig.kpsThreshold = 0.5; // If not define
// if (_modelConfig.precisionType == PrecisionType::FP16)_fp16 = true;
_fp16 = true; // Load Model from Here
// Load Model from Here
TOP_K = 100;
SEG_CHANNELS = 32;
PROBABILITY_THRESHOLD = 0.3;
NMS_THRESHOLD = 0.65f;
SEGMENTATION_THRESHOLD = 0.5f;
SEG_H = 160;
SEG_W = 160;
NUM_KPS = _modelConfig.numKPS;
KPS_THRESHOLD = _modelConfig.kpsThreshold;
SEG_CHANNELS = 32; // For segmentation
if (!m_trtEngine) {
// Fixed batch size of 1 for this model
m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
m_options.maxInputHeight = _modelConfig.maxInputHeight;
m_options.minInputHeight = _modelConfig.minInputHeight;
m_options.optInputHeight = _modelConfig.optInputHeight;
m_options.maxInputWidth = _modelConfig.maxInputWidth;
m_options.minInputWidth = _modelConfig.minInputWidth;
m_options.optInputWidth = _modelConfig.optInputWidth;
m_options.engineFileDir = _modelFolder;
// Use FP16 or FP32 precision based on the input flag
m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
// Create the TensorRT inference engine
m_trtEngine = std::make_unique<Engine<float>>(m_options);
}
// 0. Check if the configuration file exist
if (FileExist(_modelConfigFile)) {
ModelType modelType;
std::vector<int> inputShape;
_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
if (inputShape.size() == 2) {
if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0];
if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1];
}
}
else {// This is old version of model zip file
_modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
_classFilePath = CreateFilePath(_modelFolder, "classes.names");
std::ifstream isValidFileName(_classFilePath);
if (!isValidFileName)
{
this->_logger.LogDebug("TENSORRTCL::Initialize. Load classes from string", _classFilePath, __FILE__, __LINE__);
LoadClassesFromString();
}
else {
this->_logger.LogDebug("TENSORRTCL::Initialize. Load classes from file", _classFilePath, __FILE__, __LINE__);
LoadClassesFromFile();
}
}
// Load the TensorRT engine file
if (this->_loadEngineOnCreation) {
auto succ = m_trtEngine->buildLoadNetwork(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
if (!succ) {
const std::string errMsg = "Error: Unable to load TensorRT engine weights into memory. " + _modelFilePath;
this->_logger.LogError("TENSORRTCL::Initialize", errMsg, __FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
}
_modelLoadValid = true;
_isInitialized = true;
return true;
}
catch (std::exception& e) {
this->_logger.LogFatal("TENSORRTCL::LoadModel", e.what(), __FILE__, __LINE__);
return false;
}
}
bool TENSORRTCL::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig, std::string modelName, std::string className, const std::string& modelFolder, std::string& labelMap) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap);
if (!result) return false;
std::string _modelName = modelName;
if (_modelName.empty()) {
_modelName = "train_last";
}
std::string modelFullName = _modelName + ".onnx";
// Parsing for YOLO only here
_modelConfig = modelConfig;
_modelConfig.detectionType = ANSCENTER::DetectionType::CLASSIFICATION;
_modelConfig.modelType = ModelType::TENSORRT;
_modelConfig.inpHeight = 224;
_modelConfig.inpWidth = 224;
if (_modelConfig.modelMNSThreshold < 0.2)
_modelConfig.modelMNSThreshold = 0.5;
if (_modelConfig.modelConfThreshold < 0.2)
_modelConfig.modelConfThreshold = 0.5;
if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133) // 133 = COCO wholebody max
_modelConfig.numKPS = 17;
if (_modelConfig.kpsThreshold == 0)_modelConfig.kpsThreshold = 0.5; // If not define
_fp16 = true; // Load Model from Here
// Load Model from Here
TOP_K = 100;
SEG_CHANNELS = 32;
PROBABILITY_THRESHOLD = 0.3;
NMS_THRESHOLD = 0.65f;
SEGMENTATION_THRESHOLD = 0.5f;
SEG_H = 160;
SEG_W = 160;
NUM_KPS = _modelConfig.numKPS;
KPS_THRESHOLD = _modelConfig.kpsThreshold;
SEG_CHANNELS = 32; // For segmentation
if (!m_trtEngine) {
// Fixed batch size of 1 for this model
m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
m_options.maxInputHeight = _modelConfig.maxInputHeight;
m_options.minInputHeight = _modelConfig.minInputHeight;
m_options.optInputHeight = _modelConfig.optInputHeight;
m_options.maxInputWidth = _modelConfig.maxInputWidth;
m_options.minInputWidth = _modelConfig.minInputWidth;
m_options.optInputWidth = _modelConfig.optInputWidth;
m_options.engineFileDir = _modelFolder;
// Use FP16 or FP32 precision based on the input flag
m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
// Create the TensorRT inference engine
m_trtEngine = std::make_unique<Engine<float>>(m_options);
}
// 0. Check if the configuration file exist
if (FileExist(_modelConfigFile)) {
ModelType modelType;
std::vector<int> inputShape;
_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
if (inputShape.size() == 2) {
if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0];
if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1];
}
}
else {// This is old version of model zip file
_modelFilePath = CreateFilePath(_modelFolder, modelFullName);
_classFilePath = CreateFilePath(_modelFolder, className);
std::ifstream isValidFileName(_classFilePath);
if (!isValidFileName)
{
this->_logger.LogDebug("TENSORRTCL::Initialize. Load classes from string", _classFilePath, __FILE__, __LINE__);
LoadClassesFromString();
}
else {
this->_logger.LogDebug("TENSORRTCL::Initialize. Load classes from file", _classFilePath, __FILE__, __LINE__);
LoadClassesFromFile();
}
}
// 1. Load labelMap and engine
labelMap.clear();
if (!_classes.empty())
labelMap = VectorToCommaSeparatedString(_classes);
// Load the TensorRT engine file
if (this->_loadEngineOnCreation) {
auto succ = m_trtEngine->buildLoadNetwork(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
if (!succ) {
const std::string errMsg = "Error: Unable to load TensorRT engine weights into memory. " + _modelFilePath;
this->_logger.LogError("TENSORRTCL::Initialize", errMsg, __FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
}
_modelLoadValid = true;
_isInitialized = true;
return true;
}
catch (std::exception& e) {
this->_logger.LogFatal("TENSORRTCL::LoadModel", e.what(), __FILE__, __LINE__);
return false;
}
}
bool TENSORRTCL::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, std::string& labelMap) {
const bool engineAlreadyLoaded = _modelLoadValid && _isInitialized && m_trtEngine != nullptr;
_modelLoadValid = false;
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
bool result = ANSODBase::Initialize(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, labelMap);
if (!result) return false;
// Parsing for YOLO only here
_modelConfig = modelConfig;
_modelConfig.detectionType = ANSCENTER::DetectionType::CLASSIFICATION;
_modelConfig.modelType = ModelType::TENSORRT;
_modelConfig.inpHeight = 224;
_modelConfig.inpWidth = 224;
if (_modelConfig.modelMNSThreshold < 0.2)
_modelConfig.modelMNSThreshold = 0.5;
if (_modelConfig.modelConfThreshold < 0.2)
_modelConfig.modelConfThreshold = 0.5;
if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133) // 133 = COCO wholebody max
_modelConfig.numKPS = 17;
if (_modelConfig.kpsThreshold == 0)_modelConfig.kpsThreshold = 0.5; // If not define
// if (_modelConfig.precisionType == PrecisionType::FP16)_fp16 = true;
_fp16 = true; // Load Model from Here
// Load Model from Here
TOP_K = 100;
SEG_CHANNELS = 32;
PROBABILITY_THRESHOLD = 0.3;
NMS_THRESHOLD = 0.65f;
SEGMENTATION_THRESHOLD = 0.5f;
SEG_H = 160;
SEG_W = 160;
NUM_KPS = _modelConfig.numKPS;
KPS_THRESHOLD = _modelConfig.kpsThreshold;
SEG_CHANNELS = 32; // For segmentation
if (!m_trtEngine) {
// Fixed batch size of 1 for this model
m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
m_options.maxInputHeight = _modelConfig.maxInputHeight;
m_options.minInputHeight = _modelConfig.minInputHeight;
m_options.optInputHeight = _modelConfig.optInputHeight;
m_options.maxInputWidth = _modelConfig.maxInputWidth;
m_options.minInputWidth = _modelConfig.minInputWidth;
m_options.optInputWidth = _modelConfig.optInputWidth;
m_options.engineFileDir = _modelFolder;
// Use FP16 or FP32 precision based on the input flag
m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
// Create the TensorRT inference engine
m_trtEngine = std::make_unique<Engine<float>>(m_options);
}
// 0. Check if the configuration file exist
if (FileExist(_modelConfigFile)) {
ModelType modelType;
std::vector<int> inputShape;
_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
if (inputShape.size() == 2) {
if (inputShape[0] > 0)_modelConfig.inpHeight = inputShape[0];
if (inputShape[1] > 0)_modelConfig.inpWidth = inputShape[1];
}
}
else {// This is old version of model zip file
_modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
_classFilePath = CreateFilePath(_modelFolder, "classes.names");
std::ifstream isValidFileName(_classFilePath);
if (!isValidFileName)
{
this->_logger.LogDebug("TENSORRTCL::Initialize. Load classes from string", _classFilePath, __FILE__, __LINE__);
LoadClassesFromString();
}
else {
this->_logger.LogDebug("TENSORRTCL::Initialize. Load classes from file", _classFilePath, __FILE__, __LINE__);
LoadClassesFromFile();
}
}
// 1. Load labelMap and engine
labelMap.clear();
if (!_classes.empty())
labelMap = VectorToCommaSeparatedString(_classes);
// Load the TensorRT engine file
if (this->_loadEngineOnCreation && !engineAlreadyLoaded) {
auto succ = m_trtEngine->buildLoadNetwork(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
if (!succ) {
const std::string errMsg = "Error: Unable to load TensorRT engine weights into memory. " + _modelFilePath;
this->_logger.LogError("TENSORRTCL::Initialize", errMsg, __FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
}
_modelLoadValid = true;
_isInitialized = true;
return true;
}
catch (std::exception& e) {
this->_logger.LogFatal("TENSORRTCL::Initialize", e.what(), __FILE__, __LINE__);
return false;
}
}
std::vector<Object> TENSORRTCL::RunInference(const cv::Mat& inputImgBGR) {
return RunInference(inputImgBGR, "CustomCam");
}
std::vector<Object> TENSORRTCL::RunInference(const cv::Mat& inputImgBGR,const std::string& camera_id)
{
// Validate state under brief lock
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!_modelLoadValid) {
_logger.LogError("TENSORRTCL::RunInference",
"Cannot load the TensorRT model. Please check if it exists",
__FILE__, __LINE__);
return {};
}
if (!_licenseValid) {
_logger.LogError("TENSORRTCL::RunInference",
"Runtime license is not valid or expired. Please contact ANSCENTER",
__FILE__, __LINE__);
return {};
}
if (!_isInitialized) {
_logger.LogError("TENSORRTCL::RunInference",
"Model is not initialized",
__FILE__, __LINE__);
return {};
}
if (inputImgBGR.empty() || inputImgBGR.cols < 5 || inputImgBGR.rows < 5) {
return {};
}
}
try {
return DetectObjects(inputImgBGR, camera_id);
}
catch (const std::exception& e) {
_logger.LogFatal("TENSORRTCL::RunInference", e.what(), __FILE__, __LINE__);
return {};
}
}
std::vector<std::vector<Object>> TENSORRTCL::RunInferencesBatch(const std::vector<cv::Mat>& inputs, const std::string& camera_id) {
// Validate state under brief lock
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!_modelLoadValid) {
this->_logger.LogFatal("TENSORRTCL::RunInferencesBatch",
"Cannot load the TensorRT model. Please check if it exists", __FILE__, __LINE__);
return {};
}
if (!_licenseValid) {
this->_logger.LogFatal("TENSORRTCL::RunInferencesBatch",
"Runtime license is not valid or expired. Please contact ANSCENTER", __FILE__, __LINE__);
return {};
}
if (!_isInitialized) {
this->_logger.LogFatal("TENSORRTCL::RunInferencesBatch",
"Engine not initialized", __FILE__, __LINE__);
return {};
}
if (inputs.empty()) return {};
}
try {
return DetectObjectsBatch(inputs, camera_id);
}
catch (const std::exception& e) {
this->_logger.LogFatal("TENSORRTCL::RunInferencesBatch", e.what(), __FILE__, __LINE__);
return {};
}
};
TENSORRTCL::~TENSORRTCL() {
try {
Destroy();
}
catch (std::exception& e) {
this->_logger.LogError("TENSORRTCL::~TENSORRTCL()", e.what(), __FILE__, __LINE__);
}
}
bool TENSORRTCL::Destroy() {
try {
m_trtEngine.reset(); // Releases the current engine and sets m_trtEngine to nullptr.
return true;
}
catch (std::exception& e) {
this->_logger.LogError("TENSORRTCL::~TENSORRTCL()", e.what(), __FILE__, __LINE__);
return false;
}
}
// private
std::vector<Object> TENSORRTCL::DetectObjects(const cv::Mat& inputImage, const std::string& camera_id) {
try {
// --- 1. Set GPU device context ---
if (m_trtEngine) {
m_trtEngine->setDeviceContext();
}
// --- 1b. CUDA context health check ---
if (!m_nv12Helper.isCudaContextHealthy(_logger, "TENSORRTCL")) {
return {};
}
// --- 2. Preprocess under lock ---
// Try NV12 fast path first (classification: direct resize, no letterbox).
ImageMetadata meta;
std::vector<std::vector<cv::cuda::GpuMat>> input;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
const int inferenceGpu = m_trtEngine ? m_trtEngine->getPreferredDeviceIndex() : 0;
const auto& inputDims = m_trtEngine->getInputDims();
const int inputW = inputDims[0].d[2];
const int inputH = inputDims[0].d[1];
auto nv12 = m_nv12Helper.tryNV12(inputImage, inferenceGpu, inputW, inputH,
NV12PreprocessHelper::classificationLauncher(),
_logger, "TENSORRTCL");
if (nv12.succeeded) {
meta.imgWidth = nv12.metaWidth;
meta.imgHeight = nv12.metaHeight;
meta.ratio = 1.f; // classification: no letterbox
input = {{ std::move(nv12.gpuRGB) }};
}
else if (nv12.useBgrFullRes) {
input = Preprocess(nv12.bgrFullResImg, meta);
}
if (input.empty()) {
input = Preprocess(inputImage, meta);
}
m_nv12Helper.tickInference();
}
if (input.empty()) return {};
// Phase 2: Inference — mutex released; pool dispatches to idle GPU slot
std::vector<std::vector<std::vector<float>>> featureVectors;
auto succ = m_trtEngine->runInference(input, featureVectors);
if (!succ) {
this->_logger.LogFatal("TENSORRTCL::DetectObjects", "Error running inference", __FILE__, __LINE__);
return {};
}
// Phase 3: Postprocess under brief lock
std::lock_guard<std::recursive_mutex> lock(_mutex);
std::vector<float> featureVector;
Engine<float>::transformOutput(featureVectors, featureVector);
return Postprocess(featureVector, camera_id, meta);
}
catch (std::exception& e) {
this->_logger.LogFatal("TENSORRTCL::DetectObjects", e.what(), __FILE__, __LINE__);
return {};
}
}
std::vector<std::vector<cv::cuda::GpuMat>> TENSORRTCL::Preprocess(const cv::Mat& inputImage, ImageMetadata& outMeta) {
try {
if (!_licenseValid) {
this->_logger.LogFatal("TENSORRTCL::Preprocess", "Invalid license", __FILE__, __LINE__);
return {};
}
if (inputImage.empty()) {
this->_logger.LogFatal("TENSORRTCL::Preprocess", "Input image is empty", __FILE__, __LINE__);
return {};
}
if ((inputImage.cols < 5) || (inputImage.rows < 5)) {
this->_logger.LogFatal("TENSORRTCL::Preprocess",
"Input image is too small (Width: " + std::to_string(inputImage.cols) +
", Height: " + std::to_string(inputImage.rows) + ")",
__FILE__, __LINE__);
return {};
}
// Populate the input vectors
const auto& inputDims = m_trtEngine->getInputDims();
const int inputH = inputDims[0].d[1];
const int inputW = inputDims[0].d[2];
// --- CPU preprocessing: resize + BGR->RGB before GPU upload ---
cv::Mat srcImg = inputImage;
if (srcImg.channels() == 1) {
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
2026-03-28 16:54:11 +11:00
}
// These parameters will be used in the post-processing stage
outMeta.imgHeight = srcImg.rows;
outMeta.imgWidth = srcImg.cols;
2026-03-28 16:54:11 +11:00
if (outMeta.imgHeight <= 0 || outMeta.imgWidth <= 0) {
_logger.LogFatal("TENSORRTCL::Preprocess", "Image height or width is zero", __FILE__, __LINE__);
return {};
}
if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
outMeta.ratio = 1.f;
// Classification: direct CPU resize (no letterbox padding)
cv::Mat cpuResized;
if (srcImg.rows != inputH || srcImg.cols != inputW) {
cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
} else {
cpuResized = srcImg;
2026-03-28 16:54:11 +11:00
}
// CPU BGR -> RGB
cv::Mat cpuRGB;
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
// Upload small image to GPU
cv::cuda::Stream stream;
cv::cuda::GpuMat gpuResized;
gpuResized.upload(cpuRGB, stream);
2026-03-28 16:54:11 +11:00
stream.waitForCompletion();
// Convert to format expected by our inference engine
std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
2026-03-28 16:54:11 +11:00
std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
return inputs;
}
else {
this->_logger.LogFatal("TENSORRTCL::Preprocess",
"Image height or width is zero after processing (Width: " + std::to_string(outMeta.imgWidth) +
", Height: " + std::to_string(outMeta.imgHeight) + ")",
__FILE__, __LINE__);
return {};
}
}
catch (const std::exception& e) {
this->_logger.LogFatal("TENSORRTCL::Preprocess", e.what(), __FILE__, __LINE__);
return {};
}
}
std::vector<Object> TENSORRTCL::Postprocess(std::vector<float>& featureVector, const std::string& camera_id, const ImageMetadata& meta) {
std::vector<Object> outputs;
try {
// Check if output is already a probability distribution (sums to ~1.0).
// Some models include a Softmax layer; applying softmax again would
// flatten the distribution and cause wrong classifications.
float rawSum = 0.f;
bool allNonNeg = true;
for (const auto& v : featureVector) {
rawSum += v;
if (v < 0.f) allNonNeg = false;
}
const bool alreadyNormalized = (allNonNeg && rawSum > 0.9f && rawSum < 1.1f);
if (!alreadyNormalized) {
// Raw logits — apply softmax
float maxLogit = *std::max_element(featureVector.begin(), featureVector.end());
float sumExp = 0.f;
for (auto& v : featureVector) {
v = std::exp(v - maxLogit);
sumExp += v;
}
for (auto& v : featureVector)
v /= sumExp;
}
auto max_idx = std::max_element(featureVector.begin(), featureVector.end());
int class_id = static_cast<int>(std::distance(featureVector.begin(), max_idx));
float score = *max_idx;
int classNameSize = _classes.size();
Object clsResult;
clsResult.classId = class_id;
if (!_classes.empty()) {
if (clsResult.classId < classNameSize) {
clsResult.className = _classes[clsResult.classId];
}
else {
clsResult.className = _classes[classNameSize - 1]; // Use last valid class name if out of range
}
}
else {
clsResult.className = "Unknown"; // Fallback if _classes is empty
}
clsResult.confidence = score;
if (meta.imgWidth > 20 && meta.imgHeight > 20) {
clsResult.box = cv::Rect(10, 10, meta.imgWidth - 20, meta.imgHeight - 20);
}
else {
clsResult.box = cv::Rect(0, 0, meta.imgWidth, meta.imgHeight);
}
clsResult.polygon = ANSUtilityHelper::RectToNormalizedPolygon(clsResult.box, meta.imgWidth, meta.imgHeight);
clsResult.cameraId = camera_id;
outputs.push_back(clsResult);
return outputs;
//EnqueueDetection(objects, camera_id);
}
catch (std::exception& e) {
this->_logger.LogFatal("TENSORRTCL::Postproces", e.what(), __FILE__, __LINE__);
return outputs;
}
}
std::vector<std::vector<Object>> TENSORRTCL::DetectObjectsBatch(const std::vector<cv::Mat>& inputImages, const std::string& camera_id)
{
// Validate under brief lock
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (inputImages.empty()) {
_logger.LogFatal("TENSORRTCL::DetectObjectsBatch",
"Empty input images vector", __FILE__, __LINE__);
return {};
}
}
// Auto-split if batch exceeds engine capacity
const int maxBatch = m_options.maxBatchSize > 0 ? m_options.maxBatchSize : 1;
if (static_cast<int>(inputImages.size()) > maxBatch) {
const size_t numImages = inputImages.size();
std::vector<std::vector<Object>> allResults;
allResults.reserve(numImages);
// Process chunks sequentially to avoid GPU contention on the same engine
for (size_t start = 0; start < numImages; start += static_cast<size_t>(maxBatch)) {
const size_t end = std::min(start + static_cast<size_t>(maxBatch), numImages);
std::vector<cv::Mat> chunk(inputImages.begin() + start, inputImages.begin() + end);
auto chunkResults = DetectObjectsBatch(chunk, camera_id);
if (chunkResults.size() == chunk.size()) {
for (auto& r : chunkResults) allResults.push_back(std::move(r));
}
else {
// Chunk failed or returned wrong size — pad with empty results
_logger.LogError("TENSORRTCL::DetectObjectsBatch",
"Chunk returned " + std::to_string(chunkResults.size()) +
" results, expected " + std::to_string(chunk.size()) +
". Padding with empty results.", __FILE__, __LINE__);
for (auto& r : chunkResults) allResults.push_back(std::move(r));
for (size_t pad = chunkResults.size(); pad < chunk.size(); ++pad) {
allResults.push_back({});
}
}
}
return allResults;
}
_logger.LogDebug("TENSORRTCL::DetectObjectsBatch",
"Processing batch of " + std::to_string(inputImages.size()) + " images",
__FILE__, __LINE__);
// Phase 1: Preprocess under brief lock
BatchMetadata metadata;
std::vector<std::vector<cv::cuda::GpuMat>> inputs;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
inputs = PreprocessBatch(inputImages, metadata);
}
if (inputs.empty() || inputs[0].empty()) {
_logger.LogFatal("TENSORRTCL::DetectObjectsBatch",
"Preprocessing failed", __FILE__, __LINE__);
return {};
}
// Phase 2: Inference — mutex released; pool dispatches to idle GPU slot
std::vector<std::vector<std::vector<float>>> featureVectors;
bool succ = m_trtEngine->runInference(inputs, featureVectors);
if (!succ) {
_logger.LogFatal("TENSORRTCL::DetectObjectsBatch",
"Error running batch inference", __FILE__, __LINE__);
return {};
}
// Phase 3: Parallel postprocessing
const size_t numBatch = featureVectors.size();
std::vector<std::vector<Object>> batchDetections(numBatch);
std::vector<std::future<std::vector<Object>>> postFutures;
postFutures.reserve(numBatch);
for (size_t batchIdx = 0; batchIdx < numBatch; ++batchIdx) {
const auto& batchOutput = featureVectors[batchIdx];
std::vector<float> fv = batchOutput.empty() ? std::vector<float>{} : batchOutput[0];
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, idx = batchIdx, &metadata]() mutable {
return PostprocessBatch(fv, cid, idx, metadata);
}));
}
for (size_t i = 0; i < numBatch; ++i)
batchDetections[i] = postFutures[i].get();
_logger.LogDebug("TENSORRTCL::DetectObjectsBatch",
"Batch processing complete. Images: " + std::to_string(numBatch),
__FILE__, __LINE__);
return batchDetections;
}
std::vector<std::vector<cv::cuda::GpuMat>> TENSORRTCL::PreprocessBatch(const std::vector<cv::Mat>& inputImages, BatchMetadata& outMetadata)
{
try {
// Validate license
if (!_licenseValid) {
_logger.LogFatal("TENSORRTCL::PreprocessBatch",
"Invalid license", __FILE__, __LINE__);
return {};
}
// Validate input
if (inputImages.empty()) {
_logger.LogFatal("TENSORRTCL::PreprocessBatch",
"Input images vector is empty", __FILE__, __LINE__);
return {};
}
size_t batchSize = inputImages.size();
// Get model input dimensions
const auto& inputDims = m_trtEngine->getInputDims();
const int inputH = inputDims[0].d[1];
const int inputW = inputDims[0].d[2];
_logger.LogDebug("TENSORRTCL::PreprocessBatch",
"Preprocessing " + std::to_string(batchSize) + " images to " +
std::to_string(inputW) + "x" + std::to_string(inputH),
__FILE__, __LINE__);
// Create CUDA stream for async operations
cv::cuda::Stream stream;
// Store ALL images in a SINGLE batch vector
std::vector<cv::cuda::GpuMat> batchedImages;
batchedImages.reserve(batchSize);
// Store image dimensions for postprocessing
outMetadata.imgHeights.clear();
outMetadata.imgWidths.clear();
outMetadata.ratios.clear();
outMetadata.imgHeights.reserve(batchSize);
outMetadata.imgWidths.reserve(batchSize);
outMetadata.ratios.reserve(batchSize);
// Process each image
for (size_t i = 0; i < batchSize; ++i) {
const cv::Mat& inputImage = inputImages[i];
// Validate individual image
if (inputImage.empty()) {
_logger.LogFatal("TENSORRTCL::PreprocessBatch",
"Input image at index " + std::to_string(i) + " is empty",
__FILE__, __LINE__);
return {};
}
if (inputImage.cols < 5 || inputImage.rows < 5) {
_logger.LogFatal("TENSORRTCL::PreprocessBatch",
"Image at index " + std::to_string(i) +
" is too small (Width: " + std::to_string(inputImage.cols) +
", Height: " + std::to_string(inputImage.rows) + ")",
__FILE__, __LINE__);
return {};
}
// CPU preprocessing: resize + BGR->RGB before GPU upload
cv::Mat srcImg = inputImage;
if (srcImg.channels() == 1) {
2026-03-28 16:54:11 +11:00
cv::Mat img3Channel;
cv::cvtColor(srcImg, img3Channel, cv::COLOR_GRAY2BGR);
srcImg = img3Channel;
2026-03-28 16:54:11 +11:00
}
// Store original dimensions
int imgHeight = srcImg.rows;
int imgWidth = srcImg.cols;
2026-03-28 16:54:11 +11:00
if (imgHeight <= 0 || imgWidth <= 0) {
_logger.LogFatal("TENSORRTCL::PreprocessBatch",
"Image at index " + std::to_string(i) + " has zero height or width",
__FILE__, __LINE__);
return {};
}
outMetadata.imgHeights.push_back(imgHeight);
outMetadata.imgWidths.push_back(imgWidth);
// Classification: ratio is always 1.0
outMetadata.ratios.push_back(1.f);
// Classification: direct CPU resize (no letterbox padding)
cv::Mat cpuResized;
if (srcImg.rows != inputH || srcImg.cols != inputW) {
cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
} else {
cpuResized = srcImg;
2026-03-28 16:54:11 +11:00
}
cv::Mat cpuRGB;
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
cv::cuda::GpuMat gpuResized;
gpuResized.upload(cpuRGB, stream);
2026-03-28 16:54:11 +11:00
// Add to batch
batchedImages.push_back(std::move(gpuResized));
2026-03-28 16:54:11 +11:00
}
// Wait for all GPU operations to complete
stream.waitForCompletion();
// Return as single batched input
std::vector<std::vector<cv::cuda::GpuMat>> result;
result.push_back(std::move(batchedImages));
return result;
}
catch (const std::exception& e) {
_logger.LogFatal("TENSORRTCL::PreprocessBatch",
e.what(), __FILE__, __LINE__);
return {};
}
}
std::vector<Object> TENSORRTCL::PostprocessBatch(std::vector<float>& featureVector, const std::string& camera_id, size_t batchIdx, const BatchMetadata& metadata)
{
std::vector<Object> outputs;
try {
// Validate batch index
if (batchIdx >= metadata.imgHeights.size() ||
batchIdx >= metadata.imgWidths.size()) {
_logger.LogFatal("TENSORRTCL::PostprocessBatch",
"Batch index " + std::to_string(batchIdx) +
" out of range (stored " + std::to_string(metadata.imgHeights.size()) + " images)",
__FILE__, __LINE__);
return outputs;
}
// Validate feature vector
if (featureVector.empty()) {
_logger.LogFatal("TENSORRTCL::PostprocessBatch",
"Feature vector is empty for batch index " + std::to_string(batchIdx),
__FILE__, __LINE__);
return outputs;
}
// Get image dimensions for this batch index
int imgHeight = metadata.imgHeights[batchIdx];
int imgWidth = metadata.imgWidths[batchIdx];
// Normalize if raw logits (same logic as single-image Postprocess)
float rawSum = 0.f;
bool allNonNeg = true;
for (const auto& v : featureVector) {
rawSum += v;
if (v < 0.f) allNonNeg = false;
}
const bool alreadyNorm = (allNonNeg && rawSum > 0.9f && rawSum < 1.1f);
if (!alreadyNorm) {
float maxLogit = *std::max_element(featureVector.begin(), featureVector.end());
float sumExp = 0.f;
for (auto& v : featureVector) {
v = std::exp(v - maxLogit);
sumExp += v;
}
for (auto& v : featureVector) v /= sumExp;
}
// Find max element (classification result)
auto max_idx = std::max_element(featureVector.begin(), featureVector.end());
if (max_idx == featureVector.end()) {
_logger.LogFatal("TENSORRTCL::PostprocessBatch",
"Failed to find max element in feature vector for batch index " +
std::to_string(batchIdx),
__FILE__, __LINE__);
return outputs;
}
int class_id = static_cast<int>(std::distance(featureVector.begin(), max_idx));
float score = *max_idx;
// Create object result
Object clsResult;
clsResult.classId = class_id;
// Get class name
int classNameSize = static_cast<int>(_classes.size());
if (!_classes.empty()) {
if (class_id >= 0 && class_id < classNameSize) {
clsResult.className = _classes[class_id];
}
else {
clsResult.className = _classes[classNameSize - 1];
}
}
else {
clsResult.className = "Unknown";
}
clsResult.confidence = score;
// Create bounding box with margins
if (imgWidth > 20 && imgHeight > 20) {
clsResult.box = cv::Rect(10, 10, imgWidth - 20, imgHeight - 20);
}
else {
clsResult.box = cv::Rect(0, 0, imgWidth, imgHeight);
}
// Convert to normalized polygon
clsResult.polygon = ANSUtilityHelper::RectToNormalizedPolygon(
clsResult.box, imgWidth, imgHeight
);
clsResult.cameraId = camera_id;
outputs.push_back(std::move(clsResult));
return outputs;
}
catch (const std::exception& e) {
_logger.LogFatal("TENSORRTCL::PostprocessBatch",
"Error for batch index " + std::to_string(batchIdx) + ": " + e.what(),
__FILE__, __LINE__);
return outputs;
}
}
}