Files
ANSCORE/modules/ANSODEngine/ANSRTYOLO.cpp

2377 lines
115 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include "ANSRTYOLO.h"
#include "Utility.h"
#include "ANSLicense.h" // ANS_DBG macro for DebugView
#include <future>
#include <numeric>
#include <cmath>
#include <cstring>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>
namespace ANSCENTER {
// ====================================================================
// ANSODBase interface — OptimizeModel
// ====================================================================
bool ANSRTYOLO::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder)) return false;
if (!FileExist(_modelFilePath)) {
_logger.LogFatal("ANSRTYOLO::OptimizeModel", "Raw model file path does not exist", __FILE__, __LINE__);
return false;
}
try {
_fp16 = fp16;
optimizedModelFolder = GetParentFolder(_modelFilePath);
if (!m_trtEngine) {
m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
m_options.maxInputHeight = _modelConfig.maxInputHeight;
m_options.minInputHeight = _modelConfig.minInputHeight;
m_options.optInputHeight = _modelConfig.optInputHeight;
m_options.maxInputWidth = _modelConfig.maxInputWidth;
m_options.minInputWidth = _modelConfig.minInputWidth;
m_options.optInputWidth = _modelConfig.optInputWidth;
m_options.engineFileDir = optimizedModelFolder;
m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
m_trtEngine = std::make_shared<Engine<float>>(m_options);
}
auto succ = m_trtEngine->buildWithRetry(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE);
if (!succ) {
_logger.LogError("ANSRTYOLO::OptimizeModel",
"Error: Unable to build TensorRT engine. " + _modelFilePath, __FILE__, __LINE__);
return false;
}
return true;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::OptimizeModel", e.what(), __FILE__, __LINE__);
return false;
}
}
// ====================================================================
// ANSODBase interface — LoadModel
// ====================================================================
bool ANSRTYOLO::LoadModel(const std::string& modelZipFilePath,
const std::string& modelZipPassword) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
_isFixedBatch = false;
bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword);
if (!result) return false;
_modelConfig.modelType = ModelType::TENSORRT;
if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640;
if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.5f;
if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.5f;
if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)
_modelConfig.numKPS = 17;
if (_modelConfig.kpsThreshold <= 0) _modelConfig.kpsThreshold = 0.5f;
_fp16 = true;
TOP_K = 300;
SEG_CHANNELS = 32;
PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
SEGMENTATION_THRESHOLD = 0.5f;
SEG_H = 160;
SEG_W = 160;
NUM_KPS = _modelConfig.numKPS;
KPS_THRESHOLD = _modelConfig.kpsThreshold;
m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
m_options.maxInputHeight = _modelConfig.maxInputHeight;
m_options.minInputHeight = _modelConfig.minInputHeight;
m_options.optInputHeight = _modelConfig.optInputHeight;
m_options.maxInputWidth = _modelConfig.maxInputWidth;
m_options.minInputWidth = _modelConfig.minInputWidth;
m_options.optInputWidth = _modelConfig.optInputWidth;
m_options.engineFileDir = _modelFolder;
m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
_modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
if (FileExist(_modelConfigFile)) {
ModelType modelType;
std::vector<int> inputShape;
_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
if (inputShape.size() == 2) {
if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1];
}
}
else {
_classFilePath = CreateFilePath(_modelFolder, "classes.names");
std::ifstream isValid(_classFilePath);
if (!isValid) LoadClassesFromString();
else LoadClassesFromFile();
}
if (this->_loadEngineOnCreation) {
if (!m_trtEngine) {
m_poolKey = { _modelFilePath,
static_cast<int>(m_options.precision), m_options.maxBatchSize };
m_trtEngine = EnginePoolManager<float>::instance().acquire(
m_poolKey, m_options, _modelFilePath,
SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
m_usingSharedPool = (m_trtEngine != nullptr);
}
if (!m_trtEngine) {
_logger.LogError("ANSRTYOLO::LoadModel",
"Error: Unable to load TensorRT engine. " + _modelFilePath, __FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize;
m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize;
m_trtEngine->warmUp();
}
_modelLoadValid = true;
_isInitialized = true;
return true;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::LoadModel", e.what(), __FILE__, __LINE__);
return false;
}
}
// ====================================================================
// ANSODBase interface — LoadModelFromFolder
// ====================================================================
bool ANSRTYOLO::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig,
std::string modelName, std::string className,
const std::string& modelFolder, std::string& labelMap) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
_isFixedBatch = false;
bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig,
modelName, className, modelFolder, labelMap);
if (!result) return false;
_modelConfig = modelConfig;
_modelConfig.modelType = ModelType::TENSORRT;
if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640;
_modelConfig.precisionType = PrecisionType::FP32;
if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)
_modelConfig.numKPS = 17;
if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.5f;
if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.5f;
if (_modelConfig.kpsThreshold <= 0) _modelConfig.kpsThreshold = 0.5f;
_fp16 = true;
TOP_K = 300;
SEG_CHANNELS = 32;
PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
SEGMENTATION_THRESHOLD = 0.5f;
SEG_H = 160;
SEG_W = 160;
NUM_KPS = _modelConfig.numKPS;
KPS_THRESHOLD = _modelConfig.kpsThreshold;
std::string _modelName = modelName;
if (_modelName.empty()) _modelName = "train_last";
std::string modelFullName = _modelName + ".onnx";
m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
m_options.maxInputHeight = _modelConfig.maxInputHeight;
m_options.minInputHeight = _modelConfig.minInputHeight;
m_options.optInputHeight = _modelConfig.optInputHeight;
m_options.maxInputWidth = _modelConfig.maxInputWidth;
m_options.minInputWidth = _modelConfig.minInputWidth;
m_options.optInputWidth = _modelConfig.optInputWidth;
m_options.engineFileDir = _modelFolder;
m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
_modelFilePath = CreateFilePath(_modelFolder, modelFullName);
if (FileExist(_modelConfigFile)) {
ModelType modelType;
std::vector<int> inputShape;
_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
if (inputShape.size() == 2) {
if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1];
}
}
else {
_classFilePath = CreateFilePath(_modelFolder, className);
std::ifstream isValid(_classFilePath);
if (!isValid) LoadClassesFromString();
else LoadClassesFromFile();
}
labelMap.clear();
if (!_classes.empty())
labelMap = VectorToCommaSeparatedString(_classes);
if (this->_loadEngineOnCreation) {
if (!m_trtEngine) {
m_poolKey = { _modelFilePath,
static_cast<int>(m_options.precision), m_options.maxBatchSize };
m_trtEngine = EnginePoolManager<float>::instance().acquire(
m_poolKey, m_options, _modelFilePath,
SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
m_usingSharedPool = (m_trtEngine != nullptr);
}
if (!m_trtEngine) {
_logger.LogError("ANSRTYOLO::LoadModelFromFolder",
"Error: Unable to load TensorRT engine. " + _modelFilePath, __FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize;
m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize;
m_trtEngine->warmUp();
}
_modelLoadValid = true;
_isInitialized = true;
return true;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::LoadModelFromFolder", e.what(), __FILE__, __LINE__);
return false;
}
}
// ====================================================================
// ANSODBase interface — Initialize
// ====================================================================
bool ANSRTYOLO::Initialize(std::string licenseKey, ModelConfig modelConfig,
const std::string& modelZipFilePath,
const std::string& modelZipPassword,
std::string& labelMap) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
const bool engineAlreadyLoaded = _modelLoadValid && _isInitialized && m_trtEngine != nullptr;
_modelLoadValid = false;
_isFixedBatch = false;
bool result = ANSODBase::Initialize(licenseKey, modelConfig,
modelZipFilePath, modelZipPassword, labelMap);
if (!result) return false;
_modelConfig = modelConfig;
_modelConfig.modelType = ModelType::TENSORRT;
if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640;
_modelConfig.precisionType = PrecisionType::FP32;
if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)
_modelConfig.numKPS = 17;
if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.5f;
if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.5f;
if (_modelConfig.kpsThreshold <= 0) _modelConfig.kpsThreshold = 0.5f;
_fp16 = true;
TOP_K = 300;
SEG_CHANNELS = 32;
PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
SEGMENTATION_THRESHOLD = 0.5f;
SEG_H = 160;
SEG_W = 160;
NUM_KPS = _modelConfig.numKPS;
KPS_THRESHOLD = _modelConfig.kpsThreshold;
m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
m_options.maxInputHeight = _modelConfig.maxInputHeight;
m_options.minInputHeight = _modelConfig.minInputHeight;
m_options.optInputHeight = _modelConfig.optInputHeight;
m_options.maxInputWidth = _modelConfig.maxInputWidth;
m_options.minInputWidth = _modelConfig.minInputWidth;
m_options.optInputWidth = _modelConfig.optInputWidth;
m_options.engineFileDir = _modelFolder;
m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
_modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
if (FileExist(_modelConfigFile)) {
ModelType modelType;
std::vector<int> inputShape;
_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
if (inputShape.size() == 2) {
if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1];
}
}
else {
_classFilePath = CreateFilePath(_modelFolder, "classes.names");
std::ifstream isValid(_classFilePath);
if (!isValid) LoadClassesFromString();
else LoadClassesFromFile();
}
labelMap.clear();
if (!_classes.empty())
labelMap = VectorToCommaSeparatedString(_classes);
if (this->_loadEngineOnCreation && !engineAlreadyLoaded) {
if (!m_trtEngine) {
m_poolKey = { _modelFilePath,
static_cast<int>(m_options.precision), m_options.maxBatchSize };
m_trtEngine = EnginePoolManager<float>::instance().acquire(
m_poolKey, m_options, _modelFilePath,
SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
m_usingSharedPool = (m_trtEngine != nullptr);
}
if (!m_trtEngine) {
_logger.LogError("ANSRTYOLO::Initialize",
"Error: Unable to load TensorRT engine. " + _modelFilePath, __FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize;
m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize;
m_trtEngine->warmUp();
}
_modelLoadValid = true;
_isInitialized = true;
return true;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::Initialize", e.what(), __FILE__, __LINE__);
return false;
}
}
// ====================================================================
// RunInference / RunInferencesBatch / Destroy / Destructor
// ====================================================================
std::vector<Object> ANSRTYOLO::RunInference(const cv::Mat& inputImgBGR) {
return RunInference(inputImgBGR, "");
}
std::vector<Object> ANSRTYOLO::RunInference(const cv::Mat& inputImgBGR,
const std::string& camera_id) {
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!_modelLoadValid) {
_logger.LogError("ANSRTYOLO::RunInference", "Cannot load TensorRT model", __FILE__, __LINE__);
return {};
}
if (!_licenseValid) {
_logger.LogError("ANSRTYOLO::RunInference", "Invalid license", __FILE__, __LINE__);
return {};
}
if (!_isInitialized) {
_logger.LogError("ANSRTYOLO::RunInference", "Model not initialized", __FILE__, __LINE__);
return {};
}
if (inputImgBGR.empty() || inputImgBGR.cols < 10 || inputImgBGR.rows < 10)
return {};
}
try { return DetectObjects(inputImgBGR, camera_id); }
catch (const std::exception& e) {
_logger.LogFatal("ANSRTYOLO::RunInference", e.what(), __FILE__, __LINE__);
return {};
}
}
std::vector<std::vector<Object>> ANSRTYOLO::RunInferencesBatch(
const std::vector<cv::Mat>& inputs, const std::string& camera_id) {
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!_modelLoadValid) {
_logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Cannot load the TensorRT model", __FILE__, __LINE__);
return {};
}
if (!_licenseValid) {
_logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Runtime license is not valid or expired", __FILE__, __LINE__);
return {};
}
if (!_isInitialized) {
_logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Initialisation is not valid", __FILE__, __LINE__);
return {};
}
if (inputs.empty()) {
_logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Input images vector is empty", __FILE__, __LINE__);
return {};
}
}
try {
if (_isFixedBatch) return ANSODBase::RunInferencesBatch(inputs, camera_id);
else return DetectObjectsBatch(inputs, camera_id);
}
catch (const std::exception& e) {
_logger.LogFatal("ANSRTYOLO::RunInferencesBatch", e.what(), __FILE__, __LINE__);
return {};
}
}
ANSRTYOLO::~ANSRTYOLO() {
try { Destroy(); }
catch (std::exception& e) {
_logger.LogError("ANSRTYOLO::~ANSRTYOLO()", e.what(), __FILE__, __LINE__);
}
}
bool ANSRTYOLO::Destroy() {
try {
if (m_usingSharedPool) {
// Release our reference to the shared pool.
// Pool is destroyed only when all tasks release it.
EnginePoolManager<float>::instance().release(m_poolKey);
m_trtEngine.reset(); // drop shared_ptr (pool may survive)
m_usingSharedPool = false;
}
else {
m_trtEngine.reset();
}
m_nv12Helper.destroy();
return true;
}
catch (std::exception& e) {
_logger.LogError("ANSRTYOLO::Destroy()", e.what(), __FILE__, __LINE__);
return false;
}
}
// ====================================================================
// GPU Preprocessing — single image (pinned-memory H2D path)
//
// 1. Copy raw host image into a pinned (page-locked) buffer
// 2. Upload from pinned memory → GPU (DMA, no staging copy)
// 3. BGR→RGB colour conversion on GPU
// 4. Letterbox resize on GPU (right-bottom pad)
//
// Pinned memory eliminates the internal pageable→pinned staging
// copy that CUDA performs for normal (pageable) host memory,
// cutting the H2D transfer of a 3840×2160 BGR frame (~24 MB)
// by 60-70%.
// ====================================================================
std::vector<std::vector<cv::cuda::GpuMat>> ANSRTYOLO::Preprocess(
const cv::Mat& inputImage, ImageMetadata& outMeta) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
if (!_licenseValid) {
_logger.LogFatal("ANSRTYOLO::Preprocess", "Invalid license", __FILE__, __LINE__);
return {};
}
const auto& inputDims = m_trtEngine->getInputDims();
const int inputH = inputDims[0].d[1];
const int inputW = inputDims[0].d[2];
// Early-out if CUDA context is dead (sticky error from CUVID crash etc.)
if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) return {};
cv::cuda::Stream stream;
cv::cuda::GpuMat gpuImg;
// Resolve source Mat (handle grayscale → BGR on CPU first)
if (inputImage.channels() == 1) {
cv::Mat img3Channel;
cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
gpuImg.upload(img3Channel, stream);
} else {
gpuImg.upload(inputImage, stream);
}
// GPU: BGR → RGB
cv::cuda::GpuMat gpuRGB;
cv::cuda::cvtColor(gpuImg, gpuRGB, cv::COLOR_BGR2RGB, 0, stream);
outMeta.imgHeight = static_cast<float>(gpuRGB.rows);
outMeta.imgWidth = static_cast<float>(gpuRGB.cols);
if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
outMeta.ratio = 1.f / std::min(
inputDims[0].d[2] / static_cast<float>(gpuRGB.cols),
inputDims[0].d[1] / static_cast<float>(gpuRGB.rows));
// Check if model is classification (output ndims <= 2)
const auto& outputDims = m_trtEngine->getOutputDims();
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
cv::cuda::GpuMat gpuResized;
if (gpuRGB.rows != inputH || gpuRGB.cols != inputW) {
if (isClassification) {
// Classification: direct resize (no letterbox padding)
cv::cuda::resize(gpuRGB, gpuResized, cv::Size(inputW, inputH),
0, 0, cv::INTER_LINEAR, stream);
}
else {
// Detection/Seg/Pose/OBB: letterbox resize + right-bottom pad (on GPU)
gpuResized = Engine<float>::resizeKeepAspectRatioPadRightBottom(
gpuRGB, inputH, inputW);
}
} else {
gpuResized = gpuRGB;
}
stream.waitForCompletion();
std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
return inputs;
}
else {
_logger.LogFatal("ANSRTYOLO::Preprocess",
"Image height or width is zero (Width: " + std::to_string(outMeta.imgWidth) +
", Height: " + std::to_string(outMeta.imgHeight) + ")", __FILE__, __LINE__);
return {};
}
}
catch (const std::exception& e) {
_logger.LogWarn("ANSRTYOLO::Preprocess", std::string("Skipped frame: ") + e.what(), __FILE__, __LINE__);
return {};
}
}
#if 0 // PreprocessFromNV12 — moved to NV12PreprocessHelper::tryNV12()
try {
if (!gpuData || !gpuData->yPlane || !gpuData->uvPlane) {
if (!m_nv12NullLogged) {
m_nv12NullLogged = true;
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
"Early exit: null data — gpuData=" + std::to_string(gpuData != nullptr) +
" yPlane=" + std::to_string(gpuData ? (gpuData->yPlane != nullptr) : false) +
" uvPlane=" + std::to_string(gpuData ? (gpuData->uvPlane != nullptr) : false) +
" isCuda=" + std::to_string(gpuData ? gpuData->isCudaDevicePtr : false),
__FILE__, __LINE__);
}
return {};
}
const auto& inputDims = m_trtEngine->getInputDims();
const int inputH = inputDims[0].d[1];
const int inputW = inputDims[0].d[2];
const int frameW = gpuData->width;
const int frameH = gpuData->height;
if (frameW <= 0 || frameH <= 0) {
if (!m_nv12DimLogged) {
m_nv12DimLogged = true;
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
"Early exit: bad dimensions — w=" + std::to_string(frameW) +
" h=" + std::to_string(frameH),
__FILE__, __LINE__);
}
return {};
}
// Early-out if CUDA context is dead (sticky error from CUVID crash etc.)
if (m_cudaContextDead) {
if (!m_nv12DeadLogged) {
m_nv12DeadLogged = true;
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
"Early exit: CUDA context dead",
__FILE__, __LINE__);
}
return {};
}
// Cache flag before lock is released — gpuData may be invalidated after unlock
const bool isCudaDevice = gpuData->isCudaDevicePtr;
// ── GPU index validation for zero-copy ──
// NVDEC device pointers are only valid on the CUDA context that decoded them.
// If decode GPU != inference GPU, wrapping those pointers causes
// "illegal memory access" → sticky CUDA error → entire context dies.
// Fall back to CPU memcpy+upload path when GPUs don't match.
const int inferenceGpu = m_trtEngine ? m_trtEngine->getPreferredDeviceIndex() : 0;
const bool gpuMatch = !isCudaDevice ||
gpuData->gpuIndex < 0 || // unknown = trust it
gpuData->gpuIndex == inferenceGpu;
const bool useZeroCopy = isCudaDevice && gpuMatch;
// Local plane pointers — default to gpuData's primary planes.
// Overridden below for cross-GPU fallback (CPU NV12 instead of CUDA).
uint8_t* effYPlane = gpuData->yPlane;
uint8_t* effUvPlane = gpuData->uvPlane;
int effYLinesize = gpuData->yLinesize;
int effUvLinesize = gpuData->uvLinesize;
if (isCudaDevice && !gpuMatch) {
// Cross-GPU: NV12 decoded on one GPU, inference on another.
// CPU NV12 fallback uploads full decode-res NV12 (e.g. 3840x2160 = 12.4 MB)
// over PCIe, which is SLOWER than BGR at display-res (1920x1080 = 6.2 MB).
// Measured: CPU NV12 cross-GPU = 15-39ms preproc vs BGR = 10-20ms.
// Just fall back to BGR — it's faster for the cross-GPU case.
if (!m_gpuMismatchLogged) {
m_gpuMismatchLogged = true;
_logger.LogInfo("ANSRTYOLO::PreprocessFromNV12",
"GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) +
" vs inference GPU " + std::to_string(inferenceGpu) +
") — skipping NV12, using BGR (faster for cross-GPU: "
"BGR uploads " + std::to_string(displayW * displayH * 3 / 1024) +
"KB display-res vs NV12 " + std::to_string(frameW * frameH * 3 / 2 / 1024) +
"KB full-res)",
__FILE__, __LINE__);
}
if (regLock.owns_lock()) regLock.unlock();
return {}; // caller will use Preprocess(BGR) instead
}
// Diagnostic: log which path will be taken (once per instance)
// Note: cross-GPU case already returned {} above, so reaching here
// means either CUDA zero-copy (same GPU) or CPU NV12 upload (non-CUDA).
if (!m_nv12PathLogged) {
m_nv12PathLogged = true;
const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY"
: "CPU_NV12_UPLOAD";
_logger.LogInfo("ANSRTYOLO::PreprocessFromNV12",
std::string("Path: ") + pathName +
" | isCuda=" + std::to_string(isCudaDevice) +
" gpuMatch=" + std::to_string(gpuMatch) +
" decodeGpu=" + std::to_string(gpuData->gpuIndex) +
" infGpu=" + std::to_string(inferenceGpu) +
" frame=" + std::to_string(frameW) + "x" + std::to_string(frameH) +
" effYLine=" + std::to_string(effYLinesize) +
" effUvLine=" + std::to_string(effUvLinesize) +
" effYPtr=0x" + std::to_string(reinterpret_cast<uintptr_t>(effYPlane)) +
" hasCpuFallback=" + std::to_string(gpuData->cpuYPlane != nullptr),
__FILE__, __LINE__);
}
cv::cuda::Stream stream;
cv::cuda::GpuMat gpuY, gpuUV;
if (useZeroCopy) {
// ── CUDA zero-copy: wrap NVDEC device pointers directly ──
// No memcpy, no device-to-device copy — data stays in NVDEC VRAM.
// The fused letterbox kernel samples only ~409K pixels from the 4K
// source (vs 8.3M full copy), completing in <1ms on RTX 5080.
// We hold the registry lock until the kernel finishes reading.
gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1,
effYPlane, static_cast<size_t>(effYLinesize));
gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
effUvPlane, static_cast<size_t>(effUvLinesize));
// Lock released after kernel completion (stream.waitForCompletion below)
} else {
// ── CPU path: memcpy + upload (fallback for D3D11VA / sw decode) ──
// Hold registry lock during memcpy so the AVFrame can't be freed
// by another thread calling gpu_frame_attach() on the same key.
const size_t ySize = static_cast<size_t>(frameW) * frameH;
const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
const size_t nv12Size = ySize + uvSize;
ensurePinnedBuffer(nv12Size);
if (!m_pinnedBuf) {
if (!m_nv12PinnedLogged) {
m_nv12PinnedLogged = true;
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
"Early exit: pinned buffer alloc failed for " +
std::to_string(nv12Size) + " bytes",
__FILE__, __LINE__);
}
return {};
}
// Validate NV12 plane pointers before memcpy
const size_t yBufNeeded = (effYLinesize == frameW)
? ySize
: static_cast<size_t>(effYLinesize) * frameH;
const size_t uvBufNeeded = (effUvLinesize == frameW)
? uvSize
: static_cast<size_t>(effUvLinesize) * (frameH / 2);
if (!isMemoryReadable(effYPlane, std::min(yBufNeeded, (size_t)4096)) ||
!isMemoryReadable(effUvPlane, std::min(uvBufNeeded, (size_t)4096))) {
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
"NV12 plane pointers not readable! yPlane=0x" +
std::to_string(reinterpret_cast<uintptr_t>(effYPlane)) +
" uvPlane=0x" +
std::to_string(reinterpret_cast<uintptr_t>(effUvPlane)) +
" yLinesize=" + std::to_string(effYLinesize) +
" uvLinesize=" + std::to_string(effUvLinesize) +
" w=" + std::to_string(frameW) + " h=" + std::to_string(frameH),
__FILE__, __LINE__);
if (regLock.owns_lock()) regLock.unlock();
return {}; // fall back to BGR
}
uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
bool cpyOk = true;
if (effYLinesize == frameW) {
cpyOk = safeMemcpy(dst, effYPlane, ySize);
} else {
for (int row = 0; row < frameH && cpyOk; row++)
cpyOk = safeMemcpy(dst + row * frameW,
effYPlane + row * effYLinesize, frameW);
}
if (cpyOk) {
uint8_t* uvDst = dst + ySize;
if (effUvLinesize == frameW) {
cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
} else {
for (int row = 0; row < frameH / 2 && cpyOk; row++)
cpyOk = safeMemcpy(uvDst + row * frameW,
effUvPlane + row * effUvLinesize, frameW);
}
}
if (!cpyOk) {
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
"Access violation during NV12 memcpy! Falling back to BGR. "
"yPlane=0x" + std::to_string(reinterpret_cast<uintptr_t>(effYPlane)) +
" uvPlane=0x" + std::to_string(reinterpret_cast<uintptr_t>(effUvPlane)) +
" yLinesize=" + std::to_string(effYLinesize) +
" uvLinesize=" + std::to_string(effUvLinesize) +
" w=" + std::to_string(frameW) + " h=" + std::to_string(frameH) +
" avframe=0x" + std::to_string(reinterpret_cast<uintptr_t>(gpuData->avframe)),
__FILE__, __LINE__);
if (regLock.owns_lock()) regLock.unlock();
return {}; // fall back to BGR
}
// NV12 data safely in pinned memory — release registry lock.
// From here on we only read from m_pinnedBuf, not from gpuData.
if (regLock.owns_lock()) regLock.unlock();
cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
static_cast<uint8_t*>(m_pinnedBuf) + ySize);
gpuY.upload(pinnedY, stream);
gpuUV.upload(pinnedUV, stream);
}
// Use display dimensions for coordinate mapping so postprocessed
// bboxes map to the display image (1080p), not the NV12 source (4K).
const float metaW = (displayW > 0) ? static_cast<float>(displayW) : static_cast<float>(frameW);
const float metaH = (displayH > 0) ? static_cast<float>(displayH) : static_cast<float>(frameH);
outMeta.imgWidth = metaW;
outMeta.imgHeight = metaH;
if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
outMeta.ratio = 1.f / std::min(
inputDims[0].d[2] / metaW,
inputDims[0].d[1] / metaH);
const auto& outputDims = m_trtEngine->getOutputDims();
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
cv::cuda::GpuMat gpuResized;
if (isClassification) {
// Classification: NV12→RGB at full resolution, then simple resize
cv::cuda::GpuMat gpuRGB(frameH, frameW, CV_8UC3);
launchNV12ToRGB(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
gpuRGB.ptr<uint8_t>(), static_cast<int>(gpuRGB.step),
frameW, frameH, rawStream);
cv::cuda::resize(gpuRGB, gpuResized, cv::Size(inputW, inputH),
0, 0, cv::INTER_LINEAR, stream);
} else if (frameW == inputW && frameH == inputH) {
// Source matches model input — direct NV12→RGB, no resize needed
gpuResized.create(inputH, inputW, CV_8UC3);
launchNV12ToRGB(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),
frameW, frameH, rawStream);
} else {
// Detection: fused NV12→RGB + letterbox in a SINGLE kernel at
// output resolution (e.g. 640×640). This avoids the 24MB 4K RGB
// intermediate and processes 20× fewer pixels than separate
// convert + resize for 4K→640 downscale.
float r = std::min(static_cast<float>(inputW) / frameW,
static_cast<float>(inputH) / frameH);
int unpadW = static_cast<int>(r * frameW);
int unpadH = static_cast<int>(r * frameH);
float invScale = 1.0f / r; // maps output coords → source coords
gpuResized.create(inputH, inputW, CV_8UC3);
launchNV12ToRGBLetterbox(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),
inputW, inputH,
frameW, frameH,
unpadW, unpadH,
invScale, rawStream);
}
stream.waitForCompletion();
// Release registry lock now that kernel is done reading NVDEC pointers
if (regLock.owns_lock()) regLock.unlock();
// Log NV12 fast-path usage once per instance
if (!m_nv12ActiveLogged) {
m_nv12ActiveLogged = true;
const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload";
const char* kernel = isClassification ? "separate" : "FUSED letterbox";
_logger.LogInfo("ANSRTYOLO::PreprocessFromNV12",
std::string(mode) + " ACTIVE (" + kernel + "): " +
std::to_string(frameW) + "x" + std::to_string(frameH) +
" NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) +
" display=" + std::to_string(displayW) + "x" + std::to_string(displayH),
__FILE__, __LINE__);
}
std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
return inputs;
}
{
if (!m_nv12MetaLogged) {
m_nv12MetaLogged = true;
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
"Early exit: metadata dims invalid — metaW=" +
std::to_string(outMeta.imgWidth) + " metaH=" +
std::to_string(outMeta.imgHeight) +
" displayW=" + std::to_string(displayW) +
" displayH=" + std::to_string(displayH),
__FILE__, __LINE__);
}
}
return {};
}
catch (const std::exception& e) {
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
std::string("NV12 fast path failed, falling back to BGR: ") + e.what(),
__FILE__, __LINE__);
return {};
}
}
#endif // PreprocessFromNV12 moved to NV12PreprocessHelper
// ====================================================================
// GPU Preprocessing — batch
// ====================================================================
std::vector<std::vector<cv::cuda::GpuMat>> ANSRTYOLO::PreprocessBatch(
const std::vector<cv::Mat>& inputImages, BatchMetadata& outMetadata) {
if (!_licenseValid) {
_logger.LogError("ANSRTYOLO::PreprocessBatch", "Invalid license", __FILE__, __LINE__);
return {};
}
if (inputImages.empty()) {
_logger.LogError("ANSRTYOLO::PreprocessBatch", "Empty input images vector", __FILE__, __LINE__);
return {};
}
if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) return {};
try {
const auto& inputDims = m_trtEngine->getInputDims();
if (inputDims.empty()) {
_logger.LogError("ANSRTYOLO::PreprocessBatch", "No input dimensions available", __FILE__, __LINE__);
return {};
}
const int inputH = inputDims[0].d[1];
const int inputW = inputDims[0].d[2];
if (inputH <= 0 || inputW <= 0) {
_logger.LogError("ANSRTYOLO::PreprocessBatch", "Invalid model input dimensions", __FILE__, __LINE__);
return {};
}
outMetadata.imgHeights.resize(inputImages.size());
outMetadata.imgWidths.resize(inputImages.size());
outMetadata.ratios.resize(inputImages.size());
std::vector<cv::cuda::GpuMat> batchProcessed;
batchProcessed.reserve(inputImages.size());
cv::cuda::Stream stream;
for (size_t i = 0; i < inputImages.size(); ++i) {
const auto& inputImage = inputImages[i];
if (inputImage.empty()) {
_logger.LogError("ANSRTYOLO::PreprocessBatch",
"Empty input image at index " + std::to_string(i), __FILE__, __LINE__);
return {};
}
cv::cuda::GpuMat img;
if (inputImage.channels() == 1) {
cv::Mat img3Channel;
cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR);
img.upload(img3Channel, stream);
}
else if (inputImage.channels() == 3) {
img.upload(inputImage, stream);
}
else {
_logger.LogError("ANSRTYOLO::PreprocessBatch",
"Unsupported channel count at index " + std::to_string(i), __FILE__, __LINE__);
return {};
}
cv::cuda::GpuMat imgRGB;
cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream);
outMetadata.imgHeights[i] = imgRGB.rows;
outMetadata.imgWidths[i] = imgRGB.cols;
if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) {
_logger.LogError("ANSRTYOLO::PreprocessBatch",
"Invalid dimensions for image " + std::to_string(i), __FILE__, __LINE__);
return {};
}
const auto& outputDims = m_trtEngine->getOutputDims();
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
const float scaleW = inputW / static_cast<float>(imgRGB.cols);
const float scaleH = inputH / static_cast<float>(imgRGB.rows);
outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(scaleW, scaleH);
cv::cuda::GpuMat resized;
if (imgRGB.rows != inputH || imgRGB.cols != inputW) {
if (isClassification) {
cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream);
} else {
resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW);
}
}
else {
resized = imgRGB;
}
batchProcessed.push_back(std::move(resized));
}
stream.waitForCompletion();
std::vector<std::vector<cv::cuda::GpuMat>> inputs;
inputs.push_back(std::move(batchProcessed));
return inputs;
}
catch (const std::exception& e) {
_logger.LogWarn("ANSRTYOLO::PreprocessBatch", std::string("Skipped batch: ") + e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// OBB NMS helpers (Prob-IoU based) — static methods
// ====================================================================
void ANSRTYOLO::getCovarianceComponents(const OrientedBox& box,
float& out1, float& out2, float& out3) {
if (box.width <= 0.f || box.height <= 0.f) {
out1 = out2 = out3 = 0.f;
return;
}
const float vw = (box.width * box.width) / 12.0f;
const float vh = (box.height * box.height) / 12.0f;
const float cosT = std::cos(box.angle);
const float sinT = std::sin(box.angle);
const float cos2 = cosT * cosT;
const float sin2 = sinT * sinT;
const float sc = sinT * cosT;
out1 = vw * cos2 + vh * sin2;
out2 = vw * sin2 + vh * cos2;
out3 = (vw - vh) * sc;
}
std::vector<std::vector<float>> ANSRTYOLO::batchProbiou(
const std::vector<OrientedBox>& obb1,
const std::vector<OrientedBox>& obb2, float eps) {
if (obb1.empty() || obb2.empty()) return {};
const size_t n1 = obb1.size(), n2 = obb2.size();
std::vector<std::vector<float>> iouMat(n1, std::vector<float>(n2, 0.f));
struct CovData { float x, y, a, b, c; };
std::vector<CovData> cov1(n1);
for (size_t i = 0; i < n1; ++i) {
float a, b, c;
getCovarianceComponents(obb1[i], a, b, c);
cov1[i] = { obb1[i].x, obb1[i].y, a, b, c };
}
for (size_t i = 0; i < n1; ++i) {
for (size_t j = 0; j < n2; ++j) {
float a2, b2, c2;
getCovarianceComponents(obb2[j], a2, b2, c2);
float dx = cov1[i].x - obb2[j].x;
float dy = cov1[i].y - obb2[j].y;
float sA = cov1[i].a + a2, sB = cov1[i].b + b2, sC = cov1[i].c + c2;
float denom = sA * sB - sC * sC + eps;
if (denom <= eps) continue;
float t1 = ((sA*dy*dy + sB*dx*dx) * 0.25f) / denom;
float t2 = ((sC*dx*dy) * -0.5f) / denom;
float d1 = cov1[i].a*cov1[i].b - cov1[i].c*cov1[i].c;
float d2 = a2*b2 - c2*c2;
float sqrtDet = std::sqrt(std::max(d1, 0.f) * std::max(d2, 0.f) + eps);
float t3 = 0.5f * std::log((sA*sB - sC*sC) / (4.f*sqrtDet) + eps);
float bd = std::clamp(t1 + t2 + t3, eps, 100.f);
float hd = std::sqrt(1.f - std::exp(-bd) + eps);
iouMat[i][j] = 1.f - hd;
}
}
return iouMat;
}
std::vector<int> ANSRTYOLO::nmsRotatedImpl(
const std::vector<OrientedBox>& sortedBoxes, float iouThreshold) {
if (sortedBoxes.empty()) return {};
if (sortedBoxes.size() == 1) return { 0 };
auto iouMat = batchProbiou(sortedBoxes, sortedBoxes);
if (iouMat.empty()) return {};
const int n = static_cast<int>(sortedBoxes.size());
std::vector<int> keep;
keep.reserve(n / 2);
for (int j = 0; j < n; ++j) {
bool shouldKeep = true;
for (int i = 0; i < j; ++i) {
if (iouMat[i][j] >= iouThreshold) { shouldKeep = false; break; }
}
if (shouldKeep) keep.push_back(j);
}
return keep;
}
std::vector<int> ANSRTYOLO::nmsRotated(
const std::vector<OrientedBox>& boxes,
const std::vector<float>& scores, float iouThreshold) {
if (boxes.empty() || scores.empty() || boxes.size() != scores.size()) return {};
std::vector<int> sortedIdx(boxes.size());
std::iota(sortedIdx.begin(), sortedIdx.end(), 0);
std::sort(sortedIdx.begin(), sortedIdx.end(),
[&](int a, int b) { return scores[a] > scores[b]; });
std::vector<OrientedBox> sortedBoxes;
sortedBoxes.reserve(boxes.size());
for (int i : sortedIdx) sortedBoxes.push_back(boxes[i]);
auto keepSorted = nmsRotatedImpl(sortedBoxes, iouThreshold);
std::vector<int> keepOrig;
keepOrig.reserve(keepSorted.size());
for (int si : keepSorted) keepOrig.push_back(sortedIdx[si]);
return keepOrig;
}
std::vector<cv::Point2f> ANSRTYOLO::OBBToPoints(const OrientedBox& obb) {
float angleDeg = obb.angle * 180.0f / static_cast<float>(CV_PI);
cv::RotatedRect rr(cv::Point2f(obb.x, obb.y),
cv::Size2f(obb.width, obb.height), angleDeg);
std::vector<cv::Point2f> corners(4);
rr.points(corners.data());
return corners;
}
// ====================================================================
// Detection — legacy postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessDetection(
std::vector<float>& featureVector,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const auto& outputDims = m_trtEngine->getOutputDims();
auto numChannels = outputDims[0].d[1];
auto numAnchors = outputDims[0].d[2];
// Derive numClasses from tensor shape (4 box coords subtracted)
// rather than _classes.size() which may not match the model
auto numClasses = static_cast<size_t>(numChannels - 4);
if (!_classes.empty() && _classes.size() <= static_cast<size_t>(numChannels - 4))
numClasses = _classes.size();
std::vector<cv::Rect> bboxes;
std::vector<float> scores;
std::vector<int> labels;
std::vector<int> indices;
cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data());
output = output.t();
for (int i = 0; i < numAnchors; i++) {
auto rowPtr = output.row(i).ptr<float>();
auto bboxesPtr = rowPtr;
auto scoresPtr = rowPtr + 4;
auto maxSPtr = std::max_element(scoresPtr, scoresPtr + numClasses);
float score = *maxSPtr;
if (score > _modelConfig.detectionScoreThreshold) {
float x = *bboxesPtr++;
float y = *bboxesPtr++;
float w = *bboxesPtr++;
float h = *bboxesPtr;
float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
int label = static_cast<int>(maxSPtr - scoresPtr);
cv::Rect_<float> bbox;
bbox.x = x0; bbox.y = y0;
bbox.width = x1 - x0; bbox.height = y1 - y0;
bbox.x = std::max(0.f, bbox.x);
bbox.y = std::max(0.f, bbox.y);
bbox.width = std::min(meta.imgWidth - bbox.x, bbox.width);
bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height);
bboxes.push_back(bbox);
labels.push_back(label);
scores.push_back(score);
}
}
cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices);
std::vector<Object> objects;
int classNameSize = static_cast<int>(_classes.size());
for (auto& chosenIdx : indices) {
if (scores[chosenIdx] > _modelConfig.detectionScoreThreshold) {
Object obj{};
obj.confidence = scores[chosenIdx];
obj.classId = labels[chosenIdx];
obj.box = bboxes[chosenIdx];
//obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
if (!_classes.empty()) {
obj.className = (obj.classId < classNameSize)
? _classes[obj.classId] : _classes[classNameSize - 1];
}
else { obj.className = "Unknown"; }
obj.cameraId = camera_id;
objects.push_back(obj);
}
}
return objects;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessDetection", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// Detection — end2end postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessDetectionE2E(
std::vector<float>& featureVector,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const auto& outputDims = m_trtEngine->getOutputDims();
int numDets = outputDims[0].d[1];
int numFeat = outputDims[0].d[2]; // 6: x1,y1,x2,y2,conf,classId
std::vector<Object> results;
results.reserve(numDets);
for (int i = 0; i < numDets; ++i) {
const float* det = featureVector.data() + i * numFeat;
float conf = det[4];
if (conf <= _modelConfig.detectionScoreThreshold) continue;
int classId = static_cast<int>(det[5]);
// Scale from model input space to original image
float x0 = std::clamp(det[0] * meta.ratio, 0.f, meta.imgWidth);
float y0 = std::clamp(det[1] * meta.ratio, 0.f, meta.imgHeight);
float x1 = std::clamp(det[2] * meta.ratio, 0.f, meta.imgWidth);
float y1 = std::clamp(det[3] * meta.ratio, 0.f, meta.imgHeight);
float w = x1 - x0, h = y1 - y0;
if (w < 1.f || h < 1.f) continue;
Object obj;
obj.classId = classId;
obj.confidence = conf;
obj.box = cv::Rect(static_cast<int>(x0), static_cast<int>(y0),
static_cast<int>(w), static_cast<int>(h));
//obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
int classNameSize = static_cast<int>(_classes.size());
if (!_classes.empty() && classId >= 0 && classId < classNameSize)
obj.className = _classes[classId];
obj.cameraId = camera_id;
results.push_back(std::move(obj));
}
return results;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessDetectionE2E", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// OBB — legacy postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessOBB(
std::vector<float>& featureVector,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const auto& outputDims = m_trtEngine->getOutputDims();
int numChannels = outputDims[0].d[1];
int numAnchors = outputDims[0].d[2];
int numClasses = numChannels - 5; // 4 box + nc scores + 1 angle
if (numClasses <= 0) return {};
cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data()).t();
struct OBBCandidate {
OrientedBox box;
float conf;
int classId;
};
std::vector<OBBCandidate> candidates;
candidates.reserve(numAnchors);
for (int i = 0; i < numAnchors; ++i) {
const float* row = output.ptr<float>(i);
const float* scoresPtr = row + 4;
float maxScore = -FLT_MAX;
int bestClass = -1;
for (int c = 0; c < numClasses; ++c) {
if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; }
}
if (maxScore <= _modelConfig.detectionScoreThreshold) continue;
float angle = row[4 + numClasses];
float cx = row[0] * meta.ratio;
float cy = row[1] * meta.ratio;
float bw = row[2] * meta.ratio;
float bh = row[3] * meta.ratio;
cx = std::clamp(cx, 0.f, meta.imgWidth);
cy = std::clamp(cy, 0.f, meta.imgHeight);
candidates.push_back({ { cx, cy, bw, bh, angle }, maxScore, bestClass });
}
if (candidates.empty()) return {};
// Prob-IoU NMS
std::vector<OrientedBox> boxes;
std::vector<float> scores;
boxes.reserve(candidates.size());
scores.reserve(candidates.size());
for (const auto& c : candidates) { boxes.push_back(c.box); scores.push_back(c.conf); }
auto keepIdx = nmsRotated(boxes, scores, NMS_THRESHOLD);
std::vector<Object> results;
int classNameSize = static_cast<int>(_classes.size());
results.reserve(std::min(static_cast<int>(keepIdx.size()), TOP_K));
for (int idx : keepIdx) {
if (static_cast<int>(results.size()) >= TOP_K) break;
const auto& c = candidates[idx];
Object obj;
obj.classId = c.classId;
obj.confidence = c.conf;
obj.kps = { c.box.x, c.box.y, c.box.width, c.box.height, c.box.angle };
auto absCorners = OBBToPoints(c.box);
obj.box = cv::boundingRect(absCorners);
// Normalize OBB corners to [0,1] and close the polygon
obj.polygon.reserve(absCorners.size() + 1);
for (const auto& pt : absCorners) {
obj.polygon.emplace_back(
std::clamp(pt.x / meta.imgWidth, 0.f, 1.f),
std::clamp(pt.y / meta.imgHeight, 0.f, 1.f));
}
if (!obj.polygon.empty()) obj.polygon.push_back(obj.polygon.front()); // close
if (!_classes.empty() && c.classId >= 0 && c.classId < classNameSize)
obj.className = _classes[c.classId];
obj.cameraId = camera_id;
results.push_back(std::move(obj));
}
return results;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessOBB", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// OBB — end2end postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessOBBE2E(
std::vector<float>& featureVector,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const auto& outputDims = m_trtEngine->getOutputDims();
int numDets = outputDims[0].d[1];
int numFeat = outputDims[0].d[2]; // 7: cx,cy,w,h,angle,conf,classId
std::vector<Object> results;
results.reserve(numDets);
for (int i = 0; i < numDets; ++i) {
const float* det = featureVector.data() + i * numFeat;
float angle = det[4];
float conf = det[5];
if (conf <= _modelConfig.detectionScoreThreshold) continue;
float cx = det[0] * meta.ratio;
float cy = det[1] * meta.ratio;
float bw = det[2] * meta.ratio;
float bh = det[3] * meta.ratio;
int classId = static_cast<int>(det[6]);
cx = std::clamp(cx, 0.f, meta.imgWidth);
cy = std::clamp(cy, 0.f, meta.imgHeight);
OrientedBox obb{ cx, cy, bw, bh, angle };
Object obj;
obj.classId = classId;
obj.confidence = conf;
obj.kps = { cx, cy, bw, bh, angle };
auto absCorners = OBBToPoints(obb);
obj.box = cv::boundingRect(absCorners);
// Normalize OBB corners to [0,1] and close the polygon
obj.polygon.reserve(absCorners.size() + 1);
for (const auto& pt : absCorners) {
obj.polygon.emplace_back(
std::clamp(pt.x / meta.imgWidth, 0.f, 1.f),
std::clamp(pt.y / meta.imgHeight, 0.f, 1.f));
}
if (!obj.polygon.empty()) obj.polygon.push_back(obj.polygon.front()); // close
int classNameSize = static_cast<int>(_classes.size());
if (!_classes.empty() && classId >= 0 && classId < classNameSize)
obj.className = _classes[classId];
obj.cameraId = camera_id;
results.push_back(std::move(obj));
}
return results;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessOBBE2E", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// Segmentation — legacy postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessSegmentation(
std::vector<std::vector<float>>& featureVectors,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const auto& outputDims = m_trtEngine->getOutputDims();
int numChannels = outputDims[0].d[1];
int numAnchors = outputDims[0].d[2];
const auto numClasses = numChannels - SEG_CHANNELS - 4;
if (featureVectors[0].size() != static_cast<size_t>(numChannels) * numAnchors) return {};
if (featureVectors[1].size() != static_cast<size_t>(SEG_CHANNELS) * SEG_H * SEG_W) return {};
cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVectors[0].data()).t();
cv::Mat protos = cv::Mat(SEG_CHANNELS, SEG_H * SEG_W, CV_32F, featureVectors[1].data());
std::vector<int> labels;
std::vector<float> scores;
std::vector<cv::Rect> bboxes;
std::vector<cv::Mat> maskConfs;
std::vector<int> indices;
for (int i = 0; i < numAnchors; i++) {
auto rowPtr = output.row(i).ptr<float>();
auto bboxesPtr = rowPtr;
auto scoresPtr = rowPtr + 4;
auto maskConfsPtr = rowPtr + 4 + numClasses;
auto maxSPtr = std::max_element(scoresPtr, scoresPtr + numClasses);
float score = *maxSPtr;
if (score > _modelConfig.detectionScoreThreshold) {
float x = *bboxesPtr++;
float y = *bboxesPtr++;
float w = *bboxesPtr++;
float h = *bboxesPtr;
float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
int label = static_cast<int>(maxSPtr - scoresPtr);
cv::Rect_<float> bbox;
bbox.x = x0; bbox.y = y0;
bbox.width = x1 - x0; bbox.height = y1 - y0;
bbox.x = std::max(0.f, bbox.x);
bbox.y = std::max(0.f, bbox.y);
bbox.width = std::min(meta.imgWidth - bbox.x, bbox.width);
bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height);
cv::Mat maskConf = cv::Mat(1, SEG_CHANNELS, CV_32F, maskConfsPtr);
bboxes.push_back(bbox);
labels.push_back(label);
scores.push_back(score);
maskConfs.push_back(maskConf);
}
}
cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices);
cv::Mat masks;
int classNameSize = static_cast<int>(_classes.size());
std::vector<Object> objs;
for (auto& i : indices) {
if (scores[i] > _modelConfig.detectionScoreThreshold) {
Object obj;
obj.classId = labels[i];
if (!_classes.empty()) {
obj.className = (obj.classId < classNameSize)
? _classes[obj.classId] : _classes[classNameSize - 1];
}
else { obj.className = "Unknown"; }
obj.box = bboxes[i];
obj.confidence = scores[i];
obj.cameraId = camera_id;
masks.push_back(maskConfs[i]);
objs.push_back(obj);
}
}
if (!masks.empty()) {
cv::Mat matmulRes = (masks * protos).t();
// Apply sigmoid while still a single-channel 2D matrix
cv::Mat negMat;
cv::exp(-matmulRes, negMat);
cv::Mat sigmoidFlat = 1.0 / (1.0 + negMat);
// Now reshape into multi-channel and split
cv::Mat sigmoidMat = sigmoidFlat.reshape(static_cast<int>(indices.size()),
{ SEG_H, SEG_W });
std::vector<cv::Mat> maskChannels;
cv::split(sigmoidMat, maskChannels);
// ROI in proto space (SEG_H x SEG_W), accounting for top-left letterbox padding
// ANSRTYOLO pads right-bottom, so content starts at (0,0) in proto space
cv::Rect roi;
if (meta.imgHeight > meta.imgWidth) {
int roiW = std::min(static_cast<int>(std::round(
static_cast<float>(SEG_W) * meta.imgWidth / meta.imgHeight)), SEG_W);
roi = cv::Rect(0, 0, roiW, SEG_H);
}
else {
int roiH = std::min(static_cast<int>(std::round(
static_cast<float>(SEG_H) * meta.imgHeight / meta.imgWidth)), SEG_H);
roi = cv::Rect(0, 0, SEG_W, roiH);
}
roi &= cv::Rect(0, 0, SEG_W, SEG_H);
int imgW = static_cast<int>(meta.imgWidth);
int imgH = static_cast<int>(meta.imgHeight);
// Precompute scale factors from proto-ROI to original image
const float scaleX = static_cast<float>(imgW) / roi.width;
const float scaleY = static_cast<float>(imgH) / roi.height;
for (size_t i = 0; i < objs.size(); i++) {
cv::Rect safeBox = objs[i].box & cv::Rect(0, 0, imgW, imgH);
if (safeBox.area() <= 0) continue;
// Map bounding box back to proto-ROI space and crop there
int px0 = std::max(static_cast<int>(std::floor(safeBox.x / scaleX)), 0);
int py0 = std::max(static_cast<int>(std::floor(safeBox.y / scaleY)), 0);
int px1 = std::min(static_cast<int>(std::ceil((safeBox.x + safeBox.width) / scaleX)), roi.width);
int py1 = std::min(static_cast<int>(std::ceil((safeBox.y + safeBox.height) / scaleY)), roi.height);
if (px1 <= px0 || py1 <= py0) continue;
cv::Rect protoBox(roi.x + px0, roi.y + py0, px1 - px0, py1 - py0);
protoBox &= cv::Rect(0, 0, SEG_W, SEG_H);
if (protoBox.area() <= 0) continue;
// Resize only the small proto crop to the bounding box size
cv::Mat cropped = maskChannels[i](protoBox);
cv::Mat resized;
cv::resize(cropped, resized, cv::Size(safeBox.width, safeBox.height),
0, 0, cv::INTER_LINEAR);
objs[i].mask = resized > _modelConfig.modelConfThreshold;
objs[i].polygon = ANSUtilityHelper::MaskToNormalizedPolygon(
objs[i].mask, safeBox, meta.imgWidth, meta.imgHeight);
}
}
// Fill polygon for objects that got masks
for (auto& obj : objs) {
if (obj.polygon.empty())
obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
}
return objs;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessSegmentation", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// Segmentation — end2end postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessSegE2E(
std::vector<std::vector<float>>& featureVectors,
const std::string& camera_id, const ImageMetadata& meta) {
try {
if (featureVectors.size() < 2) return {};
const auto& outputDims = m_trtEngine->getOutputDims();
int numDets = outputDims[0].d[1];
int numFeat = outputDims[0].d[2]; // 6 + nm
// Proto dimensions from second output
int nm = outputDims[1].d[1];
int protoH = outputDims[1].d[2];
int protoW = (outputDims[1].nbDims > 3) ? outputDims[1].d[3] : outputDims[1].d[2];
if (numFeat < 6 + nm) return {};
const float* raw = featureVectors[0].data();
std::vector<Object> objs;
cv::Mat maskCoeffs;
for (int i = 0; i < numDets; ++i) {
const float* det = raw + i * numFeat;
float conf = det[4];
if (conf <= _modelConfig.detectionScoreThreshold) continue;
int classId = static_cast<int>(det[5]);
float x0 = std::clamp(det[0] * meta.ratio, 0.f, meta.imgWidth);
float y0 = std::clamp(det[1] * meta.ratio, 0.f, meta.imgHeight);
float x1 = std::clamp(det[2] * meta.ratio, 0.f, meta.imgWidth);
float y1 = std::clamp(det[3] * meta.ratio, 0.f, meta.imgHeight);
float w = x1 - x0, h = y1 - y0;
if (w < 1.f || h < 1.f) continue;
Object obj;
obj.classId = classId;
obj.confidence = conf;
obj.box = cv::Rect(static_cast<int>(x0), static_cast<int>(y0),
static_cast<int>(w), static_cast<int>(h));
int classNameSize = static_cast<int>(_classes.size());
if (!_classes.empty() && classId >= 0 && classId < classNameSize)
obj.className = _classes[classId];
obj.cameraId = camera_id;
objs.push_back(std::move(obj));
cv::Mat mc(1, nm, CV_32F);
std::memcpy(mc.ptr<float>(), det + 6, nm * sizeof(float));
maskCoeffs.push_back(mc);
}
if (!objs.empty() && !maskCoeffs.empty()) {
cv::Mat protos(nm, protoH * protoW, CV_32F, featureVectors[1].data());
cv::Mat matmulRes = (maskCoeffs * protos).t();
// Apply sigmoid while still a single-channel 2D matrix
cv::Mat negMat;
cv::exp(-matmulRes, negMat);
cv::Mat sigmoidFlat = 1.0 / (1.0 + negMat);
// Now reshape into multi-channel and split
cv::Mat sigmoidMat = sigmoidFlat.reshape(static_cast<int>(objs.size()),
{ protoH, protoW });
std::vector<cv::Mat> maskChannels;
cv::split(sigmoidMat, maskChannels);
// ROI in proto space, accounting for top-left letterbox padding
// ANSRTYOLO pads right-bottom, so content starts at (0,0) in proto space
cv::Rect roi;
if (meta.imgHeight > meta.imgWidth) {
int roiW = std::min(static_cast<int>(std::round(
static_cast<float>(protoW) * meta.imgWidth / meta.imgHeight)), protoW);
roi = cv::Rect(0, 0, roiW, protoH);
}
else {
int roiH = std::min(static_cast<int>(std::round(
static_cast<float>(protoH) * meta.imgHeight / meta.imgWidth)), protoH);
roi = cv::Rect(0, 0, protoW, roiH);
}
roi &= cv::Rect(0, 0, protoW, protoH);
int imgW = static_cast<int>(meta.imgWidth);
int imgH = static_cast<int>(meta.imgHeight);
const float scaleX = static_cast<float>(imgW) / roi.width;
const float scaleY = static_cast<float>(imgH) / roi.height;
for (size_t i = 0; i < objs.size(); ++i) {
cv::Rect safebox = objs[i].box & cv::Rect(0, 0, imgW, imgH);
if (safebox.area() <= 0) continue;
int px0 = std::max(static_cast<int>(std::floor(safebox.x / scaleX)), 0);
int py0 = std::max(static_cast<int>(std::floor(safebox.y / scaleY)), 0);
int px1 = std::min(static_cast<int>(std::ceil((safebox.x + safebox.width) / scaleX)), roi.width);
int py1 = std::min(static_cast<int>(std::ceil((safebox.y + safebox.height) / scaleY)), roi.height);
if (px1 <= px0 || py1 <= py0) continue;
cv::Rect protoBox(roi.x + px0, roi.y + py0, px1 - px0, py1 - py0);
protoBox &= cv::Rect(0, 0, protoW, protoH);
if (protoBox.area() <= 0) continue;
cv::Mat cropped = maskChannels[i](protoBox);
cv::Mat resized;
cv::resize(cropped, resized, cv::Size(safebox.width, safebox.height),
0, 0, cv::INTER_LINEAR);
objs[i].mask = resized > SEGMENTATION_THRESHOLD;
objs[i].polygon = ANSUtilityHelper::MaskToNormalizedPolygon(
objs[i].mask, safebox, meta.imgWidth, meta.imgHeight);
}
}
for (auto& obj : objs) {
if (obj.polygon.empty())
obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
}
return objs;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessSegE2E", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// Pose — legacy postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessPose(
std::vector<float>& featureVector,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const auto& outputDims = m_trtEngine->getOutputDims();
auto numChannels = outputDims[0].d[1];
auto numAnchors = outputDims[0].d[2];
std::vector<cv::Rect> bboxes;
std::vector<float> scores;
std::vector<int> labels;
std::vector<int> indices;
std::vector<std::vector<float>> kpss;
cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data()).t();
for (int i = 0; i < numAnchors; i++) {
auto rowPtr = output.row(i).ptr<float>();
auto bboxesPtr = rowPtr;
auto scoresPtr = rowPtr + 4;
auto kps_ptr = rowPtr + 5;
float score = *scoresPtr;
if (score > _modelConfig.detectionScoreThreshold) {
float x = *bboxesPtr++;
float y = *bboxesPtr++;
float w = *bboxesPtr++;
float h = *bboxesPtr;
float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
cv::Rect_<float> bbox;
bbox.x = x0; bbox.y = y0;
bbox.width = x1 - x0; bbox.height = y1 - y0;
bbox.x = std::max(0.f, bbox.x);
bbox.y = std::max(0.f, bbox.y);
bbox.width = std::min(meta.imgWidth - bbox.x, bbox.width);
bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height);
std::vector<float> kps;
for (int k = 0; k < NUM_KPS; k++) {
float kpsX = std::clamp(*(kps_ptr + 3 * k) * meta.ratio, 0.f, meta.imgWidth);
float kpsY = std::clamp(*(kps_ptr + 3 * k + 1) * meta.ratio, 0.f, meta.imgHeight);
float kpsS = *(kps_ptr + 3 * k + 2);
kps.push_back(kpsX);
kps.push_back(kpsY);
kps.push_back(kpsS);
}
bboxes.push_back(bbox);
labels.push_back(0);
scores.push_back(score);
kpss.push_back(kps);
}
}
cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices);
std::vector<Object> objects;
int classNameSize = static_cast<int>(_classes.size());
for (auto& chosenIdx : indices) {
if (scores[chosenIdx] > _modelConfig.detectionScoreThreshold) {
Object obj{};
obj.confidence = scores[chosenIdx];
obj.classId = labels[chosenIdx];
if (!_classes.empty()) {
obj.className = (obj.classId < classNameSize)
? _classes[obj.classId] : _classes[classNameSize - 1];
}
else { obj.className = "Unknown"; }
obj.box = bboxes[chosenIdx];
//obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
obj.kps = kpss[chosenIdx];
obj.cameraId = camera_id;
objects.push_back(obj);
}
}
return objects;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessPose", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// Pose — end2end postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessPoseE2E(
std::vector<float>& featureVector,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const auto& outputDims = m_trtEngine->getOutputDims();
int numDets = outputDims[0].d[1];
int numFeat = outputDims[0].d[2]; // 6 + nk*3
int nk = (numFeat - 6) / 3;
std::vector<Object> results;
results.reserve(numDets);
for (int i = 0; i < numDets; ++i) {
const float* det = featureVector.data() + i * numFeat;
float conf = det[4];
if (conf <= _modelConfig.detectionScoreThreshold) continue;
int classId = static_cast<int>(det[5]);
float x0 = std::clamp(det[0] * meta.ratio, 0.f, meta.imgWidth);
float y0 = std::clamp(det[1] * meta.ratio, 0.f, meta.imgHeight);
float x1 = std::clamp(det[2] * meta.ratio, 0.f, meta.imgWidth);
float y1 = std::clamp(det[3] * meta.ratio, 0.f, meta.imgHeight);
float w = x1 - x0, h = y1 - y0;
if (w < 1.f || h < 1.f) continue;
const float* kpsPtr = det + 6;
std::vector<float> kps;
kps.reserve(nk * 3);
for (int k = 0; k < nk; ++k) {
float kx = std::clamp(kpsPtr[3*k] * meta.ratio, 0.f, meta.imgWidth);
float ky = std::clamp(kpsPtr[3*k+1] * meta.ratio, 0.f, meta.imgHeight);
float ks = kpsPtr[3*k+2];
kps.push_back(kx);
kps.push_back(ky);
kps.push_back(ks);
}
Object obj;
obj.classId = classId;
obj.confidence = conf;
obj.box = cv::Rect(static_cast<int>(x0), static_cast<int>(y0),
static_cast<int>(w), static_cast<int>(h));
obj.kps = std::move(kps);
int classNameSize = static_cast<int>(_classes.size());
if (!_classes.empty() && classId >= 0 && classId < classNameSize)
obj.className = _classes[classId];
obj.cameraId = camera_id;
results.push_back(std::move(obj));
}
return results;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessPoseE2E", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// Classification postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessClassify(
std::vector<float>& featureVector,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const int nc = static_cast<int>(featureVector.size());
if (nc == 0) return {};
// Check if output is already a probability distribution (sums to ~1.0).
// Some models include a Softmax layer; applying softmax again would
// flatten the distribution and cause wrong classifications.
float rawSum = 0.f;
bool allNonNeg = true;
for (int i = 0; i < nc; ++i) {
rawSum += featureVector[i];
if (featureVector[i] < 0.f) allNonNeg = false;
}
const bool alreadyNormalized = (allNonNeg && rawSum > 0.9f && rawSum < 1.1f);
std::vector<float> probs(nc);
if (alreadyNormalized) {
for (int i = 0; i < nc; ++i) probs[i] = featureVector[i];
} else {
float maxVal = *std::max_element(featureVector.begin(), featureVector.end());
float sumExp = 0.f;
for (int i = 0; i < nc; ++i) {
probs[i] = std::exp(featureVector[i] - maxVal);
sumExp += probs[i];
}
for (int i = 0; i < nc; ++i) probs[i] /= sumExp;
}
int bestClass = 0;
float bestProb = 0.f;
for (int i = 0; i < nc; ++i) {
if (probs[i] > bestProb) { bestProb = probs[i]; bestClass = i; }
}
const int imgW = static_cast<int>(meta.imgWidth);
const int imgH = static_cast<int>(meta.imgHeight);
Object obj;
if (imgW > 20 && imgH > 20) {
obj.box = cv::Rect(10, 10, imgW - 20, imgH - 20);
}
else {
obj.box = cv::Rect(0, 0, imgW, imgH);
}
//obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
obj.classId = bestClass;
obj.confidence = bestProb;
obj.cameraId = camera_id;
int classNameSize = static_cast<int>(_classes.size());
if (!_classes.empty() && bestClass >= 0 && bestClass < classNameSize)
obj.className = _classes[bestClass];
return { std::move(obj) };
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessClassify", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// DetectObjects — single image with auto-detection of task type
// ====================================================================
std::vector<Object> ANSRTYOLO::DetectObjects(const cv::Mat& inputImage,
const std::string& camera_id) {
try {
// --- Debug timer helper (zero-cost when _debugFlag == false) ---
using Clock = std::chrono::steady_clock;
const bool dbg = _debugFlag;
auto t0 = dbg ? Clock::now() : Clock::time_point{};
auto tPrev = t0;
auto elapsed = [&]() -> double {
auto now = Clock::now();
double ms = std::chrono::duration<double, std::milli>(now - tPrev).count();
tPrev = now;
return ms;
};
// --- 1. Set GPU device context ---
if (m_trtEngine) {
m_trtEngine->setDeviceContext();
}
double msSetDevice = dbg ? elapsed() : 0;
// --- 1b. CUDA context health check ---
if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) {
return {};
}
// --- 2. Preprocess under lock ---
ANS_DBG("YOLO", "Preprocess START %dx%d", inputImage.cols, inputImage.rows);
ImageMetadata meta;
std::vector<std::vector<cv::cuda::GpuMat>> input;
bool usedNV12 = false;
float bgrFullResScaleX = 1.0f, bgrFullResScaleY = 1.0f;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
const int inferenceGpu = m_trtEngine ? m_trtEngine->getPreferredDeviceIndex() : 0;
const auto& inputDims = m_trtEngine->getInputDims();
const int inputW = inputDims[0].d[2];
const int inputH = inputDims[0].d[1];
auto nv12 = m_nv12Helper.tryNV12(inputImage, inferenceGpu, inputW, inputH,
NV12PreprocessHelper::defaultYOLOLauncher(),
_logger, "ANSRTYOLO");
if (nv12.succeeded) {
meta.imgWidth = nv12.metaWidth;
meta.imgHeight = nv12.metaHeight;
meta.ratio = nv12.ratio;
input = {{ std::move(nv12.gpuRGB) }};
usedNV12 = true;
}
else if (nv12.useBgrFullRes) {
input = Preprocess(nv12.bgrFullResImg, meta);
usedNV12 = !input.empty();
bgrFullResScaleX = nv12.bgrFullResScaleX;
bgrFullResScaleY = nv12.bgrFullResScaleY;
}
if (input.empty()) {
input = Preprocess(inputImage, meta);
}
m_nv12Helper.tickInference();
}
double msPreprocess = dbg ? elapsed() : 0;
if (input.empty()) {
_logger.LogWarn("ANSRTYOLO::DetectObjects", "Skipped: preprocessing returned empty input", __FILE__, __LINE__);
return {};
}
// --- 3. TRT Inference (mutex released for concurrent GPU slots) ---
ANS_DBG("YOLO", "TRT inference START nv12=%d inputSize=%dx%d",
(int)usedNV12,
input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].cols),
input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].rows));
auto _trtStart = std::chrono::steady_clock::now();
std::vector<std::vector<std::vector<float>>> featureVectors;
if (!m_trtEngine->runInference(input, featureVectors)) {
ANS_DBG("YOLO", "ERROR: TRT runInference FAILED");
_logger.LogError("ANSRTYOLO::DetectObjects", "Error running inference", __FILE__, __LINE__);
return {};
}
auto _trtEnd = std::chrono::steady_clock::now();
double _trtMs = std::chrono::duration<double, std::milli>(_trtEnd - _trtStart).count();
if (_trtMs > 500.0) {
ANS_DBG("YOLO", "SLOW TRT inference: %.1fms", _trtMs);
}
double msInference = dbg ? elapsed() : 0;
// --- 4. Transform output ---
std::vector<Object> results;
bool isClassification = false;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
const auto& outputDims = m_trtEngine->getOutputDims();
const size_t numOutputs = outputDims.size();
if (numOutputs >= 2) {
std::vector<std::vector<float>> featureVector2d;
Engine<float>::transformOutput(featureVectors, featureVector2d);
double msTransform = dbg ? elapsed() : 0;
int dim1 = outputDims[0].d[1];
int dim2 = outputDims[0].d[2];
if (dim1 > dim2 || dim2 <= 20)
results = PostprocessSegE2E(featureVector2d, camera_id, meta);
else
results = PostprocessSegmentation(featureVector2d, camera_id, meta);
if (dbg) {
double msPostprocess = elapsed();
_logger.LogInfo("ANSRTYOLO::DetectObjects",
"[DEBUG] Seg | " + std::string(usedNV12 ? "NV12" : "BGR") +
" | SetDev=" + std::to_string(msSetDevice) +
"ms Preproc=" + std::to_string(msPreprocess) +
"ms Inf=" + std::to_string(msInference) +
"ms Transform=" + std::to_string(msTransform) +
"ms Postproc=" + std::to_string(msPostprocess) +
"ms Det=" + std::to_string(results.size()),
__FILE__, __LINE__);
}
}
else {
std::vector<float> featureVector;
Engine<float>::transformOutput(featureVectors, featureVector);
double msTransform = dbg ? elapsed() : 0;
if (outputDims[0].nbDims <= 2) {
results = PostprocessClassify(featureVector, camera_id, meta);
isClassification = true;
}
else {
int dim1 = outputDims[0].d[1];
int dim2 = outputDims[0].d[2];
int nc = static_cast<int>(_classes.size());
const bool isEndToEnd = (dim1 > dim2) || (dim2 <= 20);
if (isEndToEnd) {
if (dim2 == 6)
results = PostprocessDetectionE2E(featureVector, camera_id, meta);
else if (dim2 == 7)
results = PostprocessOBBE2E(featureVector, camera_id, meta);
else if (dim2 > 7 && (dim2 - 6) % 3 == 0)
results = PostprocessPoseE2E(featureVector, camera_id, meta);
else
results = PostprocessDetectionE2E(featureVector, camera_id, meta);
}
else {
int extra = dim1 - 4;
bool routed = false;
if (nc > 0 && nc <= extra) {
if (extra == nc) {
results = PostprocessDetection(featureVector, camera_id, meta);
routed = true;
}
else if (extra == nc + 1) {
results = PostprocessOBB(featureVector, camera_id, meta);
routed = true;
}
else if ((extra - nc) % 3 == 0 && (extra - nc) >= 3) {
results = PostprocessPose(featureVector, camera_id, meta);
routed = true;
}
}
if (!routed) {
if (extra >= 2) {
cv::Mat probe = cv::Mat(dim1, dim2, CV_32F, featureVector.data()).t();
int lastCol = dim1 - 1;
int numSamples = std::min(dim2, 100);
int angleCount = 0;
for (int s = 0; s < numSamples; ++s) {
float v = probe.at<float>(s, lastCol);
if (v >= -3.15f && v <= 3.15f) ++angleCount;
}
if (angleCount > numSamples * 8 / 10) {
results = PostprocessOBB(featureVector, camera_id, meta);
routed = true;
}
}
if (!routed && dim1 == 56)
results = PostprocessPose(featureVector, camera_id, meta);
else if (!routed)
results = PostprocessDetection(featureVector, camera_id, meta);
}
}
}
if (dbg) {
double msPostprocess = elapsed();
_logger.LogInfo("ANSRTYOLO::DetectObjects",
"[DEBUG] " + camera_id +
" | " + std::string(usedNV12 ? "NV12" : "BGR") +
" | SetDev=" + std::to_string(msSetDevice) +
"ms Preproc=" + std::to_string(msPreprocess) +
"ms Inf=" + std::to_string(msInference) +
"ms Transform=" + std::to_string(msTransform) +
"ms Postproc=" + std::to_string(msPostprocess) +
"ms Det=" + std::to_string(results.size()) +
(isClassification ? " [classify]" : " [detect]"),
__FILE__, __LINE__);
}
}
}
// --- 4b. Rescale coords from full-res to display-res (BGR full-res path) ---
// When ANSVideoPlayer provides full-res BGR via the registry, Preprocess
// and Postprocess operate in full-res coordinates. But the caller passed
// a display-res inputImage and expects coords in that space. Remap here.
if (bgrFullResScaleX != 1.0f || bgrFullResScaleY != 1.0f) {
for (auto& obj : results) {
obj.box.x = static_cast<int>(obj.box.x * bgrFullResScaleX);
obj.box.y = static_cast<int>(obj.box.y * bgrFullResScaleY);
obj.box.width = static_cast<int>(obj.box.width * bgrFullResScaleX);
obj.box.height = static_cast<int>(obj.box.height * bgrFullResScaleY);
// Rescale polygon points if present (segmentation / OBB)
for (auto& pt : obj.polygon) {
pt.x *= bgrFullResScaleX;
pt.y *= bgrFullResScaleY;
}
// Rescale keypoints if present (pose: x,y,conf triplets)
for (size_t k = 0; k + 2 < obj.kps.size(); k += 3) {
obj.kps[k] *= bgrFullResScaleX;
obj.kps[k + 1] *= bgrFullResScaleY;
}
}
}
// --- 5. Tracking + Stabilization ---
if (_trackerEnabled && !isClassification) {
results = ApplyTracking(results, camera_id);
double msTracking = dbg ? elapsed() : 0;
if (_stabilizationEnabled) {
results = StabilizeDetections(results, camera_id);
}
double msStabilize = dbg ? elapsed() : 0;
if (dbg) {
_logger.LogInfo("ANSRTYOLO::DetectObjects",
"[DEBUG] " + camera_id +
" | Tracking=" + std::to_string(msTracking) +
"ms Stabilize=" + std::to_string(msStabilize) + "ms",
__FILE__, __LINE__);
}
}
// --- 6. Total pipeline time ---
if (dbg) {
double msTotal = std::chrono::duration<double, std::milli>(Clock::now() - t0).count();
_logger.LogInfo("ANSRTYOLO::DetectObjects",
"[DEBUG] " + camera_id + " | TOTAL=" + std::to_string(msTotal) +
"ms (" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) +
") Results=" + std::to_string(results.size()),
__FILE__, __LINE__);
}
return results;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::DetectObjects", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// DetectObjectsBatch — batch inference with auto-detection
// ====================================================================
std::vector<std::vector<Object>> ANSRTYOLO::DetectObjectsBatch(
const std::vector<cv::Mat>& inputImages, const std::string& camera_id) {
if (inputImages.empty()) {
_logger.LogError("ANSRTYOLO::DetectObjectsBatch", "Empty input images vector", __FILE__, __LINE__);
return {};
}
// Auto-split if batch exceeds engine capacity
const int maxBatch = m_options.maxBatchSize > 0 ? m_options.maxBatchSize : 1;
if (static_cast<int>(inputImages.size()) > maxBatch && maxBatch > 0) {
const size_t numImages = inputImages.size();
std::vector<std::vector<Object>> allResults;
allResults.reserve(numImages);
for (size_t start = 0; start < numImages; start += static_cast<size_t>(maxBatch)) {
const size_t end = std::min(start + static_cast<size_t>(maxBatch), numImages);
std::vector<cv::Mat> chunk(inputImages.begin() + start, inputImages.begin() + end);
auto chunkResults = DetectObjectsBatch(chunk, camera_id);
if (chunkResults.size() == chunk.size()) {
for (auto& r : chunkResults) allResults.push_back(std::move(r));
}
else {
_logger.LogError("ANSRTYOLO::DetectObjectsBatch",
"Chunk returned " + std::to_string(chunkResults.size()) +
" results, expected " + std::to_string(chunk.size()), __FILE__, __LINE__);
for (auto& r : chunkResults) allResults.push_back(std::move(r));
for (size_t pad = chunkResults.size(); pad < chunk.size(); ++pad)
allResults.push_back({});
}
}
return allResults;
}
try {
// --- Debug timer helper ---
using Clock = std::chrono::steady_clock;
const bool dbg = _debugFlag;
auto t0 = dbg ? Clock::now() : Clock::time_point{};
auto tPrev = t0;
auto elapsed = [&]() -> double {
auto now = Clock::now();
double ms = std::chrono::duration<double, std::milli>(now - tPrev).count();
tPrev = now;
return ms;
};
// Ensure correct GPU context for preprocessing (multi-GPU safety)
if (m_trtEngine) {
m_trtEngine->setDeviceContext();
}
double msSetDevice = dbg ? elapsed() : 0;
// CUDA context health check (same as DetectObjects)
if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) {
return {};
}
const size_t realCount = inputImages.size();
// Pad batch to next power-of-2
size_t paddedCount = 1;
while (paddedCount < realCount) paddedCount *= 2;
paddedCount = std::min(paddedCount, static_cast<size_t>(maxBatch));
const std::vector<cv::Mat>* batchPtr = &inputImages;
std::vector<cv::Mat> paddedImages;
if (paddedCount > realCount) {
paddedImages.reserve(paddedCount);
paddedImages.insert(paddedImages.end(), inputImages.begin(), inputImages.end());
for (size_t p = realCount; p < paddedCount; ++p)
paddedImages.push_back(inputImages.back());
batchPtr = &paddedImages;
}
double msPad = dbg ? elapsed() : 0;
BatchMetadata metadata;
const auto inputs = PreprocessBatch(*batchPtr, metadata);
double msPreprocess = dbg ? elapsed() : 0;
if (inputs.empty() || inputs[0].empty()) {
_logger.LogWarn("ANSRTYOLO::DetectObjectsBatch", "Skipped: preprocessing failed", __FILE__, __LINE__);
return {};
}
// Check for prior CUDA errors before inference.
cudaError_t priorErr = cudaGetLastError();
if (priorErr != cudaSuccess) {
_logger.LogWarn("ANSRTYOLO::DetectObjectsBatch",
std::string("Cleared prior CUDA error before inference: ")
+ cudaGetErrorString(priorErr),
__FILE__, __LINE__);
}
std::vector<std::vector<std::vector<float>>> featureVectors;
auto succ = m_trtEngine->runInference(inputs, featureVectors);
if (!succ) {
cudaError_t postErr = cudaPeekAtLastError();
std::string detail = "runInference returned false, batchSize="
+ std::to_string(inputs[0].size());
if (postErr != cudaSuccess) {
detail += ", CUDA error: ";
detail += cudaGetErrorString(postErr);
}
_logger.LogError("ANSRTYOLO::DetectObjectsBatch", detail, __FILE__, __LINE__);
return {};
}
double msInference = dbg ? elapsed() : 0;
if (featureVectors.size() != paddedCount) {
_logger.LogError("ANSRTYOLO::DetectObjectsBatch", "Output batch size mismatch", __FILE__, __LINE__);
return {};
}
featureVectors.resize(realCount);
const auto& outputDims = m_trtEngine->getOutputDims();
const size_t numOutputs = outputDims.size();
const size_t numBatch = featureVectors.size();
// Determine task type once (same model for all images in batch)
int dim1 = outputDims[0].d[1];
int dim2 = outputDims[0].d[2];
int nc = static_cast<int>(_classes.size());
enum class TaskType { DetLegacy, DetE2E, OBBLegacy, OBBE2E,
SegLegacy, SegE2E, PoseLegacy, PoseE2E, Classify };
TaskType taskType = TaskType::DetLegacy; // default
// E2E: dim1 > dim2 (e.g. [B,300,6]); Legacy: dim1 < dim2 (e.g. [B,84,8400])
const bool isEndToEnd = (dim1 > dim2) || (dim2 <= 20);
if (numOutputs >= 2) {
taskType = isEndToEnd ? TaskType::SegE2E : TaskType::SegLegacy;
}
else if (outputDims[0].nbDims <= 2) {
taskType = TaskType::Classify;
}
else if (isEndToEnd) {
if (dim2 == 6) taskType = TaskType::DetE2E;
else if (dim2 == 7) taskType = TaskType::OBBE2E;
else if (dim2 > 7 && (dim2-6) % 3 == 0) taskType = TaskType::PoseE2E;
else taskType = TaskType::DetE2E;
}
else {
int extra = dim1 - 4;
bool routed = false;
// Try class-list-based routing first (only if class count fits within tensor)
if (nc > 0 && nc <= extra) {
if (extra == nc) { taskType = TaskType::DetLegacy; routed = true; }
else if (extra == nc + 1) { taskType = TaskType::OBBLegacy; routed = true; }
else if ((extra-nc) % 3 == 0 && (extra-nc) >= 3) { taskType = TaskType::PoseLegacy; routed = true; }
}
// Fallback: probe last channel for angle values to detect OBB
if (!routed && extra >= 2 && !featureVectors.empty() && !featureVectors[0].empty() && !featureVectors[0][0].empty()) {
// Transpose first image's feature vector and probe last column
cv::Mat raw(dim1, dim2, CV_32F, const_cast<float*>(featureVectors[0][0].data()));
cv::Mat probe;
cv::transpose(raw, probe); // [dim2, dim1]
int lastCol = dim1 - 1;
int numSamples = std::min(dim2, 100);
int angleCount = 0;
for (int s = 0; s < numSamples; ++s) {
float v = probe.at<float>(s, lastCol);
if (v >= -3.15f && v <= 3.15f) ++angleCount;
}
if (angleCount > numSamples * 8 / 10) {
taskType = TaskType::OBBLegacy;
routed = true;
}
}
if (!routed) {
if (dim1 == 56) taskType = TaskType::PoseLegacy;
else taskType = TaskType::DetLegacy;
}
}
// Process each image in parallel
std::vector<std::vector<Object>> batchDetections(numBatch);
std::vector<std::future<std::vector<Object>>> postFutures;
postFutures.reserve(numBatch);
for (size_t batchIdx = 0; batchIdx < numBatch; ++batchIdx) {
const auto& batchOutput = featureVectors[batchIdx];
ImageMetadata imgMeta;
imgMeta.ratio = metadata.ratios[batchIdx];
imgMeta.imgWidth = static_cast<float>(metadata.imgWidths[batchIdx]);
imgMeta.imgHeight = static_cast<float>(metadata.imgHeights[batchIdx]);
switch (taskType) {
case TaskType::SegLegacy:
case TaskType::SegE2E: {
std::vector<std::vector<float>> fv2d;
fv2d.reserve(batchOutput.size());
for (const auto& out : batchOutput) fv2d.push_back(out);
if (taskType == TaskType::SegE2E) {
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv2d), cid = camera_id, m = imgMeta]() mutable {
return PostprocessSegE2E(fv, cid, m);
}));
}
else {
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv2d), cid = camera_id, m = imgMeta]() mutable {
return PostprocessSegmentation(fv, cid, m);
}));
}
break;
}
case TaskType::Classify: {
std::vector<float> fv = batchOutput.empty() ? std::vector<float>{} : batchOutput[0];
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessClassify(fv, cid, m);
}));
break;
}
default: {
std::vector<float> fv = batchOutput.empty() ? std::vector<float>{} : batchOutput[0];
switch (taskType) {
case TaskType::DetLegacy:
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessDetection(fv, cid, m);
}));
break;
case TaskType::DetE2E:
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessDetectionE2E(fv, cid, m);
}));
break;
case TaskType::OBBLegacy:
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessOBB(fv, cid, m);
}));
break;
case TaskType::OBBE2E:
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessOBBE2E(fv, cid, m);
}));
break;
case TaskType::PoseLegacy:
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessPose(fv, cid, m);
}));
break;
case TaskType::PoseE2E:
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessPoseE2E(fv, cid, m);
}));
break;
default:
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessDetection(fv, cid, m);
}));
break;
}
break;
}
}
}
// Gather results
for (size_t i = 0; i < numBatch; ++i)
batchDetections[i] = postFutures[i].get();
// Apply tracker per frame (skip for classification models)
if (_trackerEnabled && taskType != TaskType::Classify) {
for (auto& results : batchDetections) {
if (!results.empty()) {
results = ApplyTracking(results, camera_id);
if (_stabilizationEnabled) {
results = StabilizeDetections(results, camera_id);
}
}
}
}
if (dbg) {
double msPostprocess = elapsed();
double msTotal = std::chrono::duration<double, std::milli>(Clock::now() - t0).count();
_logger.LogInfo("ANSRTYOLO::DetectObjectsBatch",
"[DEBUG] " + camera_id +
" batch=" + std::to_string(realCount) +
" | SetDev=" + std::to_string(msSetDevice) +
"ms Pad=" + std::to_string(msPad) +
"ms Preproc=" + std::to_string(msPreprocess) +
"ms Inf=" + std::to_string(msInference) +
"ms Postproc=" + std::to_string(msPostprocess) +
"ms TOTAL=" + std::to_string(msTotal) + "ms",
__FILE__, __LINE__);
}
return batchDetections;
}
catch (const std::exception& e) {
_logger.LogFatal("ANSRTYOLO::DetectObjectsBatch", e.what(), __FILE__, __LINE__);
return {};
}
}
} // namespace ANSCENTER