Files
ANSCORE/modules/ANSODEngine/ANSRTYOLO.cpp

2381 lines
116 KiB
C++
Raw Normal View History

2026-03-28 16:54:11 +11:00
#include "ANSRTYOLO.h"
#include "Utility.h"
2026-04-04 20:19:54 +11:00
#include "ANSLicense.h" // ANS_DBG macro for DebugView
2026-03-28 16:54:11 +11:00
#include <future>
#include <numeric>
#include <cmath>
#include <cstring>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>
namespace ANSCENTER {
// ====================================================================
// ANSODBase interface — OptimizeModel
// ====================================================================
bool ANSRTYOLO::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder)) return false;
if (!FileExist(_modelFilePath)) {
_logger.LogFatal("ANSRTYOLO::OptimizeModel", "Raw model file path does not exist", __FILE__, __LINE__);
return false;
}
try {
_fp16 = fp16;
optimizedModelFolder = GetParentFolder(_modelFilePath);
if (!m_trtEngine) {
m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
m_options.maxInputHeight = _modelConfig.maxInputHeight;
m_options.minInputHeight = _modelConfig.minInputHeight;
m_options.optInputHeight = _modelConfig.optInputHeight;
m_options.maxInputWidth = _modelConfig.maxInputWidth;
m_options.minInputWidth = _modelConfig.minInputWidth;
m_options.optInputWidth = _modelConfig.optInputWidth;
m_options.engineFileDir = optimizedModelFolder;
m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
m_trtEngine = std::make_shared<Engine<float>>(m_options);
}
auto succ = m_trtEngine->buildWithRetry(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE);
if (!succ) {
_logger.LogError("ANSRTYOLO::OptimizeModel",
"Error: Unable to build TensorRT engine. " + _modelFilePath, __FILE__, __LINE__);
return false;
}
return true;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::OptimizeModel", e.what(), __FILE__, __LINE__);
return false;
}
}
// ====================================================================
// ANSODBase interface — LoadModel
// ====================================================================
bool ANSRTYOLO::LoadModel(const std::string& modelZipFilePath,
const std::string& modelZipPassword) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
_isFixedBatch = false;
bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword);
if (!result) return false;
_modelConfig.modelType = ModelType::TENSORRT;
if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640;
if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.5f;
if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.5f;
if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)
_modelConfig.numKPS = 17;
if (_modelConfig.kpsThreshold <= 0) _modelConfig.kpsThreshold = 0.5f;
_fp16 = true;
TOP_K = 300;
SEG_CHANNELS = 32;
PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
SEGMENTATION_THRESHOLD = 0.5f;
SEG_H = 160;
SEG_W = 160;
NUM_KPS = _modelConfig.numKPS;
KPS_THRESHOLD = _modelConfig.kpsThreshold;
m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
m_options.maxInputHeight = _modelConfig.maxInputHeight;
m_options.minInputHeight = _modelConfig.minInputHeight;
m_options.optInputHeight = _modelConfig.optInputHeight;
m_options.maxInputWidth = _modelConfig.maxInputWidth;
m_options.minInputWidth = _modelConfig.minInputWidth;
m_options.optInputWidth = _modelConfig.optInputWidth;
m_options.engineFileDir = _modelFolder;
m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
_modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
if (FileExist(_modelConfigFile)) {
ModelType modelType;
std::vector<int> inputShape;
_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
if (inputShape.size() == 2) {
if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1];
}
}
else {
_classFilePath = CreateFilePath(_modelFolder, "classes.names");
std::ifstream isValid(_classFilePath);
if (!isValid) LoadClassesFromString();
else LoadClassesFromFile();
}
if (this->_loadEngineOnCreation) {
if (!m_trtEngine) {
m_poolKey = { _modelFilePath,
static_cast<int>(m_options.precision), m_options.maxBatchSize };
m_trtEngine = EnginePoolManager<float>::instance().acquire(
m_poolKey, m_options, _modelFilePath,
SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
m_usingSharedPool = (m_trtEngine != nullptr);
}
if (!m_trtEngine) {
_logger.LogError("ANSRTYOLO::LoadModel",
"Error: Unable to load TensorRT engine. " + _modelFilePath, __FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize;
m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize;
m_trtEngine->warmUp();
}
_modelLoadValid = true;
_isInitialized = true;
return true;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::LoadModel", e.what(), __FILE__, __LINE__);
return false;
}
}
// ====================================================================
// ANSODBase interface — LoadModelFromFolder
// ====================================================================
bool ANSRTYOLO::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig,
std::string modelName, std::string className,
const std::string& modelFolder, std::string& labelMap) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
_isFixedBatch = false;
bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig,
modelName, className, modelFolder, labelMap);
if (!result) return false;
_modelConfig = modelConfig;
_modelConfig.modelType = ModelType::TENSORRT;
if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640;
_modelConfig.precisionType = PrecisionType::FP32;
if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)
_modelConfig.numKPS = 17;
if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.5f;
if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.5f;
if (_modelConfig.kpsThreshold <= 0) _modelConfig.kpsThreshold = 0.5f;
_fp16 = true;
TOP_K = 300;
SEG_CHANNELS = 32;
PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
SEGMENTATION_THRESHOLD = 0.5f;
SEG_H = 160;
SEG_W = 160;
NUM_KPS = _modelConfig.numKPS;
KPS_THRESHOLD = _modelConfig.kpsThreshold;
std::string _modelName = modelName;
if (_modelName.empty()) _modelName = "train_last";
std::string modelFullName = _modelName + ".onnx";
m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
m_options.maxInputHeight = _modelConfig.maxInputHeight;
m_options.minInputHeight = _modelConfig.minInputHeight;
m_options.optInputHeight = _modelConfig.optInputHeight;
m_options.maxInputWidth = _modelConfig.maxInputWidth;
m_options.minInputWidth = _modelConfig.minInputWidth;
m_options.optInputWidth = _modelConfig.optInputWidth;
m_options.engineFileDir = _modelFolder;
m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
_modelFilePath = CreateFilePath(_modelFolder, modelFullName);
if (FileExist(_modelConfigFile)) {
ModelType modelType;
std::vector<int> inputShape;
_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
if (inputShape.size() == 2) {
if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1];
}
}
else {
_classFilePath = CreateFilePath(_modelFolder, className);
std::ifstream isValid(_classFilePath);
if (!isValid) LoadClassesFromString();
else LoadClassesFromFile();
}
labelMap.clear();
if (!_classes.empty())
labelMap = VectorToCommaSeparatedString(_classes);
if (this->_loadEngineOnCreation) {
if (!m_trtEngine) {
m_poolKey = { _modelFilePath,
static_cast<int>(m_options.precision), m_options.maxBatchSize };
m_trtEngine = EnginePoolManager<float>::instance().acquire(
m_poolKey, m_options, _modelFilePath,
SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
m_usingSharedPool = (m_trtEngine != nullptr);
}
if (!m_trtEngine) {
_logger.LogError("ANSRTYOLO::LoadModelFromFolder",
"Error: Unable to load TensorRT engine. " + _modelFilePath, __FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize;
m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize;
m_trtEngine->warmUp();
}
_modelLoadValid = true;
_isInitialized = true;
return true;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::LoadModelFromFolder", e.what(), __FILE__, __LINE__);
return false;
}
}
// ====================================================================
// ANSODBase interface — Initialize
// ====================================================================
bool ANSRTYOLO::Initialize(std::string licenseKey, ModelConfig modelConfig,
const std::string& modelZipFilePath,
const std::string& modelZipPassword,
std::string& labelMap) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
const bool engineAlreadyLoaded = _modelLoadValid && _isInitialized && m_trtEngine != nullptr;
_modelLoadValid = false;
_isFixedBatch = false;
bool result = ANSODBase::Initialize(licenseKey, modelConfig,
modelZipFilePath, modelZipPassword, labelMap);
if (!result) return false;
_modelConfig = modelConfig;
_modelConfig.modelType = ModelType::TENSORRT;
if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640;
if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640;
_modelConfig.precisionType = PrecisionType::FP32;
if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133)
_modelConfig.numKPS = 17;
if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.5f;
if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.5f;
if (_modelConfig.kpsThreshold <= 0) _modelConfig.kpsThreshold = 0.5f;
_fp16 = true;
TOP_K = 300;
SEG_CHANNELS = 32;
PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold;
NMS_THRESHOLD = _modelConfig.modelMNSThreshold;
SEGMENTATION_THRESHOLD = 0.5f;
SEG_H = 160;
SEG_W = 160;
NUM_KPS = _modelConfig.numKPS;
KPS_THRESHOLD = _modelConfig.kpsThreshold;
m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
m_options.maxInputHeight = _modelConfig.maxInputHeight;
m_options.minInputHeight = _modelConfig.minInputHeight;
m_options.optInputHeight = _modelConfig.optInputHeight;
m_options.maxInputWidth = _modelConfig.maxInputWidth;
m_options.minInputWidth = _modelConfig.minInputWidth;
m_options.optInputWidth = _modelConfig.optInputWidth;
m_options.engineFileDir = _modelFolder;
m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32);
_modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx");
if (FileExist(_modelConfigFile)) {
ModelType modelType;
std::vector<int> inputShape;
_classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape);
if (inputShape.size() == 2) {
if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0];
if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1];
}
}
else {
_classFilePath = CreateFilePath(_modelFolder, "classes.names");
std::ifstream isValid(_classFilePath);
if (!isValid) LoadClassesFromString();
else LoadClassesFromFile();
}
labelMap.clear();
if (!_classes.empty())
labelMap = VectorToCommaSeparatedString(_classes);
if (this->_loadEngineOnCreation && !engineAlreadyLoaded) {
if (!m_trtEngine) {
m_poolKey = { _modelFilePath,
static_cast<int>(m_options.precision), m_options.maxBatchSize };
m_trtEngine = EnginePoolManager<float>::instance().acquire(
m_poolKey, m_options, _modelFilePath,
SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
m_usingSharedPool = (m_trtEngine != nullptr);
}
if (!m_trtEngine) {
_logger.LogError("ANSRTYOLO::Initialize",
"Error: Unable to load TensorRT engine. " + _modelFilePath, __FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize;
m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize;
m_trtEngine->warmUp();
}
_modelLoadValid = true;
_isInitialized = true;
return true;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::Initialize", e.what(), __FILE__, __LINE__);
return false;
}
}
// ====================================================================
// RunInference / RunInferencesBatch / Destroy / Destructor
// ====================================================================
std::vector<Object> ANSRTYOLO::RunInference(const cv::Mat& inputImgBGR) {
return RunInference(inputImgBGR, "");
}
std::vector<Object> ANSRTYOLO::RunInference(const cv::Mat& inputImgBGR,
const std::string& camera_id) {
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!_modelLoadValid) {
_logger.LogError("ANSRTYOLO::RunInference", "Cannot load TensorRT model", __FILE__, __LINE__);
return {};
}
if (!_licenseValid) {
_logger.LogError("ANSRTYOLO::RunInference", "Invalid license", __FILE__, __LINE__);
return {};
}
if (!_isInitialized) {
_logger.LogError("ANSRTYOLO::RunInference", "Model not initialized", __FILE__, __LINE__);
return {};
}
if (inputImgBGR.empty() || inputImgBGR.cols < 10 || inputImgBGR.rows < 10)
return {};
}
try { return DetectObjects(inputImgBGR, camera_id); }
catch (const std::exception& e) {
_logger.LogFatal("ANSRTYOLO::RunInference", e.what(), __FILE__, __LINE__);
return {};
}
}
std::vector<std::vector<Object>> ANSRTYOLO::RunInferencesBatch(
const std::vector<cv::Mat>& inputs, const std::string& camera_id) {
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!_modelLoadValid) {
_logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Cannot load the TensorRT model", __FILE__, __LINE__);
return {};
}
if (!_licenseValid) {
_logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Runtime license is not valid or expired", __FILE__, __LINE__);
return {};
}
if (!_isInitialized) {
_logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Initialisation is not valid", __FILE__, __LINE__);
return {};
}
if (inputs.empty()) {
_logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Input images vector is empty", __FILE__, __LINE__);
return {};
}
}
try {
if (_isFixedBatch) return ANSODBase::RunInferencesBatch(inputs, camera_id);
else return DetectObjectsBatch(inputs, camera_id);
}
catch (const std::exception& e) {
_logger.LogFatal("ANSRTYOLO::RunInferencesBatch", e.what(), __FILE__, __LINE__);
return {};
}
}
ANSRTYOLO::~ANSRTYOLO() {
try { Destroy(); }
catch (std::exception& e) {
_logger.LogError("ANSRTYOLO::~ANSRTYOLO()", e.what(), __FILE__, __LINE__);
}
}
bool ANSRTYOLO::Destroy() {
try {
if (m_usingSharedPool) {
// Release our reference to the shared pool.
// Pool is destroyed only when all tasks release it.
EnginePoolManager<float>::instance().release(m_poolKey);
m_trtEngine.reset(); // drop shared_ptr (pool may survive)
m_usingSharedPool = false;
}
else {
m_trtEngine.reset();
}
m_nv12Helper.destroy();
return true;
}
catch (std::exception& e) {
_logger.LogError("ANSRTYOLO::Destroy()", e.what(), __FILE__, __LINE__);
return false;
}
}
// ====================================================================
// GPU Preprocessing — single image (pinned-memory H2D path)
//
// 1. Copy raw host image into a pinned (page-locked) buffer
// 2. Upload from pinned memory → GPU (DMA, no staging copy)
// 3. BGR→RGB colour conversion on GPU
// 4. Letterbox resize on GPU (right-bottom pad)
//
// Pinned memory eliminates the internal pageable→pinned staging
// copy that CUDA performs for normal (pageable) host memory,
// cutting the H2D transfer of a 3840×2160 BGR frame (~24 MB)
// by 60-70%.
// ====================================================================
std::vector<std::vector<cv::cuda::GpuMat>> ANSRTYOLO::Preprocess(
const cv::Mat& inputImage, ImageMetadata& outMeta) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
if (!_licenseValid) {
_logger.LogFatal("ANSRTYOLO::Preprocess", "Invalid license", __FILE__, __LINE__);
return {};
}
const auto& inputDims = m_trtEngine->getInputDims();
const int inputH = inputDims[0].d[1];
const int inputW = inputDims[0].d[2];
// Early-out if CUDA context is dead (sticky error from CUVID crash etc.)
if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) return {};
// --- CPU preprocessing: resize + BGR→RGB before GPU upload ---
// Reduces PCIe transfer from 25 MB (4K BGR) to 1.2 MB (640×640 RGB).
// With 12 AI tasks uploading concurrently, this eliminates the WDDM
// SRW lock convoy that causes 400-580ms preprocess spikes.
cv::Mat srcImg = inputImage;
if (srcImg.channels() == 1) {
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
2026-03-28 16:54:11 +11:00
}
outMeta.imgHeight = static_cast<float>(srcImg.rows);
outMeta.imgWidth = static_cast<float>(srcImg.cols);
2026-03-28 16:54:11 +11:00
if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
outMeta.ratio = 1.f / std::min(
inputDims[0].d[2] / static_cast<float>(srcImg.cols),
inputDims[0].d[1] / static_cast<float>(srcImg.rows));
2026-03-28 16:54:11 +11:00
const auto& outputDims = m_trtEngine->getOutputDims();
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
// CPU resize to model input size
cv::Mat cpuResized;
if (srcImg.rows != inputH || srcImg.cols != inputW) {
2026-03-28 16:54:11 +11:00
if (isClassification) {
cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
} else {
cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
2026-03-28 16:54:11 +11:00
}
} else {
cpuResized = srcImg;
2026-03-28 16:54:11 +11:00
}
// CPU BGR → RGB
cv::Mat cpuRGB;
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
// Upload small image to GPU (1.2 MB instead of 25 MB for 4K)
cv::cuda::Stream stream;
cv::cuda::GpuMat gpuResized;
gpuResized.upload(cpuRGB, stream);
2026-03-28 16:54:11 +11:00
stream.waitForCompletion();
std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
return inputs;
}
else {
_logger.LogFatal("ANSRTYOLO::Preprocess",
"Image height or width is zero (Width: " + std::to_string(outMeta.imgWidth) +
", Height: " + std::to_string(outMeta.imgHeight) + ")", __FILE__, __LINE__);
return {};
}
}
catch (const std::exception& e) {
_logger.LogWarn("ANSRTYOLO::Preprocess", std::string("Skipped frame: ") + e.what(), __FILE__, __LINE__);
return {};
}
}
#if 0 // PreprocessFromNV12 — moved to NV12PreprocessHelper::tryNV12()
try {
if (!gpuData || !gpuData->yPlane || !gpuData->uvPlane) {
if (!m_nv12NullLogged) {
m_nv12NullLogged = true;
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
"Early exit: null data — gpuData=" + std::to_string(gpuData != nullptr) +
" yPlane=" + std::to_string(gpuData ? (gpuData->yPlane != nullptr) : false) +
" uvPlane=" + std::to_string(gpuData ? (gpuData->uvPlane != nullptr) : false) +
" isCuda=" + std::to_string(gpuData ? gpuData->isCudaDevicePtr : false),
__FILE__, __LINE__);
}
return {};
}
const auto& inputDims = m_trtEngine->getInputDims();
const int inputH = inputDims[0].d[1];
const int inputW = inputDims[0].d[2];
const int frameW = gpuData->width;
const int frameH = gpuData->height;
if (frameW <= 0 || frameH <= 0) {
if (!m_nv12DimLogged) {
m_nv12DimLogged = true;
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
"Early exit: bad dimensions — w=" + std::to_string(frameW) +
" h=" + std::to_string(frameH),
__FILE__, __LINE__);
}
return {};
}
// Early-out if CUDA context is dead (sticky error from CUVID crash etc.)
if (m_cudaContextDead) {
if (!m_nv12DeadLogged) {
m_nv12DeadLogged = true;
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
"Early exit: CUDA context dead",
__FILE__, __LINE__);
}
return {};
}
// Cache flag before lock is released — gpuData may be invalidated after unlock
const bool isCudaDevice = gpuData->isCudaDevicePtr;
// ── GPU index validation for zero-copy ──
// NVDEC device pointers are only valid on the CUDA context that decoded them.
// If decode GPU != inference GPU, wrapping those pointers causes
// "illegal memory access" → sticky CUDA error → entire context dies.
// Fall back to CPU memcpy+upload path when GPUs don't match.
const int inferenceGpu = m_trtEngine ? m_trtEngine->getPreferredDeviceIndex() : 0;
const bool gpuMatch = !isCudaDevice ||
gpuData->gpuIndex < 0 || // unknown = trust it
gpuData->gpuIndex == inferenceGpu;
const bool useZeroCopy = isCudaDevice && gpuMatch;
// Local plane pointers — default to gpuData's primary planes.
// Overridden below for cross-GPU fallback (CPU NV12 instead of CUDA).
uint8_t* effYPlane = gpuData->yPlane;
uint8_t* effUvPlane = gpuData->uvPlane;
int effYLinesize = gpuData->yLinesize;
int effUvLinesize = gpuData->uvLinesize;
if (isCudaDevice && !gpuMatch) {
// Cross-GPU: NV12 decoded on one GPU, inference on another.
// CPU NV12 fallback uploads full decode-res NV12 (e.g. 3840x2160 = 12.4 MB)
// over PCIe, which is SLOWER than BGR at display-res (1920x1080 = 6.2 MB).
// Measured: CPU NV12 cross-GPU = 15-39ms preproc vs BGR = 10-20ms.
// Just fall back to BGR — it's faster for the cross-GPU case.
if (!m_gpuMismatchLogged) {
m_gpuMismatchLogged = true;
_logger.LogInfo("ANSRTYOLO::PreprocessFromNV12",
"GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) +
" vs inference GPU " + std::to_string(inferenceGpu) +
") — skipping NV12, using BGR (faster for cross-GPU: "
"BGR uploads " + std::to_string(displayW * displayH * 3 / 1024) +
"KB display-res vs NV12 " + std::to_string(frameW * frameH * 3 / 2 / 1024) +
"KB full-res)",
__FILE__, __LINE__);
}
if (regLock.owns_lock()) regLock.unlock();
return {}; // caller will use Preprocess(BGR) instead
}
// Diagnostic: log which path will be taken (once per instance)
// Note: cross-GPU case already returned {} above, so reaching here
// means either CUDA zero-copy (same GPU) or CPU NV12 upload (non-CUDA).
if (!m_nv12PathLogged) {
m_nv12PathLogged = true;
const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY"
: "CPU_NV12_UPLOAD";
_logger.LogInfo("ANSRTYOLO::PreprocessFromNV12",
std::string("Path: ") + pathName +
" | isCuda=" + std::to_string(isCudaDevice) +
" gpuMatch=" + std::to_string(gpuMatch) +
" decodeGpu=" + std::to_string(gpuData->gpuIndex) +
" infGpu=" + std::to_string(inferenceGpu) +
" frame=" + std::to_string(frameW) + "x" + std::to_string(frameH) +
" effYLine=" + std::to_string(effYLinesize) +
" effUvLine=" + std::to_string(effUvLinesize) +
" effYPtr=0x" + std::to_string(reinterpret_cast<uintptr_t>(effYPlane)) +
" hasCpuFallback=" + std::to_string(gpuData->cpuYPlane != nullptr),
__FILE__, __LINE__);
}
cv::cuda::Stream stream;
cv::cuda::GpuMat gpuY, gpuUV;
if (useZeroCopy) {
// ── CUDA zero-copy: wrap NVDEC device pointers directly ──
// No memcpy, no device-to-device copy — data stays in NVDEC VRAM.
// The fused letterbox kernel samples only ~409K pixels from the 4K
// source (vs 8.3M full copy), completing in <1ms on RTX 5080.
// We hold the registry lock until the kernel finishes reading.
gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1,
effYPlane, static_cast<size_t>(effYLinesize));
gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
effUvPlane, static_cast<size_t>(effUvLinesize));
// Lock released after kernel completion (stream.waitForCompletion below)
} else {
// ── CPU path: memcpy + upload (fallback for D3D11VA / sw decode) ──
// Hold registry lock during memcpy so the AVFrame can't be freed
// by another thread calling gpu_frame_attach() on the same key.
const size_t ySize = static_cast<size_t>(frameW) * frameH;
const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
const size_t nv12Size = ySize + uvSize;
ensurePinnedBuffer(nv12Size);
if (!m_pinnedBuf) {
if (!m_nv12PinnedLogged) {
m_nv12PinnedLogged = true;
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
"Early exit: pinned buffer alloc failed for " +
std::to_string(nv12Size) + " bytes",
__FILE__, __LINE__);
}
return {};
}
// Validate NV12 plane pointers before memcpy
const size_t yBufNeeded = (effYLinesize == frameW)
? ySize
: static_cast<size_t>(effYLinesize) * frameH;
const size_t uvBufNeeded = (effUvLinesize == frameW)
? uvSize
: static_cast<size_t>(effUvLinesize) * (frameH / 2);
if (!isMemoryReadable(effYPlane, std::min(yBufNeeded, (size_t)4096)) ||
!isMemoryReadable(effUvPlane, std::min(uvBufNeeded, (size_t)4096))) {
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
"NV12 plane pointers not readable! yPlane=0x" +
std::to_string(reinterpret_cast<uintptr_t>(effYPlane)) +
" uvPlane=0x" +
std::to_string(reinterpret_cast<uintptr_t>(effUvPlane)) +
" yLinesize=" + std::to_string(effYLinesize) +
" uvLinesize=" + std::to_string(effUvLinesize) +
" w=" + std::to_string(frameW) + " h=" + std::to_string(frameH),
__FILE__, __LINE__);
if (regLock.owns_lock()) regLock.unlock();
return {}; // fall back to BGR
}
uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
bool cpyOk = true;
if (effYLinesize == frameW) {
cpyOk = safeMemcpy(dst, effYPlane, ySize);
} else {
for (int row = 0; row < frameH && cpyOk; row++)
cpyOk = safeMemcpy(dst + row * frameW,
effYPlane + row * effYLinesize, frameW);
}
if (cpyOk) {
uint8_t* uvDst = dst + ySize;
if (effUvLinesize == frameW) {
cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
} else {
for (int row = 0; row < frameH / 2 && cpyOk; row++)
cpyOk = safeMemcpy(uvDst + row * frameW,
effUvPlane + row * effUvLinesize, frameW);
}
}
if (!cpyOk) {
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
"Access violation during NV12 memcpy! Falling back to BGR. "
"yPlane=0x" + std::to_string(reinterpret_cast<uintptr_t>(effYPlane)) +
" uvPlane=0x" + std::to_string(reinterpret_cast<uintptr_t>(effUvPlane)) +
" yLinesize=" + std::to_string(effYLinesize) +
" uvLinesize=" + std::to_string(effUvLinesize) +
" w=" + std::to_string(frameW) + " h=" + std::to_string(frameH) +
" avframe=0x" + std::to_string(reinterpret_cast<uintptr_t>(gpuData->avframe)),
__FILE__, __LINE__);
if (regLock.owns_lock()) regLock.unlock();
return {}; // fall back to BGR
}
// NV12 data safely in pinned memory — release registry lock.
// From here on we only read from m_pinnedBuf, not from gpuData.
if (regLock.owns_lock()) regLock.unlock();
cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
static_cast<uint8_t*>(m_pinnedBuf) + ySize);
gpuY.upload(pinnedY, stream);
gpuUV.upload(pinnedUV, stream);
}
// Use display dimensions for coordinate mapping so postprocessed
// bboxes map to the display image (1080p), not the NV12 source (4K).
const float metaW = (displayW > 0) ? static_cast<float>(displayW) : static_cast<float>(frameW);
const float metaH = (displayH > 0) ? static_cast<float>(displayH) : static_cast<float>(frameH);
outMeta.imgWidth = metaW;
outMeta.imgHeight = metaH;
if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) {
outMeta.ratio = 1.f / std::min(
inputDims[0].d[2] / metaW,
inputDims[0].d[1] / metaH);
const auto& outputDims = m_trtEngine->getOutputDims();
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
cv::cuda::GpuMat gpuResized;
if (isClassification) {
// Classification: NV12→RGB at full resolution, then simple resize
cv::cuda::GpuMat gpuRGB(frameH, frameW, CV_8UC3);
launchNV12ToRGB(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
gpuRGB.ptr<uint8_t>(), static_cast<int>(gpuRGB.step),
frameW, frameH, rawStream);
cv::cuda::resize(gpuRGB, gpuResized, cv::Size(inputW, inputH),
0, 0, cv::INTER_LINEAR, stream);
} else if (frameW == inputW && frameH == inputH) {
// Source matches model input — direct NV12→RGB, no resize needed
gpuResized.create(inputH, inputW, CV_8UC3);
launchNV12ToRGB(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),
frameW, frameH, rawStream);
} else {
// Detection: fused NV12→RGB + letterbox in a SINGLE kernel at
// output resolution (e.g. 640×640). This avoids the 24MB 4K RGB
// intermediate and processes 20× fewer pixels than separate
// convert + resize for 4K→640 downscale.
float r = std::min(static_cast<float>(inputW) / frameW,
static_cast<float>(inputH) / frameH);
int unpadW = static_cast<int>(r * frameW);
int unpadH = static_cast<int>(r * frameH);
float invScale = 1.0f / r; // maps output coords → source coords
gpuResized.create(inputH, inputW, CV_8UC3);
launchNV12ToRGBLetterbox(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),
inputW, inputH,
frameW, frameH,
unpadW, unpadH,
invScale, rawStream);
}
stream.waitForCompletion();
// Release registry lock now that kernel is done reading NVDEC pointers
if (regLock.owns_lock()) regLock.unlock();
// Log NV12 fast-path usage once per instance
if (!m_nv12ActiveLogged) {
m_nv12ActiveLogged = true;
const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload";
const char* kernel = isClassification ? "separate" : "FUSED letterbox";
_logger.LogInfo("ANSRTYOLO::PreprocessFromNV12",
std::string(mode) + " ACTIVE (" + kernel + "): " +
std::to_string(frameW) + "x" + std::to_string(frameH) +
" NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) +
" display=" + std::to_string(displayW) + "x" + std::to_string(displayH),
__FILE__, __LINE__);
}
std::vector<cv::cuda::GpuMat> input{ std::move(gpuResized) };
std::vector<std::vector<cv::cuda::GpuMat>> inputs{ std::move(input) };
return inputs;
}
{
if (!m_nv12MetaLogged) {
m_nv12MetaLogged = true;
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
"Early exit: metadata dims invalid — metaW=" +
std::to_string(outMeta.imgWidth) + " metaH=" +
std::to_string(outMeta.imgHeight) +
" displayW=" + std::to_string(displayW) +
" displayH=" + std::to_string(displayH),
__FILE__, __LINE__);
}
}
return {};
}
catch (const std::exception& e) {
_logger.LogWarn("ANSRTYOLO::PreprocessFromNV12",
std::string("NV12 fast path failed, falling back to BGR: ") + e.what(),
__FILE__, __LINE__);
return {};
}
}
#endif // PreprocessFromNV12 moved to NV12PreprocessHelper
// ====================================================================
// GPU Preprocessing — batch
// ====================================================================
std::vector<std::vector<cv::cuda::GpuMat>> ANSRTYOLO::PreprocessBatch(
const std::vector<cv::Mat>& inputImages, BatchMetadata& outMetadata) {
if (!_licenseValid) {
_logger.LogError("ANSRTYOLO::PreprocessBatch", "Invalid license", __FILE__, __LINE__);
return {};
}
if (inputImages.empty()) {
_logger.LogError("ANSRTYOLO::PreprocessBatch", "Empty input images vector", __FILE__, __LINE__);
return {};
}
if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) return {};
try {
const auto& inputDims = m_trtEngine->getInputDims();
if (inputDims.empty()) {
_logger.LogError("ANSRTYOLO::PreprocessBatch", "No input dimensions available", __FILE__, __LINE__);
return {};
}
const int inputH = inputDims[0].d[1];
const int inputW = inputDims[0].d[2];
if (inputH <= 0 || inputW <= 0) {
_logger.LogError("ANSRTYOLO::PreprocessBatch", "Invalid model input dimensions", __FILE__, __LINE__);
return {};
}
outMetadata.imgHeights.resize(inputImages.size());
outMetadata.imgWidths.resize(inputImages.size());
outMetadata.ratios.resize(inputImages.size());
std::vector<cv::cuda::GpuMat> batchProcessed;
batchProcessed.reserve(inputImages.size());
cv::cuda::Stream stream;
for (size_t i = 0; i < inputImages.size(); ++i) {
const auto& inputImage = inputImages[i];
if (inputImage.empty()) {
_logger.LogError("ANSRTYOLO::PreprocessBatch",
"Empty input image at index " + std::to_string(i), __FILE__, __LINE__);
return {};
}
// CPU preprocessing: resize + BGR→RGB before GPU upload
cv::Mat srcImg = inputImage;
if (srcImg.channels() == 1) {
cv::cvtColor(srcImg, srcImg, cv::COLOR_GRAY2BGR);
} else if (srcImg.channels() != 3) {
2026-03-28 16:54:11 +11:00
_logger.LogError("ANSRTYOLO::PreprocessBatch",
"Unsupported channel count at index " + std::to_string(i), __FILE__, __LINE__);
return {};
}
outMetadata.imgHeights[i] = srcImg.rows;
outMetadata.imgWidths[i] = srcImg.cols;
2026-03-28 16:54:11 +11:00
if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) {
_logger.LogError("ANSRTYOLO::PreprocessBatch",
"Invalid dimensions for image " + std::to_string(i), __FILE__, __LINE__);
return {};
}
const auto& outputDims = m_trtEngine->getOutputDims();
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
const float scaleW = inputW / static_cast<float>(srcImg.cols);
const float scaleH = inputH / static_cast<float>(srcImg.rows);
2026-03-28 16:54:11 +11:00
outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(scaleW, scaleH);
cv::Mat cpuResized;
if (srcImg.rows != inputH || srcImg.cols != inputW) {
2026-03-28 16:54:11 +11:00
if (isClassification) {
cv::resize(srcImg, cpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR);
2026-03-28 16:54:11 +11:00
} else {
cpuResized = Engine<float>::cpuResizeKeepAspectRatioPadRightBottom(srcImg, inputH, inputW);
2026-03-28 16:54:11 +11:00
}
} else {
cpuResized = srcImg;
2026-03-28 16:54:11 +11:00
}
cv::Mat cpuRGB;
cv::cvtColor(cpuResized, cpuRGB, cv::COLOR_BGR2RGB);
cv::cuda::GpuMat gpuResized;
gpuResized.upload(cpuRGB, stream);
batchProcessed.push_back(std::move(gpuResized));
2026-03-28 16:54:11 +11:00
}
stream.waitForCompletion();
std::vector<std::vector<cv::cuda::GpuMat>> inputs;
inputs.push_back(std::move(batchProcessed));
return inputs;
}
catch (const std::exception& e) {
_logger.LogWarn("ANSRTYOLO::PreprocessBatch", std::string("Skipped batch: ") + e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// OBB NMS helpers (Prob-IoU based) — static methods
// ====================================================================
void ANSRTYOLO::getCovarianceComponents(const OrientedBox& box,
float& out1, float& out2, float& out3) {
if (box.width <= 0.f || box.height <= 0.f) {
out1 = out2 = out3 = 0.f;
return;
}
const float vw = (box.width * box.width) / 12.0f;
const float vh = (box.height * box.height) / 12.0f;
const float cosT = std::cos(box.angle);
const float sinT = std::sin(box.angle);
const float cos2 = cosT * cosT;
const float sin2 = sinT * sinT;
const float sc = sinT * cosT;
out1 = vw * cos2 + vh * sin2;
out2 = vw * sin2 + vh * cos2;
out3 = (vw - vh) * sc;
}
std::vector<std::vector<float>> ANSRTYOLO::batchProbiou(
const std::vector<OrientedBox>& obb1,
const std::vector<OrientedBox>& obb2, float eps) {
if (obb1.empty() || obb2.empty()) return {};
const size_t n1 = obb1.size(), n2 = obb2.size();
std::vector<std::vector<float>> iouMat(n1, std::vector<float>(n2, 0.f));
struct CovData { float x, y, a, b, c; };
std::vector<CovData> cov1(n1);
for (size_t i = 0; i < n1; ++i) {
float a, b, c;
getCovarianceComponents(obb1[i], a, b, c);
cov1[i] = { obb1[i].x, obb1[i].y, a, b, c };
}
for (size_t i = 0; i < n1; ++i) {
for (size_t j = 0; j < n2; ++j) {
float a2, b2, c2;
getCovarianceComponents(obb2[j], a2, b2, c2);
float dx = cov1[i].x - obb2[j].x;
float dy = cov1[i].y - obb2[j].y;
float sA = cov1[i].a + a2, sB = cov1[i].b + b2, sC = cov1[i].c + c2;
float denom = sA * sB - sC * sC + eps;
if (denom <= eps) continue;
float t1 = ((sA*dy*dy + sB*dx*dx) * 0.25f) / denom;
float t2 = ((sC*dx*dy) * -0.5f) / denom;
float d1 = cov1[i].a*cov1[i].b - cov1[i].c*cov1[i].c;
float d2 = a2*b2 - c2*c2;
float sqrtDet = std::sqrt(std::max(d1, 0.f) * std::max(d2, 0.f) + eps);
float t3 = 0.5f * std::log((sA*sB - sC*sC) / (4.f*sqrtDet) + eps);
float bd = std::clamp(t1 + t2 + t3, eps, 100.f);
float hd = std::sqrt(1.f - std::exp(-bd) + eps);
iouMat[i][j] = 1.f - hd;
}
}
return iouMat;
}
std::vector<int> ANSRTYOLO::nmsRotatedImpl(
const std::vector<OrientedBox>& sortedBoxes, float iouThreshold) {
if (sortedBoxes.empty()) return {};
if (sortedBoxes.size() == 1) return { 0 };
auto iouMat = batchProbiou(sortedBoxes, sortedBoxes);
if (iouMat.empty()) return {};
const int n = static_cast<int>(sortedBoxes.size());
std::vector<int> keep;
keep.reserve(n / 2);
for (int j = 0; j < n; ++j) {
bool shouldKeep = true;
for (int i = 0; i < j; ++i) {
if (iouMat[i][j] >= iouThreshold) { shouldKeep = false; break; }
}
if (shouldKeep) keep.push_back(j);
}
return keep;
}
std::vector<int> ANSRTYOLO::nmsRotated(
const std::vector<OrientedBox>& boxes,
const std::vector<float>& scores, float iouThreshold) {
if (boxes.empty() || scores.empty() || boxes.size() != scores.size()) return {};
std::vector<int> sortedIdx(boxes.size());
std::iota(sortedIdx.begin(), sortedIdx.end(), 0);
std::sort(sortedIdx.begin(), sortedIdx.end(),
[&](int a, int b) { return scores[a] > scores[b]; });
std::vector<OrientedBox> sortedBoxes;
sortedBoxes.reserve(boxes.size());
for (int i : sortedIdx) sortedBoxes.push_back(boxes[i]);
auto keepSorted = nmsRotatedImpl(sortedBoxes, iouThreshold);
std::vector<int> keepOrig;
keepOrig.reserve(keepSorted.size());
for (int si : keepSorted) keepOrig.push_back(sortedIdx[si]);
return keepOrig;
}
std::vector<cv::Point2f> ANSRTYOLO::OBBToPoints(const OrientedBox& obb) {
float angleDeg = obb.angle * 180.0f / static_cast<float>(CV_PI);
cv::RotatedRect rr(cv::Point2f(obb.x, obb.y),
cv::Size2f(obb.width, obb.height), angleDeg);
std::vector<cv::Point2f> corners(4);
rr.points(corners.data());
return corners;
}
// ====================================================================
// Detection — legacy postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessDetection(
std::vector<float>& featureVector,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const auto& outputDims = m_trtEngine->getOutputDims();
auto numChannels = outputDims[0].d[1];
auto numAnchors = outputDims[0].d[2];
// Derive numClasses from tensor shape (4 box coords subtracted)
// rather than _classes.size() which may not match the model
auto numClasses = static_cast<size_t>(numChannels - 4);
if (!_classes.empty() && _classes.size() <= static_cast<size_t>(numChannels - 4))
numClasses = _classes.size();
std::vector<cv::Rect> bboxes;
std::vector<float> scores;
std::vector<int> labels;
std::vector<int> indices;
cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data());
output = output.t();
for (int i = 0; i < numAnchors; i++) {
auto rowPtr = output.row(i).ptr<float>();
auto bboxesPtr = rowPtr;
auto scoresPtr = rowPtr + 4;
auto maxSPtr = std::max_element(scoresPtr, scoresPtr + numClasses);
float score = *maxSPtr;
if (score > _modelConfig.detectionScoreThreshold) {
float x = *bboxesPtr++;
float y = *bboxesPtr++;
float w = *bboxesPtr++;
float h = *bboxesPtr;
float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
int label = static_cast<int>(maxSPtr - scoresPtr);
cv::Rect_<float> bbox;
bbox.x = x0; bbox.y = y0;
bbox.width = x1 - x0; bbox.height = y1 - y0;
bbox.x = std::max(0.f, bbox.x);
bbox.y = std::max(0.f, bbox.y);
bbox.width = std::min(meta.imgWidth - bbox.x, bbox.width);
bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height);
bboxes.push_back(bbox);
labels.push_back(label);
scores.push_back(score);
}
}
cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices);
std::vector<Object> objects;
int classNameSize = static_cast<int>(_classes.size());
for (auto& chosenIdx : indices) {
if (scores[chosenIdx] > _modelConfig.detectionScoreThreshold) {
Object obj{};
obj.confidence = scores[chosenIdx];
obj.classId = labels[chosenIdx];
obj.box = bboxes[chosenIdx];
//obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
if (!_classes.empty()) {
obj.className = (obj.classId < classNameSize)
? _classes[obj.classId] : _classes[classNameSize - 1];
}
else { obj.className = "Unknown"; }
obj.cameraId = camera_id;
objects.push_back(obj);
}
}
return objects;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessDetection", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// Detection — end2end postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessDetectionE2E(
std::vector<float>& featureVector,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const auto& outputDims = m_trtEngine->getOutputDims();
int numDets = outputDims[0].d[1];
int numFeat = outputDims[0].d[2]; // 6: x1,y1,x2,y2,conf,classId
std::vector<Object> results;
results.reserve(numDets);
for (int i = 0; i < numDets; ++i) {
const float* det = featureVector.data() + i * numFeat;
float conf = det[4];
if (conf <= _modelConfig.detectionScoreThreshold) continue;
int classId = static_cast<int>(det[5]);
// Scale from model input space to original image
float x0 = std::clamp(det[0] * meta.ratio, 0.f, meta.imgWidth);
float y0 = std::clamp(det[1] * meta.ratio, 0.f, meta.imgHeight);
float x1 = std::clamp(det[2] * meta.ratio, 0.f, meta.imgWidth);
float y1 = std::clamp(det[3] * meta.ratio, 0.f, meta.imgHeight);
float w = x1 - x0, h = y1 - y0;
if (w < 1.f || h < 1.f) continue;
Object obj;
obj.classId = classId;
obj.confidence = conf;
obj.box = cv::Rect(static_cast<int>(x0), static_cast<int>(y0),
static_cast<int>(w), static_cast<int>(h));
//obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
int classNameSize = static_cast<int>(_classes.size());
if (!_classes.empty() && classId >= 0 && classId < classNameSize)
obj.className = _classes[classId];
obj.cameraId = camera_id;
results.push_back(std::move(obj));
}
return results;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessDetectionE2E", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// OBB — legacy postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessOBB(
std::vector<float>& featureVector,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const auto& outputDims = m_trtEngine->getOutputDims();
int numChannels = outputDims[0].d[1];
int numAnchors = outputDims[0].d[2];
int numClasses = numChannels - 5; // 4 box + nc scores + 1 angle
if (numClasses <= 0) return {};
cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data()).t();
struct OBBCandidate {
OrientedBox box;
float conf;
int classId;
};
std::vector<OBBCandidate> candidates;
candidates.reserve(numAnchors);
for (int i = 0; i < numAnchors; ++i) {
const float* row = output.ptr<float>(i);
const float* scoresPtr = row + 4;
float maxScore = -FLT_MAX;
int bestClass = -1;
for (int c = 0; c < numClasses; ++c) {
if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; }
}
if (maxScore <= _modelConfig.detectionScoreThreshold) continue;
float angle = row[4 + numClasses];
float cx = row[0] * meta.ratio;
float cy = row[1] * meta.ratio;
float bw = row[2] * meta.ratio;
float bh = row[3] * meta.ratio;
cx = std::clamp(cx, 0.f, meta.imgWidth);
cy = std::clamp(cy, 0.f, meta.imgHeight);
candidates.push_back({ { cx, cy, bw, bh, angle }, maxScore, bestClass });
}
if (candidates.empty()) return {};
// Prob-IoU NMS
std::vector<OrientedBox> boxes;
std::vector<float> scores;
boxes.reserve(candidates.size());
scores.reserve(candidates.size());
for (const auto& c : candidates) { boxes.push_back(c.box); scores.push_back(c.conf); }
auto keepIdx = nmsRotated(boxes, scores, NMS_THRESHOLD);
std::vector<Object> results;
int classNameSize = static_cast<int>(_classes.size());
results.reserve(std::min(static_cast<int>(keepIdx.size()), TOP_K));
for (int idx : keepIdx) {
if (static_cast<int>(results.size()) >= TOP_K) break;
const auto& c = candidates[idx];
Object obj;
obj.classId = c.classId;
obj.confidence = c.conf;
obj.kps = { c.box.x, c.box.y, c.box.width, c.box.height, c.box.angle };
auto absCorners = OBBToPoints(c.box);
obj.box = cv::boundingRect(absCorners);
// Normalize OBB corners to [0,1] and close the polygon
obj.polygon.reserve(absCorners.size() + 1);
for (const auto& pt : absCorners) {
obj.polygon.emplace_back(
std::clamp(pt.x / meta.imgWidth, 0.f, 1.f),
std::clamp(pt.y / meta.imgHeight, 0.f, 1.f));
}
if (!obj.polygon.empty()) obj.polygon.push_back(obj.polygon.front()); // close
if (!_classes.empty() && c.classId >= 0 && c.classId < classNameSize)
obj.className = _classes[c.classId];
obj.cameraId = camera_id;
results.push_back(std::move(obj));
}
return results;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessOBB", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// OBB — end2end postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessOBBE2E(
std::vector<float>& featureVector,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const auto& outputDims = m_trtEngine->getOutputDims();
int numDets = outputDims[0].d[1];
int numFeat = outputDims[0].d[2]; // 7: cx,cy,w,h,angle,conf,classId
std::vector<Object> results;
results.reserve(numDets);
for (int i = 0; i < numDets; ++i) {
const float* det = featureVector.data() + i * numFeat;
float angle = det[4];
float conf = det[5];
if (conf <= _modelConfig.detectionScoreThreshold) continue;
float cx = det[0] * meta.ratio;
float cy = det[1] * meta.ratio;
float bw = det[2] * meta.ratio;
float bh = det[3] * meta.ratio;
int classId = static_cast<int>(det[6]);
cx = std::clamp(cx, 0.f, meta.imgWidth);
cy = std::clamp(cy, 0.f, meta.imgHeight);
OrientedBox obb{ cx, cy, bw, bh, angle };
Object obj;
obj.classId = classId;
obj.confidence = conf;
obj.kps = { cx, cy, bw, bh, angle };
auto absCorners = OBBToPoints(obb);
obj.box = cv::boundingRect(absCorners);
// Normalize OBB corners to [0,1] and close the polygon
obj.polygon.reserve(absCorners.size() + 1);
for (const auto& pt : absCorners) {
obj.polygon.emplace_back(
std::clamp(pt.x / meta.imgWidth, 0.f, 1.f),
std::clamp(pt.y / meta.imgHeight, 0.f, 1.f));
}
if (!obj.polygon.empty()) obj.polygon.push_back(obj.polygon.front()); // close
int classNameSize = static_cast<int>(_classes.size());
if (!_classes.empty() && classId >= 0 && classId < classNameSize)
obj.className = _classes[classId];
obj.cameraId = camera_id;
results.push_back(std::move(obj));
}
return results;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessOBBE2E", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// Segmentation — legacy postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessSegmentation(
std::vector<std::vector<float>>& featureVectors,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const auto& outputDims = m_trtEngine->getOutputDims();
int numChannels = outputDims[0].d[1];
int numAnchors = outputDims[0].d[2];
const auto numClasses = numChannels - SEG_CHANNELS - 4;
if (featureVectors[0].size() != static_cast<size_t>(numChannels) * numAnchors) return {};
if (featureVectors[1].size() != static_cast<size_t>(SEG_CHANNELS) * SEG_H * SEG_W) return {};
cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVectors[0].data()).t();
cv::Mat protos = cv::Mat(SEG_CHANNELS, SEG_H * SEG_W, CV_32F, featureVectors[1].data());
std::vector<int> labels;
std::vector<float> scores;
std::vector<cv::Rect> bboxes;
std::vector<cv::Mat> maskConfs;
std::vector<int> indices;
for (int i = 0; i < numAnchors; i++) {
auto rowPtr = output.row(i).ptr<float>();
auto bboxesPtr = rowPtr;
auto scoresPtr = rowPtr + 4;
auto maskConfsPtr = rowPtr + 4 + numClasses;
auto maxSPtr = std::max_element(scoresPtr, scoresPtr + numClasses);
float score = *maxSPtr;
if (score > _modelConfig.detectionScoreThreshold) {
float x = *bboxesPtr++;
float y = *bboxesPtr++;
float w = *bboxesPtr++;
float h = *bboxesPtr;
float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
int label = static_cast<int>(maxSPtr - scoresPtr);
cv::Rect_<float> bbox;
bbox.x = x0; bbox.y = y0;
bbox.width = x1 - x0; bbox.height = y1 - y0;
bbox.x = std::max(0.f, bbox.x);
bbox.y = std::max(0.f, bbox.y);
bbox.width = std::min(meta.imgWidth - bbox.x, bbox.width);
bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height);
cv::Mat maskConf = cv::Mat(1, SEG_CHANNELS, CV_32F, maskConfsPtr);
bboxes.push_back(bbox);
labels.push_back(label);
scores.push_back(score);
maskConfs.push_back(maskConf);
}
}
cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices);
cv::Mat masks;
int classNameSize = static_cast<int>(_classes.size());
std::vector<Object> objs;
for (auto& i : indices) {
if (scores[i] > _modelConfig.detectionScoreThreshold) {
Object obj;
obj.classId = labels[i];
if (!_classes.empty()) {
obj.className = (obj.classId < classNameSize)
? _classes[obj.classId] : _classes[classNameSize - 1];
}
else { obj.className = "Unknown"; }
obj.box = bboxes[i];
obj.confidence = scores[i];
obj.cameraId = camera_id;
masks.push_back(maskConfs[i]);
objs.push_back(obj);
}
}
if (!masks.empty()) {
cv::Mat matmulRes = (masks * protos).t();
// Apply sigmoid while still a single-channel 2D matrix
cv::Mat negMat;
cv::exp(-matmulRes, negMat);
cv::Mat sigmoidFlat = 1.0 / (1.0 + negMat);
// Now reshape into multi-channel and split
cv::Mat sigmoidMat = sigmoidFlat.reshape(static_cast<int>(indices.size()),
{ SEG_H, SEG_W });
std::vector<cv::Mat> maskChannels;
cv::split(sigmoidMat, maskChannels);
// ROI in proto space (SEG_H x SEG_W), accounting for top-left letterbox padding
// ANSRTYOLO pads right-bottom, so content starts at (0,0) in proto space
cv::Rect roi;
if (meta.imgHeight > meta.imgWidth) {
int roiW = std::min(static_cast<int>(std::round(
static_cast<float>(SEG_W) * meta.imgWidth / meta.imgHeight)), SEG_W);
roi = cv::Rect(0, 0, roiW, SEG_H);
}
else {
int roiH = std::min(static_cast<int>(std::round(
static_cast<float>(SEG_H) * meta.imgHeight / meta.imgWidth)), SEG_H);
roi = cv::Rect(0, 0, SEG_W, roiH);
}
roi &= cv::Rect(0, 0, SEG_W, SEG_H);
int imgW = static_cast<int>(meta.imgWidth);
int imgH = static_cast<int>(meta.imgHeight);
// Precompute scale factors from proto-ROI to original image
const float scaleX = static_cast<float>(imgW) / roi.width;
const float scaleY = static_cast<float>(imgH) / roi.height;
for (size_t i = 0; i < objs.size(); i++) {
cv::Rect safeBox = objs[i].box & cv::Rect(0, 0, imgW, imgH);
if (safeBox.area() <= 0) continue;
// Map bounding box back to proto-ROI space and crop there
int px0 = std::max(static_cast<int>(std::floor(safeBox.x / scaleX)), 0);
int py0 = std::max(static_cast<int>(std::floor(safeBox.y / scaleY)), 0);
int px1 = std::min(static_cast<int>(std::ceil((safeBox.x + safeBox.width) / scaleX)), roi.width);
int py1 = std::min(static_cast<int>(std::ceil((safeBox.y + safeBox.height) / scaleY)), roi.height);
if (px1 <= px0 || py1 <= py0) continue;
cv::Rect protoBox(roi.x + px0, roi.y + py0, px1 - px0, py1 - py0);
protoBox &= cv::Rect(0, 0, SEG_W, SEG_H);
if (protoBox.area() <= 0) continue;
// Resize only the small proto crop to the bounding box size
cv::Mat cropped = maskChannels[i](protoBox);
cv::Mat resized;
cv::resize(cropped, resized, cv::Size(safeBox.width, safeBox.height),
0, 0, cv::INTER_LINEAR);
objs[i].mask = resized > _modelConfig.modelConfThreshold;
objs[i].polygon = ANSUtilityHelper::MaskToNormalizedPolygon(
objs[i].mask, safeBox, meta.imgWidth, meta.imgHeight);
}
}
// Fill polygon for objects that got masks
for (auto& obj : objs) {
if (obj.polygon.empty())
obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
}
return objs;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessSegmentation", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// Segmentation — end2end postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessSegE2E(
std::vector<std::vector<float>>& featureVectors,
const std::string& camera_id, const ImageMetadata& meta) {
try {
if (featureVectors.size() < 2) return {};
const auto& outputDims = m_trtEngine->getOutputDims();
int numDets = outputDims[0].d[1];
int numFeat = outputDims[0].d[2]; // 6 + nm
// Proto dimensions from second output
int nm = outputDims[1].d[1];
int protoH = outputDims[1].d[2];
int protoW = (outputDims[1].nbDims > 3) ? outputDims[1].d[3] : outputDims[1].d[2];
if (numFeat < 6 + nm) return {};
const float* raw = featureVectors[0].data();
std::vector<Object> objs;
cv::Mat maskCoeffs;
for (int i = 0; i < numDets; ++i) {
const float* det = raw + i * numFeat;
float conf = det[4];
if (conf <= _modelConfig.detectionScoreThreshold) continue;
int classId = static_cast<int>(det[5]);
float x0 = std::clamp(det[0] * meta.ratio, 0.f, meta.imgWidth);
float y0 = std::clamp(det[1] * meta.ratio, 0.f, meta.imgHeight);
float x1 = std::clamp(det[2] * meta.ratio, 0.f, meta.imgWidth);
float y1 = std::clamp(det[3] * meta.ratio, 0.f, meta.imgHeight);
float w = x1 - x0, h = y1 - y0;
if (w < 1.f || h < 1.f) continue;
Object obj;
obj.classId = classId;
obj.confidence = conf;
obj.box = cv::Rect(static_cast<int>(x0), static_cast<int>(y0),
static_cast<int>(w), static_cast<int>(h));
int classNameSize = static_cast<int>(_classes.size());
if (!_classes.empty() && classId >= 0 && classId < classNameSize)
obj.className = _classes[classId];
obj.cameraId = camera_id;
objs.push_back(std::move(obj));
cv::Mat mc(1, nm, CV_32F);
std::memcpy(mc.ptr<float>(), det + 6, nm * sizeof(float));
maskCoeffs.push_back(mc);
}
if (!objs.empty() && !maskCoeffs.empty()) {
cv::Mat protos(nm, protoH * protoW, CV_32F, featureVectors[1].data());
cv::Mat matmulRes = (maskCoeffs * protos).t();
// Apply sigmoid while still a single-channel 2D matrix
cv::Mat negMat;
cv::exp(-matmulRes, negMat);
cv::Mat sigmoidFlat = 1.0 / (1.0 + negMat);
// Now reshape into multi-channel and split
cv::Mat sigmoidMat = sigmoidFlat.reshape(static_cast<int>(objs.size()),
{ protoH, protoW });
std::vector<cv::Mat> maskChannels;
cv::split(sigmoidMat, maskChannels);
// ROI in proto space, accounting for top-left letterbox padding
// ANSRTYOLO pads right-bottom, so content starts at (0,0) in proto space
cv::Rect roi;
if (meta.imgHeight > meta.imgWidth) {
int roiW = std::min(static_cast<int>(std::round(
static_cast<float>(protoW) * meta.imgWidth / meta.imgHeight)), protoW);
roi = cv::Rect(0, 0, roiW, protoH);
}
else {
int roiH = std::min(static_cast<int>(std::round(
static_cast<float>(protoH) * meta.imgHeight / meta.imgWidth)), protoH);
roi = cv::Rect(0, 0, protoW, roiH);
}
roi &= cv::Rect(0, 0, protoW, protoH);
int imgW = static_cast<int>(meta.imgWidth);
int imgH = static_cast<int>(meta.imgHeight);
const float scaleX = static_cast<float>(imgW) / roi.width;
const float scaleY = static_cast<float>(imgH) / roi.height;
for (size_t i = 0; i < objs.size(); ++i) {
cv::Rect safebox = objs[i].box & cv::Rect(0, 0, imgW, imgH);
if (safebox.area() <= 0) continue;
int px0 = std::max(static_cast<int>(std::floor(safebox.x / scaleX)), 0);
int py0 = std::max(static_cast<int>(std::floor(safebox.y / scaleY)), 0);
int px1 = std::min(static_cast<int>(std::ceil((safebox.x + safebox.width) / scaleX)), roi.width);
int py1 = std::min(static_cast<int>(std::ceil((safebox.y + safebox.height) / scaleY)), roi.height);
if (px1 <= px0 || py1 <= py0) continue;
cv::Rect protoBox(roi.x + px0, roi.y + py0, px1 - px0, py1 - py0);
protoBox &= cv::Rect(0, 0, protoW, protoH);
if (protoBox.area() <= 0) continue;
cv::Mat cropped = maskChannels[i](protoBox);
cv::Mat resized;
cv::resize(cropped, resized, cv::Size(safebox.width, safebox.height),
0, 0, cv::INTER_LINEAR);
objs[i].mask = resized > SEGMENTATION_THRESHOLD;
objs[i].polygon = ANSUtilityHelper::MaskToNormalizedPolygon(
objs[i].mask, safebox, meta.imgWidth, meta.imgHeight);
}
}
for (auto& obj : objs) {
if (obj.polygon.empty())
obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
}
return objs;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessSegE2E", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// Pose — legacy postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessPose(
std::vector<float>& featureVector,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const auto& outputDims = m_trtEngine->getOutputDims();
auto numChannels = outputDims[0].d[1];
auto numAnchors = outputDims[0].d[2];
std::vector<cv::Rect> bboxes;
std::vector<float> scores;
std::vector<int> labels;
std::vector<int> indices;
std::vector<std::vector<float>> kpss;
cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data()).t();
for (int i = 0; i < numAnchors; i++) {
auto rowPtr = output.row(i).ptr<float>();
auto bboxesPtr = rowPtr;
auto scoresPtr = rowPtr + 4;
auto kps_ptr = rowPtr + 5;
float score = *scoresPtr;
if (score > _modelConfig.detectionScoreThreshold) {
float x = *bboxesPtr++;
float y = *bboxesPtr++;
float w = *bboxesPtr++;
float h = *bboxesPtr;
float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth);
float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight);
cv::Rect_<float> bbox;
bbox.x = x0; bbox.y = y0;
bbox.width = x1 - x0; bbox.height = y1 - y0;
bbox.x = std::max(0.f, bbox.x);
bbox.y = std::max(0.f, bbox.y);
bbox.width = std::min(meta.imgWidth - bbox.x, bbox.width);
bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height);
std::vector<float> kps;
for (int k = 0; k < NUM_KPS; k++) {
float kpsX = std::clamp(*(kps_ptr + 3 * k) * meta.ratio, 0.f, meta.imgWidth);
float kpsY = std::clamp(*(kps_ptr + 3 * k + 1) * meta.ratio, 0.f, meta.imgHeight);
float kpsS = *(kps_ptr + 3 * k + 2);
kps.push_back(kpsX);
kps.push_back(kpsY);
kps.push_back(kpsS);
}
bboxes.push_back(bbox);
labels.push_back(0);
scores.push_back(score);
kpss.push_back(kps);
}
}
cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices);
std::vector<Object> objects;
int classNameSize = static_cast<int>(_classes.size());
for (auto& chosenIdx : indices) {
if (scores[chosenIdx] > _modelConfig.detectionScoreThreshold) {
Object obj{};
obj.confidence = scores[chosenIdx];
obj.classId = labels[chosenIdx];
if (!_classes.empty()) {
obj.className = (obj.classId < classNameSize)
? _classes[obj.classId] : _classes[classNameSize - 1];
}
else { obj.className = "Unknown"; }
obj.box = bboxes[chosenIdx];
//obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
obj.kps = kpss[chosenIdx];
obj.cameraId = camera_id;
objects.push_back(obj);
}
}
return objects;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessPose", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// Pose — end2end postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessPoseE2E(
std::vector<float>& featureVector,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const auto& outputDims = m_trtEngine->getOutputDims();
int numDets = outputDims[0].d[1];
int numFeat = outputDims[0].d[2]; // 6 + nk*3
int nk = (numFeat - 6) / 3;
std::vector<Object> results;
results.reserve(numDets);
for (int i = 0; i < numDets; ++i) {
const float* det = featureVector.data() + i * numFeat;
float conf = det[4];
if (conf <= _modelConfig.detectionScoreThreshold) continue;
int classId = static_cast<int>(det[5]);
float x0 = std::clamp(det[0] * meta.ratio, 0.f, meta.imgWidth);
float y0 = std::clamp(det[1] * meta.ratio, 0.f, meta.imgHeight);
float x1 = std::clamp(det[2] * meta.ratio, 0.f, meta.imgWidth);
float y1 = std::clamp(det[3] * meta.ratio, 0.f, meta.imgHeight);
float w = x1 - x0, h = y1 - y0;
if (w < 1.f || h < 1.f) continue;
const float* kpsPtr = det + 6;
std::vector<float> kps;
kps.reserve(nk * 3);
for (int k = 0; k < nk; ++k) {
float kx = std::clamp(kpsPtr[3*k] * meta.ratio, 0.f, meta.imgWidth);
float ky = std::clamp(kpsPtr[3*k+1] * meta.ratio, 0.f, meta.imgHeight);
float ks = kpsPtr[3*k+2];
kps.push_back(kx);
kps.push_back(ky);
kps.push_back(ks);
}
Object obj;
obj.classId = classId;
obj.confidence = conf;
obj.box = cv::Rect(static_cast<int>(x0), static_cast<int>(y0),
static_cast<int>(w), static_cast<int>(h));
obj.kps = std::move(kps);
int classNameSize = static_cast<int>(_classes.size());
if (!_classes.empty() && classId >= 0 && classId < classNameSize)
obj.className = _classes[classId];
obj.cameraId = camera_id;
results.push_back(std::move(obj));
}
return results;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessPoseE2E", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// Classification postprocess
// ====================================================================
std::vector<Object> ANSRTYOLO::PostprocessClassify(
std::vector<float>& featureVector,
const std::string& camera_id, const ImageMetadata& meta) {
try {
const int nc = static_cast<int>(featureVector.size());
if (nc == 0) return {};
// Check if output is already a probability distribution (sums to ~1.0).
// Some models include a Softmax layer; applying softmax again would
// flatten the distribution and cause wrong classifications.
float rawSum = 0.f;
bool allNonNeg = true;
for (int i = 0; i < nc; ++i) {
rawSum += featureVector[i];
if (featureVector[i] < 0.f) allNonNeg = false;
}
const bool alreadyNormalized = (allNonNeg && rawSum > 0.9f && rawSum < 1.1f);
std::vector<float> probs(nc);
if (alreadyNormalized) {
for (int i = 0; i < nc; ++i) probs[i] = featureVector[i];
} else {
float maxVal = *std::max_element(featureVector.begin(), featureVector.end());
float sumExp = 0.f;
for (int i = 0; i < nc; ++i) {
probs[i] = std::exp(featureVector[i] - maxVal);
sumExp += probs[i];
}
for (int i = 0; i < nc; ++i) probs[i] /= sumExp;
}
int bestClass = 0;
float bestProb = 0.f;
for (int i = 0; i < nc; ++i) {
if (probs[i] > bestProb) { bestProb = probs[i]; bestClass = i; }
}
const int imgW = static_cast<int>(meta.imgWidth);
const int imgH = static_cast<int>(meta.imgHeight);
Object obj;
if (imgW > 20 && imgH > 20) {
obj.box = cv::Rect(10, 10, imgW - 20, imgH - 20);
}
else {
obj.box = cv::Rect(0, 0, imgW, imgH);
}
//obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight);
obj.classId = bestClass;
obj.confidence = bestProb;
obj.cameraId = camera_id;
int classNameSize = static_cast<int>(_classes.size());
if (!_classes.empty() && bestClass >= 0 && bestClass < classNameSize)
obj.className = _classes[bestClass];
return { std::move(obj) };
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::PostprocessClassify", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// DetectObjects — single image with auto-detection of task type
// ====================================================================
std::vector<Object> ANSRTYOLO::DetectObjects(const cv::Mat& inputImage,
const std::string& camera_id) {
try {
// --- Debug timer helper ---
2026-03-28 16:54:11 +11:00
using Clock = std::chrono::steady_clock;
const bool dbg = _debugFlag;
auto t0 = Clock::now(); // Always set — used by ANS_DBG timing output
2026-03-28 16:54:11 +11:00
auto tPrev = t0;
auto elapsed = [&]() -> double {
auto now = Clock::now();
double ms = std::chrono::duration<double, std::milli>(now - tPrev).count();
tPrev = now;
return ms;
};
// --- 1. Set GPU device context ---
if (m_trtEngine) {
m_trtEngine->setDeviceContext();
}
double msSetDevice = dbg ? elapsed() : 0;
// --- 1b. CUDA context health check ---
if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) {
return {};
}
// --- 2. Preprocess under lock ---
2026-04-04 20:19:54 +11:00
ANS_DBG("YOLO", "Preprocess START %dx%d", inputImage.cols, inputImage.rows);
2026-03-28 16:54:11 +11:00
ImageMetadata meta;
std::vector<std::vector<cv::cuda::GpuMat>> input;
bool usedNV12 = false;
float bgrFullResScaleX = 1.0f, bgrFullResScaleY = 1.0f;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
const int inferenceGpu = m_trtEngine ? m_trtEngine->getPreferredDeviceIndex() : 0;
const auto& inputDims = m_trtEngine->getInputDims();
const int inputW = inputDims[0].d[2];
const int inputH = inputDims[0].d[1];
auto nv12 = m_nv12Helper.tryNV12(inputImage, inferenceGpu, inputW, inputH,
NV12PreprocessHelper::defaultYOLOLauncher(),
_logger, "ANSRTYOLO");
if (nv12.succeeded) {
meta.imgWidth = nv12.metaWidth;
meta.imgHeight = nv12.metaHeight;
meta.ratio = nv12.ratio;
input = {{ std::move(nv12.gpuRGB) }};
usedNV12 = true;
}
else if (nv12.useBgrFullRes) {
input = Preprocess(nv12.bgrFullResImg, meta);
usedNV12 = !input.empty();
bgrFullResScaleX = nv12.bgrFullResScaleX;
bgrFullResScaleY = nv12.bgrFullResScaleY;
}
if (input.empty()) {
input = Preprocess(inputImage, meta);
}
m_nv12Helper.tickInference();
}
double msPreprocess = dbg ? elapsed() : 0;
if (input.empty()) {
_logger.LogWarn("ANSRTYOLO::DetectObjects", "Skipped: preprocessing returned empty input", __FILE__, __LINE__);
return {};
}
// --- 3. TRT Inference (mutex released for concurrent GPU slots) ---
2026-04-04 20:19:54 +11:00
ANS_DBG("YOLO", "TRT inference START nv12=%d inputSize=%dx%d",
(int)usedNV12,
input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].cols),
input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].rows));
auto _trtStart = std::chrono::steady_clock::now();
2026-03-28 16:54:11 +11:00
std::vector<std::vector<std::vector<float>>> featureVectors;
if (!m_trtEngine->runInference(input, featureVectors)) {
2026-04-04 20:19:54 +11:00
ANS_DBG("YOLO", "ERROR: TRT runInference FAILED");
2026-03-28 16:54:11 +11:00
_logger.LogError("ANSRTYOLO::DetectObjects", "Error running inference", __FILE__, __LINE__);
return {};
}
2026-04-04 20:19:54 +11:00
auto _trtEnd = std::chrono::steady_clock::now();
double _trtMs = std::chrono::duration<double, std::milli>(_trtEnd - _trtStart).count();
if (_trtMs > 500.0) {
ANS_DBG("YOLO", "SLOW TRT inference: %.1fms", _trtMs);
}
2026-03-28 16:54:11 +11:00
double msInference = dbg ? elapsed() : 0;
// --- 4. Transform output ---
std::vector<Object> results;
bool isClassification = false;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
const auto& outputDims = m_trtEngine->getOutputDims();
const size_t numOutputs = outputDims.size();
if (numOutputs >= 2) {
std::vector<std::vector<float>> featureVector2d;
Engine<float>::transformOutput(featureVectors, featureVector2d);
double msTransform = dbg ? elapsed() : 0;
int dim1 = outputDims[0].d[1];
int dim2 = outputDims[0].d[2];
if (dim1 > dim2 || dim2 <= 20)
results = PostprocessSegE2E(featureVector2d, camera_id, meta);
else
results = PostprocessSegmentation(featureVector2d, camera_id, meta);
if (dbg) {
double msPostprocess = elapsed();
_logger.LogInfo("ANSRTYOLO::DetectObjects",
"[DEBUG] Seg | " + std::string(usedNV12 ? "NV12" : "BGR") +
" | SetDev=" + std::to_string(msSetDevice) +
"ms Preproc=" + std::to_string(msPreprocess) +
"ms Inf=" + std::to_string(msInference) +
"ms Transform=" + std::to_string(msTransform) +
"ms Postproc=" + std::to_string(msPostprocess) +
"ms Det=" + std::to_string(results.size()),
__FILE__, __LINE__);
}
}
else {
std::vector<float> featureVector;
Engine<float>::transformOutput(featureVectors, featureVector);
double msTransform = dbg ? elapsed() : 0;
if (outputDims[0].nbDims <= 2) {
results = PostprocessClassify(featureVector, camera_id, meta);
isClassification = true;
}
else {
int dim1 = outputDims[0].d[1];
int dim2 = outputDims[0].d[2];
int nc = static_cast<int>(_classes.size());
const bool isEndToEnd = (dim1 > dim2) || (dim2 <= 20);
if (isEndToEnd) {
if (dim2 == 6)
results = PostprocessDetectionE2E(featureVector, camera_id, meta);
else if (dim2 == 7)
results = PostprocessOBBE2E(featureVector, camera_id, meta);
else if (dim2 > 7 && (dim2 - 6) % 3 == 0)
results = PostprocessPoseE2E(featureVector, camera_id, meta);
else
results = PostprocessDetectionE2E(featureVector, camera_id, meta);
}
else {
int extra = dim1 - 4;
bool routed = false;
if (nc > 0 && nc <= extra) {
if (extra == nc) {
results = PostprocessDetection(featureVector, camera_id, meta);
routed = true;
}
else if (extra == nc + 1) {
results = PostprocessOBB(featureVector, camera_id, meta);
routed = true;
}
else if ((extra - nc) % 3 == 0 && (extra - nc) >= 3) {
results = PostprocessPose(featureVector, camera_id, meta);
routed = true;
}
}
if (!routed) {
if (extra >= 2) {
cv::Mat probe = cv::Mat(dim1, dim2, CV_32F, featureVector.data()).t();
int lastCol = dim1 - 1;
int numSamples = std::min(dim2, 100);
int angleCount = 0;
for (int s = 0; s < numSamples; ++s) {
float v = probe.at<float>(s, lastCol);
if (v >= -3.15f && v <= 3.15f) ++angleCount;
}
if (angleCount > numSamples * 8 / 10) {
results = PostprocessOBB(featureVector, camera_id, meta);
routed = true;
}
}
if (!routed && dim1 == 56)
results = PostprocessPose(featureVector, camera_id, meta);
else if (!routed)
results = PostprocessDetection(featureVector, camera_id, meta);
}
}
}
if (dbg) {
double msPostprocess = elapsed();
_logger.LogInfo("ANSRTYOLO::DetectObjects",
"[DEBUG] " + camera_id +
" | " + std::string(usedNV12 ? "NV12" : "BGR") +
" | SetDev=" + std::to_string(msSetDevice) +
"ms Preproc=" + std::to_string(msPreprocess) +
"ms Inf=" + std::to_string(msInference) +
"ms Transform=" + std::to_string(msTransform) +
"ms Postproc=" + std::to_string(msPostprocess) +
"ms Det=" + std::to_string(results.size()) +
(isClassification ? " [classify]" : " [detect]"),
__FILE__, __LINE__);
}
}
}
// --- 4b. Rescale coords from full-res to display-res (BGR full-res path) ---
// When ANSVideoPlayer provides full-res BGR via the registry, Preprocess
// and Postprocess operate in full-res coordinates. But the caller passed
// a display-res inputImage and expects coords in that space. Remap here.
if (bgrFullResScaleX != 1.0f || bgrFullResScaleY != 1.0f) {
for (auto& obj : results) {
obj.box.x = static_cast<int>(obj.box.x * bgrFullResScaleX);
obj.box.y = static_cast<int>(obj.box.y * bgrFullResScaleY);
obj.box.width = static_cast<int>(obj.box.width * bgrFullResScaleX);
obj.box.height = static_cast<int>(obj.box.height * bgrFullResScaleY);
// Rescale polygon points if present (segmentation / OBB)
for (auto& pt : obj.polygon) {
pt.x *= bgrFullResScaleX;
pt.y *= bgrFullResScaleY;
}
// Rescale keypoints if present (pose: x,y,conf triplets)
for (size_t k = 0; k + 2 < obj.kps.size(); k += 3) {
obj.kps[k] *= bgrFullResScaleX;
obj.kps[k + 1] *= bgrFullResScaleY;
}
}
}
// --- 5. Tracking + Stabilization ---
if (_trackerEnabled && !isClassification) {
results = ApplyTracking(results, camera_id);
double msTracking = dbg ? elapsed() : 0;
if (_stabilizationEnabled) {
results = StabilizeDetections(results, camera_id);
}
double msStabilize = dbg ? elapsed() : 0;
if (dbg) {
_logger.LogInfo("ANSRTYOLO::DetectObjects",
"[DEBUG] " + camera_id +
" | Tracking=" + std::to_string(msTracking) +
"ms Stabilize=" + std::to_string(msStabilize) + "ms",
__FILE__, __LINE__);
}
}
// --- 6. Total pipeline time ---
{
2026-03-28 16:54:11 +11:00
double msTotal = std::chrono::duration<double, std::milli>(Clock::now() - t0).count();
if (dbg) {
_logger.LogInfo("ANSRTYOLO::DetectObjects",
"[DEBUG] " + camera_id + " | TOTAL=" + std::to_string(msTotal) +
"ms (" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) +
") Results=" + std::to_string(results.size()),
__FILE__, __LINE__);
}
// DebugView output — controlled by ANSCORE_DEBUGVIEW
double msPreproc = std::chrono::duration<double, std::milli>(_trtStart - t0).count();
ANS_DBG("YOLO_Timing", "cam=%s total=%.1fms preproc=%.1fms inf=%.1fms %dx%d det=%zu %s",
camera_id.c_str(), msTotal, msPreproc, _trtMs,
inputImage.cols, inputImage.rows, results.size(),
usedNV12 ? "NV12" : "BGR");
2026-03-28 16:54:11 +11:00
}
return results;
}
catch (std::exception& e) {
_logger.LogFatal("ANSRTYOLO::DetectObjects", e.what(), __FILE__, __LINE__);
return {};
}
}
// ====================================================================
// DetectObjectsBatch — batch inference with auto-detection
// ====================================================================
std::vector<std::vector<Object>> ANSRTYOLO::DetectObjectsBatch(
const std::vector<cv::Mat>& inputImages, const std::string& camera_id) {
if (inputImages.empty()) {
_logger.LogError("ANSRTYOLO::DetectObjectsBatch", "Empty input images vector", __FILE__, __LINE__);
return {};
}
// Auto-split if batch exceeds engine capacity
const int maxBatch = m_options.maxBatchSize > 0 ? m_options.maxBatchSize : 1;
if (static_cast<int>(inputImages.size()) > maxBatch && maxBatch > 0) {
const size_t numImages = inputImages.size();
std::vector<std::vector<Object>> allResults;
allResults.reserve(numImages);
for (size_t start = 0; start < numImages; start += static_cast<size_t>(maxBatch)) {
const size_t end = std::min(start + static_cast<size_t>(maxBatch), numImages);
std::vector<cv::Mat> chunk(inputImages.begin() + start, inputImages.begin() + end);
auto chunkResults = DetectObjectsBatch(chunk, camera_id);
if (chunkResults.size() == chunk.size()) {
for (auto& r : chunkResults) allResults.push_back(std::move(r));
}
else {
_logger.LogError("ANSRTYOLO::DetectObjectsBatch",
"Chunk returned " + std::to_string(chunkResults.size()) +
" results, expected " + std::to_string(chunk.size()), __FILE__, __LINE__);
for (auto& r : chunkResults) allResults.push_back(std::move(r));
for (size_t pad = chunkResults.size(); pad < chunk.size(); ++pad)
allResults.push_back({});
}
}
return allResults;
}
try {
// --- Debug timer helper ---
using Clock = std::chrono::steady_clock;
const bool dbg = _debugFlag;
auto t0 = Clock::now(); // Always set — used by ANS_DBG timing output
2026-03-28 16:54:11 +11:00
auto tPrev = t0;
auto elapsed = [&]() -> double {
auto now = Clock::now();
double ms = std::chrono::duration<double, std::milli>(now - tPrev).count();
tPrev = now;
return ms;
};
// Ensure correct GPU context for preprocessing (multi-GPU safety)
if (m_trtEngine) {
m_trtEngine->setDeviceContext();
}
double msSetDevice = dbg ? elapsed() : 0;
// CUDA context health check (same as DetectObjects)
if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) {
return {};
}
const size_t realCount = inputImages.size();
// Pad batch to next power-of-2
size_t paddedCount = 1;
while (paddedCount < realCount) paddedCount *= 2;
paddedCount = std::min(paddedCount, static_cast<size_t>(maxBatch));
const std::vector<cv::Mat>* batchPtr = &inputImages;
std::vector<cv::Mat> paddedImages;
if (paddedCount > realCount) {
paddedImages.reserve(paddedCount);
paddedImages.insert(paddedImages.end(), inputImages.begin(), inputImages.end());
for (size_t p = realCount; p < paddedCount; ++p)
paddedImages.push_back(inputImages.back());
batchPtr = &paddedImages;
}
double msPad = dbg ? elapsed() : 0;
BatchMetadata metadata;
const auto inputs = PreprocessBatch(*batchPtr, metadata);
double msPreprocess = dbg ? elapsed() : 0;
if (inputs.empty() || inputs[0].empty()) {
_logger.LogWarn("ANSRTYOLO::DetectObjectsBatch", "Skipped: preprocessing failed", __FILE__, __LINE__);
return {};
}
// Check for prior CUDA errors before inference.
cudaError_t priorErr = cudaGetLastError();
if (priorErr != cudaSuccess) {
_logger.LogWarn("ANSRTYOLO::DetectObjectsBatch",
std::string("Cleared prior CUDA error before inference: ")
+ cudaGetErrorString(priorErr),
__FILE__, __LINE__);
}
std::vector<std::vector<std::vector<float>>> featureVectors;
auto succ = m_trtEngine->runInference(inputs, featureVectors);
if (!succ) {
cudaError_t postErr = cudaPeekAtLastError();
std::string detail = "runInference returned false, batchSize="
+ std::to_string(inputs[0].size());
if (postErr != cudaSuccess) {
detail += ", CUDA error: ";
detail += cudaGetErrorString(postErr);
}
_logger.LogError("ANSRTYOLO::DetectObjectsBatch", detail, __FILE__, __LINE__);
return {};
}
double msInference = dbg ? elapsed() : 0;
if (featureVectors.size() != paddedCount) {
_logger.LogError("ANSRTYOLO::DetectObjectsBatch", "Output batch size mismatch", __FILE__, __LINE__);
return {};
}
featureVectors.resize(realCount);
const auto& outputDims = m_trtEngine->getOutputDims();
const size_t numOutputs = outputDims.size();
const size_t numBatch = featureVectors.size();
// Determine task type once (same model for all images in batch)
int dim1 = outputDims[0].d[1];
int dim2 = outputDims[0].d[2];
int nc = static_cast<int>(_classes.size());
enum class TaskType { DetLegacy, DetE2E, OBBLegacy, OBBE2E,
SegLegacy, SegE2E, PoseLegacy, PoseE2E, Classify };
TaskType taskType = TaskType::DetLegacy; // default
// E2E: dim1 > dim2 (e.g. [B,300,6]); Legacy: dim1 < dim2 (e.g. [B,84,8400])
const bool isEndToEnd = (dim1 > dim2) || (dim2 <= 20);
if (numOutputs >= 2) {
taskType = isEndToEnd ? TaskType::SegE2E : TaskType::SegLegacy;
}
else if (outputDims[0].nbDims <= 2) {
taskType = TaskType::Classify;
}
else if (isEndToEnd) {
if (dim2 == 6) taskType = TaskType::DetE2E;
else if (dim2 == 7) taskType = TaskType::OBBE2E;
else if (dim2 > 7 && (dim2-6) % 3 == 0) taskType = TaskType::PoseE2E;
else taskType = TaskType::DetE2E;
}
else {
int extra = dim1 - 4;
bool routed = false;
// Try class-list-based routing first (only if class count fits within tensor)
if (nc > 0 && nc <= extra) {
if (extra == nc) { taskType = TaskType::DetLegacy; routed = true; }
else if (extra == nc + 1) { taskType = TaskType::OBBLegacy; routed = true; }
else if ((extra-nc) % 3 == 0 && (extra-nc) >= 3) { taskType = TaskType::PoseLegacy; routed = true; }
}
// Fallback: probe last channel for angle values to detect OBB
if (!routed && extra >= 2 && !featureVectors.empty() && !featureVectors[0].empty() && !featureVectors[0][0].empty()) {
// Transpose first image's feature vector and probe last column
cv::Mat raw(dim1, dim2, CV_32F, const_cast<float*>(featureVectors[0][0].data()));
cv::Mat probe;
cv::transpose(raw, probe); // [dim2, dim1]
int lastCol = dim1 - 1;
int numSamples = std::min(dim2, 100);
int angleCount = 0;
for (int s = 0; s < numSamples; ++s) {
float v = probe.at<float>(s, lastCol);
if (v >= -3.15f && v <= 3.15f) ++angleCount;
}
if (angleCount > numSamples * 8 / 10) {
taskType = TaskType::OBBLegacy;
routed = true;
}
}
if (!routed) {
if (dim1 == 56) taskType = TaskType::PoseLegacy;
else taskType = TaskType::DetLegacy;
}
}
// Process each image in parallel
std::vector<std::vector<Object>> batchDetections(numBatch);
std::vector<std::future<std::vector<Object>>> postFutures;
postFutures.reserve(numBatch);
for (size_t batchIdx = 0; batchIdx < numBatch; ++batchIdx) {
const auto& batchOutput = featureVectors[batchIdx];
ImageMetadata imgMeta;
imgMeta.ratio = metadata.ratios[batchIdx];
imgMeta.imgWidth = static_cast<float>(metadata.imgWidths[batchIdx]);
imgMeta.imgHeight = static_cast<float>(metadata.imgHeights[batchIdx]);
switch (taskType) {
case TaskType::SegLegacy:
case TaskType::SegE2E: {
std::vector<std::vector<float>> fv2d;
fv2d.reserve(batchOutput.size());
for (const auto& out : batchOutput) fv2d.push_back(out);
if (taskType == TaskType::SegE2E) {
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv2d), cid = camera_id, m = imgMeta]() mutable {
return PostprocessSegE2E(fv, cid, m);
}));
}
else {
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv2d), cid = camera_id, m = imgMeta]() mutable {
return PostprocessSegmentation(fv, cid, m);
}));
}
break;
}
case TaskType::Classify: {
std::vector<float> fv = batchOutput.empty() ? std::vector<float>{} : batchOutput[0];
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessClassify(fv, cid, m);
}));
break;
}
default: {
std::vector<float> fv = batchOutput.empty() ? std::vector<float>{} : batchOutput[0];
switch (taskType) {
case TaskType::DetLegacy:
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessDetection(fv, cid, m);
}));
break;
case TaskType::DetE2E:
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessDetectionE2E(fv, cid, m);
}));
break;
case TaskType::OBBLegacy:
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessOBB(fv, cid, m);
}));
break;
case TaskType::OBBE2E:
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessOBBE2E(fv, cid, m);
}));
break;
case TaskType::PoseLegacy:
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessPose(fv, cid, m);
}));
break;
case TaskType::PoseE2E:
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessPoseE2E(fv, cid, m);
}));
break;
default:
postFutures.push_back(std::async(std::launch::async,
[this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable {
return PostprocessDetection(fv, cid, m);
}));
break;
}
break;
}
}
}
// Gather results
for (size_t i = 0; i < numBatch; ++i)
batchDetections[i] = postFutures[i].get();
// Apply tracker per frame (skip for classification models)
if (_trackerEnabled && taskType != TaskType::Classify) {
for (auto& results : batchDetections) {
if (!results.empty()) {
results = ApplyTracking(results, camera_id);
if (_stabilizationEnabled) {
results = StabilizeDetections(results, camera_id);
}
}
}
}
{
double msPostprocess = dbg ? elapsed() : 0;
2026-03-28 16:54:11 +11:00
double msTotal = std::chrono::duration<double, std::milli>(Clock::now() - t0).count();
if (dbg) {
_logger.LogInfo("ANSRTYOLO::DetectObjectsBatch",
"[DEBUG] " + camera_id +
" batch=" + std::to_string(realCount) +
" | SetDev=" + std::to_string(msSetDevice) +
"ms Pad=" + std::to_string(msPad) +
"ms Preproc=" + std::to_string(msPreprocess) +
"ms Inf=" + std::to_string(msInference) +
"ms Postproc=" + std::to_string(msPostprocess) +
"ms TOTAL=" + std::to_string(msTotal) + "ms",
__FILE__, __LINE__);
}
ANS_DBG("YOLO_Timing", "cam=%s batch=%d total=%.1fms preproc=%.1fms inf=%.1fms",
camera_id.c_str(), realCount, msTotal, msPreprocess, msInference);
2026-03-28 16:54:11 +11:00
}
return batchDetections;
}
catch (const std::exception& e) {
_logger.LogFatal("ANSRTYOLO::DetectObjectsBatch", e.what(), __FILE__, __LINE__);
return {};
}
}
} // namespace ANSCENTER