#include "ANSRTYOLO.h" #include "Utility.h" #include "ANSLicense.h" // ANS_DBG macro for DebugView #include #include #include #include #include #include #include namespace ANSCENTER { // ==================================================================== // ANSODBase interface — OptimizeModel // ==================================================================== bool ANSRTYOLO::OptimizeModel(bool fp16, std::string& optimizedModelFolder) { std::lock_guard lock(_mutex); if (!ANSODBase::OptimizeModel(fp16, optimizedModelFolder)) return false; if (!FileExist(_modelFilePath)) { _logger.LogFatal("ANSRTYOLO::OptimizeModel", "Raw model file path does not exist", __FILE__, __LINE__); return false; } try { _fp16 = fp16; optimizedModelFolder = GetParentFolder(_modelFilePath); if (!m_trtEngine) { m_options.optBatchSize = _modelConfig.gpuOptBatchSize; m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize; m_options.deviceIndex = _modelConfig.gpuDeviceIndex; m_options.maxInputHeight = _modelConfig.maxInputHeight; m_options.minInputHeight = _modelConfig.minInputHeight; m_options.optInputHeight = _modelConfig.optInputHeight; m_options.maxInputWidth = _modelConfig.maxInputWidth; m_options.minInputWidth = _modelConfig.minInputWidth; m_options.optInputWidth = _modelConfig.optInputWidth; m_options.engineFileDir = optimizedModelFolder; m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32); m_trtEngine = std::make_shared>(m_options); } auto succ = m_trtEngine->buildWithRetry(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE); if (!succ) { _logger.LogError("ANSRTYOLO::OptimizeModel", "Error: Unable to build TensorRT engine. " + _modelFilePath, __FILE__, __LINE__); return false; } return true; } catch (std::exception& e) { _logger.LogFatal("ANSRTYOLO::OptimizeModel", e.what(), __FILE__, __LINE__); return false; } } // ==================================================================== // ANSODBase interface — LoadModel // ==================================================================== bool ANSRTYOLO::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) { std::lock_guard lock(_mutex); try { _isFixedBatch = false; bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword); if (!result) return false; _modelConfig.modelType = ModelType::TENSORRT; if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640; if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640; if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.5f; if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.5f; if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133) _modelConfig.numKPS = 17; if (_modelConfig.kpsThreshold <= 0) _modelConfig.kpsThreshold = 0.5f; _fp16 = true; TOP_K = 300; SEG_CHANNELS = 32; PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold; NMS_THRESHOLD = _modelConfig.modelMNSThreshold; SEGMENTATION_THRESHOLD = 0.5f; SEG_H = 160; SEG_W = 160; NUM_KPS = _modelConfig.numKPS; KPS_THRESHOLD = _modelConfig.kpsThreshold; m_options.optBatchSize = _modelConfig.gpuOptBatchSize; m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize; m_options.deviceIndex = _modelConfig.gpuDeviceIndex; m_options.maxInputHeight = _modelConfig.maxInputHeight; m_options.minInputHeight = _modelConfig.minInputHeight; m_options.optInputHeight = _modelConfig.optInputHeight; m_options.maxInputWidth = _modelConfig.maxInputWidth; m_options.minInputWidth = _modelConfig.minInputWidth; m_options.optInputWidth = _modelConfig.optInputWidth; m_options.engineFileDir = _modelFolder; m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32); _modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx"); if (FileExist(_modelConfigFile)) { ModelType modelType; std::vector inputShape; _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape); if (inputShape.size() == 2) { if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0]; if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1]; } } else { _classFilePath = CreateFilePath(_modelFolder, "classes.names"); std::ifstream isValid(_classFilePath); if (!isValid) LoadClassesFromString(); else LoadClassesFromFile(); } if (this->_loadEngineOnCreation) { if (!m_trtEngine) { m_poolKey = { _modelFilePath, static_cast(m_options.precision), m_options.maxBatchSize }; m_trtEngine = EnginePoolManager::instance().acquire( m_poolKey, m_options, _modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu); m_usingSharedPool = (m_trtEngine != nullptr); } if (!m_trtEngine) { _logger.LogError("ANSRTYOLO::LoadModel", "Error: Unable to load TensorRT engine. " + _modelFilePath, __FILE__, __LINE__); _modelLoadValid = false; return false; } m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize; m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize; m_trtEngine->warmUp(); } _modelLoadValid = true; _isInitialized = true; return true; } catch (std::exception& e) { _logger.LogFatal("ANSRTYOLO::LoadModel", e.what(), __FILE__, __LINE__); return false; } } // ==================================================================== // ANSODBase interface — LoadModelFromFolder // ==================================================================== bool ANSRTYOLO::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig, std::string modelName, std::string className, const std::string& modelFolder, std::string& labelMap) { std::lock_guard lock(_mutex); try { _isFixedBatch = false; bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap); if (!result) return false; _modelConfig = modelConfig; _modelConfig.modelType = ModelType::TENSORRT; if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640; if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640; _modelConfig.precisionType = PrecisionType::FP32; if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133) _modelConfig.numKPS = 17; if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.5f; if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.5f; if (_modelConfig.kpsThreshold <= 0) _modelConfig.kpsThreshold = 0.5f; _fp16 = true; TOP_K = 300; SEG_CHANNELS = 32; PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold; NMS_THRESHOLD = _modelConfig.modelMNSThreshold; SEGMENTATION_THRESHOLD = 0.5f; SEG_H = 160; SEG_W = 160; NUM_KPS = _modelConfig.numKPS; KPS_THRESHOLD = _modelConfig.kpsThreshold; std::string _modelName = modelName; if (_modelName.empty()) _modelName = "train_last"; std::string modelFullName = _modelName + ".onnx"; m_options.optBatchSize = _modelConfig.gpuOptBatchSize; m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize; m_options.deviceIndex = _modelConfig.gpuDeviceIndex; m_options.maxInputHeight = _modelConfig.maxInputHeight; m_options.minInputHeight = _modelConfig.minInputHeight; m_options.optInputHeight = _modelConfig.optInputHeight; m_options.maxInputWidth = _modelConfig.maxInputWidth; m_options.minInputWidth = _modelConfig.minInputWidth; m_options.optInputWidth = _modelConfig.optInputWidth; m_options.engineFileDir = _modelFolder; m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32); _modelFilePath = CreateFilePath(_modelFolder, modelFullName); if (FileExist(_modelConfigFile)) { ModelType modelType; std::vector inputShape; _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape); if (inputShape.size() == 2) { if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0]; if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1]; } } else { _classFilePath = CreateFilePath(_modelFolder, className); std::ifstream isValid(_classFilePath); if (!isValid) LoadClassesFromString(); else LoadClassesFromFile(); } labelMap.clear(); if (!_classes.empty()) labelMap = VectorToCommaSeparatedString(_classes); if (this->_loadEngineOnCreation) { if (!m_trtEngine) { m_poolKey = { _modelFilePath, static_cast(m_options.precision), m_options.maxBatchSize }; m_trtEngine = EnginePoolManager::instance().acquire( m_poolKey, m_options, _modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu); m_usingSharedPool = (m_trtEngine != nullptr); } if (!m_trtEngine) { _logger.LogError("ANSRTYOLO::LoadModelFromFolder", "Error: Unable to load TensorRT engine. " + _modelFilePath, __FILE__, __LINE__); _modelLoadValid = false; return false; } m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize; m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize; m_trtEngine->warmUp(); } _modelLoadValid = true; _isInitialized = true; return true; } catch (std::exception& e) { _logger.LogFatal("ANSRTYOLO::LoadModelFromFolder", e.what(), __FILE__, __LINE__); return false; } } // ==================================================================== // ANSODBase interface — Initialize // ==================================================================== bool ANSRTYOLO::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, std::string& labelMap) { std::lock_guard lock(_mutex); try { const bool engineAlreadyLoaded = _modelLoadValid && _isInitialized && m_trtEngine != nullptr; _modelLoadValid = false; _isFixedBatch = false; bool result = ANSODBase::Initialize(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, labelMap); if (!result) return false; _modelConfig = modelConfig; _modelConfig.modelType = ModelType::TENSORRT; if (_modelConfig.inpHeight <= 0) _modelConfig.inpHeight = 640; if (_modelConfig.inpWidth <= 0) _modelConfig.inpWidth = 640; _modelConfig.precisionType = PrecisionType::FP32; if (_modelConfig.numKPS <= 0 || _modelConfig.numKPS > 133) _modelConfig.numKPS = 17; if (_modelConfig.modelMNSThreshold < 0.2f) _modelConfig.modelMNSThreshold = 0.5f; if (_modelConfig.modelConfThreshold < 0.2f) _modelConfig.modelConfThreshold = 0.5f; if (_modelConfig.kpsThreshold <= 0) _modelConfig.kpsThreshold = 0.5f; _fp16 = true; TOP_K = 300; SEG_CHANNELS = 32; PROBABILITY_THRESHOLD = _modelConfig.detectionScoreThreshold; NMS_THRESHOLD = _modelConfig.modelMNSThreshold; SEGMENTATION_THRESHOLD = 0.5f; SEG_H = 160; SEG_W = 160; NUM_KPS = _modelConfig.numKPS; KPS_THRESHOLD = _modelConfig.kpsThreshold; m_options.optBatchSize = _modelConfig.gpuOptBatchSize; m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize; m_options.deviceIndex = _modelConfig.gpuDeviceIndex; m_options.maxInputHeight = _modelConfig.maxInputHeight; m_options.minInputHeight = _modelConfig.minInputHeight; m_options.optInputHeight = _modelConfig.optInputHeight; m_options.maxInputWidth = _modelConfig.maxInputWidth; m_options.minInputWidth = _modelConfig.minInputWidth; m_options.optInputWidth = _modelConfig.optInputWidth; m_options.engineFileDir = _modelFolder; m_options.precision = (_fp16 ? Precision::FP16 : Precision::FP32); _modelFilePath = CreateFilePath(_modelFolder, "train_last.onnx"); if (FileExist(_modelConfigFile)) { ModelType modelType; std::vector inputShape; _classes = ANSUtilityHelper::GetConfigFileContent(_modelConfigFile, modelType, inputShape); if (inputShape.size() == 2) { if (inputShape[0] > 0) _modelConfig.inpHeight = inputShape[0]; if (inputShape[1] > 0) _modelConfig.inpWidth = inputShape[1]; } } else { _classFilePath = CreateFilePath(_modelFolder, "classes.names"); std::ifstream isValid(_classFilePath); if (!isValid) LoadClassesFromString(); else LoadClassesFromFile(); } labelMap.clear(); if (!_classes.empty()) labelMap = VectorToCommaSeparatedString(_classes); if (this->_loadEngineOnCreation && !engineAlreadyLoaded) { if (!m_trtEngine) { m_poolKey = { _modelFilePath, static_cast(m_options.precision), m_options.maxBatchSize }; m_trtEngine = EnginePoolManager::instance().acquire( m_poolKey, m_options, _modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu); m_usingSharedPool = (m_trtEngine != nullptr); } if (!m_trtEngine) { _logger.LogError("ANSRTYOLO::Initialize", "Error: Unable to load TensorRT engine. " + _modelFilePath, __FILE__, __LINE__); _modelLoadValid = false; return false; } m_options.maxBatchSize = m_trtEngine->getOptions().maxBatchSize; m_options.optBatchSize = m_trtEngine->getOptions().optBatchSize; m_trtEngine->warmUp(); } _modelLoadValid = true; _isInitialized = true; return true; } catch (std::exception& e) { _logger.LogFatal("ANSRTYOLO::Initialize", e.what(), __FILE__, __LINE__); return false; } } // ==================================================================== // RunInference / RunInferencesBatch / Destroy / Destructor // ==================================================================== std::vector ANSRTYOLO::RunInference(const cv::Mat& inputImgBGR) { return RunInference(inputImgBGR, ""); } std::vector ANSRTYOLO::RunInference(const cv::Mat& inputImgBGR, const std::string& camera_id) { { std::lock_guard lock(_mutex); if (!_modelLoadValid) { _logger.LogError("ANSRTYOLO::RunInference", "Cannot load TensorRT model", __FILE__, __LINE__); return {}; } if (!_licenseValid) { _logger.LogError("ANSRTYOLO::RunInference", "Invalid license", __FILE__, __LINE__); return {}; } if (!_isInitialized) { _logger.LogError("ANSRTYOLO::RunInference", "Model not initialized", __FILE__, __LINE__); return {}; } if (inputImgBGR.empty() || inputImgBGR.cols < 10 || inputImgBGR.rows < 10) return {}; } try { return DetectObjects(inputImgBGR, camera_id); } catch (const std::exception& e) { _logger.LogFatal("ANSRTYOLO::RunInference", e.what(), __FILE__, __LINE__); return {}; } } std::vector> ANSRTYOLO::RunInferencesBatch( const std::vector& inputs, const std::string& camera_id) { { std::lock_guard lock(_mutex); if (!_modelLoadValid) { _logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Cannot load the TensorRT model", __FILE__, __LINE__); return {}; } if (!_licenseValid) { _logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Runtime license is not valid or expired", __FILE__, __LINE__); return {}; } if (!_isInitialized) { _logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Initialisation is not valid", __FILE__, __LINE__); return {}; } if (inputs.empty()) { _logger.LogFatal("ANSRTYOLO::RunInferencesBatch", "Input images vector is empty", __FILE__, __LINE__); return {}; } } try { if (_isFixedBatch) return ANSODBase::RunInferencesBatch(inputs, camera_id); else return DetectObjectsBatch(inputs, camera_id); } catch (const std::exception& e) { _logger.LogFatal("ANSRTYOLO::RunInferencesBatch", e.what(), __FILE__, __LINE__); return {}; } } ANSRTYOLO::~ANSRTYOLO() { try { Destroy(); } catch (std::exception& e) { _logger.LogError("ANSRTYOLO::~ANSRTYOLO()", e.what(), __FILE__, __LINE__); } } bool ANSRTYOLO::Destroy() { try { if (m_usingSharedPool) { // Release our reference to the shared pool. // Pool is destroyed only when all tasks release it. EnginePoolManager::instance().release(m_poolKey); m_trtEngine.reset(); // drop shared_ptr (pool may survive) m_usingSharedPool = false; } else { m_trtEngine.reset(); } m_nv12Helper.destroy(); return true; } catch (std::exception& e) { _logger.LogError("ANSRTYOLO::Destroy()", e.what(), __FILE__, __LINE__); return false; } } // ==================================================================== // GPU Preprocessing — single image (pinned-memory H2D path) // // 1. Copy raw host image into a pinned (page-locked) buffer // 2. Upload from pinned memory → GPU (DMA, no staging copy) // 3. BGR→RGB colour conversion on GPU // 4. Letterbox resize on GPU (right-bottom pad) // // Pinned memory eliminates the internal pageable→pinned staging // copy that CUDA performs for normal (pageable) host memory, // cutting the H2D transfer of a 3840×2160 BGR frame (~24 MB) // by 60-70%. // ==================================================================== std::vector> ANSRTYOLO::Preprocess( const cv::Mat& inputImage, ImageMetadata& outMeta) { std::lock_guard lock(_mutex); try { if (!_licenseValid) { _logger.LogFatal("ANSRTYOLO::Preprocess", "Invalid license", __FILE__, __LINE__); return {}; } const auto& inputDims = m_trtEngine->getInputDims(); const int inputH = inputDims[0].d[1]; const int inputW = inputDims[0].d[2]; // Early-out if CUDA context is dead (sticky error from CUVID crash etc.) if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) return {}; cv::cuda::Stream stream; cv::cuda::GpuMat gpuImg; // Resolve source Mat (handle grayscale → BGR on CPU first) if (inputImage.channels() == 1) { cv::Mat img3Channel; cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); gpuImg.upload(img3Channel, stream); } else { gpuImg.upload(inputImage, stream); } // GPU: BGR → RGB cv::cuda::GpuMat gpuRGB; cv::cuda::cvtColor(gpuImg, gpuRGB, cv::COLOR_BGR2RGB, 0, stream); outMeta.imgHeight = static_cast(gpuRGB.rows); outMeta.imgWidth = static_cast(gpuRGB.cols); if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) { outMeta.ratio = 1.f / std::min( inputDims[0].d[2] / static_cast(gpuRGB.cols), inputDims[0].d[1] / static_cast(gpuRGB.rows)); // Check if model is classification (output ndims <= 2) const auto& outputDims = m_trtEngine->getOutputDims(); const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; cv::cuda::GpuMat gpuResized; if (gpuRGB.rows != inputH || gpuRGB.cols != inputW) { if (isClassification) { // Classification: direct resize (no letterbox padding) cv::cuda::resize(gpuRGB, gpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream); } else { // Detection/Seg/Pose/OBB: letterbox resize + right-bottom pad (on GPU) gpuResized = Engine::resizeKeepAspectRatioPadRightBottom( gpuRGB, inputH, inputW); } } else { gpuResized = gpuRGB; } stream.waitForCompletion(); std::vector input{ std::move(gpuResized) }; std::vector> inputs{ std::move(input) }; return inputs; } else { _logger.LogFatal("ANSRTYOLO::Preprocess", "Image height or width is zero (Width: " + std::to_string(outMeta.imgWidth) + ", Height: " + std::to_string(outMeta.imgHeight) + ")", __FILE__, __LINE__); return {}; } } catch (const std::exception& e) { _logger.LogWarn("ANSRTYOLO::Preprocess", std::string("Skipped frame: ") + e.what(), __FILE__, __LINE__); return {}; } } #if 0 // PreprocessFromNV12 — moved to NV12PreprocessHelper::tryNV12() try { if (!gpuData || !gpuData->yPlane || !gpuData->uvPlane) { if (!m_nv12NullLogged) { m_nv12NullLogged = true; _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12", "Early exit: null data — gpuData=" + std::to_string(gpuData != nullptr) + " yPlane=" + std::to_string(gpuData ? (gpuData->yPlane != nullptr) : false) + " uvPlane=" + std::to_string(gpuData ? (gpuData->uvPlane != nullptr) : false) + " isCuda=" + std::to_string(gpuData ? gpuData->isCudaDevicePtr : false), __FILE__, __LINE__); } return {}; } const auto& inputDims = m_trtEngine->getInputDims(); const int inputH = inputDims[0].d[1]; const int inputW = inputDims[0].d[2]; const int frameW = gpuData->width; const int frameH = gpuData->height; if (frameW <= 0 || frameH <= 0) { if (!m_nv12DimLogged) { m_nv12DimLogged = true; _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12", "Early exit: bad dimensions — w=" + std::to_string(frameW) + " h=" + std::to_string(frameH), __FILE__, __LINE__); } return {}; } // Early-out if CUDA context is dead (sticky error from CUVID crash etc.) if (m_cudaContextDead) { if (!m_nv12DeadLogged) { m_nv12DeadLogged = true; _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12", "Early exit: CUDA context dead", __FILE__, __LINE__); } return {}; } // Cache flag before lock is released — gpuData may be invalidated after unlock const bool isCudaDevice = gpuData->isCudaDevicePtr; // ── GPU index validation for zero-copy ── // NVDEC device pointers are only valid on the CUDA context that decoded them. // If decode GPU != inference GPU, wrapping those pointers causes // "illegal memory access" → sticky CUDA error → entire context dies. // Fall back to CPU memcpy+upload path when GPUs don't match. const int inferenceGpu = m_trtEngine ? m_trtEngine->getPreferredDeviceIndex() : 0; const bool gpuMatch = !isCudaDevice || gpuData->gpuIndex < 0 || // unknown = trust it gpuData->gpuIndex == inferenceGpu; const bool useZeroCopy = isCudaDevice && gpuMatch; // Local plane pointers — default to gpuData's primary planes. // Overridden below for cross-GPU fallback (CPU NV12 instead of CUDA). uint8_t* effYPlane = gpuData->yPlane; uint8_t* effUvPlane = gpuData->uvPlane; int effYLinesize = gpuData->yLinesize; int effUvLinesize = gpuData->uvLinesize; if (isCudaDevice && !gpuMatch) { // Cross-GPU: NV12 decoded on one GPU, inference on another. // CPU NV12 fallback uploads full decode-res NV12 (e.g. 3840x2160 = 12.4 MB) // over PCIe, which is SLOWER than BGR at display-res (1920x1080 = 6.2 MB). // Measured: CPU NV12 cross-GPU = 15-39ms preproc vs BGR = 10-20ms. // Just fall back to BGR — it's faster for the cross-GPU case. if (!m_gpuMismatchLogged) { m_gpuMismatchLogged = true; _logger.LogInfo("ANSRTYOLO::PreprocessFromNV12", "GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) + " vs inference GPU " + std::to_string(inferenceGpu) + ") — skipping NV12, using BGR (faster for cross-GPU: " "BGR uploads " + std::to_string(displayW * displayH * 3 / 1024) + "KB display-res vs NV12 " + std::to_string(frameW * frameH * 3 / 2 / 1024) + "KB full-res)", __FILE__, __LINE__); } if (regLock.owns_lock()) regLock.unlock(); return {}; // caller will use Preprocess(BGR) instead } // Diagnostic: log which path will be taken (once per instance) // Note: cross-GPU case already returned {} above, so reaching here // means either CUDA zero-copy (same GPU) or CPU NV12 upload (non-CUDA). if (!m_nv12PathLogged) { m_nv12PathLogged = true; const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY" : "CPU_NV12_UPLOAD"; _logger.LogInfo("ANSRTYOLO::PreprocessFromNV12", std::string("Path: ") + pathName + " | isCuda=" + std::to_string(isCudaDevice) + " gpuMatch=" + std::to_string(gpuMatch) + " decodeGpu=" + std::to_string(gpuData->gpuIndex) + " infGpu=" + std::to_string(inferenceGpu) + " frame=" + std::to_string(frameW) + "x" + std::to_string(frameH) + " effYLine=" + std::to_string(effYLinesize) + " effUvLine=" + std::to_string(effUvLinesize) + " effYPtr=0x" + std::to_string(reinterpret_cast(effYPlane)) + " hasCpuFallback=" + std::to_string(gpuData->cpuYPlane != nullptr), __FILE__, __LINE__); } cv::cuda::Stream stream; cv::cuda::GpuMat gpuY, gpuUV; if (useZeroCopy) { // ── CUDA zero-copy: wrap NVDEC device pointers directly ── // No memcpy, no device-to-device copy — data stays in NVDEC VRAM. // The fused letterbox kernel samples only ~409K pixels from the 4K // source (vs 8.3M full copy), completing in <1ms on RTX 5080. // We hold the registry lock until the kernel finishes reading. gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1, effYPlane, static_cast(effYLinesize)); gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1, effUvPlane, static_cast(effUvLinesize)); // Lock released after kernel completion (stream.waitForCompletion below) } else { // ── CPU path: memcpy + upload (fallback for D3D11VA / sw decode) ── // Hold registry lock during memcpy so the AVFrame can't be freed // by another thread calling gpu_frame_attach() on the same key. const size_t ySize = static_cast(frameW) * frameH; const size_t uvSize = static_cast(frameW) * frameH / 2; const size_t nv12Size = ySize + uvSize; ensurePinnedBuffer(nv12Size); if (!m_pinnedBuf) { if (!m_nv12PinnedLogged) { m_nv12PinnedLogged = true; _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12", "Early exit: pinned buffer alloc failed for " + std::to_string(nv12Size) + " bytes", __FILE__, __LINE__); } return {}; } // Validate NV12 plane pointers before memcpy const size_t yBufNeeded = (effYLinesize == frameW) ? ySize : static_cast(effYLinesize) * frameH; const size_t uvBufNeeded = (effUvLinesize == frameW) ? uvSize : static_cast(effUvLinesize) * (frameH / 2); if (!isMemoryReadable(effYPlane, std::min(yBufNeeded, (size_t)4096)) || !isMemoryReadable(effUvPlane, std::min(uvBufNeeded, (size_t)4096))) { _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12", "NV12 plane pointers not readable! yPlane=0x" + std::to_string(reinterpret_cast(effYPlane)) + " uvPlane=0x" + std::to_string(reinterpret_cast(effUvPlane)) + " yLinesize=" + std::to_string(effYLinesize) + " uvLinesize=" + std::to_string(effUvLinesize) + " w=" + std::to_string(frameW) + " h=" + std::to_string(frameH), __FILE__, __LINE__); if (regLock.owns_lock()) regLock.unlock(); return {}; // fall back to BGR } uint8_t* dst = static_cast(m_pinnedBuf); bool cpyOk = true; if (effYLinesize == frameW) { cpyOk = safeMemcpy(dst, effYPlane, ySize); } else { for (int row = 0; row < frameH && cpyOk; row++) cpyOk = safeMemcpy(dst + row * frameW, effYPlane + row * effYLinesize, frameW); } if (cpyOk) { uint8_t* uvDst = dst + ySize; if (effUvLinesize == frameW) { cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize); } else { for (int row = 0; row < frameH / 2 && cpyOk; row++) cpyOk = safeMemcpy(uvDst + row * frameW, effUvPlane + row * effUvLinesize, frameW); } } if (!cpyOk) { _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12", "Access violation during NV12 memcpy! Falling back to BGR. " "yPlane=0x" + std::to_string(reinterpret_cast(effYPlane)) + " uvPlane=0x" + std::to_string(reinterpret_cast(effUvPlane)) + " yLinesize=" + std::to_string(effYLinesize) + " uvLinesize=" + std::to_string(effUvLinesize) + " w=" + std::to_string(frameW) + " h=" + std::to_string(frameH) + " avframe=0x" + std::to_string(reinterpret_cast(gpuData->avframe)), __FILE__, __LINE__); if (regLock.owns_lock()) regLock.unlock(); return {}; // fall back to BGR } // NV12 data safely in pinned memory — release registry lock. // From here on we only read from m_pinnedBuf, not from gpuData. if (regLock.owns_lock()) regLock.unlock(); cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf); cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1, static_cast(m_pinnedBuf) + ySize); gpuY.upload(pinnedY, stream); gpuUV.upload(pinnedUV, stream); } // Use display dimensions for coordinate mapping so postprocessed // bboxes map to the display image (1080p), not the NV12 source (4K). const float metaW = (displayW > 0) ? static_cast(displayW) : static_cast(frameW); const float metaH = (displayH > 0) ? static_cast(displayH) : static_cast(frameH); outMeta.imgWidth = metaW; outMeta.imgHeight = metaH; if (outMeta.imgHeight > 0 && outMeta.imgWidth > 0) { outMeta.ratio = 1.f / std::min( inputDims[0].d[2] / metaW, inputDims[0].d[1] / metaH); const auto& outputDims = m_trtEngine->getOutputDims(); const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream); cv::cuda::GpuMat gpuResized; if (isClassification) { // Classification: NV12→RGB at full resolution, then simple resize cv::cuda::GpuMat gpuRGB(frameH, frameW, CV_8UC3); launchNV12ToRGB( gpuY.ptr(), static_cast(gpuY.step), gpuUV.ptr(), static_cast(gpuUV.step), gpuRGB.ptr(), static_cast(gpuRGB.step), frameW, frameH, rawStream); cv::cuda::resize(gpuRGB, gpuResized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream); } else if (frameW == inputW && frameH == inputH) { // Source matches model input — direct NV12→RGB, no resize needed gpuResized.create(inputH, inputW, CV_8UC3); launchNV12ToRGB( gpuY.ptr(), static_cast(gpuY.step), gpuUV.ptr(), static_cast(gpuUV.step), gpuResized.ptr(), static_cast(gpuResized.step), frameW, frameH, rawStream); } else { // Detection: fused NV12→RGB + letterbox in a SINGLE kernel at // output resolution (e.g. 640×640). This avoids the 24MB 4K RGB // intermediate and processes 20× fewer pixels than separate // convert + resize for 4K→640 downscale. float r = std::min(static_cast(inputW) / frameW, static_cast(inputH) / frameH); int unpadW = static_cast(r * frameW); int unpadH = static_cast(r * frameH); float invScale = 1.0f / r; // maps output coords → source coords gpuResized.create(inputH, inputW, CV_8UC3); launchNV12ToRGBLetterbox( gpuY.ptr(), static_cast(gpuY.step), gpuUV.ptr(), static_cast(gpuUV.step), gpuResized.ptr(), static_cast(gpuResized.step), inputW, inputH, frameW, frameH, unpadW, unpadH, invScale, rawStream); } stream.waitForCompletion(); // Release registry lock now that kernel is done reading NVDEC pointers if (regLock.owns_lock()) regLock.unlock(); // Log NV12 fast-path usage once per instance if (!m_nv12ActiveLogged) { m_nv12ActiveLogged = true; const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload"; const char* kernel = isClassification ? "separate" : "FUSED letterbox"; _logger.LogInfo("ANSRTYOLO::PreprocessFromNV12", std::string(mode) + " ACTIVE (" + kernel + "): " + std::to_string(frameW) + "x" + std::to_string(frameH) + " NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) + " display=" + std::to_string(displayW) + "x" + std::to_string(displayH), __FILE__, __LINE__); } std::vector input{ std::move(gpuResized) }; std::vector> inputs{ std::move(input) }; return inputs; } { if (!m_nv12MetaLogged) { m_nv12MetaLogged = true; _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12", "Early exit: metadata dims invalid — metaW=" + std::to_string(outMeta.imgWidth) + " metaH=" + std::to_string(outMeta.imgHeight) + " displayW=" + std::to_string(displayW) + " displayH=" + std::to_string(displayH), __FILE__, __LINE__); } } return {}; } catch (const std::exception& e) { _logger.LogWarn("ANSRTYOLO::PreprocessFromNV12", std::string("NV12 fast path failed, falling back to BGR: ") + e.what(), __FILE__, __LINE__); return {}; } } #endif // PreprocessFromNV12 moved to NV12PreprocessHelper // ==================================================================== // GPU Preprocessing — batch // ==================================================================== std::vector> ANSRTYOLO::PreprocessBatch( const std::vector& inputImages, BatchMetadata& outMetadata) { if (!_licenseValid) { _logger.LogError("ANSRTYOLO::PreprocessBatch", "Invalid license", __FILE__, __LINE__); return {}; } if (inputImages.empty()) { _logger.LogError("ANSRTYOLO::PreprocessBatch", "Empty input images vector", __FILE__, __LINE__); return {}; } if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) return {}; try { const auto& inputDims = m_trtEngine->getInputDims(); if (inputDims.empty()) { _logger.LogError("ANSRTYOLO::PreprocessBatch", "No input dimensions available", __FILE__, __LINE__); return {}; } const int inputH = inputDims[0].d[1]; const int inputW = inputDims[0].d[2]; if (inputH <= 0 || inputW <= 0) { _logger.LogError("ANSRTYOLO::PreprocessBatch", "Invalid model input dimensions", __FILE__, __LINE__); return {}; } outMetadata.imgHeights.resize(inputImages.size()); outMetadata.imgWidths.resize(inputImages.size()); outMetadata.ratios.resize(inputImages.size()); std::vector batchProcessed; batchProcessed.reserve(inputImages.size()); cv::cuda::Stream stream; for (size_t i = 0; i < inputImages.size(); ++i) { const auto& inputImage = inputImages[i]; if (inputImage.empty()) { _logger.LogError("ANSRTYOLO::PreprocessBatch", "Empty input image at index " + std::to_string(i), __FILE__, __LINE__); return {}; } cv::cuda::GpuMat img; if (inputImage.channels() == 1) { cv::Mat img3Channel; cv::cvtColor(inputImage, img3Channel, cv::COLOR_GRAY2BGR); img.upload(img3Channel, stream); } else if (inputImage.channels() == 3) { img.upload(inputImage, stream); } else { _logger.LogError("ANSRTYOLO::PreprocessBatch", "Unsupported channel count at index " + std::to_string(i), __FILE__, __LINE__); return {}; } cv::cuda::GpuMat imgRGB; cv::cuda::cvtColor(img, imgRGB, cv::COLOR_BGR2RGB, 0, stream); outMetadata.imgHeights[i] = imgRGB.rows; outMetadata.imgWidths[i] = imgRGB.cols; if (outMetadata.imgHeights[i] <= 0 || outMetadata.imgWidths[i] <= 0) { _logger.LogError("ANSRTYOLO::PreprocessBatch", "Invalid dimensions for image " + std::to_string(i), __FILE__, __LINE__); return {}; } const auto& outputDims = m_trtEngine->getOutputDims(); const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2; const float scaleW = inputW / static_cast(imgRGB.cols); const float scaleH = inputH / static_cast(imgRGB.rows); outMetadata.ratios[i] = isClassification ? 1.f : 1.f / std::min(scaleW, scaleH); cv::cuda::GpuMat resized; if (imgRGB.rows != inputH || imgRGB.cols != inputW) { if (isClassification) { cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream); } else { resized = Engine::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW); } } else { resized = imgRGB; } batchProcessed.push_back(std::move(resized)); } stream.waitForCompletion(); std::vector> inputs; inputs.push_back(std::move(batchProcessed)); return inputs; } catch (const std::exception& e) { _logger.LogWarn("ANSRTYOLO::PreprocessBatch", std::string("Skipped batch: ") + e.what(), __FILE__, __LINE__); return {}; } } // ==================================================================== // OBB NMS helpers (Prob-IoU based) — static methods // ==================================================================== void ANSRTYOLO::getCovarianceComponents(const OrientedBox& box, float& out1, float& out2, float& out3) { if (box.width <= 0.f || box.height <= 0.f) { out1 = out2 = out3 = 0.f; return; } const float vw = (box.width * box.width) / 12.0f; const float vh = (box.height * box.height) / 12.0f; const float cosT = std::cos(box.angle); const float sinT = std::sin(box.angle); const float cos2 = cosT * cosT; const float sin2 = sinT * sinT; const float sc = sinT * cosT; out1 = vw * cos2 + vh * sin2; out2 = vw * sin2 + vh * cos2; out3 = (vw - vh) * sc; } std::vector> ANSRTYOLO::batchProbiou( const std::vector& obb1, const std::vector& obb2, float eps) { if (obb1.empty() || obb2.empty()) return {}; const size_t n1 = obb1.size(), n2 = obb2.size(); std::vector> iouMat(n1, std::vector(n2, 0.f)); struct CovData { float x, y, a, b, c; }; std::vector cov1(n1); for (size_t i = 0; i < n1; ++i) { float a, b, c; getCovarianceComponents(obb1[i], a, b, c); cov1[i] = { obb1[i].x, obb1[i].y, a, b, c }; } for (size_t i = 0; i < n1; ++i) { for (size_t j = 0; j < n2; ++j) { float a2, b2, c2; getCovarianceComponents(obb2[j], a2, b2, c2); float dx = cov1[i].x - obb2[j].x; float dy = cov1[i].y - obb2[j].y; float sA = cov1[i].a + a2, sB = cov1[i].b + b2, sC = cov1[i].c + c2; float denom = sA * sB - sC * sC + eps; if (denom <= eps) continue; float t1 = ((sA*dy*dy + sB*dx*dx) * 0.25f) / denom; float t2 = ((sC*dx*dy) * -0.5f) / denom; float d1 = cov1[i].a*cov1[i].b - cov1[i].c*cov1[i].c; float d2 = a2*b2 - c2*c2; float sqrtDet = std::sqrt(std::max(d1, 0.f) * std::max(d2, 0.f) + eps); float t3 = 0.5f * std::log((sA*sB - sC*sC) / (4.f*sqrtDet) + eps); float bd = std::clamp(t1 + t2 + t3, eps, 100.f); float hd = std::sqrt(1.f - std::exp(-bd) + eps); iouMat[i][j] = 1.f - hd; } } return iouMat; } std::vector ANSRTYOLO::nmsRotatedImpl( const std::vector& sortedBoxes, float iouThreshold) { if (sortedBoxes.empty()) return {}; if (sortedBoxes.size() == 1) return { 0 }; auto iouMat = batchProbiou(sortedBoxes, sortedBoxes); if (iouMat.empty()) return {}; const int n = static_cast(sortedBoxes.size()); std::vector keep; keep.reserve(n / 2); for (int j = 0; j < n; ++j) { bool shouldKeep = true; for (int i = 0; i < j; ++i) { if (iouMat[i][j] >= iouThreshold) { shouldKeep = false; break; } } if (shouldKeep) keep.push_back(j); } return keep; } std::vector ANSRTYOLO::nmsRotated( const std::vector& boxes, const std::vector& scores, float iouThreshold) { if (boxes.empty() || scores.empty() || boxes.size() != scores.size()) return {}; std::vector sortedIdx(boxes.size()); std::iota(sortedIdx.begin(), sortedIdx.end(), 0); std::sort(sortedIdx.begin(), sortedIdx.end(), [&](int a, int b) { return scores[a] > scores[b]; }); std::vector sortedBoxes; sortedBoxes.reserve(boxes.size()); for (int i : sortedIdx) sortedBoxes.push_back(boxes[i]); auto keepSorted = nmsRotatedImpl(sortedBoxes, iouThreshold); std::vector keepOrig; keepOrig.reserve(keepSorted.size()); for (int si : keepSorted) keepOrig.push_back(sortedIdx[si]); return keepOrig; } std::vector ANSRTYOLO::OBBToPoints(const OrientedBox& obb) { float angleDeg = obb.angle * 180.0f / static_cast(CV_PI); cv::RotatedRect rr(cv::Point2f(obb.x, obb.y), cv::Size2f(obb.width, obb.height), angleDeg); std::vector corners(4); rr.points(corners.data()); return corners; } // ==================================================================== // Detection — legacy postprocess // ==================================================================== std::vector ANSRTYOLO::PostprocessDetection( std::vector& featureVector, const std::string& camera_id, const ImageMetadata& meta) { try { const auto& outputDims = m_trtEngine->getOutputDims(); auto numChannels = outputDims[0].d[1]; auto numAnchors = outputDims[0].d[2]; // Derive numClasses from tensor shape (4 box coords subtracted) // rather than _classes.size() which may not match the model auto numClasses = static_cast(numChannels - 4); if (!_classes.empty() && _classes.size() <= static_cast(numChannels - 4)) numClasses = _classes.size(); std::vector bboxes; std::vector scores; std::vector labels; std::vector indices; cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data()); output = output.t(); for (int i = 0; i < numAnchors; i++) { auto rowPtr = output.row(i).ptr(); auto bboxesPtr = rowPtr; auto scoresPtr = rowPtr + 4; auto maxSPtr = std::max_element(scoresPtr, scoresPtr + numClasses); float score = *maxSPtr; if (score > _modelConfig.detectionScoreThreshold) { float x = *bboxesPtr++; float y = *bboxesPtr++; float w = *bboxesPtr++; float h = *bboxesPtr; float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth); float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight); float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth); float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight); int label = static_cast(maxSPtr - scoresPtr); cv::Rect_ bbox; bbox.x = x0; bbox.y = y0; bbox.width = x1 - x0; bbox.height = y1 - y0; bbox.x = std::max(0.f, bbox.x); bbox.y = std::max(0.f, bbox.y); bbox.width = std::min(meta.imgWidth - bbox.x, bbox.width); bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height); bboxes.push_back(bbox); labels.push_back(label); scores.push_back(score); } } cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices); std::vector objects; int classNameSize = static_cast(_classes.size()); for (auto& chosenIdx : indices) { if (scores[chosenIdx] > _modelConfig.detectionScoreThreshold) { Object obj{}; obj.confidence = scores[chosenIdx]; obj.classId = labels[chosenIdx]; obj.box = bboxes[chosenIdx]; //obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight); if (!_classes.empty()) { obj.className = (obj.classId < classNameSize) ? _classes[obj.classId] : _classes[classNameSize - 1]; } else { obj.className = "Unknown"; } obj.cameraId = camera_id; objects.push_back(obj); } } return objects; } catch (std::exception& e) { _logger.LogFatal("ANSRTYOLO::PostprocessDetection", e.what(), __FILE__, __LINE__); return {}; } } // ==================================================================== // Detection — end2end postprocess // ==================================================================== std::vector ANSRTYOLO::PostprocessDetectionE2E( std::vector& featureVector, const std::string& camera_id, const ImageMetadata& meta) { try { const auto& outputDims = m_trtEngine->getOutputDims(); int numDets = outputDims[0].d[1]; int numFeat = outputDims[0].d[2]; // 6: x1,y1,x2,y2,conf,classId std::vector results; results.reserve(numDets); for (int i = 0; i < numDets; ++i) { const float* det = featureVector.data() + i * numFeat; float conf = det[4]; if (conf <= _modelConfig.detectionScoreThreshold) continue; int classId = static_cast(det[5]); // Scale from model input space to original image float x0 = std::clamp(det[0] * meta.ratio, 0.f, meta.imgWidth); float y0 = std::clamp(det[1] * meta.ratio, 0.f, meta.imgHeight); float x1 = std::clamp(det[2] * meta.ratio, 0.f, meta.imgWidth); float y1 = std::clamp(det[3] * meta.ratio, 0.f, meta.imgHeight); float w = x1 - x0, h = y1 - y0; if (w < 1.f || h < 1.f) continue; Object obj; obj.classId = classId; obj.confidence = conf; obj.box = cv::Rect(static_cast(x0), static_cast(y0), static_cast(w), static_cast(h)); //obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight); int classNameSize = static_cast(_classes.size()); if (!_classes.empty() && classId >= 0 && classId < classNameSize) obj.className = _classes[classId]; obj.cameraId = camera_id; results.push_back(std::move(obj)); } return results; } catch (std::exception& e) { _logger.LogFatal("ANSRTYOLO::PostprocessDetectionE2E", e.what(), __FILE__, __LINE__); return {}; } } // ==================================================================== // OBB — legacy postprocess // ==================================================================== std::vector ANSRTYOLO::PostprocessOBB( std::vector& featureVector, const std::string& camera_id, const ImageMetadata& meta) { try { const auto& outputDims = m_trtEngine->getOutputDims(); int numChannels = outputDims[0].d[1]; int numAnchors = outputDims[0].d[2]; int numClasses = numChannels - 5; // 4 box + nc scores + 1 angle if (numClasses <= 0) return {}; cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data()).t(); struct OBBCandidate { OrientedBox box; float conf; int classId; }; std::vector candidates; candidates.reserve(numAnchors); for (int i = 0; i < numAnchors; ++i) { const float* row = output.ptr(i); const float* scoresPtr = row + 4; float maxScore = -FLT_MAX; int bestClass = -1; for (int c = 0; c < numClasses; ++c) { if (scoresPtr[c] > maxScore) { maxScore = scoresPtr[c]; bestClass = c; } } if (maxScore <= _modelConfig.detectionScoreThreshold) continue; float angle = row[4 + numClasses]; float cx = row[0] * meta.ratio; float cy = row[1] * meta.ratio; float bw = row[2] * meta.ratio; float bh = row[3] * meta.ratio; cx = std::clamp(cx, 0.f, meta.imgWidth); cy = std::clamp(cy, 0.f, meta.imgHeight); candidates.push_back({ { cx, cy, bw, bh, angle }, maxScore, bestClass }); } if (candidates.empty()) return {}; // Prob-IoU NMS std::vector boxes; std::vector scores; boxes.reserve(candidates.size()); scores.reserve(candidates.size()); for (const auto& c : candidates) { boxes.push_back(c.box); scores.push_back(c.conf); } auto keepIdx = nmsRotated(boxes, scores, NMS_THRESHOLD); std::vector results; int classNameSize = static_cast(_classes.size()); results.reserve(std::min(static_cast(keepIdx.size()), TOP_K)); for (int idx : keepIdx) { if (static_cast(results.size()) >= TOP_K) break; const auto& c = candidates[idx]; Object obj; obj.classId = c.classId; obj.confidence = c.conf; obj.kps = { c.box.x, c.box.y, c.box.width, c.box.height, c.box.angle }; auto absCorners = OBBToPoints(c.box); obj.box = cv::boundingRect(absCorners); // Normalize OBB corners to [0,1] and close the polygon obj.polygon.reserve(absCorners.size() + 1); for (const auto& pt : absCorners) { obj.polygon.emplace_back( std::clamp(pt.x / meta.imgWidth, 0.f, 1.f), std::clamp(pt.y / meta.imgHeight, 0.f, 1.f)); } if (!obj.polygon.empty()) obj.polygon.push_back(obj.polygon.front()); // close if (!_classes.empty() && c.classId >= 0 && c.classId < classNameSize) obj.className = _classes[c.classId]; obj.cameraId = camera_id; results.push_back(std::move(obj)); } return results; } catch (std::exception& e) { _logger.LogFatal("ANSRTYOLO::PostprocessOBB", e.what(), __FILE__, __LINE__); return {}; } } // ==================================================================== // OBB — end2end postprocess // ==================================================================== std::vector ANSRTYOLO::PostprocessOBBE2E( std::vector& featureVector, const std::string& camera_id, const ImageMetadata& meta) { try { const auto& outputDims = m_trtEngine->getOutputDims(); int numDets = outputDims[0].d[1]; int numFeat = outputDims[0].d[2]; // 7: cx,cy,w,h,angle,conf,classId std::vector results; results.reserve(numDets); for (int i = 0; i < numDets; ++i) { const float* det = featureVector.data() + i * numFeat; float angle = det[4]; float conf = det[5]; if (conf <= _modelConfig.detectionScoreThreshold) continue; float cx = det[0] * meta.ratio; float cy = det[1] * meta.ratio; float bw = det[2] * meta.ratio; float bh = det[3] * meta.ratio; int classId = static_cast(det[6]); cx = std::clamp(cx, 0.f, meta.imgWidth); cy = std::clamp(cy, 0.f, meta.imgHeight); OrientedBox obb{ cx, cy, bw, bh, angle }; Object obj; obj.classId = classId; obj.confidence = conf; obj.kps = { cx, cy, bw, bh, angle }; auto absCorners = OBBToPoints(obb); obj.box = cv::boundingRect(absCorners); // Normalize OBB corners to [0,1] and close the polygon obj.polygon.reserve(absCorners.size() + 1); for (const auto& pt : absCorners) { obj.polygon.emplace_back( std::clamp(pt.x / meta.imgWidth, 0.f, 1.f), std::clamp(pt.y / meta.imgHeight, 0.f, 1.f)); } if (!obj.polygon.empty()) obj.polygon.push_back(obj.polygon.front()); // close int classNameSize = static_cast(_classes.size()); if (!_classes.empty() && classId >= 0 && classId < classNameSize) obj.className = _classes[classId]; obj.cameraId = camera_id; results.push_back(std::move(obj)); } return results; } catch (std::exception& e) { _logger.LogFatal("ANSRTYOLO::PostprocessOBBE2E", e.what(), __FILE__, __LINE__); return {}; } } // ==================================================================== // Segmentation — legacy postprocess // ==================================================================== std::vector ANSRTYOLO::PostprocessSegmentation( std::vector>& featureVectors, const std::string& camera_id, const ImageMetadata& meta) { try { const auto& outputDims = m_trtEngine->getOutputDims(); int numChannels = outputDims[0].d[1]; int numAnchors = outputDims[0].d[2]; const auto numClasses = numChannels - SEG_CHANNELS - 4; if (featureVectors[0].size() != static_cast(numChannels) * numAnchors) return {}; if (featureVectors[1].size() != static_cast(SEG_CHANNELS) * SEG_H * SEG_W) return {}; cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVectors[0].data()).t(); cv::Mat protos = cv::Mat(SEG_CHANNELS, SEG_H * SEG_W, CV_32F, featureVectors[1].data()); std::vector labels; std::vector scores; std::vector bboxes; std::vector maskConfs; std::vector indices; for (int i = 0; i < numAnchors; i++) { auto rowPtr = output.row(i).ptr(); auto bboxesPtr = rowPtr; auto scoresPtr = rowPtr + 4; auto maskConfsPtr = rowPtr + 4 + numClasses; auto maxSPtr = std::max_element(scoresPtr, scoresPtr + numClasses); float score = *maxSPtr; if (score > _modelConfig.detectionScoreThreshold) { float x = *bboxesPtr++; float y = *bboxesPtr++; float w = *bboxesPtr++; float h = *bboxesPtr; float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth); float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight); float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth); float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight); int label = static_cast(maxSPtr - scoresPtr); cv::Rect_ bbox; bbox.x = x0; bbox.y = y0; bbox.width = x1 - x0; bbox.height = y1 - y0; bbox.x = std::max(0.f, bbox.x); bbox.y = std::max(0.f, bbox.y); bbox.width = std::min(meta.imgWidth - bbox.x, bbox.width); bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height); cv::Mat maskConf = cv::Mat(1, SEG_CHANNELS, CV_32F, maskConfsPtr); bboxes.push_back(bbox); labels.push_back(label); scores.push_back(score); maskConfs.push_back(maskConf); } } cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices); cv::Mat masks; int classNameSize = static_cast(_classes.size()); std::vector objs; for (auto& i : indices) { if (scores[i] > _modelConfig.detectionScoreThreshold) { Object obj; obj.classId = labels[i]; if (!_classes.empty()) { obj.className = (obj.classId < classNameSize) ? _classes[obj.classId] : _classes[classNameSize - 1]; } else { obj.className = "Unknown"; } obj.box = bboxes[i]; obj.confidence = scores[i]; obj.cameraId = camera_id; masks.push_back(maskConfs[i]); objs.push_back(obj); } } if (!masks.empty()) { cv::Mat matmulRes = (masks * protos).t(); // Apply sigmoid while still a single-channel 2D matrix cv::Mat negMat; cv::exp(-matmulRes, negMat); cv::Mat sigmoidFlat = 1.0 / (1.0 + negMat); // Now reshape into multi-channel and split cv::Mat sigmoidMat = sigmoidFlat.reshape(static_cast(indices.size()), { SEG_H, SEG_W }); std::vector maskChannels; cv::split(sigmoidMat, maskChannels); // ROI in proto space (SEG_H x SEG_W), accounting for top-left letterbox padding // ANSRTYOLO pads right-bottom, so content starts at (0,0) in proto space cv::Rect roi; if (meta.imgHeight > meta.imgWidth) { int roiW = std::min(static_cast(std::round( static_cast(SEG_W) * meta.imgWidth / meta.imgHeight)), SEG_W); roi = cv::Rect(0, 0, roiW, SEG_H); } else { int roiH = std::min(static_cast(std::round( static_cast(SEG_H) * meta.imgHeight / meta.imgWidth)), SEG_H); roi = cv::Rect(0, 0, SEG_W, roiH); } roi &= cv::Rect(0, 0, SEG_W, SEG_H); int imgW = static_cast(meta.imgWidth); int imgH = static_cast(meta.imgHeight); // Precompute scale factors from proto-ROI to original image const float scaleX = static_cast(imgW) / roi.width; const float scaleY = static_cast(imgH) / roi.height; for (size_t i = 0; i < objs.size(); i++) { cv::Rect safeBox = objs[i].box & cv::Rect(0, 0, imgW, imgH); if (safeBox.area() <= 0) continue; // Map bounding box back to proto-ROI space and crop there int px0 = std::max(static_cast(std::floor(safeBox.x / scaleX)), 0); int py0 = std::max(static_cast(std::floor(safeBox.y / scaleY)), 0); int px1 = std::min(static_cast(std::ceil((safeBox.x + safeBox.width) / scaleX)), roi.width); int py1 = std::min(static_cast(std::ceil((safeBox.y + safeBox.height) / scaleY)), roi.height); if (px1 <= px0 || py1 <= py0) continue; cv::Rect protoBox(roi.x + px0, roi.y + py0, px1 - px0, py1 - py0); protoBox &= cv::Rect(0, 0, SEG_W, SEG_H); if (protoBox.area() <= 0) continue; // Resize only the small proto crop to the bounding box size cv::Mat cropped = maskChannels[i](protoBox); cv::Mat resized; cv::resize(cropped, resized, cv::Size(safeBox.width, safeBox.height), 0, 0, cv::INTER_LINEAR); objs[i].mask = resized > _modelConfig.modelConfThreshold; objs[i].polygon = ANSUtilityHelper::MaskToNormalizedPolygon( objs[i].mask, safeBox, meta.imgWidth, meta.imgHeight); } } // Fill polygon for objects that got masks for (auto& obj : objs) { if (obj.polygon.empty()) obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight); } return objs; } catch (std::exception& e) { _logger.LogFatal("ANSRTYOLO::PostprocessSegmentation", e.what(), __FILE__, __LINE__); return {}; } } // ==================================================================== // Segmentation — end2end postprocess // ==================================================================== std::vector ANSRTYOLO::PostprocessSegE2E( std::vector>& featureVectors, const std::string& camera_id, const ImageMetadata& meta) { try { if (featureVectors.size() < 2) return {}; const auto& outputDims = m_trtEngine->getOutputDims(); int numDets = outputDims[0].d[1]; int numFeat = outputDims[0].d[2]; // 6 + nm // Proto dimensions from second output int nm = outputDims[1].d[1]; int protoH = outputDims[1].d[2]; int protoW = (outputDims[1].nbDims > 3) ? outputDims[1].d[3] : outputDims[1].d[2]; if (numFeat < 6 + nm) return {}; const float* raw = featureVectors[0].data(); std::vector objs; cv::Mat maskCoeffs; for (int i = 0; i < numDets; ++i) { const float* det = raw + i * numFeat; float conf = det[4]; if (conf <= _modelConfig.detectionScoreThreshold) continue; int classId = static_cast(det[5]); float x0 = std::clamp(det[0] * meta.ratio, 0.f, meta.imgWidth); float y0 = std::clamp(det[1] * meta.ratio, 0.f, meta.imgHeight); float x1 = std::clamp(det[2] * meta.ratio, 0.f, meta.imgWidth); float y1 = std::clamp(det[3] * meta.ratio, 0.f, meta.imgHeight); float w = x1 - x0, h = y1 - y0; if (w < 1.f || h < 1.f) continue; Object obj; obj.classId = classId; obj.confidence = conf; obj.box = cv::Rect(static_cast(x0), static_cast(y0), static_cast(w), static_cast(h)); int classNameSize = static_cast(_classes.size()); if (!_classes.empty() && classId >= 0 && classId < classNameSize) obj.className = _classes[classId]; obj.cameraId = camera_id; objs.push_back(std::move(obj)); cv::Mat mc(1, nm, CV_32F); std::memcpy(mc.ptr(), det + 6, nm * sizeof(float)); maskCoeffs.push_back(mc); } if (!objs.empty() && !maskCoeffs.empty()) { cv::Mat protos(nm, protoH * protoW, CV_32F, featureVectors[1].data()); cv::Mat matmulRes = (maskCoeffs * protos).t(); // Apply sigmoid while still a single-channel 2D matrix cv::Mat negMat; cv::exp(-matmulRes, negMat); cv::Mat sigmoidFlat = 1.0 / (1.0 + negMat); // Now reshape into multi-channel and split cv::Mat sigmoidMat = sigmoidFlat.reshape(static_cast(objs.size()), { protoH, protoW }); std::vector maskChannels; cv::split(sigmoidMat, maskChannels); // ROI in proto space, accounting for top-left letterbox padding // ANSRTYOLO pads right-bottom, so content starts at (0,0) in proto space cv::Rect roi; if (meta.imgHeight > meta.imgWidth) { int roiW = std::min(static_cast(std::round( static_cast(protoW) * meta.imgWidth / meta.imgHeight)), protoW); roi = cv::Rect(0, 0, roiW, protoH); } else { int roiH = std::min(static_cast(std::round( static_cast(protoH) * meta.imgHeight / meta.imgWidth)), protoH); roi = cv::Rect(0, 0, protoW, roiH); } roi &= cv::Rect(0, 0, protoW, protoH); int imgW = static_cast(meta.imgWidth); int imgH = static_cast(meta.imgHeight); const float scaleX = static_cast(imgW) / roi.width; const float scaleY = static_cast(imgH) / roi.height; for (size_t i = 0; i < objs.size(); ++i) { cv::Rect safebox = objs[i].box & cv::Rect(0, 0, imgW, imgH); if (safebox.area() <= 0) continue; int px0 = std::max(static_cast(std::floor(safebox.x / scaleX)), 0); int py0 = std::max(static_cast(std::floor(safebox.y / scaleY)), 0); int px1 = std::min(static_cast(std::ceil((safebox.x + safebox.width) / scaleX)), roi.width); int py1 = std::min(static_cast(std::ceil((safebox.y + safebox.height) / scaleY)), roi.height); if (px1 <= px0 || py1 <= py0) continue; cv::Rect protoBox(roi.x + px0, roi.y + py0, px1 - px0, py1 - py0); protoBox &= cv::Rect(0, 0, protoW, protoH); if (protoBox.area() <= 0) continue; cv::Mat cropped = maskChannels[i](protoBox); cv::Mat resized; cv::resize(cropped, resized, cv::Size(safebox.width, safebox.height), 0, 0, cv::INTER_LINEAR); objs[i].mask = resized > SEGMENTATION_THRESHOLD; objs[i].polygon = ANSUtilityHelper::MaskToNormalizedPolygon( objs[i].mask, safebox, meta.imgWidth, meta.imgHeight); } } for (auto& obj : objs) { if (obj.polygon.empty()) obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight); } return objs; } catch (std::exception& e) { _logger.LogFatal("ANSRTYOLO::PostprocessSegE2E", e.what(), __FILE__, __LINE__); return {}; } } // ==================================================================== // Pose — legacy postprocess // ==================================================================== std::vector ANSRTYOLO::PostprocessPose( std::vector& featureVector, const std::string& camera_id, const ImageMetadata& meta) { try { const auto& outputDims = m_trtEngine->getOutputDims(); auto numChannels = outputDims[0].d[1]; auto numAnchors = outputDims[0].d[2]; std::vector bboxes; std::vector scores; std::vector labels; std::vector indices; std::vector> kpss; cv::Mat output = cv::Mat(numChannels, numAnchors, CV_32F, featureVector.data()).t(); for (int i = 0; i < numAnchors; i++) { auto rowPtr = output.row(i).ptr(); auto bboxesPtr = rowPtr; auto scoresPtr = rowPtr + 4; auto kps_ptr = rowPtr + 5; float score = *scoresPtr; if (score > _modelConfig.detectionScoreThreshold) { float x = *bboxesPtr++; float y = *bboxesPtr++; float w = *bboxesPtr++; float h = *bboxesPtr; float x0 = std::clamp((x - 0.5f * w) * meta.ratio, 0.f, meta.imgWidth); float y0 = std::clamp((y - 0.5f * h) * meta.ratio, 0.f, meta.imgHeight); float x1 = std::clamp((x + 0.5f * w) * meta.ratio, 0.f, meta.imgWidth); float y1 = std::clamp((y + 0.5f * h) * meta.ratio, 0.f, meta.imgHeight); cv::Rect_ bbox; bbox.x = x0; bbox.y = y0; bbox.width = x1 - x0; bbox.height = y1 - y0; bbox.x = std::max(0.f, bbox.x); bbox.y = std::max(0.f, bbox.y); bbox.width = std::min(meta.imgWidth - bbox.x, bbox.width); bbox.height = std::min(meta.imgHeight - bbox.y, bbox.height); std::vector kps; for (int k = 0; k < NUM_KPS; k++) { float kpsX = std::clamp(*(kps_ptr + 3 * k) * meta.ratio, 0.f, meta.imgWidth); float kpsY = std::clamp(*(kps_ptr + 3 * k + 1) * meta.ratio, 0.f, meta.imgHeight); float kpsS = *(kps_ptr + 3 * k + 2); kps.push_back(kpsX); kps.push_back(kpsY); kps.push_back(kpsS); } bboxes.push_back(bbox); labels.push_back(0); scores.push_back(score); kpss.push_back(kps); } } cv::dnn::NMSBoxesBatched(bboxes, scores, labels, PROBABILITY_THRESHOLD, NMS_THRESHOLD, indices); std::vector objects; int classNameSize = static_cast(_classes.size()); for (auto& chosenIdx : indices) { if (scores[chosenIdx] > _modelConfig.detectionScoreThreshold) { Object obj{}; obj.confidence = scores[chosenIdx]; obj.classId = labels[chosenIdx]; if (!_classes.empty()) { obj.className = (obj.classId < classNameSize) ? _classes[obj.classId] : _classes[classNameSize - 1]; } else { obj.className = "Unknown"; } obj.box = bboxes[chosenIdx]; //obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight); obj.kps = kpss[chosenIdx]; obj.cameraId = camera_id; objects.push_back(obj); } } return objects; } catch (std::exception& e) { _logger.LogFatal("ANSRTYOLO::PostprocessPose", e.what(), __FILE__, __LINE__); return {}; } } // ==================================================================== // Pose — end2end postprocess // ==================================================================== std::vector ANSRTYOLO::PostprocessPoseE2E( std::vector& featureVector, const std::string& camera_id, const ImageMetadata& meta) { try { const auto& outputDims = m_trtEngine->getOutputDims(); int numDets = outputDims[0].d[1]; int numFeat = outputDims[0].d[2]; // 6 + nk*3 int nk = (numFeat - 6) / 3; std::vector results; results.reserve(numDets); for (int i = 0; i < numDets; ++i) { const float* det = featureVector.data() + i * numFeat; float conf = det[4]; if (conf <= _modelConfig.detectionScoreThreshold) continue; int classId = static_cast(det[5]); float x0 = std::clamp(det[0] * meta.ratio, 0.f, meta.imgWidth); float y0 = std::clamp(det[1] * meta.ratio, 0.f, meta.imgHeight); float x1 = std::clamp(det[2] * meta.ratio, 0.f, meta.imgWidth); float y1 = std::clamp(det[3] * meta.ratio, 0.f, meta.imgHeight); float w = x1 - x0, h = y1 - y0; if (w < 1.f || h < 1.f) continue; const float* kpsPtr = det + 6; std::vector kps; kps.reserve(nk * 3); for (int k = 0; k < nk; ++k) { float kx = std::clamp(kpsPtr[3*k] * meta.ratio, 0.f, meta.imgWidth); float ky = std::clamp(kpsPtr[3*k+1] * meta.ratio, 0.f, meta.imgHeight); float ks = kpsPtr[3*k+2]; kps.push_back(kx); kps.push_back(ky); kps.push_back(ks); } Object obj; obj.classId = classId; obj.confidence = conf; obj.box = cv::Rect(static_cast(x0), static_cast(y0), static_cast(w), static_cast(h)); obj.kps = std::move(kps); int classNameSize = static_cast(_classes.size()); if (!_classes.empty() && classId >= 0 && classId < classNameSize) obj.className = _classes[classId]; obj.cameraId = camera_id; results.push_back(std::move(obj)); } return results; } catch (std::exception& e) { _logger.LogFatal("ANSRTYOLO::PostprocessPoseE2E", e.what(), __FILE__, __LINE__); return {}; } } // ==================================================================== // Classification postprocess // ==================================================================== std::vector ANSRTYOLO::PostprocessClassify( std::vector& featureVector, const std::string& camera_id, const ImageMetadata& meta) { try { const int nc = static_cast(featureVector.size()); if (nc == 0) return {}; // Check if output is already a probability distribution (sums to ~1.0). // Some models include a Softmax layer; applying softmax again would // flatten the distribution and cause wrong classifications. float rawSum = 0.f; bool allNonNeg = true; for (int i = 0; i < nc; ++i) { rawSum += featureVector[i]; if (featureVector[i] < 0.f) allNonNeg = false; } const bool alreadyNormalized = (allNonNeg && rawSum > 0.9f && rawSum < 1.1f); std::vector probs(nc); if (alreadyNormalized) { for (int i = 0; i < nc; ++i) probs[i] = featureVector[i]; } else { float maxVal = *std::max_element(featureVector.begin(), featureVector.end()); float sumExp = 0.f; for (int i = 0; i < nc; ++i) { probs[i] = std::exp(featureVector[i] - maxVal); sumExp += probs[i]; } for (int i = 0; i < nc; ++i) probs[i] /= sumExp; } int bestClass = 0; float bestProb = 0.f; for (int i = 0; i < nc; ++i) { if (probs[i] > bestProb) { bestProb = probs[i]; bestClass = i; } } const int imgW = static_cast(meta.imgWidth); const int imgH = static_cast(meta.imgHeight); Object obj; if (imgW > 20 && imgH > 20) { obj.box = cv::Rect(10, 10, imgW - 20, imgH - 20); } else { obj.box = cv::Rect(0, 0, imgW, imgH); } //obj.polygon = ANSUtilityHelper::RectToNormalizedPolygon(obj.box, meta.imgWidth, meta.imgHeight); obj.classId = bestClass; obj.confidence = bestProb; obj.cameraId = camera_id; int classNameSize = static_cast(_classes.size()); if (!_classes.empty() && bestClass >= 0 && bestClass < classNameSize) obj.className = _classes[bestClass]; return { std::move(obj) }; } catch (std::exception& e) { _logger.LogFatal("ANSRTYOLO::PostprocessClassify", e.what(), __FILE__, __LINE__); return {}; } } // ==================================================================== // DetectObjects — single image with auto-detection of task type // ==================================================================== std::vector ANSRTYOLO::DetectObjects(const cv::Mat& inputImage, const std::string& camera_id) { try { // --- Debug timer helper (zero-cost when _debugFlag == false) --- using Clock = std::chrono::steady_clock; const bool dbg = _debugFlag; auto t0 = dbg ? Clock::now() : Clock::time_point{}; auto tPrev = t0; auto elapsed = [&]() -> double { auto now = Clock::now(); double ms = std::chrono::duration(now - tPrev).count(); tPrev = now; return ms; }; // --- 1. Set GPU device context --- if (m_trtEngine) { m_trtEngine->setDeviceContext(); } double msSetDevice = dbg ? elapsed() : 0; // --- 1b. CUDA context health check --- if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) { return {}; } // --- 2. Preprocess under lock --- ANS_DBG("YOLO", "Preprocess START %dx%d", inputImage.cols, inputImage.rows); ImageMetadata meta; std::vector> input; bool usedNV12 = false; float bgrFullResScaleX = 1.0f, bgrFullResScaleY = 1.0f; { std::lock_guard lock(_mutex); const int inferenceGpu = m_trtEngine ? m_trtEngine->getPreferredDeviceIndex() : 0; const auto& inputDims = m_trtEngine->getInputDims(); const int inputW = inputDims[0].d[2]; const int inputH = inputDims[0].d[1]; auto nv12 = m_nv12Helper.tryNV12(inputImage, inferenceGpu, inputW, inputH, NV12PreprocessHelper::defaultYOLOLauncher(), _logger, "ANSRTYOLO"); if (nv12.succeeded) { meta.imgWidth = nv12.metaWidth; meta.imgHeight = nv12.metaHeight; meta.ratio = nv12.ratio; input = {{ std::move(nv12.gpuRGB) }}; usedNV12 = true; } else if (nv12.useBgrFullRes) { input = Preprocess(nv12.bgrFullResImg, meta); usedNV12 = !input.empty(); bgrFullResScaleX = nv12.bgrFullResScaleX; bgrFullResScaleY = nv12.bgrFullResScaleY; } if (input.empty()) { input = Preprocess(inputImage, meta); } m_nv12Helper.tickInference(); } double msPreprocess = dbg ? elapsed() : 0; if (input.empty()) { _logger.LogWarn("ANSRTYOLO::DetectObjects", "Skipped: preprocessing returned empty input", __FILE__, __LINE__); return {}; } // --- 3. TRT Inference (mutex released for concurrent GPU slots) --- ANS_DBG("YOLO", "TRT inference START nv12=%d inputSize=%dx%d", (int)usedNV12, input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].cols), input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].rows)); auto _trtStart = std::chrono::steady_clock::now(); std::vector>> featureVectors; if (!m_trtEngine->runInference(input, featureVectors)) { ANS_DBG("YOLO", "ERROR: TRT runInference FAILED"); _logger.LogError("ANSRTYOLO::DetectObjects", "Error running inference", __FILE__, __LINE__); return {}; } auto _trtEnd = std::chrono::steady_clock::now(); double _trtMs = std::chrono::duration(_trtEnd - _trtStart).count(); if (_trtMs > 500.0) { ANS_DBG("YOLO", "SLOW TRT inference: %.1fms", _trtMs); } double msInference = dbg ? elapsed() : 0; // --- 4. Transform output --- std::vector results; bool isClassification = false; { std::lock_guard lock(_mutex); const auto& outputDims = m_trtEngine->getOutputDims(); const size_t numOutputs = outputDims.size(); if (numOutputs >= 2) { std::vector> featureVector2d; Engine::transformOutput(featureVectors, featureVector2d); double msTransform = dbg ? elapsed() : 0; int dim1 = outputDims[0].d[1]; int dim2 = outputDims[0].d[2]; if (dim1 > dim2 || dim2 <= 20) results = PostprocessSegE2E(featureVector2d, camera_id, meta); else results = PostprocessSegmentation(featureVector2d, camera_id, meta); if (dbg) { double msPostprocess = elapsed(); _logger.LogInfo("ANSRTYOLO::DetectObjects", "[DEBUG] Seg | " + std::string(usedNV12 ? "NV12" : "BGR") + " | SetDev=" + std::to_string(msSetDevice) + "ms Preproc=" + std::to_string(msPreprocess) + "ms Inf=" + std::to_string(msInference) + "ms Transform=" + std::to_string(msTransform) + "ms Postproc=" + std::to_string(msPostprocess) + "ms Det=" + std::to_string(results.size()), __FILE__, __LINE__); } } else { std::vector featureVector; Engine::transformOutput(featureVectors, featureVector); double msTransform = dbg ? elapsed() : 0; if (outputDims[0].nbDims <= 2) { results = PostprocessClassify(featureVector, camera_id, meta); isClassification = true; } else { int dim1 = outputDims[0].d[1]; int dim2 = outputDims[0].d[2]; int nc = static_cast(_classes.size()); const bool isEndToEnd = (dim1 > dim2) || (dim2 <= 20); if (isEndToEnd) { if (dim2 == 6) results = PostprocessDetectionE2E(featureVector, camera_id, meta); else if (dim2 == 7) results = PostprocessOBBE2E(featureVector, camera_id, meta); else if (dim2 > 7 && (dim2 - 6) % 3 == 0) results = PostprocessPoseE2E(featureVector, camera_id, meta); else results = PostprocessDetectionE2E(featureVector, camera_id, meta); } else { int extra = dim1 - 4; bool routed = false; if (nc > 0 && nc <= extra) { if (extra == nc) { results = PostprocessDetection(featureVector, camera_id, meta); routed = true; } else if (extra == nc + 1) { results = PostprocessOBB(featureVector, camera_id, meta); routed = true; } else if ((extra - nc) % 3 == 0 && (extra - nc) >= 3) { results = PostprocessPose(featureVector, camera_id, meta); routed = true; } } if (!routed) { if (extra >= 2) { cv::Mat probe = cv::Mat(dim1, dim2, CV_32F, featureVector.data()).t(); int lastCol = dim1 - 1; int numSamples = std::min(dim2, 100); int angleCount = 0; for (int s = 0; s < numSamples; ++s) { float v = probe.at(s, lastCol); if (v >= -3.15f && v <= 3.15f) ++angleCount; } if (angleCount > numSamples * 8 / 10) { results = PostprocessOBB(featureVector, camera_id, meta); routed = true; } } if (!routed && dim1 == 56) results = PostprocessPose(featureVector, camera_id, meta); else if (!routed) results = PostprocessDetection(featureVector, camera_id, meta); } } } if (dbg) { double msPostprocess = elapsed(); _logger.LogInfo("ANSRTYOLO::DetectObjects", "[DEBUG] " + camera_id + " | " + std::string(usedNV12 ? "NV12" : "BGR") + " | SetDev=" + std::to_string(msSetDevice) + "ms Preproc=" + std::to_string(msPreprocess) + "ms Inf=" + std::to_string(msInference) + "ms Transform=" + std::to_string(msTransform) + "ms Postproc=" + std::to_string(msPostprocess) + "ms Det=" + std::to_string(results.size()) + (isClassification ? " [classify]" : " [detect]"), __FILE__, __LINE__); } } } // --- 4b. Rescale coords from full-res to display-res (BGR full-res path) --- // When ANSVideoPlayer provides full-res BGR via the registry, Preprocess // and Postprocess operate in full-res coordinates. But the caller passed // a display-res inputImage and expects coords in that space. Remap here. if (bgrFullResScaleX != 1.0f || bgrFullResScaleY != 1.0f) { for (auto& obj : results) { obj.box.x = static_cast(obj.box.x * bgrFullResScaleX); obj.box.y = static_cast(obj.box.y * bgrFullResScaleY); obj.box.width = static_cast(obj.box.width * bgrFullResScaleX); obj.box.height = static_cast(obj.box.height * bgrFullResScaleY); // Rescale polygon points if present (segmentation / OBB) for (auto& pt : obj.polygon) { pt.x *= bgrFullResScaleX; pt.y *= bgrFullResScaleY; } // Rescale keypoints if present (pose: x,y,conf triplets) for (size_t k = 0; k + 2 < obj.kps.size(); k += 3) { obj.kps[k] *= bgrFullResScaleX; obj.kps[k + 1] *= bgrFullResScaleY; } } } // --- 5. Tracking + Stabilization --- if (_trackerEnabled && !isClassification) { results = ApplyTracking(results, camera_id); double msTracking = dbg ? elapsed() : 0; if (_stabilizationEnabled) { results = StabilizeDetections(results, camera_id); } double msStabilize = dbg ? elapsed() : 0; if (dbg) { _logger.LogInfo("ANSRTYOLO::DetectObjects", "[DEBUG] " + camera_id + " | Tracking=" + std::to_string(msTracking) + "ms Stabilize=" + std::to_string(msStabilize) + "ms", __FILE__, __LINE__); } } // --- 6. Total pipeline time --- if (dbg) { double msTotal = std::chrono::duration(Clock::now() - t0).count(); _logger.LogInfo("ANSRTYOLO::DetectObjects", "[DEBUG] " + camera_id + " | TOTAL=" + std::to_string(msTotal) + "ms (" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) + ") Results=" + std::to_string(results.size()), __FILE__, __LINE__); } return results; } catch (std::exception& e) { _logger.LogFatal("ANSRTYOLO::DetectObjects", e.what(), __FILE__, __LINE__); return {}; } } // ==================================================================== // DetectObjectsBatch — batch inference with auto-detection // ==================================================================== std::vector> ANSRTYOLO::DetectObjectsBatch( const std::vector& inputImages, const std::string& camera_id) { if (inputImages.empty()) { _logger.LogError("ANSRTYOLO::DetectObjectsBatch", "Empty input images vector", __FILE__, __LINE__); return {}; } // Auto-split if batch exceeds engine capacity const int maxBatch = m_options.maxBatchSize > 0 ? m_options.maxBatchSize : 1; if (static_cast(inputImages.size()) > maxBatch && maxBatch > 0) { const size_t numImages = inputImages.size(); std::vector> allResults; allResults.reserve(numImages); for (size_t start = 0; start < numImages; start += static_cast(maxBatch)) { const size_t end = std::min(start + static_cast(maxBatch), numImages); std::vector chunk(inputImages.begin() + start, inputImages.begin() + end); auto chunkResults = DetectObjectsBatch(chunk, camera_id); if (chunkResults.size() == chunk.size()) { for (auto& r : chunkResults) allResults.push_back(std::move(r)); } else { _logger.LogError("ANSRTYOLO::DetectObjectsBatch", "Chunk returned " + std::to_string(chunkResults.size()) + " results, expected " + std::to_string(chunk.size()), __FILE__, __LINE__); for (auto& r : chunkResults) allResults.push_back(std::move(r)); for (size_t pad = chunkResults.size(); pad < chunk.size(); ++pad) allResults.push_back({}); } } return allResults; } try { // --- Debug timer helper --- using Clock = std::chrono::steady_clock; const bool dbg = _debugFlag; auto t0 = dbg ? Clock::now() : Clock::time_point{}; auto tPrev = t0; auto elapsed = [&]() -> double { auto now = Clock::now(); double ms = std::chrono::duration(now - tPrev).count(); tPrev = now; return ms; }; // Ensure correct GPU context for preprocessing (multi-GPU safety) if (m_trtEngine) { m_trtEngine->setDeviceContext(); } double msSetDevice = dbg ? elapsed() : 0; // CUDA context health check (same as DetectObjects) if (!m_nv12Helper.isCudaContextHealthy(_logger, "ANSRTYOLO")) { return {}; } const size_t realCount = inputImages.size(); // Pad batch to next power-of-2 size_t paddedCount = 1; while (paddedCount < realCount) paddedCount *= 2; paddedCount = std::min(paddedCount, static_cast(maxBatch)); const std::vector* batchPtr = &inputImages; std::vector paddedImages; if (paddedCount > realCount) { paddedImages.reserve(paddedCount); paddedImages.insert(paddedImages.end(), inputImages.begin(), inputImages.end()); for (size_t p = realCount; p < paddedCount; ++p) paddedImages.push_back(inputImages.back()); batchPtr = &paddedImages; } double msPad = dbg ? elapsed() : 0; BatchMetadata metadata; const auto inputs = PreprocessBatch(*batchPtr, metadata); double msPreprocess = dbg ? elapsed() : 0; if (inputs.empty() || inputs[0].empty()) { _logger.LogWarn("ANSRTYOLO::DetectObjectsBatch", "Skipped: preprocessing failed", __FILE__, __LINE__); return {}; } // Check for prior CUDA errors before inference. cudaError_t priorErr = cudaGetLastError(); if (priorErr != cudaSuccess) { _logger.LogWarn("ANSRTYOLO::DetectObjectsBatch", std::string("Cleared prior CUDA error before inference: ") + cudaGetErrorString(priorErr), __FILE__, __LINE__); } std::vector>> featureVectors; auto succ = m_trtEngine->runInference(inputs, featureVectors); if (!succ) { cudaError_t postErr = cudaPeekAtLastError(); std::string detail = "runInference returned false, batchSize=" + std::to_string(inputs[0].size()); if (postErr != cudaSuccess) { detail += ", CUDA error: "; detail += cudaGetErrorString(postErr); } _logger.LogError("ANSRTYOLO::DetectObjectsBatch", detail, __FILE__, __LINE__); return {}; } double msInference = dbg ? elapsed() : 0; if (featureVectors.size() != paddedCount) { _logger.LogError("ANSRTYOLO::DetectObjectsBatch", "Output batch size mismatch", __FILE__, __LINE__); return {}; } featureVectors.resize(realCount); const auto& outputDims = m_trtEngine->getOutputDims(); const size_t numOutputs = outputDims.size(); const size_t numBatch = featureVectors.size(); // Determine task type once (same model for all images in batch) int dim1 = outputDims[0].d[1]; int dim2 = outputDims[0].d[2]; int nc = static_cast(_classes.size()); enum class TaskType { DetLegacy, DetE2E, OBBLegacy, OBBE2E, SegLegacy, SegE2E, PoseLegacy, PoseE2E, Classify }; TaskType taskType = TaskType::DetLegacy; // default // E2E: dim1 > dim2 (e.g. [B,300,6]); Legacy: dim1 < dim2 (e.g. [B,84,8400]) const bool isEndToEnd = (dim1 > dim2) || (dim2 <= 20); if (numOutputs >= 2) { taskType = isEndToEnd ? TaskType::SegE2E : TaskType::SegLegacy; } else if (outputDims[0].nbDims <= 2) { taskType = TaskType::Classify; } else if (isEndToEnd) { if (dim2 == 6) taskType = TaskType::DetE2E; else if (dim2 == 7) taskType = TaskType::OBBE2E; else if (dim2 > 7 && (dim2-6) % 3 == 0) taskType = TaskType::PoseE2E; else taskType = TaskType::DetE2E; } else { int extra = dim1 - 4; bool routed = false; // Try class-list-based routing first (only if class count fits within tensor) if (nc > 0 && nc <= extra) { if (extra == nc) { taskType = TaskType::DetLegacy; routed = true; } else if (extra == nc + 1) { taskType = TaskType::OBBLegacy; routed = true; } else if ((extra-nc) % 3 == 0 && (extra-nc) >= 3) { taskType = TaskType::PoseLegacy; routed = true; } } // Fallback: probe last channel for angle values to detect OBB if (!routed && extra >= 2 && !featureVectors.empty() && !featureVectors[0].empty() && !featureVectors[0][0].empty()) { // Transpose first image's feature vector and probe last column cv::Mat raw(dim1, dim2, CV_32F, const_cast(featureVectors[0][0].data())); cv::Mat probe; cv::transpose(raw, probe); // [dim2, dim1] int lastCol = dim1 - 1; int numSamples = std::min(dim2, 100); int angleCount = 0; for (int s = 0; s < numSamples; ++s) { float v = probe.at(s, lastCol); if (v >= -3.15f && v <= 3.15f) ++angleCount; } if (angleCount > numSamples * 8 / 10) { taskType = TaskType::OBBLegacy; routed = true; } } if (!routed) { if (dim1 == 56) taskType = TaskType::PoseLegacy; else taskType = TaskType::DetLegacy; } } // Process each image in parallel std::vector> batchDetections(numBatch); std::vector>> postFutures; postFutures.reserve(numBatch); for (size_t batchIdx = 0; batchIdx < numBatch; ++batchIdx) { const auto& batchOutput = featureVectors[batchIdx]; ImageMetadata imgMeta; imgMeta.ratio = metadata.ratios[batchIdx]; imgMeta.imgWidth = static_cast(metadata.imgWidths[batchIdx]); imgMeta.imgHeight = static_cast(metadata.imgHeights[batchIdx]); switch (taskType) { case TaskType::SegLegacy: case TaskType::SegE2E: { std::vector> fv2d; fv2d.reserve(batchOutput.size()); for (const auto& out : batchOutput) fv2d.push_back(out); if (taskType == TaskType::SegE2E) { postFutures.push_back(std::async(std::launch::async, [this, fv = std::move(fv2d), cid = camera_id, m = imgMeta]() mutable { return PostprocessSegE2E(fv, cid, m); })); } else { postFutures.push_back(std::async(std::launch::async, [this, fv = std::move(fv2d), cid = camera_id, m = imgMeta]() mutable { return PostprocessSegmentation(fv, cid, m); })); } break; } case TaskType::Classify: { std::vector fv = batchOutput.empty() ? std::vector{} : batchOutput[0]; postFutures.push_back(std::async(std::launch::async, [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable { return PostprocessClassify(fv, cid, m); })); break; } default: { std::vector fv = batchOutput.empty() ? std::vector{} : batchOutput[0]; switch (taskType) { case TaskType::DetLegacy: postFutures.push_back(std::async(std::launch::async, [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable { return PostprocessDetection(fv, cid, m); })); break; case TaskType::DetE2E: postFutures.push_back(std::async(std::launch::async, [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable { return PostprocessDetectionE2E(fv, cid, m); })); break; case TaskType::OBBLegacy: postFutures.push_back(std::async(std::launch::async, [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable { return PostprocessOBB(fv, cid, m); })); break; case TaskType::OBBE2E: postFutures.push_back(std::async(std::launch::async, [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable { return PostprocessOBBE2E(fv, cid, m); })); break; case TaskType::PoseLegacy: postFutures.push_back(std::async(std::launch::async, [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable { return PostprocessPose(fv, cid, m); })); break; case TaskType::PoseE2E: postFutures.push_back(std::async(std::launch::async, [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable { return PostprocessPoseE2E(fv, cid, m); })); break; default: postFutures.push_back(std::async(std::launch::async, [this, fv = std::move(fv), cid = camera_id, m = imgMeta]() mutable { return PostprocessDetection(fv, cid, m); })); break; } break; } } } // Gather results for (size_t i = 0; i < numBatch; ++i) batchDetections[i] = postFutures[i].get(); // Apply tracker per frame (skip for classification models) if (_trackerEnabled && taskType != TaskType::Classify) { for (auto& results : batchDetections) { if (!results.empty()) { results = ApplyTracking(results, camera_id); if (_stabilizationEnabled) { results = StabilizeDetections(results, camera_id); } } } } if (dbg) { double msPostprocess = elapsed(); double msTotal = std::chrono::duration(Clock::now() - t0).count(); _logger.LogInfo("ANSRTYOLO::DetectObjectsBatch", "[DEBUG] " + camera_id + " batch=" + std::to_string(realCount) + " | SetDev=" + std::to_string(msSetDevice) + "ms Pad=" + std::to_string(msPad) + "ms Preproc=" + std::to_string(msPreprocess) + "ms Inf=" + std::to_string(msInference) + "ms Postproc=" + std::to_string(msPostprocess) + "ms TOTAL=" + std::to_string(msTotal) + "ms", __FILE__, __LINE__); } return batchDetections; } catch (const std::exception& e) { _logger.LogFatal("ANSRTYOLO::DetectObjectsBatch", e.what(), __FILE__, __LINE__); return {}; } } } // namespace ANSCENTER