#include "ANSYOLOOD.h" #include "Utility.h" #include "EPLoader.h" #include "ANSGpuFrameRegistry.h" #include "NV12PreprocessHelper.h" // tl_currentGpuFrame() #ifdef USEONNXOV //#include #endif namespace ANSCENTER { void NMSBoxes(const std::vector& boundingBoxes,const std::vector& scores,float scoreThreshold,float nmsThreshold,std::vector& indices) { indices.clear(); const size_t numBoxes = boundingBoxes.size(); if (numBoxes == 0) { return; } // Step 1: Filter out boxes with scores below the threshold // and create a list of indices sorted by descending scores std::vector sortedIndices; sortedIndices.reserve(numBoxes); for (size_t i = 0; i < numBoxes; ++i) { if (scores[i] >= scoreThreshold) { sortedIndices.push_back(static_cast(i)); } } // If no boxes remain after thresholding if (sortedIndices.empty()) { return; } // Sort the indices based on scores in descending order std::sort(sortedIndices.begin(), sortedIndices.end(), [&scores](int idx1, int idx2) { return scores[idx1] > scores[idx2]; }); // Step 2: Precompute the areas of all boxes std::vector areas(numBoxes, 0.0f); for (size_t i = 0; i < numBoxes; ++i) { areas[i] = boundingBoxes[i].width * boundingBoxes[i].height; } // Step 3: Suppression mask to mark boxes that are suppressed std::vector suppressed(numBoxes, false); // Step 4: Iterate through the sorted list and suppress boxes with high IoU for (size_t i = 0; i < sortedIndices.size(); ++i) { int currentIdx = sortedIndices[i]; if (suppressed[currentIdx]) { continue; } // Select the current box as a valid detection indices.push_back(currentIdx); const BoundingBox& currentBox = boundingBoxes[currentIdx]; const float x1_max = currentBox.x; const float y1_max = currentBox.y; const float x2_max = currentBox.x + currentBox.width; const float y2_max = currentBox.y + currentBox.height; const float area_current = areas[currentIdx]; // Compare IoU of the current box with the rest for (size_t j = i + 1; j < sortedIndices.size(); ++j) { int compareIdx = sortedIndices[j]; if (suppressed[compareIdx]) { continue; } const BoundingBox& compareBox = boundingBoxes[compareIdx]; const float x1 = std::max(x1_max, static_cast(compareBox.x)); const float y1 = std::max(y1_max, static_cast(compareBox.y)); const float x2 = std::min(x2_max, static_cast(compareBox.x + compareBox.width)); const float y2 = std::min(y2_max, static_cast(compareBox.y + compareBox.height)); const float interWidth = x2 - x1; const float interHeight = y2 - y1; if (interWidth <= 0 || interHeight <= 0) { continue; } const float intersection = interWidth * interHeight; const float unionArea = area_current + areas[compareIdx] - intersection; const float iou = (unionArea > 0.0f) ? (intersection / unionArea) : 0.0f; if (iou > nmsThreshold) { suppressed[compareIdx] = true; } } } } // End of YOLO V8 // Utility functions void YOLOOD::getBestClassInfo(std::vector::iterator it, const int& numClasses, float& bestConf, int& bestClassId) { try { // first 5 element are box and obj confidence bestClassId = 5; bestConf = 0; for (int i = 5; i < numClasses + 5; i++) { if (it[i] > bestConf) { bestConf = it[i]; bestClassId = i - 5; } } } catch (std::exception& e) { this->_logger.LogFatal("YOLOOD::getBestClassInfo", e.what(), __FILE__, __LINE__); } } size_t YOLOOD::vectorProduct(const std::vector& vector) { try { if (vector.empty()) return 0; size_t product = 1; for (const auto& element : vector) product *= element; return product; } catch (std::exception& e) { this->_logger.LogFatal("YOLOOD::vectorProduct", e.what(), __FILE__, __LINE__); return 0; } } void YOLOOD::letterbox(const cv::Mat& image, cv::Mat& outImage, const cv::Size& newShape, const cv::Scalar& color, bool auto_, bool scaleFill, bool scaleUp, int stride) { try { cv::Size shape = image.size(); float r = std::min((float)newShape.height / (float)shape.height, (float)newShape.width / (float)shape.width); if (!scaleUp) r = std::min(r, 1.0f); int newUnpad[2]{ (int)std::round((float)shape.width * r), (int)std::round((float)shape.height * r) }; auto dw = (float)(newShape.width - newUnpad[0]); auto dh = (float)(newShape.height - newUnpad[1]); if (auto_) { dw = (float)((int)dw % stride); dh = (float)((int)dh % stride); } else if (scaleFill) { dw = 0.0f; dh = 0.0f; newUnpad[0] = newShape.width; newUnpad[1] = newShape.height; } dw /= 2.0f; dh /= 2.0f; // Fix: Use OR instead of AND, and handle the else case if (shape.width != newUnpad[0] || shape.height != newUnpad[1]) { cv::resize(image, outImage, cv::Size(newUnpad[0], newUnpad[1])); } else { outImage = image.clone(); } // Fix: Better padding calculation int top = (int)dh; int bottom = newShape.height - newUnpad[1] - top; int left = (int)dw; int right = newShape.width - newUnpad[0] - left; cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color); } catch (std::exception& e) { this->_logger.LogFatal("YOLOOD::letterbox", e.what(), __FILE__, __LINE__); } } void YOLOOD::scaleCoords(const cv::Size& imageShape, cv::Rect& coords, const cv::Size& imageOriginalShape) { try { float gain = std::min((float)imageShape.height / (float)imageOriginalShape.height, (float)imageShape.width / (float)imageOriginalShape.width); int pad[2] = { (int)(((float)imageShape.width - (float)imageOriginalShape.width * gain) / 2.0f), (int)(((float)imageShape.height - (float)imageOriginalShape.height * gain) / 2.0f) }; coords.x = (int)std::round(((float)(coords.x - pad[0]) / gain)); coords.y = (int)std::round(((float)(coords.y - pad[1]) / gain)); coords.width = (int)std::round(((float)coords.width / gain)); coords.height = (int)std::round(((float)coords.height / gain)); } catch (std::exception& e) { this->_logger.LogFatal("YOLOOD::scaleCoords", e.what(), __FILE__, __LINE__); } } bool YOLOOD::loadYoloModel(const std::string& modelPath, const bool& isGPU, const cv::Size& inputSize) { try { const auto& ep = ANSCENTER::EPLoader::Current(); if (Ort::Global::api_ == nullptr) Ort::InitApi(static_cast(EPLoader::GetOrtApiRaw())); std::cout << "[YOLOOD] EP ready: " << ANSCENTER::EPLoader::EngineTypeName(ep.type) << std::endl; env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, "ONNX_DETECTION"); sessionOptions = Ort::SessionOptions(); sessionOptions.SetIntraOpNumThreads( std::min(6, static_cast(std::thread::hardware_concurrency()))); sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); // ── Log available providers ───────────────────────────────────────── std::vector availableProviders = Ort::GetAvailableProviders(); std::cout << "Available Execution Providers:" << std::endl; for (const auto& p : availableProviders) std::cout << " - " << p << std::endl; // ── Attach EP based on runtime-detected hardware ──────────────────── // No #ifdef, no isGPU flag — EPLoader owns the decision if (isGPU) { bool attached = false; switch (ep.type) { case ANSCENTER::EngineType::NVIDIA_GPU: { auto cuda_it = std::find(availableProviders.begin(), availableProviders.end(), "CUDAExecutionProvider"); if (cuda_it == availableProviders.end()) { this->_logger.LogError("YOLOOD::loadYoloModel. CUDA EP failed:", "CUDAExecutionProvider not in DLL — " "check ep/cuda/ has the CUDA ORT build.", __FILE__, __LINE__); break; } try { OrtCUDAProviderOptionsV2* cuda_options = nullptr; Ort::GetApi().CreateCUDAProviderOptions(&cuda_options); const char* keys[] = { "device_id" }; const char* values[] = { "0" }; Ort::GetApi().UpdateCUDAProviderOptions(cuda_options, keys, values, 1); sessionOptions.AppendExecutionProvider_CUDA_V2(*cuda_options); Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options); std::cout << "[YOLOOD] CUDA EP attached." << std::endl; attached = true; } catch (const Ort::Exception& e) { this->_logger.LogError("YOLOOD::loadYoloModel. CUDA EP failed:", e.what(), __FILE__, __LINE__); } break; } case ANSCENTER::EngineType::AMD_GPU: { auto it = std::find(availableProviders.begin(), availableProviders.end(), "DmlExecutionProvider"); if (it == availableProviders.end()) { this->_logger.LogError("YOLOOD::loadYoloModel. DirectML EP failed:", "DmlExecutionProvider not in DLL — " "check ep/directml/ has the DirectML ORT build.", __FILE__, __LINE__); break; } try { std::unordered_map opts = { { "device_id", "0" } }; sessionOptions.AppendExecutionProvider("DML", opts); std::cout << "[YOLOOD] DirectML EP attached." << std::endl; attached = true; } catch (const Ort::Exception& e) { this->_logger.LogError("YOLOOD::loadYoloModel. DirectML EP failed:", e.what(), __FILE__, __LINE__); } break; } case ANSCENTER::EngineType::OPENVINO_GPU: { auto it = std::find(availableProviders.begin(), availableProviders.end(), "OpenVINOExecutionProvider"); if (it == availableProviders.end()) { this->_logger.LogError("YOLOOD::loadYoloModel", "OpenVINOExecutionProvider not in DLL — " "check ep/openvino/ has the OpenVINO ORT build.", __FILE__, __LINE__); break; } // Try device configs in priority order const std::string precision = "FP16"; const std::string numberOfThreads = "8"; const std::string numberOfStreams = "8"; std::vector> try_configs = { { {"device_type","AUTO:NPU,GPU"}, {"precision",precision}, {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }, { {"device_type","GPU.0"}, {"precision",precision}, {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }, { {"device_type","GPU.1"}, {"precision",precision}, {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} }, { {"device_type","AUTO:GPU,CPU"}, {"precision",precision}, {"num_of_threads",numberOfThreads}, {"num_streams",numberOfStreams}, {"enable_opencl_throttling","False"}, {"enable_qdq_optimizer","True"} } }; for (const auto& config : try_configs) { try { sessionOptions.AppendExecutionProvider_OpenVINO_V2(config); std::cout << "[YOLOOD] OpenVINO EP attached (" << config.at("device_type") << ")." << std::endl; attached = true; break; } catch (const Ort::Exception& e) { // try next config this->_logger.LogError("YOLOOD::loadYoloModel", e.what(), __FILE__, __LINE__); } } if (!attached) std::cerr << "[YOLOOD] OpenVINO EP: all device configs failed." << std::endl; break; } default: break; } if (!attached) { std::cerr << "[YOLOOD] No GPU EP attached — running on CPU." << std::endl; this->_logger.LogFatal("YOLOOD::loadYoloModel","GPU EP not attached. Running on CPU.", __FILE__, __LINE__); } } else { std::cout << "[YOLOOD] Inference device: CPU (isGPU=false)" << std::endl; } // ── Load model ────────────────────────────────────────────────────── std::wstring w_modelPath = String2WString(modelPath.c_str()); session = Ort::Session(env, w_modelPath.c_str(), sessionOptions); Ort::AllocatorWithDefaultOptions allocator; // ── Input shape ───────────────────────────────────────────────────── Ort::TypeInfo inputTypeInfo = session.GetInputTypeInfo(0); std::vector inputTensorShape = inputTypeInfo.GetTensorTypeAndShapeInfo().GetShape(); this->isDynamicInputShape = (inputTensorShape.size() >= 4 && inputTensorShape[2] == -1 && inputTensorShape[3] == -1); if (this->isDynamicInputShape) std::cout << "Dynamic input shape detected." << std::endl; for (auto shape : inputTensorShape) std::cout << "Input shape: " << shape << std::endl; // ── Node names ────────────────────────────────────────────────────── size_t inputNodesNum = session.GetInputCount(); for (size_t i = 0; i < inputNodesNum; i++) { Ort::AllocatedStringPtr name = session.GetInputNameAllocated(i, allocator); char* buf = new char[50]; strcpy(buf, name.get()); inputNodeNames.push_back(buf); } size_t outputNodesNum = session.GetOutputCount(); for (size_t i = 0; i < outputNodesNum; i++) { Ort::AllocatedStringPtr name = session.GetOutputNameAllocated(i, allocator); char* buf = new char[50]; strcpy(buf, name.get()); outputNodeNames.push_back(buf); } this->inputImageShape = cv::Size2f(inputSize); std::cout << "Input image shape: " << inputImageShape << std::endl; return true; } catch (const std::exception& e) { this->_logger.LogFatal("YOLOOD::loadYoloModel", e.what(), __FILE__, __LINE__); return false; } } void YOLOOD::preprocessing(const cv::Mat& image,std::vector& blob,std::vector& inputTensorShape) { m_imgWidth = image.cols; m_imgHeight = image.rows; try { // Convert grayscale to BGR if needed cv::Mat processedImage; if (image.channels() == 1) { cv::cvtColor(image, processedImage, cv::COLOR_GRAY2BGR); } else { processedImage = image; // Shallow copy - safe } // Convert BGR to RGB cv::Mat rgbImage; cv::cvtColor(processedImage, rgbImage, cv::COLOR_BGR2RGB); // Resize with letterbox (SEPARATE output buffer!) cv::Mat resizedImage; letterbox(rgbImage, resizedImage, inputImageShape, cv::Scalar(114, 114, 114), isDynamicInputShape, false, true, 32); if (resizedImage.empty()) { _logger.LogError("YOLOOD::preprocessing", "Letterbox operation failed", __FILE__, __LINE__); return; } // Update tensor shape inputTensorShape = { 1, 3, resizedImage.rows, resizedImage.cols }; // Normalize to [0, 1] (FIX: Use 1.0f for float division) cv::Mat floatImage; resizedImage.convertTo(floatImage, CV_32FC3, 1.0f / 255.0f); // Convert HWC to CHW const size_t channelSize = floatImage.rows * floatImage.cols; blob.resize(channelSize * 3); std::vector channels(3); cv::split(floatImage, channels); for (int c = 0; c < 3; ++c) { std::memcpy(blob.data() + c * channelSize, channels[c].data, channelSize * sizeof(float)); } } catch (const std::exception& e) { _logger.LogFatal("YOLOOD::preprocessing", e.what(), __FILE__, __LINE__); } } cv::Mat YOLOOD::preprocessv11(const cv::Mat& image,std::vector& blob,std::vector& inputTensorShape) { try { // Validation if (image.empty()) { _logger.LogError("YOLOOD::preprocessv11", "Empty image provided", __FILE__, __LINE__); return cv::Mat(); } // Convert grayscale to BGR if needed cv::Mat processedImage; if (image.channels() == 1) { cv::cvtColor(image, processedImage, cv::COLOR_GRAY2BGR); } else { processedImage = image; // Shallow copy - safe! } // Resize with letterbox cv::Mat resizedImage; letterbox(processedImage, resizedImage, inputImageShape, cv::Scalar(114, 114, 114), isDynamicInputShape, false, true, 32); if (resizedImage.empty()) { _logger.LogError("YOLOOD::preprocessv11", "Letterbox operation failed", __FILE__, __LINE__); return cv::Mat(); } // Convert BGR to RGB cv::Mat rgbImage; cv::cvtColor(resizedImage, rgbImage, cv::COLOR_BGR2RGB); // Normalize to [0, 1] cv::Mat floatImage; rgbImage.convertTo(floatImage, CV_32FC3, 1.0f / 255.0f); // Update input tensor shape const int height = floatImage.rows; const int width = floatImage.cols; const int channels = floatImage.channels(); inputTensorShape = { 1, channels, height, width }; // Allocate blob const size_t totalSize = height * width * channels; blob.resize(totalSize); // Convert HWC to CHW efficiently std::vector rgbChannels(channels); cv::split(floatImage, rgbChannels); // Copy channels to blob in CHW order const size_t channelSize = height * width; for (int c = 0; c < channels; ++c) { std::memcpy(blob.data() + c * channelSize, rgbChannels[c].data, channelSize * sizeof(float)); } return floatImage; // Return for visualization if needed } catch (const std::exception& e) { _logger.LogFatal("YOLOOD::preprocessv11", e.what(), __FILE__, __LINE__); return cv::Mat(); } } std::vector