Files
ANSCORE/engines/ONNXEngine/ONNXEngine.cpp

1696 lines
69 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include "ONNXEngine.h"
#include "EPLoader.h"
#include "Utility.h"
#include <algorithm>
#include <limits>
#include <filesystem>
#include <fstream>
#include <cstdlib>
#include <system_error>
namespace ANSCENTER {
// ====================================================================
// BasicOrtHandler — constructors
// ====================================================================
std::string BasicOrtHandler::QueryModelInputName(const std::string& onnxPath)
{
try {
// Make sure the Ort API pointer is initialised in THIS DLL.
if (Ort::Global<void>::api_ == nullptr) {
Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
}
Ort::Env env(ORT_LOGGING_LEVEL_ERROR, "QueryModelInputName");
Ort::SessionOptions opts;
opts.SetIntraOpNumThreads(1);
opts.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_DISABLE_ALL);
// Intentionally NOT attaching CUDA/TRT EP — CPU is fastest
// for a no-inference metadata read.
std::wstring wpath(onnxPath.begin(), onnxPath.end());
Ort::Session session(env, wpath.c_str(), opts);
Ort::AllocatorWithDefaultOptions alloc;
auto inName = session.GetInputNameAllocated(0, alloc);
return std::string(inName.get());
}
catch (const Ort::Exception& e) {
std::cerr << "[QueryModelInputName] ORT exception: "
<< e.what() << " (path=" << onnxPath << ")" << std::endl;
return "";
}
catch (const std::exception& e) {
std::cerr << "[QueryModelInputName] std exception: "
<< e.what() << " (path=" << onnxPath << ")" << std::endl;
return "";
}
}
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
unsigned int _num_threads)
: log_id(_onnx_path.data()),
num_threads(_num_threads),
m_engineType(static_cast<EngineType>(-1)),
onnx_path_w(_onnx_path.begin(), _onnx_path.end()) // ← stored as member
{
onnx_path = onnx_path_w.c_str(); // ← safe, member owns storage
initialize_handler();
}
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
EngineType engineType,
unsigned int _num_threads)
: log_id(_onnx_path.data()),
num_threads(_num_threads),
m_engineType(engineType),
onnx_path_w(_onnx_path.begin(), _onnx_path.end()) // ← stored as member
{
onnx_path = onnx_path_w.c_str(); // ← safe, member owns storage
initialize_handler();
}
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
EngineType engineType,
const OrtHandlerOptions& options,
unsigned int _num_threads)
: log_id(_onnx_path.data()),
num_threads(_num_threads),
m_engineType(engineType),
m_handlerOptions(options),
onnx_path_w(_onnx_path.begin(), _onnx_path.end())
{
onnx_path = onnx_path_w.c_str();
initialize_handler();
}
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
const OrtHandlerOptions& options,
unsigned int _num_threads)
: log_id(_onnx_path.data()),
num_threads(_num_threads),
m_engineType(static_cast<EngineType>(-1)), // EPLoader auto-detect
m_handlerOptions(options),
onnx_path_w(_onnx_path.begin(), _onnx_path.end())
{
onnx_path = onnx_path_w.c_str();
initialize_handler();
}
BasicOrtHandler::~BasicOrtHandler()
{
if (ort_session) {
delete ort_session;
ort_session = nullptr;
}
if (memory_info_handler) {
delete memory_info_handler;
memory_info_handler = nullptr;
}
if (ort_env) {
delete ort_env;
ort_env = nullptr;
}
}
// ====================================================================
// EP appenders
// ====================================================================
bool BasicOrtHandler::TryAppendCUDA(Ort::SessionOptions& session_options)
{
try {
OrtCUDAProviderOptionsV2* cuda_options = nullptr;
Ort::GetApi().CreateCUDAProviderOptions(&cuda_options);
// Memory-safe GPU configuration for multi-model environments:
// - arena_extend_strategy = 1 (kSameAsRequested) to avoid
// pre-allocating huge GPU memory blocks that may exceed VRAM
// - cudnn_conv_algo_search = HEURISTIC for faster session init
// - cudnn_conv_use_max_workspace defaults to "0" to prevent
// CUDNN_BACKEND_API_FAILED when TRT engines already occupy
// most VRAM on the same GPU. OCR sub-models that need fast
// convs opt into "1" via OrtHandlerOptions::useMaxCudnnWorkspace
// - gpu_mem_limit — cap ONNX Runtime's GPU memory arena to 2 GB
// so it doesn't compete with TensorRT for the remaining VRAM
const char* maxWorkspace =
m_handlerOptions.useMaxCudnnWorkspace ? "1" : "0";
const char* keys[] = {
"device_id",
"arena_extend_strategy",
"cudnn_conv_algo_search",
"cudnn_conv_use_max_workspace",
"gpu_mem_limit"
};
const char* values[] = {
"0",
"1", // kSameAsRequested
"HEURISTIC", // avoid exhaustive algo search on large model
maxWorkspace, // "1" for OCR (perf), "0" elsewhere (safety)
"2147483648" // 2 GB arena limit
};
Ort::GetApi().UpdateCUDAProviderOptions(
cuda_options, keys, values, 5);
session_options.AppendExecutionProvider_CUDA_V2(*cuda_options);
Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options);
std::cout << "[ORT] CUDA EP attached (arena=SameAsRequested, "
"cudnn=HEURISTIC, maxWorkspace=" << maxWorkspace
<< ", memLimit=2GB)." << std::endl;
return true;
}
catch (const Ort::Exception& e) {
std::cerr << "[ORT] CUDA EP failed: " << e.what() << std::endl;
return false;
}
}
bool BasicOrtHandler::TryAppendTensorRT(Ort::SessionOptions& session_options)
{
try {
OrtTensorRTProviderOptionsV2* trt_options = nullptr;
Ort::GetApi().CreateTensorRTProviderOptions(&trt_options);
// Cache built engines on disk so subsequent runs skip the
// multi-minute build. Engines are keyed on (model hash, GPU
// arch, shape profile) so changing any of those triggers
// a rebuild automatically.
std::string cacheDir = m_handlerOptions.trtEngineCacheDir;
if (cacheDir.empty()) {
// %TEMP%\ANSCENTER\TRTEngineCache
const char* tmp = std::getenv("TEMP");
if (!tmp) tmp = std::getenv("TMP");
if (!tmp) tmp = ".";
std::filesystem::path p(tmp);
p /= "ANSCENTER";
p /= "TRTEngineCache";
std::error_code ec;
std::filesystem::create_directories(p, ec);
cacheDir = p.string();
}
// Builder options tuned for *fast first-run*:
// - opt_level 1: builds in seconds, ~510 % runtime cost vs 3
// - workspace 1 GB: leaves room for CUDA EP arena and the
// LPD's own TRT engine on the same GPU
// - timing cache: persists kernel timings between runs so
// builds at new shapes get progressively faster
// - profile shapes (if set): build ONE dynamic-shape
// engine that handles all (batch, width) combos instead
// of rebuilding per unique input. Critical for variable
// batch workloads — without this, TRT EP rebuilds every
// time runtime sees a new shape pair, causing 60-90 s
// hangs mid-stream.
std::filesystem::path timingCachePath =
std::filesystem::path(cacheDir) / "timing.cache";
std::string timingCacheStr = timingCachePath.string();
const bool haveProfile = !m_handlerOptions.trtProfileMinShapes.empty()
&& !m_handlerOptions.trtProfileOptShapes.empty()
&& !m_handlerOptions.trtProfileMaxShapes.empty();
// Build the key/value arrays. We always set the first 8 keys;
// the profile shapes are appended only when provided.
std::vector<const char*> keys = {
"device_id",
"trt_fp16_enable",
"trt_engine_cache_enable",
"trt_engine_cache_path",
"trt_max_workspace_size",
"trt_builder_optimization_level",
"trt_timing_cache_enable",
"trt_timing_cache_path"
};
std::vector<const char*> values = {
"0",
m_handlerOptions.trtFP16 ? "1" : "0",
"1",
cacheDir.c_str(),
"1073741824", // 1 GB build workspace
"1", // fast build (was "3")
"1",
cacheDir.c_str()
};
if (haveProfile) {
keys.push_back("trt_profile_min_shapes");
values.push_back(m_handlerOptions.trtProfileMinShapes.c_str());
keys.push_back("trt_profile_opt_shapes");
values.push_back(m_handlerOptions.trtProfileOptShapes.c_str());
keys.push_back("trt_profile_max_shapes");
values.push_back(m_handlerOptions.trtProfileMaxShapes.c_str());
}
Ort::GetApi().UpdateTensorRTProviderOptions(
trt_options, keys.data(), values.data(), keys.size());
session_options.AppendExecutionProvider_TensorRT_V2(*trt_options);
Ort::GetApi().ReleaseTensorRTProviderOptions(trt_options);
std::cout << "[ORT] TensorRT EP attached (fp16="
<< (m_handlerOptions.trtFP16 ? "1" : "0")
<< ", cache=" << cacheDir
<< ", profile=" << (haveProfile ? "dynamic" : "static")
<< ")." << std::endl;
if (haveProfile) {
std::cout << "[ORT] profile min: "
<< m_handlerOptions.trtProfileMinShapes << std::endl
<< "[ORT] profile opt: "
<< m_handlerOptions.trtProfileOptShapes << std::endl
<< "[ORT] profile max: "
<< m_handlerOptions.trtProfileMaxShapes << std::endl;
}
return true;
}
catch (const Ort::Exception& e) {
std::cerr << "[ORT] TensorRT EP failed: " << e.what() << std::endl;
return false;
}
catch (const std::exception& e) {
std::cerr << "[ORT] TensorRT EP failed (std): " << e.what() << std::endl;
return false;
}
}
bool BasicOrtHandler::TryAppendDirectML(Ort::SessionOptions& session_options)
{
try {
// AppendExecutionProvider("DML") is the correct API for DirectML —
// there is no V2 variant, so the string-based map is intentional here.
std::unordered_map<std::string, std::string> options = {
{ "device_id", "0" }
};
session_options.AppendExecutionProvider("DML", options);
std::cout << "[ORT] DirectML EP attached (device 0)." << std::endl;
return true;
}
catch (const Ort::Exception& e) {
std::cerr << "[ORT] DirectML EP failed: " << e.what() << std::endl;
return false;
}
}
bool BasicOrtHandler::TryAppendOpenVINO(Ort::SessionOptions& session_options)
{
// Use AppendExecutionProvider_OpenVINO_V2 instead of the generic string API,
// matching the pattern used in YOLOOD/YOLO12OD/ANSONNXCL etc.
// Try device configs in priority order, falling back gracefully.
//
// NPU availability is probed once per process. If AUTO:NPU,GPU fails on
// the first call, we skip it for all subsequent models to avoid repeated
// "Failed to load shared library" errors cluttering the log.
static bool s_npuProbed = false;
static bool s_npuAvailable = false;
const std::string precision = "FP16";
const std::string numberOfThreads = "4";
const std::string numberOfStreams = "4";
auto makeConfig = [&](const std::string& device) {
return std::unordered_map<std::string, std::string>{
{"device_type", device}, {"precision", precision},
{"num_of_threads", numberOfThreads}, {"num_streams", numberOfStreams},
{"enable_opencl_throttling", "False"}, {"enable_qdq_optimizer", "True"}
};
};
std::vector<std::unordered_map<std::string, std::string>> try_configs;
// Only try NPU if it hasn't been probed yet or was previously available
if (!s_npuProbed || s_npuAvailable) {
try_configs.push_back(makeConfig("AUTO:NPU,GPU"));
}
try_configs.push_back(makeConfig("GPU.0"));
try_configs.push_back(makeConfig("GPU.1"));
try_configs.push_back(makeConfig("AUTO:GPU,CPU"));
for (const auto& config : try_configs) {
try {
session_options.AppendExecutionProvider_OpenVINO_V2(config);
const auto& device = config.at("device_type");
std::cout << "[ORT] OpenVINO EP attached ("
<< device << ", " << precision << ")." << std::endl;
ANS_DBG("OrtHandler", "OpenVINO EP attached: %s", device.c_str());
// If NPU config succeeded, mark it available
if (device.find("NPU") != std::string::npos) {
s_npuProbed = true;
s_npuAvailable = true;
}
return true;
}
catch (const Ort::Exception& e) {
const auto& device = config.at("device_type");
// If NPU config failed, remember so we skip it next time
if (device.find("NPU") != std::string::npos) {
if (!s_npuProbed) {
std::cout << "[ORT] NPU not available — skipping NPU configs for subsequent models." << std::endl;
ANS_DBG("OrtHandler", "NPU not available, will skip in future");
}
s_npuProbed = true;
s_npuAvailable = false;
} else {
std::cerr << "[ORT] OpenVINO EP failed for device "
<< device << ": " << e.what() << std::endl;
}
}
}
std::cerr << "[ORT] OpenVINO EP: all device configs failed." << std::endl;
return false;
}
// ====================================================================
// initialize_handler
// ====================================================================
void BasicOrtHandler::initialize_handler()
{
ANS_DBG("OrtHandler", "initialize_handler: m_engineType=%d", static_cast<int>(m_engineType));
const auto& epInfo = EPLoader::Current();
ANS_DBG("OrtHandler", "initialize_handler: EPLoader type=%d dir=%s",
static_cast<int>(epInfo.type), epInfo.libraryDir.c_str());
if (Ort::Global<void>::api_ == nullptr)
Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
std::cout << "[ORT] api_ = " << (void*)Ort::Global<void>::api_ << std::endl;
EngineType engine = (static_cast<int>(m_engineType) == -1)
? epInfo.type : m_engineType;
// Persist the resolved engine type so subclasses (e.g. ONNXYOLO)
// can branch on the actual EP at inference time (IoBinding for DML).
m_engineType = engine;
ANS_DBG("OrtHandler", "initialize_handler: resolved engine=%d (from %s)",
static_cast<int>(engine),
(static_cast<int>(m_engineType) == -1) ? "EPLoader" : "explicit");
ort_env = new Ort::Env(ORT_LOGGING_LEVEL_ERROR, log_id);
memory_info_handler = new Ort::MemoryInfo(
Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault));
Ort::SessionOptions session_options;
session_options.SetIntraOpNumThreads(num_threads);
// Start with full optimization — will be downgraded to DISABLE_ALL
// later if we detect a large external data file (e.g. SAM3's 3.3 GB
// .onnx_data). Normal small models keep ORT_ENABLE_ALL.
session_options.SetGraphOptimizationLevel(
GraphOptimizationLevel::ORT_ENABLE_ALL);
session_options.SetLogSeverityLevel(4);
// DirectML REQUIRES these two settings per ORT documentation:
// - DisableMemPattern: DML manages its own memory; ORT's memory
// pattern optimization conflicts with DML's D3D12 allocator.
// - ORT_SEQUENTIAL: DML uses a single command queue and cannot
// handle parallel execution mode — doing so causes deadlocks
// when synchronizing GPU→CPU data transfers.
if (engine == EngineType::AMD_GPU) {
session_options.DisableMemPattern();
session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
// DirectML 1.15.4 (the latest; Microsoft has moved DirectML into
// sustained engineering only) has a deterministic crash path in
// amdkmdag.sys +0xf03d on RDNA2 iGPUs (Radeon 680M on Ryzen 6000)
// when ORT_ENABLE_ALL applies layout-reorder transforms to
// YOLO-style conv graphs. Downgrade to EXTENDED on DML: still
// keeps constant folding and Conv+BN+ReLU fusion (the big wins),
// drops the risky layout transforms. Perf impact on YOLO is
// typically under 5%.
session_options.SetGraphOptimizationLevel(
GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
ANS_DBG("OrtHandler",
"DirectML: DisableMemPattern + ORT_SEQUENTIAL + EXTENDED opt");
}
std::vector<std::string> available = Ort::GetAvailableProviders();
std::cout << "[ORT] Available providers: ";
for (auto& p : available) std::cout << p << " ";
std::cout << std::endl;
//std::cout << "[ORT] Selected engine : "
// << EPLoader::EngineTypeToString(engine) << std::endl;
auto hasProvider = [&](const std::string& name) -> bool {
return std::find(available.begin(), available.end(), name)
!= available.end();
};
bool epAttached = false;
switch (engine)
{
// --------------------------------------------------------
case EngineType::NVIDIA_GPU:
// Try TensorRT EP first when explicitly requested. Falls
// through to CUDA EP if TRT is missing or option creation
// fails. Both EPs may be attached at once — ORT picks TRT
// for nodes it supports and CUDA for the rest.
if (m_handlerOptions.preferTensorRT
&& hasProvider("TensorrtExecutionProvider")) {
ANS_DBG("OrtHandler", "Trying TensorRT EP...");
if (TryAppendTensorRT(session_options)) {
epAttached = true;
}
else {
std::cerr << "[ORT] TensorRT EP attach failed — "
"falling back to CUDA EP." << std::endl;
}
}
ANS_DBG("OrtHandler", "Trying CUDA EP...");
if (hasProvider("CUDAExecutionProvider")) {
if (TryAppendCUDA(session_options)) {
epAttached = true;
}
}
if (!epAttached) {
std::cerr << "[ORT] CUDA EP unavailable — falling back to CPU."
<< std::endl;
ANS_DBG("OrtHandler", "CUDA EP FAILED — fallback to CPU");
}
break;
// --------------------------------------------------------
case EngineType::AMD_GPU:
ANS_DBG("OrtHandler", "Trying DirectML EP...");
if (hasProvider("DmlExecutionProvider"))
epAttached = TryAppendDirectML(session_options);
if (!epAttached) {
std::cerr << "[ORT] DirectML EP unavailable — falling back to CPU."
<< std::endl;
ANS_DBG("OrtHandler", "DirectML EP FAILED — fallback to CPU");
}
break;
// --------------------------------------------------------
case EngineType::OPENVINO_GPU:
ANS_DBG("OrtHandler", "Trying OpenVINO EP...");
if (hasProvider("OpenVINOExecutionProvider"))
epAttached = TryAppendOpenVINO(session_options);
if (!epAttached) {
std::cerr << "[ORT] OpenVINO EP unavailable — falling back to CPU."
<< std::endl;
ANS_DBG("OrtHandler", "OpenVINO EP FAILED — fallback to CPU");
}
break;
// --------------------------------------------------------
case EngineType::CPU:
default:
std::cout << "[ORT] Using CPU EP." << std::endl;
ANS_DBG("OrtHandler", "Using CPU EP");
epAttached = true;
break;
}
if (!epAttached) {
std::cout << "[ORT] Running on CPU EP (fallback)." << std::endl;
ANS_DBG("OrtHandler", "EP not attached — running on CPU fallback");
} else {
ANS_DBG("OrtHandler", "EP attached successfully");
}
// ----------------------------------------------------------------
// Create session
// ----------------------------------------------------------------
// ORT resolves external data files (e.g. .onnx_data) relative to
// the CWD rather than the model file's directory. Temporarily
// switch CWD so ORT can locate them.
//
// Additionally, ORT's internal memory-mapping of very large
// external data files (>2 GB) can crash with an access violation
// on Windows. When we detect a large .onnx_data file, we
// pre-load it with standard file I/O and pass the buffer via
// AddExternalInitializersFromFilesInMemory() so ORT never
// memory-maps the file itself.
// ----------------------------------------------------------------
std::filesystem::path modelFsPath(onnx_path); // wchar_t*
std::filesystem::path modelDir = modelFsPath.parent_path();
std::filesystem::path prevCwd = std::filesystem::current_path();
if (!modelDir.empty() && std::filesystem::is_directory(modelDir)) {
std::filesystem::current_path(modelDir);
std::cout << "[ORT] CWD -> " << modelDir.string() << std::endl;
}
// --- Pre-load external data files if they exist -----------------
// Keep the buffer alive across session creation (must outlive the
// Ort::Session constructor call).
std::vector<char> extDataBuffer;
{
// Build the expected external-data filename:
// <model_stem>.onnx_data (e.g. anssam3.onnx_data)
std::filesystem::path extDataPath =
modelDir / (modelFsPath.stem().wstring() + L".onnx_data");
if (std::filesystem::exists(extDataPath)) {
auto fileSize = std::filesystem::file_size(extDataPath);
std::cout << "[ORT] External data file found: "
<< extDataPath.string()
<< " (" << (fileSize / (1024*1024)) << " MB)" << std::endl;
// Read entire file into memory with standard I/O.
// This avoids ORT's internal memory-mapping which can crash
// with access violation for files > 2 GB on Windows.
try {
std::ifstream ifs(extDataPath, std::ios::binary);
if (!ifs) {
std::cerr << "[ORT] ERROR: Could not open external data file."
<< std::endl;
} else {
extDataBuffer.resize(static_cast<size_t>(fileSize));
std::cout << "[ORT] Reading external data into memory..."
<< std::endl;
ifs.read(extDataBuffer.data(), static_cast<std::streamsize>(fileSize));
ifs.close();
std::cout << "[ORT] External data loaded ("
<< extDataBuffer.size() << " bytes)." << std::endl;
// Tell ORT to use our in-memory buffer instead of
// memory-mapping the file.
std::vector<std::basic_string<ORTCHAR_T>> extFileNames = {
extDataPath.filename().wstring()
};
std::vector<char*> extBuffers = { extDataBuffer.data() };
std::vector<size_t> extLengths = { extDataBuffer.size() };
session_options.AddExternalInitializersFromFilesInMemory(
extFileNames, extBuffers, extLengths);
std::cout << "[ORT] External initializers registered."
<< std::endl;
// Large external-data models crash ORT's CUDA graph
// optimization passes. Disable all optimization for
// these models only. Normal small models (SCRFD, YOLO,
// GlintArcFace, etc.) keep ORT_ENABLE_ALL.
session_options.SetGraphOptimizationLevel(
GraphOptimizationLevel::ORT_DISABLE_ALL);
std::cout << "[ORT] Graph optimization set to DISABLE_ALL "
"(large external data detected)." << std::endl;
}
}
catch (const std::bad_alloc&) {
std::cerr << "[ORT] WARNING: Could not allocate "
<< (fileSize / (1024*1024)) << " MB for external data. "
<< "Falling back to ORT file mapping." << std::endl;
extDataBuffer.clear();
extDataBuffer.shrink_to_fit();
}
}
}
// --- Load the .onnx model file into a memory buffer too ----------
// This avoids ORT opening/mapping ANY files during CreateSession.
std::vector<char> modelBuffer;
bool useModelBuffer = false;
if (!extDataBuffer.empty()) {
// External data was pre-loaded, so also load the .onnx itself
try {
auto modelFileSize = std::filesystem::file_size(modelFsPath);
modelBuffer.resize(static_cast<size_t>(modelFileSize));
std::ifstream mifs(modelFsPath, std::ios::binary);
if (mifs) {
mifs.read(modelBuffer.data(), static_cast<std::streamsize>(modelFileSize));
mifs.close();
useModelBuffer = true;
std::cout << "[ORT] Model proto loaded into memory ("
<< modelBuffer.size() << " bytes)." << std::endl;
}
}
catch (const std::exception& e) {
std::cerr << "[ORT] WARNING: Could not read model file into memory: "
<< e.what() << ". Using file path." << std::endl;
}
}
// --- Attempt session creation (with CUDA → CPU fallback) --------
auto createSession = [&](Ort::SessionOptions& opts, const char* label) {
std::cout << "[ORT] Creating session (" << label << ")..." << std::endl;
if (useModelBuffer) {
ort_session = new Ort::Session(*ort_env,
modelBuffer.data(), modelBuffer.size(), opts);
} else {
ort_session = new Ort::Session(*ort_env, onnx_path, opts);
}
std::cout << "[ORT] Session created OK (" << label << ")." << std::endl;
};
ANS_DBG("OrtHandler", "Creating session for model: %ls", onnx_path);
try {
createSession(session_options, "primary EP");
ANS_DBG("OrtHandler", "Session created OK with primary EP");
}
catch (const Ort::Exception& e) {
ANS_DBG("OrtHandler", "Session FAILED with primary EP: %s", e.what());
std::cerr << "[ORT] Session creation FAILED with primary EP: "
<< e.what() << std::endl;
// If we were using a GPU EP, fall back to CPU
if (engine != EngineType::CPU && epAttached) {
ANS_DBG("OrtHandler", "Retrying with CPU fallback...");
std::cerr << "[ORT] Retrying with CPU EP (fallback)..." << std::endl;
// Build fresh session options — no GPU EP, no graph opt
Ort::SessionOptions cpuOpts;
cpuOpts.SetIntraOpNumThreads(num_threads);
cpuOpts.SetGraphOptimizationLevel(
GraphOptimizationLevel::ORT_DISABLE_ALL);
cpuOpts.SetLogSeverityLevel(4);
// Re-register the in-memory external data if we have it
if (!extDataBuffer.empty()) {
std::filesystem::path extDataPath =
modelDir / (modelFsPath.stem().wstring() + L".onnx_data");
std::vector<std::basic_string<ORTCHAR_T>> extFileNames = {
extDataPath.filename().wstring()
};
std::vector<char*> extBuffers = { extDataBuffer.data() };
std::vector<size_t> extLengths = { extDataBuffer.size() };
cpuOpts.AddExternalInitializersFromFilesInMemory(
extFileNames, extBuffers, extLengths);
}
createSession(cpuOpts, "CPU fallback");
} else {
throw; // re-throw if already on CPU
}
}
catch (const std::exception& e) {
ANS_DBG("OrtHandler", "Session FAILED (std::exception): %s", e.what());
std::cerr << "[ORT] Session creation FAILED (std::exception): "
<< e.what() << std::endl;
throw;
}
// Restore previous CWD & release buffers
std::filesystem::current_path(prevCwd);
extDataBuffer.clear();
extDataBuffer.shrink_to_fit();
modelBuffer.clear();
modelBuffer.shrink_to_fit();
Ort::Allocator allocator(*ort_session, *memory_info_handler);
std::cout << "[ORT] Allocator created OK." << std::endl;
// Input
input_node_names.resize(1);
input_node_names_.resize(1);
input_node_names_[0] = OrtCompatiableGetInputName(0, allocator, ort_session);
input_node_names[0] = input_node_names_[0].data();
Ort::TypeInfo type_info = ort_session->GetInputTypeInfo(0);
auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
input_tensor_size = 1;
input_node_dims = tensor_info.GetShape();
for (auto dim : input_node_dims) {
if (dim > 0) input_tensor_size *= static_cast<size_t>(dim);
}
input_values_handler.resize(input_tensor_size);
// Outputs
num_outputs = static_cast<int>(ort_session->GetOutputCount());
output_node_names.resize(num_outputs);
output_node_names_.resize(num_outputs);
for (int i = 0; i < num_outputs; ++i) {
output_node_names_[i] =
OrtCompatiableGetOutputName(i, allocator, ort_session);
output_node_names[i] = output_node_names_[i].data();
output_node_dims.push_back(
ort_session->GetOutputTypeInfo(i)
.GetTensorTypeAndShapeInfo().GetShape());
}
}
// ====================================================================
// GlintArcFace
// ====================================================================
Ort::Value GlintArcFace::transform(const cv::Mat& mat)
{
if (mat.empty())
throw std::runtime_error("GlintArcFace::transform — input is empty.");
cv::Mat canvas;
cv::resize(mat, canvas, cv::Size(input_node_dims.at(3), input_node_dims.at(2)));
cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
if (canvas.type() != CV_32FC3)
canvas.convertTo(canvas, CV_32FC3);
utils::transform::normalize_inplace(canvas, mean_val, scale_val);
std::vector<int64_t> shape = input_node_dims;
if (shape[0] == -1) shape[0] = 1;
return utils::transform::create_tensor(
canvas, shape, *memory_info_handler,
input_values_handler, utils::transform::CHW);
}
Ort::Value GlintArcFace::transformBatch(const std::vector<cv::Mat>& images)
{
if (images.empty())
throw std::runtime_error("GlintArcFace::transformBatch — batch is empty.");
const int width = input_node_dims.at(3);
const int height = input_node_dims.at(2);
std::vector<cv::Mat> batch;
batch.reserve(images.size());
cv::Mat t1, t2, t3;
for (const auto& mat : images) {
if (mat.empty())
throw std::runtime_error("GlintArcFace::transformBatch — empty image in batch.");
cv::resize(mat, t1, cv::Size(width, height));
cv::cvtColor(t1, t2, cv::COLOR_BGR2RGB);
if (t2.type() != CV_32FC3) t2.convertTo(t3, CV_32FC3);
else t3 = t2.clone();
utils::transform::normalize_inplace(t3, mean_val, scale_val);
batch.push_back(t3.clone());
}
std::vector<int64_t> shape = input_node_dims;
shape[0] = static_cast<int64_t>(images.size());
return utils::transform::create_tensor_batch(
batch, shape, *memory_info_handler,
input_values_handler, utils::transform::CHW);
}
void GlintArcFace::detect(const cv::Mat& mat, types::FaceContent& face_content)
{
if (mat.empty()) return;
Ort::Value input_tensor = transform(mat);
auto output_tensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &input_tensor, 1,
output_node_names.data(), num_outputs);
const unsigned int hidden_dim =
static_cast<unsigned int>(output_node_dims.at(0).at(1));
const float* vals =
output_tensors.at(0).GetTensorMutableData<float>();
std::vector<float> embedding(vals, vals + hidden_dim);
cv::normalize(embedding, embedding);
face_content.embedding = std::move(embedding);
face_content.dim = hidden_dim;
face_content.flag = true;
}
void GlintArcFace::detectBatch(const std::vector<cv::Mat>& images,
std::vector<types::FaceContent>& face_contents)
{
if (images.empty()) return;
const size_t batch_size = images.size();
face_contents.clear();
face_contents.reserve(batch_size);
try {
Ort::Value input_tensor = transformBatch(images);
auto output_tensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &input_tensor, 1,
output_node_names.data(), num_outputs);
input_values_handler.clear();
input_values_handler.shrink_to_fit();
const float* vals = output_tensors[0].GetTensorData<float>();
const unsigned int hidden_dim =
static_cast<unsigned int>(output_node_dims.at(0).at(1));
face_contents.resize(batch_size);
for (size_t i = 0; i < batch_size; ++i) {
cv::Mat emb_mat(1, hidden_dim, CV_32F,
const_cast<float*>(vals + i * hidden_dim));
cv::Mat emb_norm;
cv::normalize(emb_mat, emb_norm);
face_contents[i].embedding = std::vector<float>(
emb_norm.begin<float>(), emb_norm.end<float>());
face_contents[i].dim = hidden_dim;
face_contents[i].flag = true;
}
}
catch (const Ort::Exception&) {
face_contents.clear();
throw;
}
}
// ====================================================================
// GlintCosFace
// ====================================================================
Ort::Value GlintCosFace::transform(const cv::Mat& mat)
{
if (mat.empty())
throw std::runtime_error("GlintCosFace::transform — input is empty.");
cv::Mat canvas;
cv::resize(mat, canvas, cv::Size(input_node_dims.at(3), input_node_dims.at(2)));
cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
canvas.convertTo(canvas, CV_32FC3);
utils::transform::normalize_inplace(canvas, mean_val, scale_val);
std::vector<int64_t> shape = input_node_dims;
if (shape[0] == -1) shape[0] = 1;
return utils::transform::create_tensor(
canvas, shape, *memory_info_handler,
input_values_handler, utils::transform::CHW);
}
Ort::Value GlintCosFace::transformBatch(const std::vector<cv::Mat>& images)
{
if (images.empty())
throw std::runtime_error("GlintCosFace::transformBatch — batch is empty.");
const int width = input_node_dims.at(3);
const int height = input_node_dims.at(2);
std::vector<cv::Mat> batch;
batch.reserve(images.size());
for (const auto& mat : images) {
if (mat.empty())
throw std::runtime_error("GlintCosFace::transformBatch — empty image in batch.");
cv::Mat canvas;
cv::resize(mat, canvas, cv::Size(width, height));
cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
canvas.convertTo(canvas, CV_32FC3);
utils::transform::normalize_inplace(canvas, mean_val, scale_val);
batch.push_back(std::move(canvas));
}
std::vector<int64_t> shape = input_node_dims;
shape[0] = static_cast<int64_t>(images.size());
return utils::transform::create_tensor_batch(
batch, shape, *memory_info_handler,
input_values_handler, utils::transform::CHW);
}
void GlintCosFace::detect(const cv::Mat& mat, types::FaceContent& face_content)
{
if (mat.empty()) return;
Ort::Value input_tensor = transform(mat);
auto output_tensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &input_tensor, 1,
output_node_names.data(), num_outputs);
const unsigned int hidden_dim =
static_cast<unsigned int>(output_node_dims.at(0).at(1));
const float* vals =
output_tensors.at(0).GetTensorMutableData<float>();
std::vector<float> embedding(vals, vals + hidden_dim);
cv::normalize(embedding, embedding);
face_content.embedding = std::move(embedding);
face_content.dim = hidden_dim;
face_content.flag = true;
}
void GlintCosFace::detectBatch(const std::vector<cv::Mat>& images,
std::vector<types::FaceContent>& face_contents)
{
if (images.empty()) return;
const size_t batch_size = images.size();
face_contents.clear();
face_contents.reserve(batch_size);
Ort::Value input_tensor = transformBatch(images);
auto output_tensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &input_tensor, 1,
output_node_names.data(), num_outputs);
const float* vals =
output_tensors.at(0).GetTensorMutableData<float>();
const unsigned int hidden_dim =
static_cast<unsigned int>(output_node_dims.at(0).at(1));
for (size_t i = 0; i < batch_size; ++i) {
std::vector<float> embedding(vals + i * hidden_dim,
vals + i * hidden_dim + hidden_dim);
cv::normalize(embedding, embedding);
types::FaceContent fc;
fc.embedding = std::move(embedding);
fc.dim = hidden_dim;
fc.flag = true;
face_contents.emplace_back(std::move(fc));
}
}
// ====================================================================
// SCRFD — constructors
// ====================================================================
SCRFD::SCRFD(const std::string& _onnx_path, unsigned int _num_threads)
: BasicOrtHandler(_onnx_path, _num_threads)
{
initial_context();
}
SCRFD::SCRFD(const std::string& _onnx_path,
EngineType engineType,
unsigned int _num_threads)
: BasicOrtHandler(_onnx_path, engineType, _num_threads)
{
initial_context();
}
void SCRFD::initial_context()
{
if (num_outputs == 6) {
fmc = 3; feat_stride_fpn = { 8, 16, 32 }; num_anchors = 2; use_kps = false;
}
else if (num_outputs == 9) {
fmc = 3; feat_stride_fpn = { 8, 16, 32 }; num_anchors = 2; use_kps = true;
}
}
void SCRFD::resize_unscale(const cv::Mat& mat, cv::Mat& mat_rs,
int target_height, int target_width,
SCRFDScaleParams& scale_params)
{
if (mat.empty()) return;
int img_height = mat.rows;
int img_width = mat.cols;
mat_rs = cv::Mat(target_height, target_width, CV_8UC3, cv::Scalar(0, 0, 0));
float r = std::min(
static_cast<float>(target_width) / img_width,
static_cast<float>(target_height) / img_height);
int new_w = static_cast<int>(img_width * r);
int new_h = static_cast<int>(img_height * r);
int dw = (target_width - new_w) / 2;
int dh = (target_height - new_h) / 2;
cv::Mat resized;
cv::resize(mat, resized, cv::Size(new_w, new_h));
resized.copyTo(mat_rs(cv::Rect(dw, dh, new_w, new_h)));
scale_params.ratio = r;
scale_params.dw = dw;
scale_params.dh = dh;
scale_params.flag = true;
}
Ort::Value SCRFD::transform(const cv::Mat& mat_rs)
{
cv::Mat canvas = mat_rs.clone();
cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
utils::transform::normalize_inplace(canvas, mean_vals, scale_vals);
return utils::transform::create_tensor(
canvas, input_node_dims, *memory_info_handler,
input_values_handler, utils::transform::CHW);
}
Ort::Value SCRFD::transformBatch(const std::vector<cv::Mat>& images)
{
if (images.empty())
throw std::runtime_error("SCRFD::transformBatch — batch is empty.");
const int width = input_node_dims.at(3);
const int height = input_node_dims.at(2);
std::vector<cv::Mat> batch;
batch.reserve(images.size());
for (const auto& mat : images) {
if (mat.empty())
throw std::runtime_error("SCRFD::transformBatch — empty image in batch.");
cv::Mat canvas;
cv::resize(mat, canvas, cv::Size(width, height));
cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
canvas.convertTo(canvas, CV_32FC3);
utils::transform::normalize_inplace(canvas, mean_vals, scale_vals);
batch.push_back(std::move(canvas));
}
std::vector<int64_t> shape = input_node_dims;
shape[0] = static_cast<int64_t>(images.size());
return utils::transform::create_tensor_batch(
batch, shape, *memory_info_handler,
input_values_handler, utils::transform::CHW);
}
void SCRFD::detect(const cv::Mat& mat,
std::vector<types::BoxfWithLandmarks>& detected_boxes_kps,
float score_threshold, float iou_threshold, unsigned int topk)
{
if (mat.empty()) return;
float img_height = static_cast<float>(mat.rows);
float img_width = static_cast<float>(mat.cols);
int target_height = static_cast<int>(input_node_dims.at(2));
int target_width = static_cast<int>(input_node_dims.at(3));
cv::Mat mat_rs;
SCRFDScaleParams scale_params;
resize_unscale(mat, mat_rs, target_height, target_width, scale_params);
Ort::Value input_tensor = transform(mat_rs);
auto output_tensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &input_tensor, 1,
output_node_names.data(), num_outputs);
std::vector<types::BoxfWithLandmarks> bbox_kps_collection;
generate_bboxes_kps(scale_params, bbox_kps_collection,
output_tensors, score_threshold,
img_height, img_width);
nms_bboxes_kps(bbox_kps_collection, detected_boxes_kps,
iou_threshold, topk);
}
void SCRFD::generate_points(int target_height, int target_width)
{
if (center_points_is_update) return;
for (auto stride : feat_stride_fpn) {
unsigned int num_grid_w = target_width / stride;
unsigned int num_grid_h = target_height / stride;
for (unsigned int i = 0; i < num_grid_h; ++i) {
for (unsigned int j = 0; j < num_grid_w; ++j) {
for (unsigned int k = 0; k < num_anchors; ++k) {
SCRFDPoint pt;
pt.cx = static_cast<float>(j);
pt.cy = static_cast<float>(i);
pt.stride = static_cast<float>(stride);
center_points[stride].push_back(pt);
}
}
}
}
center_points_is_update = true;
}
void SCRFD::generate_bboxes_kps(const SCRFDScaleParams& scale_params,
std::vector<types::BoxfWithLandmarks>& bbox_kps_collection,
std::vector<Ort::Value>& output_tensors,
float score_threshold,
float img_height, float img_width)
{
const float input_height = static_cast<float>(input_node_dims.at(2));
const float input_width = static_cast<float>(input_node_dims.at(3));
generate_points(static_cast<int>(input_height),
static_cast<int>(input_width));
bbox_kps_collection.clear();
if (use_kps) {
generate_bboxes_kps_single_stride(scale_params,
output_tensors.at(0), output_tensors.at(3), output_tensors.at(6),
8, score_threshold, img_height, img_width, bbox_kps_collection);
generate_bboxes_kps_single_stride(scale_params,
output_tensors.at(1), output_tensors.at(4), output_tensors.at(7),
16, score_threshold, img_height, img_width, bbox_kps_collection);
generate_bboxes_kps_single_stride(scale_params,
output_tensors.at(2), output_tensors.at(5), output_tensors.at(8),
32, score_threshold, img_height, img_width, bbox_kps_collection);
}
else {
generate_bboxes_single_stride(scale_params,
output_tensors.at(0), output_tensors.at(3),
8, score_threshold, img_height, img_width, bbox_kps_collection);
generate_bboxes_single_stride(scale_params,
output_tensors.at(1), output_tensors.at(4),
16, score_threshold, img_height, img_width, bbox_kps_collection);
generate_bboxes_single_stride(scale_params,
output_tensors.at(2), output_tensors.at(5),
32, score_threshold, img_height, img_width, bbox_kps_collection);
}
}
void SCRFD::generate_bboxes_single_stride(
const SCRFDScaleParams& scale_params,
Ort::Value& score_pred, Ort::Value& bbox_pred,
unsigned int stride, float score_threshold,
float img_height, float img_width,
std::vector<types::BoxfWithLandmarks>& bbox_kps_collection)
{
unsigned int nms_pre_ = std::max(nms_pre, (stride / 8) * nms_pre);
auto stride_dims = score_pred.GetTypeInfo().GetTensorTypeAndShapeInfo().GetShape();
const unsigned int num_points = static_cast<unsigned int>(stride_dims.at(1));
const float* score_ptr = score_pred.GetTensorMutableData<float>();
const float* bbox_ptr = bbox_pred.GetTensorMutableData<float>();
float ratio = scale_params.ratio;
int dw = scale_params.dw;
int dh = scale_params.dh;
unsigned int count = 0;
auto& stride_points = center_points[stride];
for (unsigned int i = 0; i < num_points; ++i) {
if (score_ptr[i] < score_threshold) continue;
const auto& point = stride_points.at(i);
const float* offsets = bbox_ptr + i * 4;
float x1 = ((point.cx - offsets[0]) * point.stride - dw) / ratio;
float y1 = ((point.cy - offsets[1]) * point.stride - dh) / ratio;
float x2 = ((point.cx + offsets[2]) * point.stride - dw) / ratio;
float y2 = ((point.cy + offsets[3]) * point.stride - dh) / ratio;
types::BoxfWithLandmarks box_kps;
box_kps.box.x1 = std::max(0.f, x1);
box_kps.box.y1 = std::max(0.f, y1);
box_kps.box.x2 = std::min(img_width - 1.f, x2);
box_kps.box.y2 = std::min(img_height - 1.f, y2);
box_kps.box.score = score_ptr[i];
box_kps.box.label = 1;
box_kps.box.label_text = "face";
box_kps.box.flag = true;
box_kps.flag = true;
bbox_kps_collection.push_back(box_kps);
if (++count > max_nms) break;
}
if (bbox_kps_collection.size() > nms_pre_) {
std::sort(bbox_kps_collection.begin(), bbox_kps_collection.end(),
[](const types::BoxfWithLandmarks& a, const types::BoxfWithLandmarks& b) {
return a.box.score > b.box.score; });
bbox_kps_collection.resize(nms_pre_);
}
}
void SCRFD::generate_bboxes_kps_single_stride(
const SCRFDScaleParams& scale_params,
Ort::Value& score_pred, Ort::Value& bbox_pred, Ort::Value& kps_pred,
unsigned int stride, float score_threshold,
float img_height, float img_width,
std::vector<types::BoxfWithLandmarks>& bbox_kps_collection)
{
unsigned int nms_pre_ = std::max(nms_pre, (stride / 8) * nms_pre);
auto stride_dims = score_pred.GetTypeInfo().GetTensorTypeAndShapeInfo().GetShape();
const unsigned int num_points = static_cast<unsigned int>(stride_dims.at(1));
const float* score_ptr = score_pred.GetTensorMutableData<float>();
const float* bbox_ptr = bbox_pred.GetTensorMutableData<float>();
const float* kps_ptr = kps_pred.GetTensorMutableData<float>();
float ratio = scale_params.ratio;
int dw = scale_params.dw;
int dh = scale_params.dh;
unsigned int count = 0;
auto& stride_points = center_points[stride];
for (unsigned int i = 0; i < num_points; ++i) {
if (score_ptr[i] < score_threshold) continue;
const auto& point = stride_points.at(i);
const float* offsets = bbox_ptr + i * 4;
float x1 = ((point.cx - offsets[0]) * point.stride - dw) / ratio;
float y1 = ((point.cy - offsets[1]) * point.stride - dh) / ratio;
float x2 = ((point.cx + offsets[2]) * point.stride - dw) / ratio;
float y2 = ((point.cy + offsets[3]) * point.stride - dh) / ratio;
types::BoxfWithLandmarks box_kps;
box_kps.box.x1 = std::max(0.f, x1);
box_kps.box.y1 = std::max(0.f, y1);
box_kps.box.x2 = std::min(img_width - 1.f, x2);
box_kps.box.y2 = std::min(img_height - 1.f, y2);
box_kps.box.score = score_ptr[i];
box_kps.box.label = 1;
box_kps.box.label_text = "face";
box_kps.box.flag = true;
const float* kps_offsets = kps_ptr + i * 10;
for (unsigned int j = 0; j < 10; j += 2) {
cv::Point2f kp;
kp.x = std::min(std::max(0.f,
((point.cx + kps_offsets[j]) * point.stride - dw) / ratio),
img_width - 1.f);
kp.y = std::min(std::max(0.f,
((point.cy + kps_offsets[j + 1]) * point.stride - dh) / ratio),
img_height - 1.f);
box_kps.landmarks.points.push_back(kp);
}
box_kps.landmarks.flag = true;
box_kps.flag = true;
bbox_kps_collection.push_back(box_kps);
if (++count > max_nms) break;
}
if (bbox_kps_collection.size() > nms_pre_) {
std::sort(bbox_kps_collection.begin(), bbox_kps_collection.end(),
[](const types::BoxfWithLandmarks& a, const types::BoxfWithLandmarks& b) {
return a.box.score > b.box.score; });
bbox_kps_collection.resize(nms_pre_);
}
}
void SCRFD::nms_bboxes_kps(std::vector<types::BoxfWithLandmarks>& input,
std::vector<types::BoxfWithLandmarks>& output,
float iou_threshold, unsigned int topk)
{
if (input.empty()) return;
std::sort(input.begin(), input.end(),
[](const types::BoxfWithLandmarks& a, const types::BoxfWithLandmarks& b) {
return a.box.score > b.box.score; });
const unsigned int box_num = static_cast<unsigned int>(input.size());
std::vector<int> merged(box_num, 0);
unsigned int count = 0;
for (unsigned int i = 0; i < box_num; ++i) {
if (merged[i]) continue;
output.push_back(input[i]);
merged[i] = 1;
for (unsigned int j = i + 1; j < box_num; ++j) {
if (merged[j]) continue;
if (input[i].box.iou_of(input[j].box) > iou_threshold) {
merged[j] = 1;
}
}
if (++count >= topk) break;
}
}
// ====================================================================
// MOVINET
// ====================================================================
MOVINET::MOVINET(const std::string& _onnx_path, unsigned int _num_threads)
: BasicOrtHandler(_onnx_path, _num_threads)
{
init_io_names();
}
MOVINET::MOVINET(const std::string& _onnx_path,
int _temporal, int _width, int _height, int _channels,
unsigned int _num_threads)
: BasicOrtHandler(_onnx_path, _num_threads)
{
input_params.temporal = _temporal;
input_params.width = _width;
input_params.height = _height;
input_params.channels = _channels;
init_io_names();
}
MOVINET::MOVINET(const std::string& _onnx_path,
EngineType engineType,
unsigned int _num_threads)
: BasicOrtHandler(_onnx_path, engineType, _num_threads)
{
init_io_names();
}
MOVINET::MOVINET(const std::string& _onnx_path,
EngineType engineType,
int _temporal, int _width, int _height, int _channels,
unsigned int _num_threads)
: BasicOrtHandler(_onnx_path, engineType, _num_threads)
{
input_params.temporal = _temporal;
input_params.width = _width;
input_params.height = _height;
input_params.channels = _channels;
init_io_names();
}
void MOVINET::init_io_names()
{
Ort::AllocatorWithDefaultOptions allocator;
_MoviNetInputName =
ort_session->GetInputNameAllocated(0, allocator).get();
_MoviNetOutputName =
ort_session->GetOutputNameAllocated(0, allocator).get();
}
Ort::Value MOVINET::transform(const std::deque<cv::Mat>& frames)
{
if (frames.size() != static_cast<size_t>(input_params.temporal))
throw std::runtime_error("MOVINET::transform — frame count != temporal length.");
std::vector<int64_t> shape = {
1,
input_params.channels,
input_params.temporal,
input_params.height,
input_params.width
};
auto mem = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
return utils::transform::create_video_tensor_5d(
frames, shape, mem, input_tensor_values);
}
std::pair<int, float> MOVINET::post_processing(const float* p)
{
const int C = output_params.num_classes;
float m = *std::max_element(p, p + C);
float s = 0.f;
std::vector<float> prob(C);
for (int i = 0; i < C; ++i) s += (prob[i] = std::exp(p[i] - m));
for (float& v : prob) v /= s;
int label = static_cast<int>(
std::max_element(prob.begin(), prob.end()) - prob.begin());
return { label, prob[label] };
}
void MOVINET::inference(const std::deque<cv::Mat>& frames,
std::pair<int, float>& out_result)
{
if (frames.empty() ||
frames.size() != static_cast<size_t>(input_params.temporal)) {
std::cerr << "[MOVINET] Invalid frame count." << std::endl;
out_result = { -1, 0.f };
return;
}
Ort::Value input_tensor = transform(frames);
const char* in_names[] = { _MoviNetInputName.c_str() };
const char* out_names[] = { _MoviNetOutputName.c_str() };
auto outputs = ort_session->Run(
Ort::RunOptions{ nullptr },
in_names, &input_tensor, 1, out_names, 1);
out_result = post_processing(outputs[0].GetTensorData<float>());
}
Ort::Value MOVINET::transform(const cv::Mat& mat)
{
std::deque<cv::Mat> frames;
for (int i = 0; i < input_params.temporal; ++i)
frames.push_back(mat.clone());
return transform(frames);
}
Ort::Value MOVINET::transformBatch(const std::vector<cv::Mat>& images)
{
std::deque<cv::Mat> frames;
if (!images.empty()) {
for (int i = 0; i < input_params.temporal; ++i)
frames.push_back(images[i % images.size()].clone());
}
return transform(frames);
}
// ====================================================================
// utils::transform
// ====================================================================
Ort::Value ANSCENTER::utils::transform::create_tensor(
const cv::Mat& mat,
const std::vector<int64_t>& tensor_dims,
const Ort::MemoryInfo& memory_info_handler,
std::vector<float>& tensor_value_handler,
unsigned int data_format)
{
if (mat.empty() || tensor_dims.size() != 4 || tensor_dims.at(0) != 1)
return Ort::Value(nullptr);
const unsigned int channels = mat.channels();
cv::Mat mat_ref;
if (mat.type() != CV_32FC(channels))
mat.convertTo(mat_ref, CV_32FC(channels));
else
mat_ref = mat;
if (data_format == CHW) {
const unsigned int H = tensor_dims.at(2);
const unsigned int W = tensor_dims.at(3);
const unsigned int C = tensor_dims.at(1);
if (C != channels) return Ort::Value(nullptr);
const unsigned int total = C * H * W;
tensor_value_handler.resize(total);
cv::Mat resized;
if (H != static_cast<unsigned int>(mat_ref.rows) ||
W != static_cast<unsigned int>(mat_ref.cols))
cv::resize(mat_ref, resized, cv::Size(W, H));
else
resized = mat_ref;
std::vector<cv::Mat> chans;
cv::split(resized, chans);
for (unsigned int c = 0; c < C; ++c)
std::memcpy(tensor_value_handler.data() + c * H * W,
chans[c].data, H * W * sizeof(float));
return Ort::Value::CreateTensor<float>(
memory_info_handler, tensor_value_handler.data(),
total, tensor_dims.data(), tensor_dims.size());
}
// HWC
const unsigned int H = tensor_dims.at(1);
const unsigned int W = tensor_dims.at(2);
const unsigned int C = tensor_dims.at(3);
const unsigned int total = C * H * W;
if (C != channels) return Ort::Value(nullptr);
tensor_value_handler.resize(total);
cv::Mat resized;
if (H != static_cast<unsigned int>(mat_ref.rows) ||
W != static_cast<unsigned int>(mat_ref.cols))
cv::resize(mat_ref, resized, cv::Size(W, H));
else
resized = mat_ref;
std::memcpy(tensor_value_handler.data(), resized.data, total * sizeof(float));
return Ort::Value::CreateTensor<float>(
memory_info_handler, tensor_value_handler.data(),
total, tensor_dims.data(), tensor_dims.size());
}
Ort::Value ANSCENTER::utils::transform::create_tensor_batch(
const std::vector<cv::Mat>& batch_mats,
const std::vector<int64_t>& tensor_dims,
const Ort::MemoryInfo& memory_info_handler,
std::vector<float>& tensor_value_handler,
unsigned int data_format)
{
if (batch_mats.empty() || tensor_dims.size() != 4)
return Ort::Value(nullptr);
const size_t N = batch_mats.size();
const unsigned int C = tensor_dims.at(1);
const unsigned int H = tensor_dims.at(2);
const unsigned int W = tensor_dims.at(3);
const unsigned int image_size = C * H * W;
const unsigned int total = static_cast<unsigned int>(N) * image_size;
tensor_value_handler.resize(total);
for (size_t b = 0; b < N; ++b) {
const cv::Mat& mat = batch_mats[b];
if (mat.empty() || static_cast<unsigned int>(mat.channels()) != C)
return Ort::Value(nullptr);
cv::Mat mat_ref;
if (mat.type() != CV_32FC(C))
mat.convertTo(mat_ref, CV_32FC(C));
else
mat_ref = mat;
cv::Mat resized;
if (static_cast<unsigned int>(mat_ref.rows) != H ||
static_cast<unsigned int>(mat_ref.cols) != W)
cv::resize(mat_ref, resized, cv::Size(W, H));
else
resized = mat_ref;
const size_t batch_offset = b * image_size;
if (data_format == CHW) {
const float* src = resized.ptr<float>(0);
const size_t plane_size = H * W;
for (unsigned int c = 0; c < C; ++c) {
float* dst = tensor_value_handler.data() + batch_offset + c * plane_size;
for (size_t i = 0; i < plane_size; ++i)
dst[i] = src[i * C + c];
}
}
else {
std::memcpy(tensor_value_handler.data() + batch_offset,
resized.data, image_size * sizeof(float));
}
}
return Ort::Value::CreateTensor<float>(
memory_info_handler, tensor_value_handler.data(),
total, tensor_dims.data(), tensor_dims.size());
}
Ort::Value ANSCENTER::utils::transform::create_video_tensor_5d(
const std::deque<cv::Mat>& frames,
const std::vector<int64_t>& tensor_dims,
const Ort::MemoryInfo& memory_info_handler,
std::vector<float>& tensor_value_handler)
{
if (tensor_dims.size() != 5 || tensor_dims[0] != 1)
throw std::runtime_error("create_video_tensor_5d: expect [1,C,T,H,W]");
const unsigned int C = tensor_dims[1];
const unsigned int T = tensor_dims[2];
const unsigned int H = tensor_dims[3];
const unsigned int W = tensor_dims[4];
if (frames.size() != T)
throw std::runtime_error("create_video_tensor_5d: frame count != T");
const size_t total = static_cast<size_t>(C) * T * H * W;
tensor_value_handler.resize(total);
for (unsigned int t = 0; t < T; ++t) {
cv::Mat frame_ref;
if (frames[t].type() != CV_32FC(C))
frames[t].convertTo(frame_ref, CV_32FC(C), 1.0 / 255.0);
else
frame_ref = frames[t];
cv::Mat resized;
if (static_cast<unsigned int>(frame_ref.rows) != H ||
static_cast<unsigned int>(frame_ref.cols) != W)
cv::resize(frame_ref, resized, cv::Size(W, H));
else
resized = frame_ref;
cv::cvtColor(resized, resized, cv::COLOR_BGR2RGB);
std::vector<cv::Mat> chans;
cv::split(resized, chans);
for (unsigned int c = 0; c < C; ++c) {
float* dst = tensor_value_handler.data()
+ c * (T * H * W) + t * (H * W);
std::memcpy(dst, chans[c].data, H * W * sizeof(float));
}
}
return Ort::Value::CreateTensor<float>(
memory_info_handler, tensor_value_handler.data(),
total, tensor_dims.data(), tensor_dims.size());
}
cv::Mat ANSCENTER::utils::transform::normalize(
const cv::Mat& mat, float mean, float scale)
{
cv::Mat matf;
if (mat.type() != CV_32FC3) mat.convertTo(matf, CV_32FC3);
else matf = mat;
return (matf - mean) * scale;
}
cv::Mat ANSCENTER::utils::transform::normalize(
const cv::Mat& mat, const float* mean, const float* scale)
{
cv::Mat out;
if (mat.type() != CV_32FC3) mat.convertTo(out, CV_32FC3);
else out = mat.clone();
for (int i = 0; i < out.rows; ++i) {
cv::Vec3f* p = out.ptr<cv::Vec3f>(i);
for (int j = 0; j < out.cols; ++j) {
p[j][0] = (p[j][0] - mean[0]) * scale[0];
p[j][1] = (p[j][1] - mean[1]) * scale[1];
p[j][2] = (p[j][2] - mean[2]) * scale[2];
}
}
return out;
}
void ANSCENTER::utils::transform::normalize(
const cv::Mat& inmat, cv::Mat& outmat, float mean, float scale)
{
outmat = ANSCENTER::utils::transform::normalize(inmat, mean, scale);
}
void ANSCENTER::utils::transform::normalize_inplace(
cv::Mat& mat_inplace, float mean, float scale)
{
if (mat_inplace.type() != CV_32FC3)
mat_inplace.convertTo(mat_inplace, CV_32FC3);
ANSCENTER::utils::transform::normalize(mat_inplace, mat_inplace, mean, scale);
}
void ANSCENTER::utils::transform::normalize_inplace(
cv::Mat& mat_inplace, const float* mean, const float* scale)
{
if (mat_inplace.type() != CV_32FC3)
mat_inplace.convertTo(mat_inplace, CV_32FC3);
for (int i = 0; i < mat_inplace.rows; ++i) {
cv::Vec3f* p = mat_inplace.ptr<cv::Vec3f>(i);
for (int j = 0; j < mat_inplace.cols; ++j) {
p[j][0] = (p[j][0] - mean[0]) * scale[0];
p[j][1] = (p[j][1] - mean[1]) * scale[1];
p[j][2] = (p[j][2] - mean[2]) * scale[2];
}
}
}
// ====================================================================
// BoundingBoxType template method implementations
// ====================================================================
template<typename T1, typename T2>
template<typename O1, typename O2>
inline typename ANSCENTER::types::BoundingBoxType<T1, T2>::value_type
ANSCENTER::types::BoundingBoxType<T1, T2>::iou_of(
const BoundingBoxType<O1, O2>& other) const
{
auto tbox = other.template convert_type<value_type, score_type>();
value_type ix1 = std::max(x1, tbox.x1);
value_type iy1 = std::max(y1, tbox.y1);
value_type ix2 = std::min(x2, tbox.x2);
value_type iy2 = std::min(y2, tbox.y2);
value_type iw = ix2 - ix1 + static_cast<value_type>(1);
value_type ih = iy2 - iy1 + static_cast<value_type>(1);
if (iw <= 0 || ih <= 0)
return std::numeric_limits<value_type>::min();
value_type inter = iw * ih;
return inter / (area() + tbox.area() - inter);
}
template<typename T1, typename T2>
inline ::cv::Rect ANSCENTER::types::BoundingBoxType<T1, T2>::rect() const
{
auto b = convert_type<int>();
return ::cv::Rect(b.x1, b.y1, b.width(), b.height());
}
template<typename T1, typename T2>
inline ::cv::Point2i ANSCENTER::types::BoundingBoxType<T1, T2>::tl() const
{
auto b = convert_type<int>();
return ::cv::Point2i(b.x1, b.y1);
}
template<typename T1, typename T2>
inline ::cv::Point2i ANSCENTER::types::BoundingBoxType<T1, T2>::rb() const
{
auto b = convert_type<int>();
return ::cv::Point2i(b.x2, b.y2);
}
template<typename T1, typename T2>
inline typename ANSCENTER::types::BoundingBoxType<T1, T2>::value_type
ANSCENTER::types::BoundingBoxType<T1, T2>::width() const
{
return x2 - x1 + static_cast<value_type>(1);
}
template<typename T1, typename T2>
inline typename ANSCENTER::types::BoundingBoxType<T1, T2>::value_type
ANSCENTER::types::BoundingBoxType<T1, T2>::height() const
{
return y2 - y1 + static_cast<value_type>(1);
}
template<typename T1, typename T2>
inline typename ANSCENTER::types::BoundingBoxType<T1, T2>::value_type
ANSCENTER::types::BoundingBoxType<T1, T2>::area() const
{
return std::fabs(width() * height());
}
} // namespace ANSCENTER