1696 lines
69 KiB
C++
1696 lines
69 KiB
C++
#include "ONNXEngine.h"
|
||
#include "EPLoader.h"
|
||
#include "Utility.h"
|
||
|
||
#include <algorithm>
|
||
#include <limits>
|
||
#include <filesystem>
|
||
#include <fstream>
|
||
#include <cstdlib>
|
||
#include <system_error>
|
||
|
||
namespace ANSCENTER {
|
||
|
||
// ====================================================================
|
||
// BasicOrtHandler — constructors
|
||
// ====================================================================
|
||
|
||
std::string BasicOrtHandler::QueryModelInputName(const std::string& onnxPath)
|
||
{
|
||
try {
|
||
// Make sure the Ort API pointer is initialised in THIS DLL.
|
||
if (Ort::Global<void>::api_ == nullptr) {
|
||
Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
|
||
}
|
||
|
||
Ort::Env env(ORT_LOGGING_LEVEL_ERROR, "QueryModelInputName");
|
||
Ort::SessionOptions opts;
|
||
opts.SetIntraOpNumThreads(1);
|
||
opts.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_DISABLE_ALL);
|
||
// Intentionally NOT attaching CUDA/TRT EP — CPU is fastest
|
||
// for a no-inference metadata read.
|
||
|
||
std::wstring wpath(onnxPath.begin(), onnxPath.end());
|
||
Ort::Session session(env, wpath.c_str(), opts);
|
||
|
||
Ort::AllocatorWithDefaultOptions alloc;
|
||
auto inName = session.GetInputNameAllocated(0, alloc);
|
||
return std::string(inName.get());
|
||
}
|
||
catch (const Ort::Exception& e) {
|
||
std::cerr << "[QueryModelInputName] ORT exception: "
|
||
<< e.what() << " (path=" << onnxPath << ")" << std::endl;
|
||
return "";
|
||
}
|
||
catch (const std::exception& e) {
|
||
std::cerr << "[QueryModelInputName] std exception: "
|
||
<< e.what() << " (path=" << onnxPath << ")" << std::endl;
|
||
return "";
|
||
}
|
||
}
|
||
|
||
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
|
||
unsigned int _num_threads)
|
||
: log_id(_onnx_path.data()),
|
||
num_threads(_num_threads),
|
||
m_engineType(static_cast<EngineType>(-1)),
|
||
onnx_path_w(_onnx_path.begin(), _onnx_path.end()) // ← stored as member
|
||
{
|
||
onnx_path = onnx_path_w.c_str(); // ← safe, member owns storage
|
||
initialize_handler();
|
||
}
|
||
|
||
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
|
||
EngineType engineType,
|
||
unsigned int _num_threads)
|
||
: log_id(_onnx_path.data()),
|
||
num_threads(_num_threads),
|
||
m_engineType(engineType),
|
||
onnx_path_w(_onnx_path.begin(), _onnx_path.end()) // ← stored as member
|
||
{
|
||
onnx_path = onnx_path_w.c_str(); // ← safe, member owns storage
|
||
initialize_handler();
|
||
}
|
||
|
||
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
|
||
EngineType engineType,
|
||
const OrtHandlerOptions& options,
|
||
unsigned int _num_threads)
|
||
: log_id(_onnx_path.data()),
|
||
num_threads(_num_threads),
|
||
m_engineType(engineType),
|
||
m_handlerOptions(options),
|
||
onnx_path_w(_onnx_path.begin(), _onnx_path.end())
|
||
{
|
||
onnx_path = onnx_path_w.c_str();
|
||
initialize_handler();
|
||
}
|
||
|
||
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
|
||
const OrtHandlerOptions& options,
|
||
unsigned int _num_threads)
|
||
: log_id(_onnx_path.data()),
|
||
num_threads(_num_threads),
|
||
m_engineType(static_cast<EngineType>(-1)), // EPLoader auto-detect
|
||
m_handlerOptions(options),
|
||
onnx_path_w(_onnx_path.begin(), _onnx_path.end())
|
||
{
|
||
onnx_path = onnx_path_w.c_str();
|
||
initialize_handler();
|
||
}
|
||
|
||
BasicOrtHandler::~BasicOrtHandler()
|
||
{
|
||
if (ort_session) {
|
||
delete ort_session;
|
||
ort_session = nullptr;
|
||
}
|
||
if (memory_info_handler) {
|
||
delete memory_info_handler;
|
||
memory_info_handler = nullptr;
|
||
}
|
||
if (ort_env) {
|
||
delete ort_env;
|
||
ort_env = nullptr;
|
||
}
|
||
}
|
||
|
||
// ====================================================================
|
||
// EP appenders
|
||
// ====================================================================
|
||
|
||
bool BasicOrtHandler::TryAppendCUDA(Ort::SessionOptions& session_options)
|
||
{
|
||
try {
|
||
OrtCUDAProviderOptionsV2* cuda_options = nullptr;
|
||
Ort::GetApi().CreateCUDAProviderOptions(&cuda_options);
|
||
|
||
// Memory-safe GPU configuration for multi-model environments:
|
||
// - arena_extend_strategy = 1 (kSameAsRequested) to avoid
|
||
// pre-allocating huge GPU memory blocks that may exceed VRAM
|
||
// - cudnn_conv_algo_search = HEURISTIC for faster session init
|
||
// - cudnn_conv_use_max_workspace defaults to "0" to prevent
|
||
// CUDNN_BACKEND_API_FAILED when TRT engines already occupy
|
||
// most VRAM on the same GPU. OCR sub-models that need fast
|
||
// convs opt into "1" via OrtHandlerOptions::useMaxCudnnWorkspace
|
||
// - gpu_mem_limit — cap ONNX Runtime's GPU memory arena to 2 GB
|
||
// so it doesn't compete with TensorRT for the remaining VRAM
|
||
const char* maxWorkspace =
|
||
m_handlerOptions.useMaxCudnnWorkspace ? "1" : "0";
|
||
|
||
const char* keys[] = {
|
||
"device_id",
|
||
"arena_extend_strategy",
|
||
"cudnn_conv_algo_search",
|
||
"cudnn_conv_use_max_workspace",
|
||
"gpu_mem_limit"
|
||
};
|
||
const char* values[] = {
|
||
"0",
|
||
"1", // kSameAsRequested
|
||
"HEURISTIC", // avoid exhaustive algo search on large model
|
||
maxWorkspace, // "1" for OCR (perf), "0" elsewhere (safety)
|
||
"2147483648" // 2 GB arena limit
|
||
};
|
||
Ort::GetApi().UpdateCUDAProviderOptions(
|
||
cuda_options, keys, values, 5);
|
||
|
||
session_options.AppendExecutionProvider_CUDA_V2(*cuda_options);
|
||
Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options);
|
||
|
||
std::cout << "[ORT] CUDA EP attached (arena=SameAsRequested, "
|
||
"cudnn=HEURISTIC, maxWorkspace=" << maxWorkspace
|
||
<< ", memLimit=2GB)." << std::endl;
|
||
return true;
|
||
}
|
||
catch (const Ort::Exception& e) {
|
||
std::cerr << "[ORT] CUDA EP failed: " << e.what() << std::endl;
|
||
return false;
|
||
}
|
||
}
|
||
|
||
bool BasicOrtHandler::TryAppendTensorRT(Ort::SessionOptions& session_options)
|
||
{
|
||
try {
|
||
OrtTensorRTProviderOptionsV2* trt_options = nullptr;
|
||
Ort::GetApi().CreateTensorRTProviderOptions(&trt_options);
|
||
|
||
// Cache built engines on disk so subsequent runs skip the
|
||
// multi-minute build. Engines are keyed on (model hash, GPU
|
||
// arch, shape profile) so changing any of those triggers
|
||
// a rebuild automatically.
|
||
std::string cacheDir = m_handlerOptions.trtEngineCacheDir;
|
||
if (cacheDir.empty()) {
|
||
// %TEMP%\ANSCENTER\TRTEngineCache
|
||
const char* tmp = std::getenv("TEMP");
|
||
if (!tmp) tmp = std::getenv("TMP");
|
||
if (!tmp) tmp = ".";
|
||
std::filesystem::path p(tmp);
|
||
p /= "ANSCENTER";
|
||
p /= "TRTEngineCache";
|
||
std::error_code ec;
|
||
std::filesystem::create_directories(p, ec);
|
||
cacheDir = p.string();
|
||
}
|
||
|
||
// Builder options tuned for *fast first-run*:
|
||
// - opt_level 1: builds in seconds, ~5–10 % runtime cost vs 3
|
||
// - workspace 1 GB: leaves room for CUDA EP arena and the
|
||
// LPD's own TRT engine on the same GPU
|
||
// - timing cache: persists kernel timings between runs so
|
||
// builds at new shapes get progressively faster
|
||
// - profile shapes (if set): build ONE dynamic-shape
|
||
// engine that handles all (batch, width) combos instead
|
||
// of rebuilding per unique input. Critical for variable
|
||
// batch workloads — without this, TRT EP rebuilds every
|
||
// time runtime sees a new shape pair, causing 60-90 s
|
||
// hangs mid-stream.
|
||
std::filesystem::path timingCachePath =
|
||
std::filesystem::path(cacheDir) / "timing.cache";
|
||
std::string timingCacheStr = timingCachePath.string();
|
||
|
||
const bool haveProfile = !m_handlerOptions.trtProfileMinShapes.empty()
|
||
&& !m_handlerOptions.trtProfileOptShapes.empty()
|
||
&& !m_handlerOptions.trtProfileMaxShapes.empty();
|
||
|
||
// Build the key/value arrays. We always set the first 8 keys;
|
||
// the profile shapes are appended only when provided.
|
||
std::vector<const char*> keys = {
|
||
"device_id",
|
||
"trt_fp16_enable",
|
||
"trt_engine_cache_enable",
|
||
"trt_engine_cache_path",
|
||
"trt_max_workspace_size",
|
||
"trt_builder_optimization_level",
|
||
"trt_timing_cache_enable",
|
||
"trt_timing_cache_path"
|
||
};
|
||
std::vector<const char*> values = {
|
||
"0",
|
||
m_handlerOptions.trtFP16 ? "1" : "0",
|
||
"1",
|
||
cacheDir.c_str(),
|
||
"1073741824", // 1 GB build workspace
|
||
"1", // fast build (was "3")
|
||
"1",
|
||
cacheDir.c_str()
|
||
};
|
||
|
||
if (haveProfile) {
|
||
keys.push_back("trt_profile_min_shapes");
|
||
values.push_back(m_handlerOptions.trtProfileMinShapes.c_str());
|
||
keys.push_back("trt_profile_opt_shapes");
|
||
values.push_back(m_handlerOptions.trtProfileOptShapes.c_str());
|
||
keys.push_back("trt_profile_max_shapes");
|
||
values.push_back(m_handlerOptions.trtProfileMaxShapes.c_str());
|
||
}
|
||
|
||
Ort::GetApi().UpdateTensorRTProviderOptions(
|
||
trt_options, keys.data(), values.data(), keys.size());
|
||
|
||
session_options.AppendExecutionProvider_TensorRT_V2(*trt_options);
|
||
Ort::GetApi().ReleaseTensorRTProviderOptions(trt_options);
|
||
|
||
std::cout << "[ORT] TensorRT EP attached (fp16="
|
||
<< (m_handlerOptions.trtFP16 ? "1" : "0")
|
||
<< ", cache=" << cacheDir
|
||
<< ", profile=" << (haveProfile ? "dynamic" : "static")
|
||
<< ")." << std::endl;
|
||
if (haveProfile) {
|
||
std::cout << "[ORT] profile min: "
|
||
<< m_handlerOptions.trtProfileMinShapes << std::endl
|
||
<< "[ORT] profile opt: "
|
||
<< m_handlerOptions.trtProfileOptShapes << std::endl
|
||
<< "[ORT] profile max: "
|
||
<< m_handlerOptions.trtProfileMaxShapes << std::endl;
|
||
}
|
||
return true;
|
||
}
|
||
catch (const Ort::Exception& e) {
|
||
std::cerr << "[ORT] TensorRT EP failed: " << e.what() << std::endl;
|
||
return false;
|
||
}
|
||
catch (const std::exception& e) {
|
||
std::cerr << "[ORT] TensorRT EP failed (std): " << e.what() << std::endl;
|
||
return false;
|
||
}
|
||
}
|
||
bool BasicOrtHandler::TryAppendDirectML(Ort::SessionOptions& session_options)
|
||
{
|
||
try {
|
||
// AppendExecutionProvider("DML") is the correct API for DirectML —
|
||
// there is no V2 variant, so the string-based map is intentional here.
|
||
std::unordered_map<std::string, std::string> options = {
|
||
{ "device_id", "0" }
|
||
};
|
||
session_options.AppendExecutionProvider("DML", options);
|
||
std::cout << "[ORT] DirectML EP attached (device 0)." << std::endl;
|
||
return true;
|
||
}
|
||
catch (const Ort::Exception& e) {
|
||
std::cerr << "[ORT] DirectML EP failed: " << e.what() << std::endl;
|
||
return false;
|
||
}
|
||
}
|
||
bool BasicOrtHandler::TryAppendOpenVINO(Ort::SessionOptions& session_options)
|
||
{
|
||
// Use AppendExecutionProvider_OpenVINO_V2 instead of the generic string API,
|
||
// matching the pattern used in YOLOOD/YOLO12OD/ANSONNXCL etc.
|
||
// Try device configs in priority order, falling back gracefully.
|
||
//
|
||
// NPU availability is probed once per process. If AUTO:NPU,GPU fails on
|
||
// the first call, we skip it for all subsequent models to avoid repeated
|
||
// "Failed to load shared library" errors cluttering the log.
|
||
static bool s_npuProbed = false;
|
||
static bool s_npuAvailable = false;
|
||
|
||
const std::string precision = "FP16";
|
||
const std::string numberOfThreads = "4";
|
||
const std::string numberOfStreams = "4";
|
||
|
||
auto makeConfig = [&](const std::string& device) {
|
||
return std::unordered_map<std::string, std::string>{
|
||
{"device_type", device}, {"precision", precision},
|
||
{"num_of_threads", numberOfThreads}, {"num_streams", numberOfStreams},
|
||
{"enable_opencl_throttling", "False"}, {"enable_qdq_optimizer", "True"}
|
||
};
|
||
};
|
||
|
||
std::vector<std::unordered_map<std::string, std::string>> try_configs;
|
||
|
||
// Only try NPU if it hasn't been probed yet or was previously available
|
||
if (!s_npuProbed || s_npuAvailable) {
|
||
try_configs.push_back(makeConfig("AUTO:NPU,GPU"));
|
||
}
|
||
try_configs.push_back(makeConfig("GPU.0"));
|
||
try_configs.push_back(makeConfig("GPU.1"));
|
||
try_configs.push_back(makeConfig("AUTO:GPU,CPU"));
|
||
|
||
for (const auto& config : try_configs) {
|
||
try {
|
||
session_options.AppendExecutionProvider_OpenVINO_V2(config);
|
||
const auto& device = config.at("device_type");
|
||
std::cout << "[ORT] OpenVINO EP attached ("
|
||
<< device << ", " << precision << ")." << std::endl;
|
||
ANS_DBG("OrtHandler", "OpenVINO EP attached: %s", device.c_str());
|
||
|
||
// If NPU config succeeded, mark it available
|
||
if (device.find("NPU") != std::string::npos) {
|
||
s_npuProbed = true;
|
||
s_npuAvailable = true;
|
||
}
|
||
return true;
|
||
}
|
||
catch (const Ort::Exception& e) {
|
||
const auto& device = config.at("device_type");
|
||
|
||
// If NPU config failed, remember so we skip it next time
|
||
if (device.find("NPU") != std::string::npos) {
|
||
if (!s_npuProbed) {
|
||
std::cout << "[ORT] NPU not available — skipping NPU configs for subsequent models." << std::endl;
|
||
ANS_DBG("OrtHandler", "NPU not available, will skip in future");
|
||
}
|
||
s_npuProbed = true;
|
||
s_npuAvailable = false;
|
||
} else {
|
||
std::cerr << "[ORT] OpenVINO EP failed for device "
|
||
<< device << ": " << e.what() << std::endl;
|
||
}
|
||
}
|
||
}
|
||
std::cerr << "[ORT] OpenVINO EP: all device configs failed." << std::endl;
|
||
return false;
|
||
}
|
||
|
||
// ====================================================================
|
||
// initialize_handler
|
||
// ====================================================================
|
||
|
||
void BasicOrtHandler::initialize_handler()
|
||
{
|
||
ANS_DBG("OrtHandler", "initialize_handler: m_engineType=%d", static_cast<int>(m_engineType));
|
||
const auto& epInfo = EPLoader::Current();
|
||
ANS_DBG("OrtHandler", "initialize_handler: EPLoader type=%d dir=%s",
|
||
static_cast<int>(epInfo.type), epInfo.libraryDir.c_str());
|
||
if (Ort::Global<void>::api_ == nullptr)
|
||
Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
|
||
|
||
std::cout << "[ORT] api_ = " << (void*)Ort::Global<void>::api_ << std::endl;
|
||
|
||
EngineType engine = (static_cast<int>(m_engineType) == -1)
|
||
? epInfo.type : m_engineType;
|
||
// Persist the resolved engine type so subclasses (e.g. ONNXYOLO)
|
||
// can branch on the actual EP at inference time (IoBinding for DML).
|
||
m_engineType = engine;
|
||
ANS_DBG("OrtHandler", "initialize_handler: resolved engine=%d (from %s)",
|
||
static_cast<int>(engine),
|
||
(static_cast<int>(m_engineType) == -1) ? "EPLoader" : "explicit");
|
||
|
||
ort_env = new Ort::Env(ORT_LOGGING_LEVEL_ERROR, log_id);
|
||
memory_info_handler = new Ort::MemoryInfo(
|
||
Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault));
|
||
|
||
Ort::SessionOptions session_options;
|
||
session_options.SetIntraOpNumThreads(num_threads);
|
||
// Start with full optimization — will be downgraded to DISABLE_ALL
|
||
// later if we detect a large external data file (e.g. SAM3's 3.3 GB
|
||
// .onnx_data). Normal small models keep ORT_ENABLE_ALL.
|
||
session_options.SetGraphOptimizationLevel(
|
||
GraphOptimizationLevel::ORT_ENABLE_ALL);
|
||
session_options.SetLogSeverityLevel(4);
|
||
|
||
// DirectML REQUIRES these two settings per ORT documentation:
|
||
// - DisableMemPattern: DML manages its own memory; ORT's memory
|
||
// pattern optimization conflicts with DML's D3D12 allocator.
|
||
// - ORT_SEQUENTIAL: DML uses a single command queue and cannot
|
||
// handle parallel execution mode — doing so causes deadlocks
|
||
// when synchronizing GPU→CPU data transfers.
|
||
if (engine == EngineType::AMD_GPU) {
|
||
session_options.DisableMemPattern();
|
||
session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
|
||
|
||
// DirectML 1.15.4 (the latest; Microsoft has moved DirectML into
|
||
// sustained engineering only) has a deterministic crash path in
|
||
// amdkmdag.sys +0xf03d on RDNA2 iGPUs (Radeon 680M on Ryzen 6000)
|
||
// when ORT_ENABLE_ALL applies layout-reorder transforms to
|
||
// YOLO-style conv graphs. Downgrade to EXTENDED on DML: still
|
||
// keeps constant folding and Conv+BN+ReLU fusion (the big wins),
|
||
// drops the risky layout transforms. Perf impact on YOLO is
|
||
// typically under 5%.
|
||
session_options.SetGraphOptimizationLevel(
|
||
GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
|
||
|
||
ANS_DBG("OrtHandler",
|
||
"DirectML: DisableMemPattern + ORT_SEQUENTIAL + EXTENDED opt");
|
||
}
|
||
|
||
std::vector<std::string> available = Ort::GetAvailableProviders();
|
||
std::cout << "[ORT] Available providers: ";
|
||
for (auto& p : available) std::cout << p << " ";
|
||
std::cout << std::endl;
|
||
//std::cout << "[ORT] Selected engine : "
|
||
// << EPLoader::EngineTypeToString(engine) << std::endl;
|
||
|
||
auto hasProvider = [&](const std::string& name) -> bool {
|
||
return std::find(available.begin(), available.end(), name)
|
||
!= available.end();
|
||
};
|
||
|
||
bool epAttached = false;
|
||
|
||
switch (engine)
|
||
{
|
||
// --------------------------------------------------------
|
||
case EngineType::NVIDIA_GPU:
|
||
// Try TensorRT EP first when explicitly requested. Falls
|
||
// through to CUDA EP if TRT is missing or option creation
|
||
// fails. Both EPs may be attached at once — ORT picks TRT
|
||
// for nodes it supports and CUDA for the rest.
|
||
if (m_handlerOptions.preferTensorRT
|
||
&& hasProvider("TensorrtExecutionProvider")) {
|
||
ANS_DBG("OrtHandler", "Trying TensorRT EP...");
|
||
if (TryAppendTensorRT(session_options)) {
|
||
epAttached = true;
|
||
}
|
||
else {
|
||
std::cerr << "[ORT] TensorRT EP attach failed — "
|
||
"falling back to CUDA EP." << std::endl;
|
||
}
|
||
}
|
||
|
||
ANS_DBG("OrtHandler", "Trying CUDA EP...");
|
||
if (hasProvider("CUDAExecutionProvider")) {
|
||
if (TryAppendCUDA(session_options)) {
|
||
epAttached = true;
|
||
}
|
||
}
|
||
if (!epAttached) {
|
||
std::cerr << "[ORT] CUDA EP unavailable — falling back to CPU."
|
||
<< std::endl;
|
||
ANS_DBG("OrtHandler", "CUDA EP FAILED — fallback to CPU");
|
||
}
|
||
break;
|
||
|
||
// --------------------------------------------------------
|
||
case EngineType::AMD_GPU:
|
||
ANS_DBG("OrtHandler", "Trying DirectML EP...");
|
||
if (hasProvider("DmlExecutionProvider"))
|
||
epAttached = TryAppendDirectML(session_options);
|
||
if (!epAttached) {
|
||
std::cerr << "[ORT] DirectML EP unavailable — falling back to CPU."
|
||
<< std::endl;
|
||
ANS_DBG("OrtHandler", "DirectML EP FAILED — fallback to CPU");
|
||
}
|
||
break;
|
||
|
||
// --------------------------------------------------------
|
||
case EngineType::OPENVINO_GPU:
|
||
ANS_DBG("OrtHandler", "Trying OpenVINO EP...");
|
||
if (hasProvider("OpenVINOExecutionProvider"))
|
||
epAttached = TryAppendOpenVINO(session_options);
|
||
if (!epAttached) {
|
||
std::cerr << "[ORT] OpenVINO EP unavailable — falling back to CPU."
|
||
<< std::endl;
|
||
ANS_DBG("OrtHandler", "OpenVINO EP FAILED — fallback to CPU");
|
||
}
|
||
break;
|
||
|
||
// --------------------------------------------------------
|
||
case EngineType::CPU:
|
||
default:
|
||
std::cout << "[ORT] Using CPU EP." << std::endl;
|
||
ANS_DBG("OrtHandler", "Using CPU EP");
|
||
epAttached = true;
|
||
break;
|
||
}
|
||
|
||
if (!epAttached) {
|
||
std::cout << "[ORT] Running on CPU EP (fallback)." << std::endl;
|
||
ANS_DBG("OrtHandler", "EP not attached — running on CPU fallback");
|
||
} else {
|
||
ANS_DBG("OrtHandler", "EP attached successfully");
|
||
}
|
||
|
||
// ----------------------------------------------------------------
|
||
// Create session
|
||
// ----------------------------------------------------------------
|
||
// ORT resolves external data files (e.g. .onnx_data) relative to
|
||
// the CWD rather than the model file's directory. Temporarily
|
||
// switch CWD so ORT can locate them.
|
||
//
|
||
// Additionally, ORT's internal memory-mapping of very large
|
||
// external data files (>2 GB) can crash with an access violation
|
||
// on Windows. When we detect a large .onnx_data file, we
|
||
// pre-load it with standard file I/O and pass the buffer via
|
||
// AddExternalInitializersFromFilesInMemory() so ORT never
|
||
// memory-maps the file itself.
|
||
// ----------------------------------------------------------------
|
||
std::filesystem::path modelFsPath(onnx_path); // wchar_t*
|
||
std::filesystem::path modelDir = modelFsPath.parent_path();
|
||
std::filesystem::path prevCwd = std::filesystem::current_path();
|
||
|
||
if (!modelDir.empty() && std::filesystem::is_directory(modelDir)) {
|
||
std::filesystem::current_path(modelDir);
|
||
std::cout << "[ORT] CWD -> " << modelDir.string() << std::endl;
|
||
}
|
||
|
||
// --- Pre-load external data files if they exist -----------------
|
||
// Keep the buffer alive across session creation (must outlive the
|
||
// Ort::Session constructor call).
|
||
std::vector<char> extDataBuffer;
|
||
{
|
||
// Build the expected external-data filename:
|
||
// <model_stem>.onnx_data (e.g. anssam3.onnx_data)
|
||
std::filesystem::path extDataPath =
|
||
modelDir / (modelFsPath.stem().wstring() + L".onnx_data");
|
||
|
||
if (std::filesystem::exists(extDataPath)) {
|
||
auto fileSize = std::filesystem::file_size(extDataPath);
|
||
std::cout << "[ORT] External data file found: "
|
||
<< extDataPath.string()
|
||
<< " (" << (fileSize / (1024*1024)) << " MB)" << std::endl;
|
||
|
||
// Read entire file into memory with standard I/O.
|
||
// This avoids ORT's internal memory-mapping which can crash
|
||
// with access violation for files > 2 GB on Windows.
|
||
try {
|
||
std::ifstream ifs(extDataPath, std::ios::binary);
|
||
if (!ifs) {
|
||
std::cerr << "[ORT] ERROR: Could not open external data file."
|
||
<< std::endl;
|
||
} else {
|
||
extDataBuffer.resize(static_cast<size_t>(fileSize));
|
||
std::cout << "[ORT] Reading external data into memory..."
|
||
<< std::endl;
|
||
ifs.read(extDataBuffer.data(), static_cast<std::streamsize>(fileSize));
|
||
ifs.close();
|
||
std::cout << "[ORT] External data loaded ("
|
||
<< extDataBuffer.size() << " bytes)." << std::endl;
|
||
|
||
// Tell ORT to use our in-memory buffer instead of
|
||
// memory-mapping the file.
|
||
std::vector<std::basic_string<ORTCHAR_T>> extFileNames = {
|
||
extDataPath.filename().wstring()
|
||
};
|
||
std::vector<char*> extBuffers = { extDataBuffer.data() };
|
||
std::vector<size_t> extLengths = { extDataBuffer.size() };
|
||
|
||
session_options.AddExternalInitializersFromFilesInMemory(
|
||
extFileNames, extBuffers, extLengths);
|
||
std::cout << "[ORT] External initializers registered."
|
||
<< std::endl;
|
||
|
||
// Large external-data models crash ORT's CUDA graph
|
||
// optimization passes. Disable all optimization for
|
||
// these models only. Normal small models (SCRFD, YOLO,
|
||
// GlintArcFace, etc.) keep ORT_ENABLE_ALL.
|
||
session_options.SetGraphOptimizationLevel(
|
||
GraphOptimizationLevel::ORT_DISABLE_ALL);
|
||
std::cout << "[ORT] Graph optimization set to DISABLE_ALL "
|
||
"(large external data detected)." << std::endl;
|
||
}
|
||
}
|
||
catch (const std::bad_alloc&) {
|
||
std::cerr << "[ORT] WARNING: Could not allocate "
|
||
<< (fileSize / (1024*1024)) << " MB for external data. "
|
||
<< "Falling back to ORT file mapping." << std::endl;
|
||
extDataBuffer.clear();
|
||
extDataBuffer.shrink_to_fit();
|
||
}
|
||
}
|
||
}
|
||
|
||
// --- Load the .onnx model file into a memory buffer too ----------
|
||
// This avoids ORT opening/mapping ANY files during CreateSession.
|
||
std::vector<char> modelBuffer;
|
||
bool useModelBuffer = false;
|
||
if (!extDataBuffer.empty()) {
|
||
// External data was pre-loaded, so also load the .onnx itself
|
||
try {
|
||
auto modelFileSize = std::filesystem::file_size(modelFsPath);
|
||
modelBuffer.resize(static_cast<size_t>(modelFileSize));
|
||
std::ifstream mifs(modelFsPath, std::ios::binary);
|
||
if (mifs) {
|
||
mifs.read(modelBuffer.data(), static_cast<std::streamsize>(modelFileSize));
|
||
mifs.close();
|
||
useModelBuffer = true;
|
||
std::cout << "[ORT] Model proto loaded into memory ("
|
||
<< modelBuffer.size() << " bytes)." << std::endl;
|
||
}
|
||
}
|
||
catch (const std::exception& e) {
|
||
std::cerr << "[ORT] WARNING: Could not read model file into memory: "
|
||
<< e.what() << ". Using file path." << std::endl;
|
||
}
|
||
}
|
||
|
||
// --- Attempt session creation (with CUDA → CPU fallback) --------
|
||
auto createSession = [&](Ort::SessionOptions& opts, const char* label) {
|
||
std::cout << "[ORT] Creating session (" << label << ")..." << std::endl;
|
||
if (useModelBuffer) {
|
||
ort_session = new Ort::Session(*ort_env,
|
||
modelBuffer.data(), modelBuffer.size(), opts);
|
||
} else {
|
||
ort_session = new Ort::Session(*ort_env, onnx_path, opts);
|
||
}
|
||
std::cout << "[ORT] Session created OK (" << label << ")." << std::endl;
|
||
};
|
||
|
||
ANS_DBG("OrtHandler", "Creating session for model: %ls", onnx_path);
|
||
try {
|
||
createSession(session_options, "primary EP");
|
||
ANS_DBG("OrtHandler", "Session created OK with primary EP");
|
||
}
|
||
catch (const Ort::Exception& e) {
|
||
ANS_DBG("OrtHandler", "Session FAILED with primary EP: %s", e.what());
|
||
std::cerr << "[ORT] Session creation FAILED with primary EP: "
|
||
<< e.what() << std::endl;
|
||
|
||
// If we were using a GPU EP, fall back to CPU
|
||
if (engine != EngineType::CPU && epAttached) {
|
||
ANS_DBG("OrtHandler", "Retrying with CPU fallback...");
|
||
std::cerr << "[ORT] Retrying with CPU EP (fallback)..." << std::endl;
|
||
|
||
// Build fresh session options — no GPU EP, no graph opt
|
||
Ort::SessionOptions cpuOpts;
|
||
cpuOpts.SetIntraOpNumThreads(num_threads);
|
||
cpuOpts.SetGraphOptimizationLevel(
|
||
GraphOptimizationLevel::ORT_DISABLE_ALL);
|
||
cpuOpts.SetLogSeverityLevel(4);
|
||
|
||
// Re-register the in-memory external data if we have it
|
||
if (!extDataBuffer.empty()) {
|
||
std::filesystem::path extDataPath =
|
||
modelDir / (modelFsPath.stem().wstring() + L".onnx_data");
|
||
std::vector<std::basic_string<ORTCHAR_T>> extFileNames = {
|
||
extDataPath.filename().wstring()
|
||
};
|
||
std::vector<char*> extBuffers = { extDataBuffer.data() };
|
||
std::vector<size_t> extLengths = { extDataBuffer.size() };
|
||
cpuOpts.AddExternalInitializersFromFilesInMemory(
|
||
extFileNames, extBuffers, extLengths);
|
||
}
|
||
|
||
createSession(cpuOpts, "CPU fallback");
|
||
} else {
|
||
throw; // re-throw if already on CPU
|
||
}
|
||
}
|
||
catch (const std::exception& e) {
|
||
ANS_DBG("OrtHandler", "Session FAILED (std::exception): %s", e.what());
|
||
std::cerr << "[ORT] Session creation FAILED (std::exception): "
|
||
<< e.what() << std::endl;
|
||
throw;
|
||
}
|
||
|
||
// Restore previous CWD & release buffers
|
||
std::filesystem::current_path(prevCwd);
|
||
extDataBuffer.clear();
|
||
extDataBuffer.shrink_to_fit();
|
||
modelBuffer.clear();
|
||
modelBuffer.shrink_to_fit();
|
||
Ort::Allocator allocator(*ort_session, *memory_info_handler);
|
||
std::cout << "[ORT] Allocator created OK." << std::endl;
|
||
|
||
// Input
|
||
input_node_names.resize(1);
|
||
input_node_names_.resize(1);
|
||
input_node_names_[0] = OrtCompatiableGetInputName(0, allocator, ort_session);
|
||
input_node_names[0] = input_node_names_[0].data();
|
||
|
||
Ort::TypeInfo type_info = ort_session->GetInputTypeInfo(0);
|
||
auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
|
||
input_tensor_size = 1;
|
||
input_node_dims = tensor_info.GetShape();
|
||
for (auto dim : input_node_dims) {
|
||
if (dim > 0) input_tensor_size *= static_cast<size_t>(dim);
|
||
}
|
||
input_values_handler.resize(input_tensor_size);
|
||
|
||
// Outputs
|
||
num_outputs = static_cast<int>(ort_session->GetOutputCount());
|
||
output_node_names.resize(num_outputs);
|
||
output_node_names_.resize(num_outputs);
|
||
for (int i = 0; i < num_outputs; ++i) {
|
||
output_node_names_[i] =
|
||
OrtCompatiableGetOutputName(i, allocator, ort_session);
|
||
output_node_names[i] = output_node_names_[i].data();
|
||
output_node_dims.push_back(
|
||
ort_session->GetOutputTypeInfo(i)
|
||
.GetTensorTypeAndShapeInfo().GetShape());
|
||
}
|
||
}
|
||
|
||
// ====================================================================
|
||
// GlintArcFace
|
||
// ====================================================================
|
||
|
||
Ort::Value GlintArcFace::transform(const cv::Mat& mat)
|
||
{
|
||
if (mat.empty())
|
||
throw std::runtime_error("GlintArcFace::transform — input is empty.");
|
||
|
||
cv::Mat canvas;
|
||
cv::resize(mat, canvas, cv::Size(input_node_dims.at(3), input_node_dims.at(2)));
|
||
cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
|
||
if (canvas.type() != CV_32FC3)
|
||
canvas.convertTo(canvas, CV_32FC3);
|
||
|
||
utils::transform::normalize_inplace(canvas, mean_val, scale_val);
|
||
|
||
std::vector<int64_t> shape = input_node_dims;
|
||
if (shape[0] == -1) shape[0] = 1;
|
||
|
||
return utils::transform::create_tensor(
|
||
canvas, shape, *memory_info_handler,
|
||
input_values_handler, utils::transform::CHW);
|
||
}
|
||
|
||
Ort::Value GlintArcFace::transformBatch(const std::vector<cv::Mat>& images)
|
||
{
|
||
if (images.empty())
|
||
throw std::runtime_error("GlintArcFace::transformBatch — batch is empty.");
|
||
|
||
const int width = input_node_dims.at(3);
|
||
const int height = input_node_dims.at(2);
|
||
std::vector<cv::Mat> batch;
|
||
batch.reserve(images.size());
|
||
|
||
cv::Mat t1, t2, t3;
|
||
for (const auto& mat : images) {
|
||
if (mat.empty())
|
||
throw std::runtime_error("GlintArcFace::transformBatch — empty image in batch.");
|
||
cv::resize(mat, t1, cv::Size(width, height));
|
||
cv::cvtColor(t1, t2, cv::COLOR_BGR2RGB);
|
||
if (t2.type() != CV_32FC3) t2.convertTo(t3, CV_32FC3);
|
||
else t3 = t2.clone();
|
||
utils::transform::normalize_inplace(t3, mean_val, scale_val);
|
||
batch.push_back(t3.clone());
|
||
}
|
||
|
||
std::vector<int64_t> shape = input_node_dims;
|
||
shape[0] = static_cast<int64_t>(images.size());
|
||
|
||
return utils::transform::create_tensor_batch(
|
||
batch, shape, *memory_info_handler,
|
||
input_values_handler, utils::transform::CHW);
|
||
}
|
||
|
||
void GlintArcFace::detect(const cv::Mat& mat, types::FaceContent& face_content)
|
||
{
|
||
if (mat.empty()) return;
|
||
Ort::Value input_tensor = transform(mat);
|
||
auto output_tensors = ort_session->Run(
|
||
Ort::RunOptions{ nullptr },
|
||
input_node_names.data(), &input_tensor, 1,
|
||
output_node_names.data(), num_outputs);
|
||
|
||
const unsigned int hidden_dim =
|
||
static_cast<unsigned int>(output_node_dims.at(0).at(1));
|
||
const float* vals =
|
||
output_tensors.at(0).GetTensorMutableData<float>();
|
||
|
||
std::vector<float> embedding(vals, vals + hidden_dim);
|
||
cv::normalize(embedding, embedding);
|
||
|
||
face_content.embedding = std::move(embedding);
|
||
face_content.dim = hidden_dim;
|
||
face_content.flag = true;
|
||
}
|
||
|
||
void GlintArcFace::detectBatch(const std::vector<cv::Mat>& images,
|
||
std::vector<types::FaceContent>& face_contents)
|
||
{
|
||
if (images.empty()) return;
|
||
const size_t batch_size = images.size();
|
||
face_contents.clear();
|
||
face_contents.reserve(batch_size);
|
||
|
||
try {
|
||
Ort::Value input_tensor = transformBatch(images);
|
||
auto output_tensors = ort_session->Run(
|
||
Ort::RunOptions{ nullptr },
|
||
input_node_names.data(), &input_tensor, 1,
|
||
output_node_names.data(), num_outputs);
|
||
|
||
input_values_handler.clear();
|
||
input_values_handler.shrink_to_fit();
|
||
|
||
const float* vals = output_tensors[0].GetTensorData<float>();
|
||
const unsigned int hidden_dim =
|
||
static_cast<unsigned int>(output_node_dims.at(0).at(1));
|
||
|
||
face_contents.resize(batch_size);
|
||
for (size_t i = 0; i < batch_size; ++i) {
|
||
cv::Mat emb_mat(1, hidden_dim, CV_32F,
|
||
const_cast<float*>(vals + i * hidden_dim));
|
||
cv::Mat emb_norm;
|
||
cv::normalize(emb_mat, emb_norm);
|
||
|
||
face_contents[i].embedding = std::vector<float>(
|
||
emb_norm.begin<float>(), emb_norm.end<float>());
|
||
face_contents[i].dim = hidden_dim;
|
||
face_contents[i].flag = true;
|
||
}
|
||
}
|
||
catch (const Ort::Exception&) {
|
||
face_contents.clear();
|
||
throw;
|
||
}
|
||
}
|
||
|
||
// ====================================================================
|
||
// GlintCosFace
|
||
// ====================================================================
|
||
|
||
Ort::Value GlintCosFace::transform(const cv::Mat& mat)
|
||
{
|
||
if (mat.empty())
|
||
throw std::runtime_error("GlintCosFace::transform — input is empty.");
|
||
|
||
cv::Mat canvas;
|
||
cv::resize(mat, canvas, cv::Size(input_node_dims.at(3), input_node_dims.at(2)));
|
||
cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
|
||
canvas.convertTo(canvas, CV_32FC3);
|
||
utils::transform::normalize_inplace(canvas, mean_val, scale_val);
|
||
|
||
std::vector<int64_t> shape = input_node_dims;
|
||
if (shape[0] == -1) shape[0] = 1;
|
||
|
||
return utils::transform::create_tensor(
|
||
canvas, shape, *memory_info_handler,
|
||
input_values_handler, utils::transform::CHW);
|
||
}
|
||
|
||
Ort::Value GlintCosFace::transformBatch(const std::vector<cv::Mat>& images)
|
||
{
|
||
if (images.empty())
|
||
throw std::runtime_error("GlintCosFace::transformBatch — batch is empty.");
|
||
|
||
const int width = input_node_dims.at(3);
|
||
const int height = input_node_dims.at(2);
|
||
std::vector<cv::Mat> batch;
|
||
batch.reserve(images.size());
|
||
|
||
for (const auto& mat : images) {
|
||
if (mat.empty())
|
||
throw std::runtime_error("GlintCosFace::transformBatch — empty image in batch.");
|
||
cv::Mat canvas;
|
||
cv::resize(mat, canvas, cv::Size(width, height));
|
||
cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
|
||
canvas.convertTo(canvas, CV_32FC3);
|
||
utils::transform::normalize_inplace(canvas, mean_val, scale_val);
|
||
batch.push_back(std::move(canvas));
|
||
}
|
||
|
||
std::vector<int64_t> shape = input_node_dims;
|
||
shape[0] = static_cast<int64_t>(images.size());
|
||
|
||
return utils::transform::create_tensor_batch(
|
||
batch, shape, *memory_info_handler,
|
||
input_values_handler, utils::transform::CHW);
|
||
}
|
||
|
||
void GlintCosFace::detect(const cv::Mat& mat, types::FaceContent& face_content)
|
||
{
|
||
if (mat.empty()) return;
|
||
Ort::Value input_tensor = transform(mat);
|
||
auto output_tensors = ort_session->Run(
|
||
Ort::RunOptions{ nullptr },
|
||
input_node_names.data(), &input_tensor, 1,
|
||
output_node_names.data(), num_outputs);
|
||
|
||
const unsigned int hidden_dim =
|
||
static_cast<unsigned int>(output_node_dims.at(0).at(1));
|
||
const float* vals =
|
||
output_tensors.at(0).GetTensorMutableData<float>();
|
||
|
||
std::vector<float> embedding(vals, vals + hidden_dim);
|
||
cv::normalize(embedding, embedding);
|
||
|
||
face_content.embedding = std::move(embedding);
|
||
face_content.dim = hidden_dim;
|
||
face_content.flag = true;
|
||
}
|
||
|
||
void GlintCosFace::detectBatch(const std::vector<cv::Mat>& images,
|
||
std::vector<types::FaceContent>& face_contents)
|
||
{
|
||
if (images.empty()) return;
|
||
const size_t batch_size = images.size();
|
||
face_contents.clear();
|
||
face_contents.reserve(batch_size);
|
||
|
||
Ort::Value input_tensor = transformBatch(images);
|
||
auto output_tensors = ort_session->Run(
|
||
Ort::RunOptions{ nullptr },
|
||
input_node_names.data(), &input_tensor, 1,
|
||
output_node_names.data(), num_outputs);
|
||
|
||
const float* vals =
|
||
output_tensors.at(0).GetTensorMutableData<float>();
|
||
const unsigned int hidden_dim =
|
||
static_cast<unsigned int>(output_node_dims.at(0).at(1));
|
||
|
||
for (size_t i = 0; i < batch_size; ++i) {
|
||
std::vector<float> embedding(vals + i * hidden_dim,
|
||
vals + i * hidden_dim + hidden_dim);
|
||
cv::normalize(embedding, embedding);
|
||
|
||
types::FaceContent fc;
|
||
fc.embedding = std::move(embedding);
|
||
fc.dim = hidden_dim;
|
||
fc.flag = true;
|
||
face_contents.emplace_back(std::move(fc));
|
||
}
|
||
}
|
||
|
||
// ====================================================================
|
||
// SCRFD — constructors
|
||
// ====================================================================
|
||
|
||
SCRFD::SCRFD(const std::string& _onnx_path, unsigned int _num_threads)
|
||
: BasicOrtHandler(_onnx_path, _num_threads)
|
||
{
|
||
initial_context();
|
||
}
|
||
|
||
SCRFD::SCRFD(const std::string& _onnx_path,
|
||
EngineType engineType,
|
||
unsigned int _num_threads)
|
||
: BasicOrtHandler(_onnx_path, engineType, _num_threads)
|
||
{
|
||
initial_context();
|
||
}
|
||
|
||
void SCRFD::initial_context()
|
||
{
|
||
if (num_outputs == 6) {
|
||
fmc = 3; feat_stride_fpn = { 8, 16, 32 }; num_anchors = 2; use_kps = false;
|
||
}
|
||
else if (num_outputs == 9) {
|
||
fmc = 3; feat_stride_fpn = { 8, 16, 32 }; num_anchors = 2; use_kps = true;
|
||
}
|
||
}
|
||
|
||
void SCRFD::resize_unscale(const cv::Mat& mat, cv::Mat& mat_rs,
|
||
int target_height, int target_width,
|
||
SCRFDScaleParams& scale_params)
|
||
{
|
||
if (mat.empty()) return;
|
||
int img_height = mat.rows;
|
||
int img_width = mat.cols;
|
||
|
||
mat_rs = cv::Mat(target_height, target_width, CV_8UC3, cv::Scalar(0, 0, 0));
|
||
|
||
float r = std::min(
|
||
static_cast<float>(target_width) / img_width,
|
||
static_cast<float>(target_height) / img_height);
|
||
|
||
int new_w = static_cast<int>(img_width * r);
|
||
int new_h = static_cast<int>(img_height * r);
|
||
int dw = (target_width - new_w) / 2;
|
||
int dh = (target_height - new_h) / 2;
|
||
|
||
cv::Mat resized;
|
||
cv::resize(mat, resized, cv::Size(new_w, new_h));
|
||
resized.copyTo(mat_rs(cv::Rect(dw, dh, new_w, new_h)));
|
||
|
||
scale_params.ratio = r;
|
||
scale_params.dw = dw;
|
||
scale_params.dh = dh;
|
||
scale_params.flag = true;
|
||
}
|
||
|
||
Ort::Value SCRFD::transform(const cv::Mat& mat_rs)
|
||
{
|
||
cv::Mat canvas = mat_rs.clone();
|
||
cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
|
||
utils::transform::normalize_inplace(canvas, mean_vals, scale_vals);
|
||
return utils::transform::create_tensor(
|
||
canvas, input_node_dims, *memory_info_handler,
|
||
input_values_handler, utils::transform::CHW);
|
||
}
|
||
|
||
Ort::Value SCRFD::transformBatch(const std::vector<cv::Mat>& images)
|
||
{
|
||
if (images.empty())
|
||
throw std::runtime_error("SCRFD::transformBatch — batch is empty.");
|
||
|
||
const int width = input_node_dims.at(3);
|
||
const int height = input_node_dims.at(2);
|
||
std::vector<cv::Mat> batch;
|
||
batch.reserve(images.size());
|
||
|
||
for (const auto& mat : images) {
|
||
if (mat.empty())
|
||
throw std::runtime_error("SCRFD::transformBatch — empty image in batch.");
|
||
cv::Mat canvas;
|
||
cv::resize(mat, canvas, cv::Size(width, height));
|
||
cv::cvtColor(canvas, canvas, cv::COLOR_BGR2RGB);
|
||
canvas.convertTo(canvas, CV_32FC3);
|
||
utils::transform::normalize_inplace(canvas, mean_vals, scale_vals);
|
||
batch.push_back(std::move(canvas));
|
||
}
|
||
|
||
std::vector<int64_t> shape = input_node_dims;
|
||
shape[0] = static_cast<int64_t>(images.size());
|
||
|
||
return utils::transform::create_tensor_batch(
|
||
batch, shape, *memory_info_handler,
|
||
input_values_handler, utils::transform::CHW);
|
||
}
|
||
|
||
void SCRFD::detect(const cv::Mat& mat,
|
||
std::vector<types::BoxfWithLandmarks>& detected_boxes_kps,
|
||
float score_threshold, float iou_threshold, unsigned int topk)
|
||
{
|
||
if (mat.empty()) return;
|
||
|
||
float img_height = static_cast<float>(mat.rows);
|
||
float img_width = static_cast<float>(mat.cols);
|
||
int target_height = static_cast<int>(input_node_dims.at(2));
|
||
int target_width = static_cast<int>(input_node_dims.at(3));
|
||
|
||
cv::Mat mat_rs;
|
||
SCRFDScaleParams scale_params;
|
||
resize_unscale(mat, mat_rs, target_height, target_width, scale_params);
|
||
|
||
Ort::Value input_tensor = transform(mat_rs);
|
||
auto output_tensors = ort_session->Run(
|
||
Ort::RunOptions{ nullptr },
|
||
input_node_names.data(), &input_tensor, 1,
|
||
output_node_names.data(), num_outputs);
|
||
|
||
std::vector<types::BoxfWithLandmarks> bbox_kps_collection;
|
||
generate_bboxes_kps(scale_params, bbox_kps_collection,
|
||
output_tensors, score_threshold,
|
||
img_height, img_width);
|
||
nms_bboxes_kps(bbox_kps_collection, detected_boxes_kps,
|
||
iou_threshold, topk);
|
||
}
|
||
|
||
void SCRFD::generate_points(int target_height, int target_width)
|
||
{
|
||
if (center_points_is_update) return;
|
||
for (auto stride : feat_stride_fpn) {
|
||
unsigned int num_grid_w = target_width / stride;
|
||
unsigned int num_grid_h = target_height / stride;
|
||
for (unsigned int i = 0; i < num_grid_h; ++i) {
|
||
for (unsigned int j = 0; j < num_grid_w; ++j) {
|
||
for (unsigned int k = 0; k < num_anchors; ++k) {
|
||
SCRFDPoint pt;
|
||
pt.cx = static_cast<float>(j);
|
||
pt.cy = static_cast<float>(i);
|
||
pt.stride = static_cast<float>(stride);
|
||
center_points[stride].push_back(pt);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
center_points_is_update = true;
|
||
}
|
||
|
||
void SCRFD::generate_bboxes_kps(const SCRFDScaleParams& scale_params,
|
||
std::vector<types::BoxfWithLandmarks>& bbox_kps_collection,
|
||
std::vector<Ort::Value>& output_tensors,
|
||
float score_threshold,
|
||
float img_height, float img_width)
|
||
{
|
||
const float input_height = static_cast<float>(input_node_dims.at(2));
|
||
const float input_width = static_cast<float>(input_node_dims.at(3));
|
||
generate_points(static_cast<int>(input_height),
|
||
static_cast<int>(input_width));
|
||
bbox_kps_collection.clear();
|
||
|
||
if (use_kps) {
|
||
generate_bboxes_kps_single_stride(scale_params,
|
||
output_tensors.at(0), output_tensors.at(3), output_tensors.at(6),
|
||
8, score_threshold, img_height, img_width, bbox_kps_collection);
|
||
generate_bboxes_kps_single_stride(scale_params,
|
||
output_tensors.at(1), output_tensors.at(4), output_tensors.at(7),
|
||
16, score_threshold, img_height, img_width, bbox_kps_collection);
|
||
generate_bboxes_kps_single_stride(scale_params,
|
||
output_tensors.at(2), output_tensors.at(5), output_tensors.at(8),
|
||
32, score_threshold, img_height, img_width, bbox_kps_collection);
|
||
}
|
||
else {
|
||
generate_bboxes_single_stride(scale_params,
|
||
output_tensors.at(0), output_tensors.at(3),
|
||
8, score_threshold, img_height, img_width, bbox_kps_collection);
|
||
generate_bboxes_single_stride(scale_params,
|
||
output_tensors.at(1), output_tensors.at(4),
|
||
16, score_threshold, img_height, img_width, bbox_kps_collection);
|
||
generate_bboxes_single_stride(scale_params,
|
||
output_tensors.at(2), output_tensors.at(5),
|
||
32, score_threshold, img_height, img_width, bbox_kps_collection);
|
||
}
|
||
}
|
||
|
||
void SCRFD::generate_bboxes_single_stride(
|
||
const SCRFDScaleParams& scale_params,
|
||
Ort::Value& score_pred, Ort::Value& bbox_pred,
|
||
unsigned int stride, float score_threshold,
|
||
float img_height, float img_width,
|
||
std::vector<types::BoxfWithLandmarks>& bbox_kps_collection)
|
||
{
|
||
unsigned int nms_pre_ = std::max(nms_pre, (stride / 8) * nms_pre);
|
||
auto stride_dims = score_pred.GetTypeInfo().GetTensorTypeAndShapeInfo().GetShape();
|
||
const unsigned int num_points = static_cast<unsigned int>(stride_dims.at(1));
|
||
const float* score_ptr = score_pred.GetTensorMutableData<float>();
|
||
const float* bbox_ptr = bbox_pred.GetTensorMutableData<float>();
|
||
|
||
float ratio = scale_params.ratio;
|
||
int dw = scale_params.dw;
|
||
int dh = scale_params.dh;
|
||
|
||
unsigned int count = 0;
|
||
auto& stride_points = center_points[stride];
|
||
|
||
for (unsigned int i = 0; i < num_points; ++i) {
|
||
if (score_ptr[i] < score_threshold) continue;
|
||
const auto& point = stride_points.at(i);
|
||
const float* offsets = bbox_ptr + i * 4;
|
||
|
||
float x1 = ((point.cx - offsets[0]) * point.stride - dw) / ratio;
|
||
float y1 = ((point.cy - offsets[1]) * point.stride - dh) / ratio;
|
||
float x2 = ((point.cx + offsets[2]) * point.stride - dw) / ratio;
|
||
float y2 = ((point.cy + offsets[3]) * point.stride - dh) / ratio;
|
||
|
||
types::BoxfWithLandmarks box_kps;
|
||
box_kps.box.x1 = std::max(0.f, x1);
|
||
box_kps.box.y1 = std::max(0.f, y1);
|
||
box_kps.box.x2 = std::min(img_width - 1.f, x2);
|
||
box_kps.box.y2 = std::min(img_height - 1.f, y2);
|
||
box_kps.box.score = score_ptr[i];
|
||
box_kps.box.label = 1;
|
||
box_kps.box.label_text = "face";
|
||
box_kps.box.flag = true;
|
||
box_kps.flag = true;
|
||
bbox_kps_collection.push_back(box_kps);
|
||
|
||
if (++count > max_nms) break;
|
||
}
|
||
|
||
if (bbox_kps_collection.size() > nms_pre_) {
|
||
std::sort(bbox_kps_collection.begin(), bbox_kps_collection.end(),
|
||
[](const types::BoxfWithLandmarks& a, const types::BoxfWithLandmarks& b) {
|
||
return a.box.score > b.box.score; });
|
||
bbox_kps_collection.resize(nms_pre_);
|
||
}
|
||
}
|
||
|
||
void SCRFD::generate_bboxes_kps_single_stride(
|
||
const SCRFDScaleParams& scale_params,
|
||
Ort::Value& score_pred, Ort::Value& bbox_pred, Ort::Value& kps_pred,
|
||
unsigned int stride, float score_threshold,
|
||
float img_height, float img_width,
|
||
std::vector<types::BoxfWithLandmarks>& bbox_kps_collection)
|
||
{
|
||
unsigned int nms_pre_ = std::max(nms_pre, (stride / 8) * nms_pre);
|
||
auto stride_dims = score_pred.GetTypeInfo().GetTensorTypeAndShapeInfo().GetShape();
|
||
const unsigned int num_points = static_cast<unsigned int>(stride_dims.at(1));
|
||
const float* score_ptr = score_pred.GetTensorMutableData<float>();
|
||
const float* bbox_ptr = bbox_pred.GetTensorMutableData<float>();
|
||
const float* kps_ptr = kps_pred.GetTensorMutableData<float>();
|
||
|
||
float ratio = scale_params.ratio;
|
||
int dw = scale_params.dw;
|
||
int dh = scale_params.dh;
|
||
|
||
unsigned int count = 0;
|
||
auto& stride_points = center_points[stride];
|
||
|
||
for (unsigned int i = 0; i < num_points; ++i) {
|
||
if (score_ptr[i] < score_threshold) continue;
|
||
const auto& point = stride_points.at(i);
|
||
const float* offsets = bbox_ptr + i * 4;
|
||
|
||
float x1 = ((point.cx - offsets[0]) * point.stride - dw) / ratio;
|
||
float y1 = ((point.cy - offsets[1]) * point.stride - dh) / ratio;
|
||
float x2 = ((point.cx + offsets[2]) * point.stride - dw) / ratio;
|
||
float y2 = ((point.cy + offsets[3]) * point.stride - dh) / ratio;
|
||
|
||
types::BoxfWithLandmarks box_kps;
|
||
box_kps.box.x1 = std::max(0.f, x1);
|
||
box_kps.box.y1 = std::max(0.f, y1);
|
||
box_kps.box.x2 = std::min(img_width - 1.f, x2);
|
||
box_kps.box.y2 = std::min(img_height - 1.f, y2);
|
||
box_kps.box.score = score_ptr[i];
|
||
box_kps.box.label = 1;
|
||
box_kps.box.label_text = "face";
|
||
box_kps.box.flag = true;
|
||
|
||
const float* kps_offsets = kps_ptr + i * 10;
|
||
for (unsigned int j = 0; j < 10; j += 2) {
|
||
cv::Point2f kp;
|
||
kp.x = std::min(std::max(0.f,
|
||
((point.cx + kps_offsets[j]) * point.stride - dw) / ratio),
|
||
img_width - 1.f);
|
||
kp.y = std::min(std::max(0.f,
|
||
((point.cy + kps_offsets[j + 1]) * point.stride - dh) / ratio),
|
||
img_height - 1.f);
|
||
box_kps.landmarks.points.push_back(kp);
|
||
}
|
||
box_kps.landmarks.flag = true;
|
||
box_kps.flag = true;
|
||
bbox_kps_collection.push_back(box_kps);
|
||
|
||
if (++count > max_nms) break;
|
||
}
|
||
|
||
if (bbox_kps_collection.size() > nms_pre_) {
|
||
std::sort(bbox_kps_collection.begin(), bbox_kps_collection.end(),
|
||
[](const types::BoxfWithLandmarks& a, const types::BoxfWithLandmarks& b) {
|
||
return a.box.score > b.box.score; });
|
||
bbox_kps_collection.resize(nms_pre_);
|
||
}
|
||
}
|
||
|
||
void SCRFD::nms_bboxes_kps(std::vector<types::BoxfWithLandmarks>& input,
|
||
std::vector<types::BoxfWithLandmarks>& output,
|
||
float iou_threshold, unsigned int topk)
|
||
{
|
||
if (input.empty()) return;
|
||
std::sort(input.begin(), input.end(),
|
||
[](const types::BoxfWithLandmarks& a, const types::BoxfWithLandmarks& b) {
|
||
return a.box.score > b.box.score; });
|
||
|
||
const unsigned int box_num = static_cast<unsigned int>(input.size());
|
||
std::vector<int> merged(box_num, 0);
|
||
unsigned int count = 0;
|
||
|
||
for (unsigned int i = 0; i < box_num; ++i) {
|
||
if (merged[i]) continue;
|
||
output.push_back(input[i]);
|
||
merged[i] = 1;
|
||
for (unsigned int j = i + 1; j < box_num; ++j) {
|
||
if (merged[j]) continue;
|
||
if (input[i].box.iou_of(input[j].box) > iou_threshold) {
|
||
merged[j] = 1;
|
||
}
|
||
}
|
||
if (++count >= topk) break;
|
||
}
|
||
}
|
||
|
||
// ====================================================================
|
||
// MOVINET
|
||
// ====================================================================
|
||
|
||
MOVINET::MOVINET(const std::string& _onnx_path, unsigned int _num_threads)
|
||
: BasicOrtHandler(_onnx_path, _num_threads)
|
||
{
|
||
init_io_names();
|
||
}
|
||
|
||
MOVINET::MOVINET(const std::string& _onnx_path,
|
||
int _temporal, int _width, int _height, int _channels,
|
||
unsigned int _num_threads)
|
||
: BasicOrtHandler(_onnx_path, _num_threads)
|
||
{
|
||
input_params.temporal = _temporal;
|
||
input_params.width = _width;
|
||
input_params.height = _height;
|
||
input_params.channels = _channels;
|
||
init_io_names();
|
||
}
|
||
|
||
MOVINET::MOVINET(const std::string& _onnx_path,
|
||
EngineType engineType,
|
||
unsigned int _num_threads)
|
||
: BasicOrtHandler(_onnx_path, engineType, _num_threads)
|
||
{
|
||
init_io_names();
|
||
}
|
||
|
||
MOVINET::MOVINET(const std::string& _onnx_path,
|
||
EngineType engineType,
|
||
int _temporal, int _width, int _height, int _channels,
|
||
unsigned int _num_threads)
|
||
: BasicOrtHandler(_onnx_path, engineType, _num_threads)
|
||
{
|
||
input_params.temporal = _temporal;
|
||
input_params.width = _width;
|
||
input_params.height = _height;
|
||
input_params.channels = _channels;
|
||
init_io_names();
|
||
}
|
||
|
||
void MOVINET::init_io_names()
|
||
{
|
||
Ort::AllocatorWithDefaultOptions allocator;
|
||
_MoviNetInputName =
|
||
ort_session->GetInputNameAllocated(0, allocator).get();
|
||
_MoviNetOutputName =
|
||
ort_session->GetOutputNameAllocated(0, allocator).get();
|
||
}
|
||
|
||
Ort::Value MOVINET::transform(const std::deque<cv::Mat>& frames)
|
||
{
|
||
if (frames.size() != static_cast<size_t>(input_params.temporal))
|
||
throw std::runtime_error("MOVINET::transform — frame count != temporal length.");
|
||
|
||
std::vector<int64_t> shape = {
|
||
1,
|
||
input_params.channels,
|
||
input_params.temporal,
|
||
input_params.height,
|
||
input_params.width
|
||
};
|
||
auto mem = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
|
||
return utils::transform::create_video_tensor_5d(
|
||
frames, shape, mem, input_tensor_values);
|
||
}
|
||
|
||
std::pair<int, float> MOVINET::post_processing(const float* p)
|
||
{
|
||
const int C = output_params.num_classes;
|
||
float m = *std::max_element(p, p + C);
|
||
float s = 0.f;
|
||
std::vector<float> prob(C);
|
||
for (int i = 0; i < C; ++i) s += (prob[i] = std::exp(p[i] - m));
|
||
for (float& v : prob) v /= s;
|
||
int label = static_cast<int>(
|
||
std::max_element(prob.begin(), prob.end()) - prob.begin());
|
||
return { label, prob[label] };
|
||
}
|
||
|
||
void MOVINET::inference(const std::deque<cv::Mat>& frames,
|
||
std::pair<int, float>& out_result)
|
||
{
|
||
if (frames.empty() ||
|
||
frames.size() != static_cast<size_t>(input_params.temporal)) {
|
||
std::cerr << "[MOVINET] Invalid frame count." << std::endl;
|
||
out_result = { -1, 0.f };
|
||
return;
|
||
}
|
||
|
||
Ort::Value input_tensor = transform(frames);
|
||
const char* in_names[] = { _MoviNetInputName.c_str() };
|
||
const char* out_names[] = { _MoviNetOutputName.c_str() };
|
||
|
||
auto outputs = ort_session->Run(
|
||
Ort::RunOptions{ nullptr },
|
||
in_names, &input_tensor, 1, out_names, 1);
|
||
|
||
out_result = post_processing(outputs[0].GetTensorData<float>());
|
||
}
|
||
|
||
Ort::Value MOVINET::transform(const cv::Mat& mat)
|
||
{
|
||
std::deque<cv::Mat> frames;
|
||
for (int i = 0; i < input_params.temporal; ++i)
|
||
frames.push_back(mat.clone());
|
||
return transform(frames);
|
||
}
|
||
|
||
Ort::Value MOVINET::transformBatch(const std::vector<cv::Mat>& images)
|
||
{
|
||
std::deque<cv::Mat> frames;
|
||
if (!images.empty()) {
|
||
for (int i = 0; i < input_params.temporal; ++i)
|
||
frames.push_back(images[i % images.size()].clone());
|
||
}
|
||
return transform(frames);
|
||
}
|
||
|
||
// ====================================================================
|
||
// utils::transform
|
||
// ====================================================================
|
||
|
||
Ort::Value ANSCENTER::utils::transform::create_tensor(
|
||
const cv::Mat& mat,
|
||
const std::vector<int64_t>& tensor_dims,
|
||
const Ort::MemoryInfo& memory_info_handler,
|
||
std::vector<float>& tensor_value_handler,
|
||
unsigned int data_format)
|
||
{
|
||
if (mat.empty() || tensor_dims.size() != 4 || tensor_dims.at(0) != 1)
|
||
return Ort::Value(nullptr);
|
||
|
||
const unsigned int channels = mat.channels();
|
||
cv::Mat mat_ref;
|
||
if (mat.type() != CV_32FC(channels))
|
||
mat.convertTo(mat_ref, CV_32FC(channels));
|
||
else
|
||
mat_ref = mat;
|
||
|
||
if (data_format == CHW) {
|
||
const unsigned int H = tensor_dims.at(2);
|
||
const unsigned int W = tensor_dims.at(3);
|
||
const unsigned int C = tensor_dims.at(1);
|
||
if (C != channels) return Ort::Value(nullptr);
|
||
|
||
const unsigned int total = C * H * W;
|
||
tensor_value_handler.resize(total);
|
||
|
||
cv::Mat resized;
|
||
if (H != static_cast<unsigned int>(mat_ref.rows) ||
|
||
W != static_cast<unsigned int>(mat_ref.cols))
|
||
cv::resize(mat_ref, resized, cv::Size(W, H));
|
||
else
|
||
resized = mat_ref;
|
||
|
||
std::vector<cv::Mat> chans;
|
||
cv::split(resized, chans);
|
||
for (unsigned int c = 0; c < C; ++c)
|
||
std::memcpy(tensor_value_handler.data() + c * H * W,
|
||
chans[c].data, H * W * sizeof(float));
|
||
|
||
return Ort::Value::CreateTensor<float>(
|
||
memory_info_handler, tensor_value_handler.data(),
|
||
total, tensor_dims.data(), tensor_dims.size());
|
||
}
|
||
|
||
// HWC
|
||
const unsigned int H = tensor_dims.at(1);
|
||
const unsigned int W = tensor_dims.at(2);
|
||
const unsigned int C = tensor_dims.at(3);
|
||
const unsigned int total = C * H * W;
|
||
if (C != channels) return Ort::Value(nullptr);
|
||
tensor_value_handler.resize(total);
|
||
|
||
cv::Mat resized;
|
||
if (H != static_cast<unsigned int>(mat_ref.rows) ||
|
||
W != static_cast<unsigned int>(mat_ref.cols))
|
||
cv::resize(mat_ref, resized, cv::Size(W, H));
|
||
else
|
||
resized = mat_ref;
|
||
|
||
std::memcpy(tensor_value_handler.data(), resized.data, total * sizeof(float));
|
||
return Ort::Value::CreateTensor<float>(
|
||
memory_info_handler, tensor_value_handler.data(),
|
||
total, tensor_dims.data(), tensor_dims.size());
|
||
}
|
||
|
||
Ort::Value ANSCENTER::utils::transform::create_tensor_batch(
|
||
const std::vector<cv::Mat>& batch_mats,
|
||
const std::vector<int64_t>& tensor_dims,
|
||
const Ort::MemoryInfo& memory_info_handler,
|
||
std::vector<float>& tensor_value_handler,
|
||
unsigned int data_format)
|
||
{
|
||
if (batch_mats.empty() || tensor_dims.size() != 4)
|
||
return Ort::Value(nullptr);
|
||
|
||
const size_t N = batch_mats.size();
|
||
const unsigned int C = tensor_dims.at(1);
|
||
const unsigned int H = tensor_dims.at(2);
|
||
const unsigned int W = tensor_dims.at(3);
|
||
const unsigned int image_size = C * H * W;
|
||
const unsigned int total = static_cast<unsigned int>(N) * image_size;
|
||
|
||
tensor_value_handler.resize(total);
|
||
|
||
for (size_t b = 0; b < N; ++b) {
|
||
const cv::Mat& mat = batch_mats[b];
|
||
if (mat.empty() || static_cast<unsigned int>(mat.channels()) != C)
|
||
return Ort::Value(nullptr);
|
||
|
||
cv::Mat mat_ref;
|
||
if (mat.type() != CV_32FC(C))
|
||
mat.convertTo(mat_ref, CV_32FC(C));
|
||
else
|
||
mat_ref = mat;
|
||
|
||
cv::Mat resized;
|
||
if (static_cast<unsigned int>(mat_ref.rows) != H ||
|
||
static_cast<unsigned int>(mat_ref.cols) != W)
|
||
cv::resize(mat_ref, resized, cv::Size(W, H));
|
||
else
|
||
resized = mat_ref;
|
||
|
||
const size_t batch_offset = b * image_size;
|
||
|
||
if (data_format == CHW) {
|
||
const float* src = resized.ptr<float>(0);
|
||
const size_t plane_size = H * W;
|
||
for (unsigned int c = 0; c < C; ++c) {
|
||
float* dst = tensor_value_handler.data() + batch_offset + c * plane_size;
|
||
for (size_t i = 0; i < plane_size; ++i)
|
||
dst[i] = src[i * C + c];
|
||
}
|
||
}
|
||
else {
|
||
std::memcpy(tensor_value_handler.data() + batch_offset,
|
||
resized.data, image_size * sizeof(float));
|
||
}
|
||
}
|
||
|
||
return Ort::Value::CreateTensor<float>(
|
||
memory_info_handler, tensor_value_handler.data(),
|
||
total, tensor_dims.data(), tensor_dims.size());
|
||
}
|
||
|
||
Ort::Value ANSCENTER::utils::transform::create_video_tensor_5d(
|
||
const std::deque<cv::Mat>& frames,
|
||
const std::vector<int64_t>& tensor_dims,
|
||
const Ort::MemoryInfo& memory_info_handler,
|
||
std::vector<float>& tensor_value_handler)
|
||
{
|
||
if (tensor_dims.size() != 5 || tensor_dims[0] != 1)
|
||
throw std::runtime_error("create_video_tensor_5d: expect [1,C,T,H,W]");
|
||
|
||
const unsigned int C = tensor_dims[1];
|
||
const unsigned int T = tensor_dims[2];
|
||
const unsigned int H = tensor_dims[3];
|
||
const unsigned int W = tensor_dims[4];
|
||
|
||
if (frames.size() != T)
|
||
throw std::runtime_error("create_video_tensor_5d: frame count != T");
|
||
|
||
const size_t total = static_cast<size_t>(C) * T * H * W;
|
||
tensor_value_handler.resize(total);
|
||
|
||
for (unsigned int t = 0; t < T; ++t) {
|
||
cv::Mat frame_ref;
|
||
if (frames[t].type() != CV_32FC(C))
|
||
frames[t].convertTo(frame_ref, CV_32FC(C), 1.0 / 255.0);
|
||
else
|
||
frame_ref = frames[t];
|
||
|
||
cv::Mat resized;
|
||
if (static_cast<unsigned int>(frame_ref.rows) != H ||
|
||
static_cast<unsigned int>(frame_ref.cols) != W)
|
||
cv::resize(frame_ref, resized, cv::Size(W, H));
|
||
else
|
||
resized = frame_ref;
|
||
|
||
cv::cvtColor(resized, resized, cv::COLOR_BGR2RGB);
|
||
|
||
std::vector<cv::Mat> chans;
|
||
cv::split(resized, chans);
|
||
for (unsigned int c = 0; c < C; ++c) {
|
||
float* dst = tensor_value_handler.data()
|
||
+ c * (T * H * W) + t * (H * W);
|
||
std::memcpy(dst, chans[c].data, H * W * sizeof(float));
|
||
}
|
||
}
|
||
|
||
return Ort::Value::CreateTensor<float>(
|
||
memory_info_handler, tensor_value_handler.data(),
|
||
total, tensor_dims.data(), tensor_dims.size());
|
||
}
|
||
|
||
cv::Mat ANSCENTER::utils::transform::normalize(
|
||
const cv::Mat& mat, float mean, float scale)
|
||
{
|
||
cv::Mat matf;
|
||
if (mat.type() != CV_32FC3) mat.convertTo(matf, CV_32FC3);
|
||
else matf = mat;
|
||
return (matf - mean) * scale;
|
||
}
|
||
|
||
cv::Mat ANSCENTER::utils::transform::normalize(
|
||
const cv::Mat& mat, const float* mean, const float* scale)
|
||
{
|
||
cv::Mat out;
|
||
if (mat.type() != CV_32FC3) mat.convertTo(out, CV_32FC3);
|
||
else out = mat.clone();
|
||
for (int i = 0; i < out.rows; ++i) {
|
||
cv::Vec3f* p = out.ptr<cv::Vec3f>(i);
|
||
for (int j = 0; j < out.cols; ++j) {
|
||
p[j][0] = (p[j][0] - mean[0]) * scale[0];
|
||
p[j][1] = (p[j][1] - mean[1]) * scale[1];
|
||
p[j][2] = (p[j][2] - mean[2]) * scale[2];
|
||
}
|
||
}
|
||
return out;
|
||
}
|
||
|
||
void ANSCENTER::utils::transform::normalize(
|
||
const cv::Mat& inmat, cv::Mat& outmat, float mean, float scale)
|
||
{
|
||
outmat = ANSCENTER::utils::transform::normalize(inmat, mean, scale);
|
||
}
|
||
|
||
void ANSCENTER::utils::transform::normalize_inplace(
|
||
cv::Mat& mat_inplace, float mean, float scale)
|
||
{
|
||
if (mat_inplace.type() != CV_32FC3)
|
||
mat_inplace.convertTo(mat_inplace, CV_32FC3);
|
||
ANSCENTER::utils::transform::normalize(mat_inplace, mat_inplace, mean, scale);
|
||
}
|
||
|
||
void ANSCENTER::utils::transform::normalize_inplace(
|
||
cv::Mat& mat_inplace, const float* mean, const float* scale)
|
||
{
|
||
if (mat_inplace.type() != CV_32FC3)
|
||
mat_inplace.convertTo(mat_inplace, CV_32FC3);
|
||
for (int i = 0; i < mat_inplace.rows; ++i) {
|
||
cv::Vec3f* p = mat_inplace.ptr<cv::Vec3f>(i);
|
||
for (int j = 0; j < mat_inplace.cols; ++j) {
|
||
p[j][0] = (p[j][0] - mean[0]) * scale[0];
|
||
p[j][1] = (p[j][1] - mean[1]) * scale[1];
|
||
p[j][2] = (p[j][2] - mean[2]) * scale[2];
|
||
}
|
||
}
|
||
}
|
||
|
||
// ====================================================================
|
||
// BoundingBoxType template method implementations
|
||
// ====================================================================
|
||
|
||
template<typename T1, typename T2>
|
||
template<typename O1, typename O2>
|
||
inline typename ANSCENTER::types::BoundingBoxType<T1, T2>::value_type
|
||
ANSCENTER::types::BoundingBoxType<T1, T2>::iou_of(
|
||
const BoundingBoxType<O1, O2>& other) const
|
||
{
|
||
auto tbox = other.template convert_type<value_type, score_type>();
|
||
value_type ix1 = std::max(x1, tbox.x1);
|
||
value_type iy1 = std::max(y1, tbox.y1);
|
||
value_type ix2 = std::min(x2, tbox.x2);
|
||
value_type iy2 = std::min(y2, tbox.y2);
|
||
value_type iw = ix2 - ix1 + static_cast<value_type>(1);
|
||
value_type ih = iy2 - iy1 + static_cast<value_type>(1);
|
||
if (iw <= 0 || ih <= 0)
|
||
return std::numeric_limits<value_type>::min();
|
||
value_type inter = iw * ih;
|
||
return inter / (area() + tbox.area() - inter);
|
||
}
|
||
|
||
template<typename T1, typename T2>
|
||
inline ::cv::Rect ANSCENTER::types::BoundingBoxType<T1, T2>::rect() const
|
||
{
|
||
auto b = convert_type<int>();
|
||
return ::cv::Rect(b.x1, b.y1, b.width(), b.height());
|
||
}
|
||
|
||
template<typename T1, typename T2>
|
||
inline ::cv::Point2i ANSCENTER::types::BoundingBoxType<T1, T2>::tl() const
|
||
{
|
||
auto b = convert_type<int>();
|
||
return ::cv::Point2i(b.x1, b.y1);
|
||
}
|
||
|
||
template<typename T1, typename T2>
|
||
inline ::cv::Point2i ANSCENTER::types::BoundingBoxType<T1, T2>::rb() const
|
||
{
|
||
auto b = convert_type<int>();
|
||
return ::cv::Point2i(b.x2, b.y2);
|
||
}
|
||
|
||
template<typename T1, typename T2>
|
||
inline typename ANSCENTER::types::BoundingBoxType<T1, T2>::value_type
|
||
ANSCENTER::types::BoundingBoxType<T1, T2>::width() const
|
||
{
|
||
return x2 - x1 + static_cast<value_type>(1);
|
||
}
|
||
|
||
template<typename T1, typename T2>
|
||
inline typename ANSCENTER::types::BoundingBoxType<T1, T2>::value_type
|
||
ANSCENTER::types::BoundingBoxType<T1, T2>::height() const
|
||
{
|
||
return y2 - y1 + static_cast<value_type>(1);
|
||
}
|
||
|
||
template<typename T1, typename T2>
|
||
inline typename ANSCENTER::types::BoundingBoxType<T1, T2>::value_type
|
||
ANSCENTER::types::BoundingBoxType<T1, T2>::area() const
|
||
{
|
||
return std::fabs(width() * height());
|
||
}
|
||
|
||
} // namespace ANSCENTER
|