Improve ALPR_OCR peformance
This commit is contained in:
@@ -6,6 +6,8 @@
|
||||
#include <limits>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <cstdlib>
|
||||
#include <system_error>
|
||||
|
||||
namespace ANSCENTER {
|
||||
|
||||
@@ -13,6 +15,40 @@ namespace ANSCENTER {
|
||||
// BasicOrtHandler — constructors
|
||||
// ====================================================================
|
||||
|
||||
std::string BasicOrtHandler::QueryModelInputName(const std::string& onnxPath)
|
||||
{
|
||||
try {
|
||||
// Make sure the Ort API pointer is initialised in THIS DLL.
|
||||
if (Ort::Global<void>::api_ == nullptr) {
|
||||
Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
|
||||
}
|
||||
|
||||
Ort::Env env(ORT_LOGGING_LEVEL_ERROR, "QueryModelInputName");
|
||||
Ort::SessionOptions opts;
|
||||
opts.SetIntraOpNumThreads(1);
|
||||
opts.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_DISABLE_ALL);
|
||||
// Intentionally NOT attaching CUDA/TRT EP — CPU is fastest
|
||||
// for a no-inference metadata read.
|
||||
|
||||
std::wstring wpath(onnxPath.begin(), onnxPath.end());
|
||||
Ort::Session session(env, wpath.c_str(), opts);
|
||||
|
||||
Ort::AllocatorWithDefaultOptions alloc;
|
||||
auto inName = session.GetInputNameAllocated(0, alloc);
|
||||
return std::string(inName.get());
|
||||
}
|
||||
catch (const Ort::Exception& e) {
|
||||
std::cerr << "[QueryModelInputName] ORT exception: "
|
||||
<< e.what() << " (path=" << onnxPath << ")" << std::endl;
|
||||
return "";
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[QueryModelInputName] std exception: "
|
||||
<< e.what() << " (path=" << onnxPath << ")" << std::endl;
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
|
||||
unsigned int _num_threads)
|
||||
: log_id(_onnx_path.data()),
|
||||
@@ -36,6 +72,33 @@ namespace ANSCENTER {
|
||||
initialize_handler();
|
||||
}
|
||||
|
||||
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
|
||||
EngineType engineType,
|
||||
const OrtHandlerOptions& options,
|
||||
unsigned int _num_threads)
|
||||
: log_id(_onnx_path.data()),
|
||||
num_threads(_num_threads),
|
||||
m_engineType(engineType),
|
||||
m_handlerOptions(options),
|
||||
onnx_path_w(_onnx_path.begin(), _onnx_path.end())
|
||||
{
|
||||
onnx_path = onnx_path_w.c_str();
|
||||
initialize_handler();
|
||||
}
|
||||
|
||||
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
|
||||
const OrtHandlerOptions& options,
|
||||
unsigned int _num_threads)
|
||||
: log_id(_onnx_path.data()),
|
||||
num_threads(_num_threads),
|
||||
m_engineType(static_cast<EngineType>(-1)), // EPLoader auto-detect
|
||||
m_handlerOptions(options),
|
||||
onnx_path_w(_onnx_path.begin(), _onnx_path.end())
|
||||
{
|
||||
onnx_path = onnx_path_w.c_str();
|
||||
initialize_handler();
|
||||
}
|
||||
|
||||
BasicOrtHandler::~BasicOrtHandler()
|
||||
{
|
||||
if (ort_session) {
|
||||
@@ -66,11 +129,15 @@ namespace ANSCENTER {
|
||||
// - arena_extend_strategy = 1 (kSameAsRequested) to avoid
|
||||
// pre-allocating huge GPU memory blocks that may exceed VRAM
|
||||
// - cudnn_conv_algo_search = HEURISTIC for faster session init
|
||||
// - cudnn_conv_use_max_workspace = 0 — use minimal cuDNN workspace
|
||||
// to prevent CUDNN_BACKEND_API_FAILED when TRT engines already
|
||||
// occupy most VRAM on the same GPU
|
||||
// - cudnn_conv_use_max_workspace defaults to "0" to prevent
|
||||
// CUDNN_BACKEND_API_FAILED when TRT engines already occupy
|
||||
// most VRAM on the same GPU. OCR sub-models that need fast
|
||||
// convs opt into "1" via OrtHandlerOptions::useMaxCudnnWorkspace
|
||||
// - gpu_mem_limit — cap ONNX Runtime's GPU memory arena to 2 GB
|
||||
// so it doesn't compete with TensorRT for the remaining VRAM
|
||||
const char* maxWorkspace =
|
||||
m_handlerOptions.useMaxCudnnWorkspace ? "1" : "0";
|
||||
|
||||
const char* keys[] = {
|
||||
"device_id",
|
||||
"arena_extend_strategy",
|
||||
@@ -82,7 +149,7 @@ namespace ANSCENTER {
|
||||
"0",
|
||||
"1", // kSameAsRequested
|
||||
"HEURISTIC", // avoid exhaustive algo search on large model
|
||||
"0", // minimal cuDNN workspace (prevents OOM)
|
||||
maxWorkspace, // "1" for OCR (perf), "0" elsewhere (safety)
|
||||
"2147483648" // 2 GB arena limit
|
||||
};
|
||||
Ort::GetApi().UpdateCUDAProviderOptions(
|
||||
@@ -92,7 +159,8 @@ namespace ANSCENTER {
|
||||
Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options);
|
||||
|
||||
std::cout << "[ORT] CUDA EP attached (arena=SameAsRequested, "
|
||||
"cudnn=HEURISTIC, maxWorkspace=0, memLimit=2GB)." << std::endl;
|
||||
"cudnn=HEURISTIC, maxWorkspace=" << maxWorkspace
|
||||
<< ", memLimit=2GB)." << std::endl;
|
||||
return true;
|
||||
}
|
||||
catch (const Ort::Exception& e) {
|
||||
@@ -100,6 +168,113 @@ namespace ANSCENTER {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool BasicOrtHandler::TryAppendTensorRT(Ort::SessionOptions& session_options)
|
||||
{
|
||||
try {
|
||||
OrtTensorRTProviderOptionsV2* trt_options = nullptr;
|
||||
Ort::GetApi().CreateTensorRTProviderOptions(&trt_options);
|
||||
|
||||
// Cache built engines on disk so subsequent runs skip the
|
||||
// multi-minute build. Engines are keyed on (model hash, GPU
|
||||
// arch, shape profile) so changing any of those triggers
|
||||
// a rebuild automatically.
|
||||
std::string cacheDir = m_handlerOptions.trtEngineCacheDir;
|
||||
if (cacheDir.empty()) {
|
||||
// %TEMP%\ANSCENTER\TRTEngineCache
|
||||
const char* tmp = std::getenv("TEMP");
|
||||
if (!tmp) tmp = std::getenv("TMP");
|
||||
if (!tmp) tmp = ".";
|
||||
std::filesystem::path p(tmp);
|
||||
p /= "ANSCENTER";
|
||||
p /= "TRTEngineCache";
|
||||
std::error_code ec;
|
||||
std::filesystem::create_directories(p, ec);
|
||||
cacheDir = p.string();
|
||||
}
|
||||
|
||||
// Builder options tuned for *fast first-run*:
|
||||
// - opt_level 1: builds in seconds, ~5–10 % runtime cost vs 3
|
||||
// - workspace 1 GB: leaves room for CUDA EP arena and the
|
||||
// LPD's own TRT engine on the same GPU
|
||||
// - timing cache: persists kernel timings between runs so
|
||||
// builds at new shapes get progressively faster
|
||||
// - profile shapes (if set): build ONE dynamic-shape
|
||||
// engine that handles all (batch, width) combos instead
|
||||
// of rebuilding per unique input. Critical for variable
|
||||
// batch workloads — without this, TRT EP rebuilds every
|
||||
// time runtime sees a new shape pair, causing 60-90 s
|
||||
// hangs mid-stream.
|
||||
std::filesystem::path timingCachePath =
|
||||
std::filesystem::path(cacheDir) / "timing.cache";
|
||||
std::string timingCacheStr = timingCachePath.string();
|
||||
|
||||
const bool haveProfile = !m_handlerOptions.trtProfileMinShapes.empty()
|
||||
&& !m_handlerOptions.trtProfileOptShapes.empty()
|
||||
&& !m_handlerOptions.trtProfileMaxShapes.empty();
|
||||
|
||||
// Build the key/value arrays. We always set the first 8 keys;
|
||||
// the profile shapes are appended only when provided.
|
||||
std::vector<const char*> keys = {
|
||||
"device_id",
|
||||
"trt_fp16_enable",
|
||||
"trt_engine_cache_enable",
|
||||
"trt_engine_cache_path",
|
||||
"trt_max_workspace_size",
|
||||
"trt_builder_optimization_level",
|
||||
"trt_timing_cache_enable",
|
||||
"trt_timing_cache_path"
|
||||
};
|
||||
std::vector<const char*> values = {
|
||||
"0",
|
||||
m_handlerOptions.trtFP16 ? "1" : "0",
|
||||
"1",
|
||||
cacheDir.c_str(),
|
||||
"1073741824", // 1 GB build workspace
|
||||
"1", // fast build (was "3")
|
||||
"1",
|
||||
cacheDir.c_str()
|
||||
};
|
||||
|
||||
if (haveProfile) {
|
||||
keys.push_back("trt_profile_min_shapes");
|
||||
values.push_back(m_handlerOptions.trtProfileMinShapes.c_str());
|
||||
keys.push_back("trt_profile_opt_shapes");
|
||||
values.push_back(m_handlerOptions.trtProfileOptShapes.c_str());
|
||||
keys.push_back("trt_profile_max_shapes");
|
||||
values.push_back(m_handlerOptions.trtProfileMaxShapes.c_str());
|
||||
}
|
||||
|
||||
Ort::GetApi().UpdateTensorRTProviderOptions(
|
||||
trt_options, keys.data(), values.data(), keys.size());
|
||||
|
||||
session_options.AppendExecutionProvider_TensorRT_V2(*trt_options);
|
||||
Ort::GetApi().ReleaseTensorRTProviderOptions(trt_options);
|
||||
|
||||
std::cout << "[ORT] TensorRT EP attached (fp16="
|
||||
<< (m_handlerOptions.trtFP16 ? "1" : "0")
|
||||
<< ", cache=" << cacheDir
|
||||
<< ", profile=" << (haveProfile ? "dynamic" : "static")
|
||||
<< ")." << std::endl;
|
||||
if (haveProfile) {
|
||||
std::cout << "[ORT] profile min: "
|
||||
<< m_handlerOptions.trtProfileMinShapes << std::endl
|
||||
<< "[ORT] profile opt: "
|
||||
<< m_handlerOptions.trtProfileOptShapes << std::endl
|
||||
<< "[ORT] profile max: "
|
||||
<< m_handlerOptions.trtProfileMaxShapes << std::endl;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
catch (const Ort::Exception& e) {
|
||||
std::cerr << "[ORT] TensorRT EP failed: " << e.what() << std::endl;
|
||||
return false;
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[ORT] TensorRT EP failed (std): " << e.what() << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
bool BasicOrtHandler::TryAppendDirectML(Ort::SessionOptions& session_options)
|
||||
{
|
||||
try {
|
||||
@@ -267,9 +442,28 @@ namespace ANSCENTER {
|
||||
{
|
||||
// --------------------------------------------------------
|
||||
case EngineType::NVIDIA_GPU:
|
||||
// Try TensorRT EP first when explicitly requested. Falls
|
||||
// through to CUDA EP if TRT is missing or option creation
|
||||
// fails. Both EPs may be attached at once — ORT picks TRT
|
||||
// for nodes it supports and CUDA for the rest.
|
||||
if (m_handlerOptions.preferTensorRT
|
||||
&& hasProvider("TensorrtExecutionProvider")) {
|
||||
ANS_DBG("OrtHandler", "Trying TensorRT EP...");
|
||||
if (TryAppendTensorRT(session_options)) {
|
||||
epAttached = true;
|
||||
}
|
||||
else {
|
||||
std::cerr << "[ORT] TensorRT EP attach failed — "
|
||||
"falling back to CUDA EP." << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
ANS_DBG("OrtHandler", "Trying CUDA EP...");
|
||||
if (hasProvider("CUDAExecutionProvider"))
|
||||
epAttached = TryAppendCUDA(session_options);
|
||||
if (hasProvider("CUDAExecutionProvider")) {
|
||||
if (TryAppendCUDA(session_options)) {
|
||||
epAttached = true;
|
||||
}
|
||||
}
|
||||
if (!epAttached) {
|
||||
std::cerr << "[ORT] CUDA EP unavailable — falling back to CPU."
|
||||
<< std::endl;
|
||||
|
||||
Reference in New Issue
Block a user