Improve ALPR_OCR peformance

This commit is contained in:
2026-04-14 20:30:21 +10:00
parent 3349b45ade
commit f9a0af8949
18 changed files with 991 additions and 77 deletions

View File

@@ -6,6 +6,8 @@
#include <limits>
#include <filesystem>
#include <fstream>
#include <cstdlib>
#include <system_error>
namespace ANSCENTER {
@@ -13,6 +15,40 @@ namespace ANSCENTER {
// BasicOrtHandler — constructors
// ====================================================================
std::string BasicOrtHandler::QueryModelInputName(const std::string& onnxPath)
{
try {
// Make sure the Ort API pointer is initialised in THIS DLL.
if (Ort::Global<void>::api_ == nullptr) {
Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
}
Ort::Env env(ORT_LOGGING_LEVEL_ERROR, "QueryModelInputName");
Ort::SessionOptions opts;
opts.SetIntraOpNumThreads(1);
opts.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_DISABLE_ALL);
// Intentionally NOT attaching CUDA/TRT EP — CPU is fastest
// for a no-inference metadata read.
std::wstring wpath(onnxPath.begin(), onnxPath.end());
Ort::Session session(env, wpath.c_str(), opts);
Ort::AllocatorWithDefaultOptions alloc;
auto inName = session.GetInputNameAllocated(0, alloc);
return std::string(inName.get());
}
catch (const Ort::Exception& e) {
std::cerr << "[QueryModelInputName] ORT exception: "
<< e.what() << " (path=" << onnxPath << ")" << std::endl;
return "";
}
catch (const std::exception& e) {
std::cerr << "[QueryModelInputName] std exception: "
<< e.what() << " (path=" << onnxPath << ")" << std::endl;
return "";
}
}
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
unsigned int _num_threads)
: log_id(_onnx_path.data()),
@@ -36,6 +72,33 @@ namespace ANSCENTER {
initialize_handler();
}
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
EngineType engineType,
const OrtHandlerOptions& options,
unsigned int _num_threads)
: log_id(_onnx_path.data()),
num_threads(_num_threads),
m_engineType(engineType),
m_handlerOptions(options),
onnx_path_w(_onnx_path.begin(), _onnx_path.end())
{
onnx_path = onnx_path_w.c_str();
initialize_handler();
}
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
const OrtHandlerOptions& options,
unsigned int _num_threads)
: log_id(_onnx_path.data()),
num_threads(_num_threads),
m_engineType(static_cast<EngineType>(-1)), // EPLoader auto-detect
m_handlerOptions(options),
onnx_path_w(_onnx_path.begin(), _onnx_path.end())
{
onnx_path = onnx_path_w.c_str();
initialize_handler();
}
BasicOrtHandler::~BasicOrtHandler()
{
if (ort_session) {
@@ -66,11 +129,15 @@ namespace ANSCENTER {
// - arena_extend_strategy = 1 (kSameAsRequested) to avoid
// pre-allocating huge GPU memory blocks that may exceed VRAM
// - cudnn_conv_algo_search = HEURISTIC for faster session init
// - cudnn_conv_use_max_workspace = 0 — use minimal cuDNN workspace
// to prevent CUDNN_BACKEND_API_FAILED when TRT engines already
// occupy most VRAM on the same GPU
// - cudnn_conv_use_max_workspace defaults to "0" to prevent
// CUDNN_BACKEND_API_FAILED when TRT engines already occupy
// most VRAM on the same GPU. OCR sub-models that need fast
// convs opt into "1" via OrtHandlerOptions::useMaxCudnnWorkspace
// - gpu_mem_limit — cap ONNX Runtime's GPU memory arena to 2 GB
// so it doesn't compete with TensorRT for the remaining VRAM
const char* maxWorkspace =
m_handlerOptions.useMaxCudnnWorkspace ? "1" : "0";
const char* keys[] = {
"device_id",
"arena_extend_strategy",
@@ -82,7 +149,7 @@ namespace ANSCENTER {
"0",
"1", // kSameAsRequested
"HEURISTIC", // avoid exhaustive algo search on large model
"0", // minimal cuDNN workspace (prevents OOM)
maxWorkspace, // "1" for OCR (perf), "0" elsewhere (safety)
"2147483648" // 2 GB arena limit
};
Ort::GetApi().UpdateCUDAProviderOptions(
@@ -92,7 +159,8 @@ namespace ANSCENTER {
Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options);
std::cout << "[ORT] CUDA EP attached (arena=SameAsRequested, "
"cudnn=HEURISTIC, maxWorkspace=0, memLimit=2GB)." << std::endl;
"cudnn=HEURISTIC, maxWorkspace=" << maxWorkspace
<< ", memLimit=2GB)." << std::endl;
return true;
}
catch (const Ort::Exception& e) {
@@ -100,6 +168,113 @@ namespace ANSCENTER {
return false;
}
}
bool BasicOrtHandler::TryAppendTensorRT(Ort::SessionOptions& session_options)
{
try {
OrtTensorRTProviderOptionsV2* trt_options = nullptr;
Ort::GetApi().CreateTensorRTProviderOptions(&trt_options);
// Cache built engines on disk so subsequent runs skip the
// multi-minute build. Engines are keyed on (model hash, GPU
// arch, shape profile) so changing any of those triggers
// a rebuild automatically.
std::string cacheDir = m_handlerOptions.trtEngineCacheDir;
if (cacheDir.empty()) {
// %TEMP%\ANSCENTER\TRTEngineCache
const char* tmp = std::getenv("TEMP");
if (!tmp) tmp = std::getenv("TMP");
if (!tmp) tmp = ".";
std::filesystem::path p(tmp);
p /= "ANSCENTER";
p /= "TRTEngineCache";
std::error_code ec;
std::filesystem::create_directories(p, ec);
cacheDir = p.string();
}
// Builder options tuned for *fast first-run*:
// - opt_level 1: builds in seconds, ~510 % runtime cost vs 3
// - workspace 1 GB: leaves room for CUDA EP arena and the
// LPD's own TRT engine on the same GPU
// - timing cache: persists kernel timings between runs so
// builds at new shapes get progressively faster
// - profile shapes (if set): build ONE dynamic-shape
// engine that handles all (batch, width) combos instead
// of rebuilding per unique input. Critical for variable
// batch workloads — without this, TRT EP rebuilds every
// time runtime sees a new shape pair, causing 60-90 s
// hangs mid-stream.
std::filesystem::path timingCachePath =
std::filesystem::path(cacheDir) / "timing.cache";
std::string timingCacheStr = timingCachePath.string();
const bool haveProfile = !m_handlerOptions.trtProfileMinShapes.empty()
&& !m_handlerOptions.trtProfileOptShapes.empty()
&& !m_handlerOptions.trtProfileMaxShapes.empty();
// Build the key/value arrays. We always set the first 8 keys;
// the profile shapes are appended only when provided.
std::vector<const char*> keys = {
"device_id",
"trt_fp16_enable",
"trt_engine_cache_enable",
"trt_engine_cache_path",
"trt_max_workspace_size",
"trt_builder_optimization_level",
"trt_timing_cache_enable",
"trt_timing_cache_path"
};
std::vector<const char*> values = {
"0",
m_handlerOptions.trtFP16 ? "1" : "0",
"1",
cacheDir.c_str(),
"1073741824", // 1 GB build workspace
"1", // fast build (was "3")
"1",
cacheDir.c_str()
};
if (haveProfile) {
keys.push_back("trt_profile_min_shapes");
values.push_back(m_handlerOptions.trtProfileMinShapes.c_str());
keys.push_back("trt_profile_opt_shapes");
values.push_back(m_handlerOptions.trtProfileOptShapes.c_str());
keys.push_back("trt_profile_max_shapes");
values.push_back(m_handlerOptions.trtProfileMaxShapes.c_str());
}
Ort::GetApi().UpdateTensorRTProviderOptions(
trt_options, keys.data(), values.data(), keys.size());
session_options.AppendExecutionProvider_TensorRT_V2(*trt_options);
Ort::GetApi().ReleaseTensorRTProviderOptions(trt_options);
std::cout << "[ORT] TensorRT EP attached (fp16="
<< (m_handlerOptions.trtFP16 ? "1" : "0")
<< ", cache=" << cacheDir
<< ", profile=" << (haveProfile ? "dynamic" : "static")
<< ")." << std::endl;
if (haveProfile) {
std::cout << "[ORT] profile min: "
<< m_handlerOptions.trtProfileMinShapes << std::endl
<< "[ORT] profile opt: "
<< m_handlerOptions.trtProfileOptShapes << std::endl
<< "[ORT] profile max: "
<< m_handlerOptions.trtProfileMaxShapes << std::endl;
}
return true;
}
catch (const Ort::Exception& e) {
std::cerr << "[ORT] TensorRT EP failed: " << e.what() << std::endl;
return false;
}
catch (const std::exception& e) {
std::cerr << "[ORT] TensorRT EP failed (std): " << e.what() << std::endl;
return false;
}
}
bool BasicOrtHandler::TryAppendDirectML(Ort::SessionOptions& session_options)
{
try {
@@ -267,9 +442,28 @@ namespace ANSCENTER {
{
// --------------------------------------------------------
case EngineType::NVIDIA_GPU:
// Try TensorRT EP first when explicitly requested. Falls
// through to CUDA EP if TRT is missing or option creation
// fails. Both EPs may be attached at once — ORT picks TRT
// for nodes it supports and CUDA for the rest.
if (m_handlerOptions.preferTensorRT
&& hasProvider("TensorrtExecutionProvider")) {
ANS_DBG("OrtHandler", "Trying TensorRT EP...");
if (TryAppendTensorRT(session_options)) {
epAttached = true;
}
else {
std::cerr << "[ORT] TensorRT EP attach failed — "
"falling back to CUDA EP." << std::endl;
}
}
ANS_DBG("OrtHandler", "Trying CUDA EP...");
if (hasProvider("CUDAExecutionProvider"))
epAttached = TryAppendCUDA(session_options);
if (hasProvider("CUDAExecutionProvider")) {
if (TryAppendCUDA(session_options)) {
epAttached = true;
}
}
if (!epAttached) {
std::cerr << "[ORT] CUDA EP unavailable — falling back to CPU."
<< std::endl;