Improve ALPR_OCR peformance
This commit is contained in:
@@ -248,6 +248,46 @@ namespace ANSCENTER {
|
||||
return std::string(ort_session->GetOutputNameAllocated(index, allocator).get());
|
||||
}
|
||||
|
||||
// ====================================================================
|
||||
// High-perf options for OCR sub-models that need TRT EP and full
|
||||
// cuDNN workspace. Default-constructed = identical to the legacy
|
||||
// behavior (CUDA EP only, minimal cuDNN workspace).
|
||||
// ====================================================================
|
||||
struct OrtHandlerOptions {
|
||||
// Try to attach TensorRT EP before CUDA EP (NVIDIA only).
|
||||
// Falls back to CUDA EP automatically if TRT EP creation or session
|
||||
// creation fails. Engines are cached on disk for fast reload.
|
||||
bool preferTensorRT = false;
|
||||
|
||||
// Use the largest cuDNN conv workspace. cuDNN can then pick fast
|
||||
// algorithms (Winograd, implicit-precomp-GEMM with big workspaces).
|
||||
// Defaults off because some deployments share VRAM with TRT engines
|
||||
// and need the minimal-workspace mode to avoid OOM.
|
||||
bool useMaxCudnnWorkspace = false;
|
||||
|
||||
// Where to cache built TRT engines. Empty → default
|
||||
// %TEMP%/ANSCENTER/TRTEngineCache. Only used when preferTensorRT.
|
||||
std::string trtEngineCacheDir;
|
||||
|
||||
// FP16 builds for TRT EP. Recommended for inference; ignored if
|
||||
// preferTensorRT is false.
|
||||
bool trtFP16 = true;
|
||||
|
||||
// Dynamic-shape profile for TRT EP. When set, TRT builds ONE
|
||||
// engine that handles every input shape in the [min..max] range
|
||||
// instead of rebuilding per unique shape. Critical for models
|
||||
// that see many (batch_size, spatial) combinations at runtime.
|
||||
//
|
||||
// Format: "input_name:d0xd1xd2xd3[,input2:...]"
|
||||
// e.g. "x:1x3x48x320" for batch=1, C=3, H=48, W=320
|
||||
//
|
||||
// All three fields must be set together. An empty min implies
|
||||
// no profile (fall back to static-shape-per-unique-input mode).
|
||||
std::string trtProfileMinShapes;
|
||||
std::string trtProfileOptShapes;
|
||||
std::string trtProfileMaxShapes;
|
||||
};
|
||||
|
||||
// ====================================================================
|
||||
// BasicOrtHandler
|
||||
// ====================================================================
|
||||
@@ -280,6 +320,9 @@ namespace ANSCENTER {
|
||||
const unsigned int num_threads;
|
||||
EngineType m_engineType;
|
||||
|
||||
// Per-session high-perf options. Default = legacy behavior.
|
||||
OrtHandlerOptions m_handlerOptions;
|
||||
|
||||
protected:
|
||||
// Default: hardware auto-detection via ANSLicenseHelper through EPLoader
|
||||
explicit BasicOrtHandler(const std::string& _onnx_path,
|
||||
@@ -290,6 +333,19 @@ namespace ANSCENTER {
|
||||
EngineType engineType,
|
||||
unsigned int _num_threads = 1);
|
||||
|
||||
// Engine override + per-session high-perf options (TRT EP, max
|
||||
// cuDNN workspace, etc.). Used by OCR sub-models that need
|
||||
// shape-stable, high-throughput inference.
|
||||
explicit BasicOrtHandler(const std::string& _onnx_path,
|
||||
EngineType engineType,
|
||||
const OrtHandlerOptions& options,
|
||||
unsigned int _num_threads = 1);
|
||||
|
||||
// Auto-detect engine via EPLoader, but with high-perf options.
|
||||
explicit BasicOrtHandler(const std::string& _onnx_path,
|
||||
const OrtHandlerOptions& options,
|
||||
unsigned int _num_threads = 1);
|
||||
|
||||
virtual ~BasicOrtHandler();
|
||||
|
||||
BasicOrtHandler(const BasicOrtHandler&) = delete;
|
||||
@@ -298,6 +354,13 @@ namespace ANSCENTER {
|
||||
// Resolved EP type (after EPLoader fallback). Subclasses use this
|
||||
// to branch on actual EP at inference time.
|
||||
EngineType getEngineType() const { return m_engineType; }
|
||||
|
||||
// Spin up a tiny CPU-only ORT session just long enough to read
|
||||
// the name of the model's first input, then tear it down. Used
|
||||
// by callers that need to build TRT profile-shape strings
|
||||
// (which require the input name) BEFORE the real session is
|
||||
// created. Returns an empty string on failure.
|
||||
static std::string QueryModelInputName(const std::string& onnxPath);
|
||||
private:
|
||||
void initialize_handler();
|
||||
protected:
|
||||
@@ -306,6 +369,7 @@ namespace ANSCENTER {
|
||||
|
||||
// EP-specific session option builders
|
||||
bool TryAppendCUDA(Ort::SessionOptions& opts);
|
||||
bool TryAppendTensorRT(Ort::SessionOptions& opts);
|
||||
bool TryAppendDirectML(Ort::SessionOptions& opts);
|
||||
bool TryAppendOpenVINO(Ort::SessionOptions& opts);
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user