Support tracker to improve ALPR_OCR
This commit is contained in:
@@ -8,60 +8,62 @@
|
||||
namespace ANSCENTER {
|
||||
namespace onnxocr {
|
||||
|
||||
bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
|
||||
const std::string& clsModelPath,
|
||||
const std::string& recModelPath,
|
||||
const std::string& dictPath,
|
||||
bool preferTensorRT) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
ModelLoadingGuard mlg(_modelLoading);
|
||||
// ============================================================================
|
||||
// Per-backend OCR option builders
|
||||
//
|
||||
// Each backend (NVIDIA / AMD / Intel / CPU) has its own helper that returns
|
||||
// a fully-populated set of OrtHandlerOptions for the detector, classifier,
|
||||
// and recognizer sub-models. PaddleOCRV5Engine::Initialize dispatches to the
|
||||
// correct helper based on the engine type that EPLoader resolved at startup.
|
||||
//
|
||||
// Adding a new backend optimization is a strictly contained change: touch
|
||||
// only that backend's builder. The others — especially NVIDIA, which is
|
||||
// hand-tuned and should not regress — stay untouched.
|
||||
// ============================================================================
|
||||
|
||||
// High-perf options. The OCR sub-models split into two groups:
|
||||
//
|
||||
// 1. Detector — its input shape varies continuously with every
|
||||
// plate-ROI aspect ratio. TRT EP is a poor fit because it
|
||||
// builds a fresh engine for each unique shape (minutes each).
|
||||
// We keep it on CUDA EP with the largest cuDNN workspace and
|
||||
// let cuDNN HEURISTIC handle the per-shape algo selection.
|
||||
//
|
||||
// 2. Classifier + Recognizer — fixed-bucket shapes (cls is
|
||||
// [1,3,80,160], rec is [1,3,48,{320,480,640,960}]). These
|
||||
// benefit massively from TRT EP because the engine is built
|
||||
// once per shape and reused forever.
|
||||
namespace {
|
||||
|
||||
struct PerModelOcrOptions {
|
||||
OrtHandlerOptions detectorOpts;
|
||||
// Detector uses CUDA EP with *conservative* cuDNN workspace.
|
||||
// Empirical: on VRAM-constrained GPUs (LPD TRT engine + rec TRT
|
||||
// engine + ORT arena in play) the max-workspace mode causes cuDNN
|
||||
// to pick Winograd/implicit-precomp-GEMM variants that silently
|
||||
// fall back to slow NO-WORKSPACE algorithms when the big workspace
|
||||
// can't be allocated. With "0" cuDNN picks algorithms that are
|
||||
// known to fit and runs ~10x faster in practice.
|
||||
detectorOpts.useMaxCudnnWorkspace = false;
|
||||
detectorOpts.preferTensorRT = false; // never TRT for the detector
|
||||
|
||||
// Classifier (fixed [1,3,80,160]): TRT with no profile is fine.
|
||||
OrtHandlerOptions classifierOpts;
|
||||
classifierOpts.useMaxCudnnWorkspace = true;
|
||||
classifierOpts.preferTensorRT = preferTensorRT;
|
||||
classifierOpts.trtFP16 = true;
|
||||
|
||||
// Recognizer: needs a DYNAMIC profile so one TRT engine covers every
|
||||
// (batch, bucket_width) pair we generate at runtime. Without this,
|
||||
// each new shape triggers a ~80s engine rebuild mid-stream when a
|
||||
// new plate appears or the plate count changes.
|
||||
//
|
||||
// Profile range:
|
||||
// batch : 1 .. 16 (16 plates worth of crops is generous)
|
||||
// H : 48 (fixed)
|
||||
// W : 320 .. 960 (covers all 4 recognizer buckets)
|
||||
//
|
||||
// Query the actual input name from the .onnx file instead of
|
||||
// hardcoding — PaddleOCR usually exports it as "x" but the name can
|
||||
// vary across model versions.
|
||||
OrtHandlerOptions recognizerOpts;
|
||||
recognizerOpts.useMaxCudnnWorkspace = true;
|
||||
recognizerOpts.preferTensorRT = preferTensorRT;
|
||||
recognizerOpts.trtFP16 = true;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// NVIDIA — LOCKED. Do NOT modify this helper unless fixing a specific
|
||||
// NVIDIA-observable regression.
|
||||
//
|
||||
// The OCR sub-models split into two groups:
|
||||
// 1. Detector — variable input shape per plate-ROI aspect. TRT EP is a
|
||||
// poor fit (one engine build per unique shape, minutes each). Runs on
|
||||
// CUDA EP with *conservative* cuDNN workspace: empirical measurements
|
||||
// showed that max-workspace mode forces cuDNN to pick Winograd/
|
||||
// implicit-precomp-GEMM variants that silently fall back to slow
|
||||
// NO-WORKSPACE algorithms when the big workspace can't be allocated
|
||||
// under VRAM pressure (LPD TRT engine + rec TRT engine + ORT arena).
|
||||
// 2. Classifier + Recognizer — TRT EP. Classifier has fixed shape so no
|
||||
// profile is needed. Recognizer gets a dynamic profile
|
||||
// [batch=1..16, W=320..960] so a single pre-built engine handles every
|
||||
// runtime shape without mid-stream rebuilds (fixes 60–90 s hangs).
|
||||
// ----------------------------------------------------------------------------
|
||||
static PerModelOcrOptions BuildNvidiaOcrOptions(
|
||||
const std::string& recModelPath,
|
||||
bool preferTensorRT) {
|
||||
PerModelOcrOptions opts;
|
||||
|
||||
// Detector: CUDA EP, conservative workspace, never TRT.
|
||||
opts.detectorOpts.useMaxCudnnWorkspace = false;
|
||||
opts.detectorOpts.preferTensorRT = false;
|
||||
|
||||
// Classifier: TRT EP, no profile (fixed [1,3,80,160]).
|
||||
opts.classifierOpts.useMaxCudnnWorkspace = true;
|
||||
opts.classifierOpts.preferTensorRT = preferTensorRT;
|
||||
opts.classifierOpts.trtFP16 = true;
|
||||
|
||||
// Recognizer: TRT EP with dynamic shape profile.
|
||||
opts.recognizerOpts.useMaxCudnnWorkspace = true;
|
||||
opts.recognizerOpts.preferTensorRT = preferTensorRT;
|
||||
opts.recognizerOpts.trtFP16 = true;
|
||||
if (preferTensorRT) {
|
||||
std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath);
|
||||
if (recInputName.empty()) {
|
||||
@@ -72,10 +74,80 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
|
||||
std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
|
||||
<< recInputName << "' — building TRT dynamic profile "
|
||||
<< "[batch=1..16, W=320..960]" << std::endl;
|
||||
recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
|
||||
recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
|
||||
recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
|
||||
opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
|
||||
opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
|
||||
opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
|
||||
}
|
||||
return opts;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Intel (OpenVINO EP) — placeholder.
|
||||
//
|
||||
// Returns default-constructed options: no backend-specific tuning applied
|
||||
// yet. When adding Intel optimizations (OpenVINO cache_dir, explicit device
|
||||
// selection, INT8 paths, etc.), add the corresponding fields to the Intel
|
||||
// section of OrtHandlerOptions and populate them here.
|
||||
// ----------------------------------------------------------------------------
|
||||
static PerModelOcrOptions BuildIntelOcrOptions() {
|
||||
return PerModelOcrOptions{}; // defaults everywhere
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// AMD (DirectML EP / MIGraphX EP) — placeholder.
|
||||
//
|
||||
// Returns default-constructed options: no backend-specific tuning applied
|
||||
// yet. When adding AMD optimizations (graph opt gate for RDNA3+ desktop
|
||||
// cards, MIGraphX cache on Linux, etc.), add the corresponding fields to
|
||||
// the AMD section of OrtHandlerOptions and populate them here.
|
||||
// ----------------------------------------------------------------------------
|
||||
static PerModelOcrOptions BuildAmdOcrOptions() {
|
||||
return PerModelOcrOptions{}; // defaults everywhere
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// CPU / unknown hardware — no tuning.
|
||||
// ----------------------------------------------------------------------------
|
||||
static PerModelOcrOptions BuildDefaultOcrOptions() {
|
||||
return PerModelOcrOptions{}; // defaults everywhere
|
||||
}
|
||||
|
||||
// Dispatch entry point used by Initialize().
|
||||
static PerModelOcrOptions BuildOcrOptionsForBackend(
|
||||
const std::string& recModelPath,
|
||||
bool preferTensorRT) {
|
||||
const EngineType backend = EPLoader::Current().type;
|
||||
switch (backend) {
|
||||
case EngineType::NVIDIA_GPU:
|
||||
return BuildNvidiaOcrOptions(recModelPath, preferTensorRT);
|
||||
case EngineType::AMD_GPU:
|
||||
return BuildAmdOcrOptions();
|
||||
case EngineType::OPENVINO_GPU:
|
||||
return BuildIntelOcrOptions();
|
||||
case EngineType::CPU:
|
||||
default:
|
||||
return BuildDefaultOcrOptions();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace (anonymous)
|
||||
|
||||
bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
|
||||
const std::string& clsModelPath,
|
||||
const std::string& recModelPath,
|
||||
const std::string& dictPath,
|
||||
bool preferTensorRT) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
ModelLoadingGuard mlg(_modelLoading);
|
||||
|
||||
// Dispatch to the correct per-backend option builder. The NVIDIA path
|
||||
// is fully locked-in; AMD/Intel/CPU paths currently return defaults
|
||||
// and are the place to add future backend-specific tuning.
|
||||
const PerModelOcrOptions opts =
|
||||
BuildOcrOptionsForBackend(recModelPath, preferTensorRT);
|
||||
const OrtHandlerOptions& detectorOpts = opts.detectorOpts;
|
||||
const OrtHandlerOptions& classifierOpts = opts.classifierOpts;
|
||||
const OrtHandlerOptions& recognizerOpts = opts.recognizerOpts;
|
||||
|
||||
try {
|
||||
// Initialize detector (also triggers EPLoader init in BasicOrtHandler)
|
||||
|
||||
Reference in New Issue
Block a user