Improve ALPR_OCR peformance
This commit is contained in:
@@ -11,13 +11,75 @@ namespace onnxocr {
|
||||
bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
|
||||
const std::string& clsModelPath,
|
||||
const std::string& recModelPath,
|
||||
const std::string& dictPath) {
|
||||
const std::string& dictPath,
|
||||
bool preferTensorRT) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
ModelLoadingGuard mlg(_modelLoading);
|
||||
|
||||
// High-perf options. The OCR sub-models split into two groups:
|
||||
//
|
||||
// 1. Detector — its input shape varies continuously with every
|
||||
// plate-ROI aspect ratio. TRT EP is a poor fit because it
|
||||
// builds a fresh engine for each unique shape (minutes each).
|
||||
// We keep it on CUDA EP with the largest cuDNN workspace and
|
||||
// let cuDNN HEURISTIC handle the per-shape algo selection.
|
||||
//
|
||||
// 2. Classifier + Recognizer — fixed-bucket shapes (cls is
|
||||
// [1,3,80,160], rec is [1,3,48,{320,480,640,960}]). These
|
||||
// benefit massively from TRT EP because the engine is built
|
||||
// once per shape and reused forever.
|
||||
OrtHandlerOptions detectorOpts;
|
||||
// Detector uses CUDA EP with *conservative* cuDNN workspace.
|
||||
// Empirical: on VRAM-constrained GPUs (LPD TRT engine + rec TRT
|
||||
// engine + ORT arena in play) the max-workspace mode causes cuDNN
|
||||
// to pick Winograd/implicit-precomp-GEMM variants that silently
|
||||
// fall back to slow NO-WORKSPACE algorithms when the big workspace
|
||||
// can't be allocated. With "0" cuDNN picks algorithms that are
|
||||
// known to fit and runs ~10x faster in practice.
|
||||
detectorOpts.useMaxCudnnWorkspace = false;
|
||||
detectorOpts.preferTensorRT = false; // never TRT for the detector
|
||||
|
||||
// Classifier (fixed [1,3,80,160]): TRT with no profile is fine.
|
||||
OrtHandlerOptions classifierOpts;
|
||||
classifierOpts.useMaxCudnnWorkspace = true;
|
||||
classifierOpts.preferTensorRT = preferTensorRT;
|
||||
classifierOpts.trtFP16 = true;
|
||||
|
||||
// Recognizer: needs a DYNAMIC profile so one TRT engine covers every
|
||||
// (batch, bucket_width) pair we generate at runtime. Without this,
|
||||
// each new shape triggers a ~80s engine rebuild mid-stream when a
|
||||
// new plate appears or the plate count changes.
|
||||
//
|
||||
// Profile range:
|
||||
// batch : 1 .. 16 (16 plates worth of crops is generous)
|
||||
// H : 48 (fixed)
|
||||
// W : 320 .. 960 (covers all 4 recognizer buckets)
|
||||
//
|
||||
// Query the actual input name from the .onnx file instead of
|
||||
// hardcoding — PaddleOCR usually exports it as "x" but the name can
|
||||
// vary across model versions.
|
||||
OrtHandlerOptions recognizerOpts;
|
||||
recognizerOpts.useMaxCudnnWorkspace = true;
|
||||
recognizerOpts.preferTensorRT = preferTensorRT;
|
||||
recognizerOpts.trtFP16 = true;
|
||||
if (preferTensorRT) {
|
||||
std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath);
|
||||
if (recInputName.empty()) {
|
||||
std::cerr << "[PaddleOCRV5Engine] Could not query recognizer "
|
||||
"input name — defaulting to 'x'" << std::endl;
|
||||
recInputName = "x";
|
||||
}
|
||||
std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
|
||||
<< recInputName << "' — building TRT dynamic profile "
|
||||
<< "[batch=1..16, W=320..960]" << std::endl;
|
||||
recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
|
||||
recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
|
||||
recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
|
||||
}
|
||||
|
||||
try {
|
||||
// Initialize detector (also triggers EPLoader init in BasicOrtHandler)
|
||||
detector_ = std::make_unique<ONNXOCRDetector>(detModelPath);
|
||||
detector_ = std::make_unique<ONNXOCRDetector>(detModelPath, detectorOpts);
|
||||
std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl;
|
||||
|
||||
// Ensure this DLL's copy of Ort::Global<void>::api_ is initialized.
|
||||
@@ -29,7 +91,7 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
|
||||
|
||||
// Initialize classifier (optional)
|
||||
if (!clsModelPath.empty()) {
|
||||
classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath);
|
||||
classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath, classifierOpts);
|
||||
std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl;
|
||||
}
|
||||
else {
|
||||
@@ -38,13 +100,26 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
|
||||
}
|
||||
|
||||
// Initialize recognizer
|
||||
recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath);
|
||||
recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath, recognizerOpts);
|
||||
if (!recognizer_->LoadDictionary(dictPath)) {
|
||||
std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl;
|
||||
return false;
|
||||
}
|
||||
std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl;
|
||||
|
||||
// Pre-warm classifier (fixed [1,3,80,160]) and recognizer (4
|
||||
// bucket widths) so the first frame doesn't pay the cuDNN/TRT
|
||||
// algorithm-selection tax. The detector is intentionally NOT
|
||||
// warmed up: its input shape varies continuously with each
|
||||
// plate-ROI aspect ratio, so a warmup at any single canonical
|
||||
// shape would cost minutes (TRT) or be useless (CUDA cache miss
|
||||
// on the real frame anyway). Real frames will pay the per-shape
|
||||
// cuDNN HEURISTIC cost on first use.
|
||||
std::cout << "[PaddleOCRV5Engine] Warming up OCR pipeline..." << std::endl;
|
||||
if (classifier_) classifier_->Warmup();
|
||||
if (recognizer_) recognizer_->Warmup();
|
||||
std::cout << "[PaddleOCRV5Engine] Warmup complete" << std::endl;
|
||||
|
||||
_initialized = true;
|
||||
std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl;
|
||||
return true;
|
||||
@@ -140,5 +215,18 @@ TextLine PaddleOCRV5Engine::recognizeOnly(const cv::Mat& croppedImage) {
|
||||
return recognizer_->Recognize(croppedImage);
|
||||
}
|
||||
|
||||
std::vector<TextLine> PaddleOCRV5Engine::recognizeMany(const std::vector<cv::Mat>& croppedImages) {
|
||||
if (_modelLoading.load()) return std::vector<TextLine>(croppedImages.size());
|
||||
{
|
||||
auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeMany");
|
||||
if (!lk.owns_lock()) return std::vector<TextLine>(croppedImages.size());
|
||||
if (!_initialized || !recognizer_ || croppedImages.empty()) {
|
||||
return std::vector<TextLine>(croppedImages.size());
|
||||
}
|
||||
}
|
||||
// Delegates to the bucketed, batched path in ONNXOCRRecognizer.
|
||||
return recognizer_->RecognizeBatch(croppedImages);
|
||||
}
|
||||
|
||||
} // namespace onnxocr
|
||||
} // namespace ANSCENTER
|
||||
|
||||
Reference in New Issue
Block a user