Files
ANSCORE/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
Tuan Nghia Nguyen 7778f8c214 Two-part fix
Fix 1 — Chunk oversized bucket groups (the correctness fix)
ONNXOCRRecognizer::RecognizeBatch now slices each bucket group into chunks of ≤ kRecMaxBatch before submitting to TRT. A frame with 30 crops in bucket 320 produces two back-to-back batched calls (24 + 6), both within the profile, both on the fast path.

Fix 2 — Raise the profile max from 16 to 24 (the performance fix)
The old profile max was 16; your real scenes routinely hit 24. Raising the profile max to 24 means the common 12-plate scene (24 crops) fits in a single batched call with no chunking needed. Scenes with > 24 crops now use chunking, but that's rare.
2026-04-15 07:27:55 +10:00

310 lines
13 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include "PaddleOCRV5Engine.h"
#include "EPLoader.h"
#include <opencv2/imgproc.hpp>
#include <iostream>
#include <algorithm>
namespace ANSCENTER {
namespace onnxocr {
// ============================================================================
// Per-backend OCR option builders
//
// Each backend (NVIDIA / AMD / Intel / CPU) has its own helper that returns
// a fully-populated set of OrtHandlerOptions for the detector, classifier,
// and recognizer sub-models. PaddleOCRV5Engine::Initialize dispatches to the
// correct helper based on the engine type that EPLoader resolved at startup.
//
// Adding a new backend optimization is a strictly contained change: touch
// only that backend's builder. The others — especially NVIDIA, which is
// hand-tuned and should not regress — stay untouched.
// ============================================================================
namespace {
struct PerModelOcrOptions {
OrtHandlerOptions detectorOpts;
OrtHandlerOptions classifierOpts;
OrtHandlerOptions recognizerOpts;
};
// ----------------------------------------------------------------------------
// NVIDIA — LOCKED. Do NOT modify this helper unless fixing a specific
// NVIDIA-observable regression.
//
// The OCR sub-models split into two groups:
// 1. Detector — variable input shape per plate-ROI aspect. TRT EP is a
// poor fit (one engine build per unique shape, minutes each). Runs on
// CUDA EP with *conservative* cuDNN workspace: empirical measurements
// showed that max-workspace mode forces cuDNN to pick Winograd/
// implicit-precomp-GEMM variants that silently fall back to slow
// NO-WORKSPACE algorithms when the big workspace can't be allocated
// under VRAM pressure (LPD TRT engine + rec TRT engine + ORT arena).
// 2. Classifier + Recognizer — TRT EP. Classifier has fixed shape so no
// profile is needed. Recognizer gets a dynamic profile
// [batch=1..16, W=320..960] so a single pre-built engine handles every
// runtime shape without mid-stream rebuilds (fixes 6090 s hangs).
// ----------------------------------------------------------------------------
static PerModelOcrOptions BuildNvidiaOcrOptions(
const std::string& recModelPath,
bool preferTensorRT) {
PerModelOcrOptions opts;
// Detector: CUDA EP, conservative workspace, never TRT.
opts.detectorOpts.useMaxCudnnWorkspace = false;
opts.detectorOpts.preferTensorRT = false;
// Classifier: TRT EP, no profile (fixed [1,3,80,160]).
opts.classifierOpts.useMaxCudnnWorkspace = true;
opts.classifierOpts.preferTensorRT = preferTensorRT;
opts.classifierOpts.trtFP16 = true;
// Recognizer: TRT EP with dynamic shape profile. The max-batch
// dimension is kRecMaxBatch (defined in ONNXOCRTypes.h) — the same
// constant that ONNXOCRRecognizer::RecognizeBatch uses to chunk
// oversized bucket groups. Keeping them in lockstep ensures the
// recognizer never submits a shape that falls outside the TRT profile.
opts.recognizerOpts.useMaxCudnnWorkspace = true;
opts.recognizerOpts.preferTensorRT = preferTensorRT;
opts.recognizerOpts.trtFP16 = true;
if (preferTensorRT) {
std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath);
if (recInputName.empty()) {
std::cerr << "[PaddleOCRV5Engine] Could not query recognizer "
"input name — defaulting to 'x'" << std::endl;
recInputName = "x";
}
const std::string maxB = std::to_string(kRecMaxBatch);
std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
<< recInputName << "' — building TRT dynamic profile "
<< "[batch=1.." << maxB << ", W=320..960]" << std::endl;
opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":" + maxB + "x3x48x960";
}
return opts;
}
// ----------------------------------------------------------------------------
// Intel (OpenVINO EP) — placeholder.
//
// Returns default-constructed options: no backend-specific tuning applied
// yet. When adding Intel optimizations (OpenVINO cache_dir, explicit device
// selection, INT8 paths, etc.), add the corresponding fields to the Intel
// section of OrtHandlerOptions and populate them here.
// ----------------------------------------------------------------------------
static PerModelOcrOptions BuildIntelOcrOptions() {
return PerModelOcrOptions{}; // defaults everywhere
}
// ----------------------------------------------------------------------------
// AMD (DirectML EP / MIGraphX EP) — placeholder.
//
// Returns default-constructed options: no backend-specific tuning applied
// yet. When adding AMD optimizations (graph opt gate for RDNA3+ desktop
// cards, MIGraphX cache on Linux, etc.), add the corresponding fields to
// the AMD section of OrtHandlerOptions and populate them here.
// ----------------------------------------------------------------------------
static PerModelOcrOptions BuildAmdOcrOptions() {
return PerModelOcrOptions{}; // defaults everywhere
}
// ----------------------------------------------------------------------------
// CPU / unknown hardware — no tuning.
// ----------------------------------------------------------------------------
static PerModelOcrOptions BuildDefaultOcrOptions() {
return PerModelOcrOptions{}; // defaults everywhere
}
// Dispatch entry point used by Initialize().
static PerModelOcrOptions BuildOcrOptionsForBackend(
const std::string& recModelPath,
bool preferTensorRT) {
const EngineType backend = EPLoader::Current().type;
switch (backend) {
case EngineType::NVIDIA_GPU:
return BuildNvidiaOcrOptions(recModelPath, preferTensorRT);
case EngineType::AMD_GPU:
return BuildAmdOcrOptions();
case EngineType::OPENVINO_GPU:
return BuildIntelOcrOptions();
case EngineType::CPU:
default:
return BuildDefaultOcrOptions();
}
}
} // namespace (anonymous)
bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
const std::string& clsModelPath,
const std::string& recModelPath,
const std::string& dictPath,
bool preferTensorRT) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
ModelLoadingGuard mlg(_modelLoading);
// Dispatch to the correct per-backend option builder. The NVIDIA path
// is fully locked-in; AMD/Intel/CPU paths currently return defaults
// and are the place to add future backend-specific tuning.
const PerModelOcrOptions opts =
BuildOcrOptionsForBackend(recModelPath, preferTensorRT);
const OrtHandlerOptions& detectorOpts = opts.detectorOpts;
const OrtHandlerOptions& classifierOpts = opts.classifierOpts;
const OrtHandlerOptions& recognizerOpts = opts.recognizerOpts;
try {
// Initialize detector (also triggers EPLoader init in BasicOrtHandler)
detector_ = std::make_unique<ONNXOCRDetector>(detModelPath, detectorOpts);
std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl;
// Ensure this DLL's copy of Ort::Global<void>::api_ is initialized.
// BasicOrtHandler sets it in ONNXEngine.dll, but each DLL has its own
// inline-static copy. Without this, inference calls from ANSOCR.dll crash.
if (Ort::Global<void>::api_ == nullptr) {
Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
}
// Initialize classifier (optional)
if (!clsModelPath.empty()) {
classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath, classifierOpts);
std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl;
}
else {
classifier_.reset();
std::cout << "[PaddleOCRV5Engine] Classifier skipped (no model path)" << std::endl;
}
// Initialize recognizer
recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath, recognizerOpts);
if (!recognizer_->LoadDictionary(dictPath)) {
std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl;
return false;
}
std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl;
// Pre-warm classifier (fixed [1,3,80,160]) and recognizer (4
// bucket widths) so the first frame doesn't pay the cuDNN/TRT
// algorithm-selection tax. The detector is intentionally NOT
// warmed up: its input shape varies continuously with each
// plate-ROI aspect ratio, so a warmup at any single canonical
// shape would cost minutes (TRT) or be useless (CUDA cache miss
// on the real frame anyway). Real frames will pay the per-shape
// cuDNN HEURISTIC cost on first use.
std::cout << "[PaddleOCRV5Engine] Warming up OCR pipeline..." << std::endl;
if (classifier_) classifier_->Warmup();
if (recognizer_) recognizer_->Warmup();
std::cout << "[PaddleOCRV5Engine] Warmup complete" << std::endl;
_initialized = true;
std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl;
return true;
}
catch (const std::exception& e) {
std::cerr << "[PaddleOCRV5Engine] Initialization failed: " << e.what() << std::endl;
detector_.reset();
classifier_.reset();
recognizer_.reset();
_initialized = false;
return false;
}
}
std::vector<OCRPredictResult> PaddleOCRV5Engine::ocr(const cv::Mat& img) {
if (_modelLoading.load()) return {};
std::vector<OCRPredictResult> results;
{
auto lk = TryLockWithTimeout("PaddleOCRV5Engine::ocr");
if (!lk.owns_lock()) return results;
if (!_initialized || img.empty()) return results;
}
// _mutex released — heavy pipeline runs lock-free
// Step 1: Text Detection
auto boxes = detector_->Detect(img, _maxSideLen, _detDbThresh, _detBoxThresh, _detUnclipRatio, _useDilation);
if (boxes.empty()) {
return results;
}
// Step 2: Crop detected text regions
std::vector<cv::Mat> croppedImages;
croppedImages.reserve(boxes.size());
for (auto& box : boxes) {
cv::Mat cropped = GetRotateCropImage(img, box);
if (!cropped.empty()) {
croppedImages.push_back(cropped);
}
}
// Step 3: Classification (optional)
std::vector<int> cls_labels(croppedImages.size(), 0);
std::vector<float> cls_scores(croppedImages.size(), 0.0f);
if (classifier_) {
classifier_->Classify(croppedImages, cls_labels, cls_scores, _clsThresh);
// Rotate images classified as upside-down (label=1 and score > threshold)
for (size_t i = 0; i < croppedImages.size(); i++) {
if (cls_labels[i] % 2 == 1 && cls_scores[i] > _clsThresh) {
cv::rotate(croppedImages[i], croppedImages[i], cv::ROTATE_180);
}
}
}
// Step 4: Text Recognition
auto textLines = recognizer_->RecognizeBatch(croppedImages);
// Step 5: Combine results
for (size_t i = 0; i < boxes.size() && i < textLines.size(); i++) {
OCRPredictResult result;
// Convert TextBox points to box format [[x0,y0], [x1,y1], [x2,y2], [x3,y3]]
result.box.resize(4);
for (int j = 0; j < 4; j++) {
result.box[j] = {
static_cast<int>(boxes[i].points[j].x),
static_cast<int>(boxes[i].points[j].y)
};
}
result.text = textLines[i].text;
result.score = textLines[i].score;
result.cls_label = cls_labels[i];
result.cls_score = cls_scores[i];
results.push_back(result);
}
return results;
}
TextLine PaddleOCRV5Engine::recognizeOnly(const cv::Mat& croppedImage) {
if (_modelLoading.load()) return { "", 0.0f };
{
auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeOnly");
if (!lk.owns_lock()) return { "", 0.0f };
if (!_initialized || !recognizer_ || croppedImage.empty()) return { "", 0.0f };
}
return recognizer_->Recognize(croppedImage);
}
std::vector<TextLine> PaddleOCRV5Engine::recognizeMany(const std::vector<cv::Mat>& croppedImages) {
if (_modelLoading.load()) return std::vector<TextLine>(croppedImages.size());
{
auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeMany");
if (!lk.owns_lock()) return std::vector<TextLine>(croppedImages.size());
if (!_initialized || !recognizer_ || croppedImages.empty()) {
return std::vector<TextLine>(croppedImages.size());
}
}
// Delegates to the bucketed, batched path in ONNXOCRRecognizer.
return recognizer_->RecognizeBatch(croppedImages);
}
} // namespace onnxocr
} // namespace ANSCENTER