Fix 1 — Chunk oversized bucket groups (the correctness fix) ONNXOCRRecognizer::RecognizeBatch now slices each bucket group into chunks of ≤ kRecMaxBatch before submitting to TRT. A frame with 30 crops in bucket 320 produces two back-to-back batched calls (24 + 6), both within the profile, both on the fast path. Fix 2 — Raise the profile max from 16 to 24 (the performance fix) The old profile max was 16; your real scenes routinely hit 24. Raising the profile max to 24 means the common 12-plate scene (24 crops) fits in a single batched call with no chunking needed. Scenes with > 24 crops now use chunking, but that's rare.
310 lines
13 KiB
C++
310 lines
13 KiB
C++
#include "PaddleOCRV5Engine.h"
|
||
#include "EPLoader.h"
|
||
|
||
#include <opencv2/imgproc.hpp>
|
||
#include <iostream>
|
||
#include <algorithm>
|
||
|
||
namespace ANSCENTER {
|
||
namespace onnxocr {
|
||
|
||
// ============================================================================
|
||
// Per-backend OCR option builders
|
||
//
|
||
// Each backend (NVIDIA / AMD / Intel / CPU) has its own helper that returns
|
||
// a fully-populated set of OrtHandlerOptions for the detector, classifier,
|
||
// and recognizer sub-models. PaddleOCRV5Engine::Initialize dispatches to the
|
||
// correct helper based on the engine type that EPLoader resolved at startup.
|
||
//
|
||
// Adding a new backend optimization is a strictly contained change: touch
|
||
// only that backend's builder. The others — especially NVIDIA, which is
|
||
// hand-tuned and should not regress — stay untouched.
|
||
// ============================================================================
|
||
|
||
namespace {
|
||
|
||
struct PerModelOcrOptions {
|
||
OrtHandlerOptions detectorOpts;
|
||
OrtHandlerOptions classifierOpts;
|
||
OrtHandlerOptions recognizerOpts;
|
||
};
|
||
|
||
// ----------------------------------------------------------------------------
|
||
// NVIDIA — LOCKED. Do NOT modify this helper unless fixing a specific
|
||
// NVIDIA-observable regression.
|
||
//
|
||
// The OCR sub-models split into two groups:
|
||
// 1. Detector — variable input shape per plate-ROI aspect. TRT EP is a
|
||
// poor fit (one engine build per unique shape, minutes each). Runs on
|
||
// CUDA EP with *conservative* cuDNN workspace: empirical measurements
|
||
// showed that max-workspace mode forces cuDNN to pick Winograd/
|
||
// implicit-precomp-GEMM variants that silently fall back to slow
|
||
// NO-WORKSPACE algorithms when the big workspace can't be allocated
|
||
// under VRAM pressure (LPD TRT engine + rec TRT engine + ORT arena).
|
||
// 2. Classifier + Recognizer — TRT EP. Classifier has fixed shape so no
|
||
// profile is needed. Recognizer gets a dynamic profile
|
||
// [batch=1..16, W=320..960] so a single pre-built engine handles every
|
||
// runtime shape without mid-stream rebuilds (fixes 60–90 s hangs).
|
||
// ----------------------------------------------------------------------------
|
||
static PerModelOcrOptions BuildNvidiaOcrOptions(
|
||
const std::string& recModelPath,
|
||
bool preferTensorRT) {
|
||
PerModelOcrOptions opts;
|
||
|
||
// Detector: CUDA EP, conservative workspace, never TRT.
|
||
opts.detectorOpts.useMaxCudnnWorkspace = false;
|
||
opts.detectorOpts.preferTensorRT = false;
|
||
|
||
// Classifier: TRT EP, no profile (fixed [1,3,80,160]).
|
||
opts.classifierOpts.useMaxCudnnWorkspace = true;
|
||
opts.classifierOpts.preferTensorRT = preferTensorRT;
|
||
opts.classifierOpts.trtFP16 = true;
|
||
|
||
// Recognizer: TRT EP with dynamic shape profile. The max-batch
|
||
// dimension is kRecMaxBatch (defined in ONNXOCRTypes.h) — the same
|
||
// constant that ONNXOCRRecognizer::RecognizeBatch uses to chunk
|
||
// oversized bucket groups. Keeping them in lockstep ensures the
|
||
// recognizer never submits a shape that falls outside the TRT profile.
|
||
opts.recognizerOpts.useMaxCudnnWorkspace = true;
|
||
opts.recognizerOpts.preferTensorRT = preferTensorRT;
|
||
opts.recognizerOpts.trtFP16 = true;
|
||
if (preferTensorRT) {
|
||
std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath);
|
||
if (recInputName.empty()) {
|
||
std::cerr << "[PaddleOCRV5Engine] Could not query recognizer "
|
||
"input name — defaulting to 'x'" << std::endl;
|
||
recInputName = "x";
|
||
}
|
||
const std::string maxB = std::to_string(kRecMaxBatch);
|
||
std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
|
||
<< recInputName << "' — building TRT dynamic profile "
|
||
<< "[batch=1.." << maxB << ", W=320..960]" << std::endl;
|
||
opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
|
||
opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
|
||
opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":" + maxB + "x3x48x960";
|
||
}
|
||
return opts;
|
||
}
|
||
|
||
// ----------------------------------------------------------------------------
|
||
// Intel (OpenVINO EP) — placeholder.
|
||
//
|
||
// Returns default-constructed options: no backend-specific tuning applied
|
||
// yet. When adding Intel optimizations (OpenVINO cache_dir, explicit device
|
||
// selection, INT8 paths, etc.), add the corresponding fields to the Intel
|
||
// section of OrtHandlerOptions and populate them here.
|
||
// ----------------------------------------------------------------------------
|
||
static PerModelOcrOptions BuildIntelOcrOptions() {
|
||
return PerModelOcrOptions{}; // defaults everywhere
|
||
}
|
||
|
||
// ----------------------------------------------------------------------------
|
||
// AMD (DirectML EP / MIGraphX EP) — placeholder.
|
||
//
|
||
// Returns default-constructed options: no backend-specific tuning applied
|
||
// yet. When adding AMD optimizations (graph opt gate for RDNA3+ desktop
|
||
// cards, MIGraphX cache on Linux, etc.), add the corresponding fields to
|
||
// the AMD section of OrtHandlerOptions and populate them here.
|
||
// ----------------------------------------------------------------------------
|
||
static PerModelOcrOptions BuildAmdOcrOptions() {
|
||
return PerModelOcrOptions{}; // defaults everywhere
|
||
}
|
||
|
||
// ----------------------------------------------------------------------------
|
||
// CPU / unknown hardware — no tuning.
|
||
// ----------------------------------------------------------------------------
|
||
static PerModelOcrOptions BuildDefaultOcrOptions() {
|
||
return PerModelOcrOptions{}; // defaults everywhere
|
||
}
|
||
|
||
// Dispatch entry point used by Initialize().
|
||
static PerModelOcrOptions BuildOcrOptionsForBackend(
|
||
const std::string& recModelPath,
|
||
bool preferTensorRT) {
|
||
const EngineType backend = EPLoader::Current().type;
|
||
switch (backend) {
|
||
case EngineType::NVIDIA_GPU:
|
||
return BuildNvidiaOcrOptions(recModelPath, preferTensorRT);
|
||
case EngineType::AMD_GPU:
|
||
return BuildAmdOcrOptions();
|
||
case EngineType::OPENVINO_GPU:
|
||
return BuildIntelOcrOptions();
|
||
case EngineType::CPU:
|
||
default:
|
||
return BuildDefaultOcrOptions();
|
||
}
|
||
}
|
||
|
||
} // namespace (anonymous)
|
||
|
||
bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
|
||
const std::string& clsModelPath,
|
||
const std::string& recModelPath,
|
||
const std::string& dictPath,
|
||
bool preferTensorRT) {
|
||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||
ModelLoadingGuard mlg(_modelLoading);
|
||
|
||
// Dispatch to the correct per-backend option builder. The NVIDIA path
|
||
// is fully locked-in; AMD/Intel/CPU paths currently return defaults
|
||
// and are the place to add future backend-specific tuning.
|
||
const PerModelOcrOptions opts =
|
||
BuildOcrOptionsForBackend(recModelPath, preferTensorRT);
|
||
const OrtHandlerOptions& detectorOpts = opts.detectorOpts;
|
||
const OrtHandlerOptions& classifierOpts = opts.classifierOpts;
|
||
const OrtHandlerOptions& recognizerOpts = opts.recognizerOpts;
|
||
|
||
try {
|
||
// Initialize detector (also triggers EPLoader init in BasicOrtHandler)
|
||
detector_ = std::make_unique<ONNXOCRDetector>(detModelPath, detectorOpts);
|
||
std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl;
|
||
|
||
// Ensure this DLL's copy of Ort::Global<void>::api_ is initialized.
|
||
// BasicOrtHandler sets it in ONNXEngine.dll, but each DLL has its own
|
||
// inline-static copy. Without this, inference calls from ANSOCR.dll crash.
|
||
if (Ort::Global<void>::api_ == nullptr) {
|
||
Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
|
||
}
|
||
|
||
// Initialize classifier (optional)
|
||
if (!clsModelPath.empty()) {
|
||
classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath, classifierOpts);
|
||
std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl;
|
||
}
|
||
else {
|
||
classifier_.reset();
|
||
std::cout << "[PaddleOCRV5Engine] Classifier skipped (no model path)" << std::endl;
|
||
}
|
||
|
||
// Initialize recognizer
|
||
recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath, recognizerOpts);
|
||
if (!recognizer_->LoadDictionary(dictPath)) {
|
||
std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl;
|
||
return false;
|
||
}
|
||
std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl;
|
||
|
||
// Pre-warm classifier (fixed [1,3,80,160]) and recognizer (4
|
||
// bucket widths) so the first frame doesn't pay the cuDNN/TRT
|
||
// algorithm-selection tax. The detector is intentionally NOT
|
||
// warmed up: its input shape varies continuously with each
|
||
// plate-ROI aspect ratio, so a warmup at any single canonical
|
||
// shape would cost minutes (TRT) or be useless (CUDA cache miss
|
||
// on the real frame anyway). Real frames will pay the per-shape
|
||
// cuDNN HEURISTIC cost on first use.
|
||
std::cout << "[PaddleOCRV5Engine] Warming up OCR pipeline..." << std::endl;
|
||
if (classifier_) classifier_->Warmup();
|
||
if (recognizer_) recognizer_->Warmup();
|
||
std::cout << "[PaddleOCRV5Engine] Warmup complete" << std::endl;
|
||
|
||
_initialized = true;
|
||
std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl;
|
||
return true;
|
||
}
|
||
catch (const std::exception& e) {
|
||
std::cerr << "[PaddleOCRV5Engine] Initialization failed: " << e.what() << std::endl;
|
||
detector_.reset();
|
||
classifier_.reset();
|
||
recognizer_.reset();
|
||
_initialized = false;
|
||
return false;
|
||
}
|
||
}
|
||
|
||
std::vector<OCRPredictResult> PaddleOCRV5Engine::ocr(const cv::Mat& img) {
|
||
if (_modelLoading.load()) return {};
|
||
|
||
std::vector<OCRPredictResult> results;
|
||
|
||
{
|
||
auto lk = TryLockWithTimeout("PaddleOCRV5Engine::ocr");
|
||
if (!lk.owns_lock()) return results;
|
||
if (!_initialized || img.empty()) return results;
|
||
}
|
||
// _mutex released — heavy pipeline runs lock-free
|
||
|
||
// Step 1: Text Detection
|
||
auto boxes = detector_->Detect(img, _maxSideLen, _detDbThresh, _detBoxThresh, _detUnclipRatio, _useDilation);
|
||
|
||
if (boxes.empty()) {
|
||
return results;
|
||
}
|
||
|
||
// Step 2: Crop detected text regions
|
||
std::vector<cv::Mat> croppedImages;
|
||
croppedImages.reserve(boxes.size());
|
||
for (auto& box : boxes) {
|
||
cv::Mat cropped = GetRotateCropImage(img, box);
|
||
if (!cropped.empty()) {
|
||
croppedImages.push_back(cropped);
|
||
}
|
||
}
|
||
|
||
// Step 3: Classification (optional)
|
||
std::vector<int> cls_labels(croppedImages.size(), 0);
|
||
std::vector<float> cls_scores(croppedImages.size(), 0.0f);
|
||
|
||
if (classifier_) {
|
||
classifier_->Classify(croppedImages, cls_labels, cls_scores, _clsThresh);
|
||
|
||
// Rotate images classified as upside-down (label=1 and score > threshold)
|
||
for (size_t i = 0; i < croppedImages.size(); i++) {
|
||
if (cls_labels[i] % 2 == 1 && cls_scores[i] > _clsThresh) {
|
||
cv::rotate(croppedImages[i], croppedImages[i], cv::ROTATE_180);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Step 4: Text Recognition
|
||
auto textLines = recognizer_->RecognizeBatch(croppedImages);
|
||
|
||
// Step 5: Combine results
|
||
for (size_t i = 0; i < boxes.size() && i < textLines.size(); i++) {
|
||
OCRPredictResult result;
|
||
|
||
// Convert TextBox points to box format [[x0,y0], [x1,y1], [x2,y2], [x3,y3]]
|
||
result.box.resize(4);
|
||
for (int j = 0; j < 4; j++) {
|
||
result.box[j] = {
|
||
static_cast<int>(boxes[i].points[j].x),
|
||
static_cast<int>(boxes[i].points[j].y)
|
||
};
|
||
}
|
||
|
||
result.text = textLines[i].text;
|
||
result.score = textLines[i].score;
|
||
result.cls_label = cls_labels[i];
|
||
result.cls_score = cls_scores[i];
|
||
|
||
results.push_back(result);
|
||
}
|
||
|
||
return results;
|
||
}
|
||
|
||
TextLine PaddleOCRV5Engine::recognizeOnly(const cv::Mat& croppedImage) {
|
||
if (_modelLoading.load()) return { "", 0.0f };
|
||
{
|
||
auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeOnly");
|
||
if (!lk.owns_lock()) return { "", 0.0f };
|
||
if (!_initialized || !recognizer_ || croppedImage.empty()) return { "", 0.0f };
|
||
}
|
||
return recognizer_->Recognize(croppedImage);
|
||
}
|
||
|
||
std::vector<TextLine> PaddleOCRV5Engine::recognizeMany(const std::vector<cv::Mat>& croppedImages) {
|
||
if (_modelLoading.load()) return std::vector<TextLine>(croppedImages.size());
|
||
{
|
||
auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeMany");
|
||
if (!lk.owns_lock()) return std::vector<TextLine>(croppedImages.size());
|
||
if (!_initialized || !recognizer_ || croppedImages.empty()) {
|
||
return std::vector<TextLine>(croppedImages.size());
|
||
}
|
||
}
|
||
// Delegates to the bucketed, batched path in ONNXOCRRecognizer.
|
||
return recognizer_->RecognizeBatch(croppedImages);
|
||
}
|
||
|
||
} // namespace onnxocr
|
||
} // namespace ANSCENTER
|