Files
ANSCORE/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp

233 lines
9.6 KiB
C++
Raw Normal View History

2026-03-28 16:54:11 +11:00
#include "PaddleOCRV5Engine.h"
#include "EPLoader.h"
#include <opencv2/imgproc.hpp>
#include <iostream>
#include <algorithm>
namespace ANSCENTER {
namespace onnxocr {
bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
const std::string& clsModelPath,
const std::string& recModelPath,
2026-04-14 20:30:21 +10:00
const std::string& dictPath,
bool preferTensorRT) {
2026-03-28 16:54:11 +11:00
std::lock_guard<std::recursive_mutex> lock(_mutex);
2026-04-13 20:38:40 +10:00
ModelLoadingGuard mlg(_modelLoading);
2026-03-28 16:54:11 +11:00
2026-04-14 20:30:21 +10:00
// High-perf options. The OCR sub-models split into two groups:
//
// 1. Detector — its input shape varies continuously with every
// plate-ROI aspect ratio. TRT EP is a poor fit because it
// builds a fresh engine for each unique shape (minutes each).
// We keep it on CUDA EP with the largest cuDNN workspace and
// let cuDNN HEURISTIC handle the per-shape algo selection.
//
// 2. Classifier + Recognizer — fixed-bucket shapes (cls is
// [1,3,80,160], rec is [1,3,48,{320,480,640,960}]). These
// benefit massively from TRT EP because the engine is built
// once per shape and reused forever.
OrtHandlerOptions detectorOpts;
// Detector uses CUDA EP with *conservative* cuDNN workspace.
// Empirical: on VRAM-constrained GPUs (LPD TRT engine + rec TRT
// engine + ORT arena in play) the max-workspace mode causes cuDNN
// to pick Winograd/implicit-precomp-GEMM variants that silently
// fall back to slow NO-WORKSPACE algorithms when the big workspace
// can't be allocated. With "0" cuDNN picks algorithms that are
// known to fit and runs ~10x faster in practice.
detectorOpts.useMaxCudnnWorkspace = false;
detectorOpts.preferTensorRT = false; // never TRT for the detector
// Classifier (fixed [1,3,80,160]): TRT with no profile is fine.
OrtHandlerOptions classifierOpts;
classifierOpts.useMaxCudnnWorkspace = true;
classifierOpts.preferTensorRT = preferTensorRT;
classifierOpts.trtFP16 = true;
// Recognizer: needs a DYNAMIC profile so one TRT engine covers every
// (batch, bucket_width) pair we generate at runtime. Without this,
// each new shape triggers a ~80s engine rebuild mid-stream when a
// new plate appears or the plate count changes.
//
// Profile range:
// batch : 1 .. 16 (16 plates worth of crops is generous)
// H : 48 (fixed)
// W : 320 .. 960 (covers all 4 recognizer buckets)
//
// Query the actual input name from the .onnx file instead of
// hardcoding — PaddleOCR usually exports it as "x" but the name can
// vary across model versions.
OrtHandlerOptions recognizerOpts;
recognizerOpts.useMaxCudnnWorkspace = true;
recognizerOpts.preferTensorRT = preferTensorRT;
recognizerOpts.trtFP16 = true;
if (preferTensorRT) {
std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath);
if (recInputName.empty()) {
std::cerr << "[PaddleOCRV5Engine] Could not query recognizer "
"input name — defaulting to 'x'" << std::endl;
recInputName = "x";
}
std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
<< recInputName << "' — building TRT dynamic profile "
<< "[batch=1..16, W=320..960]" << std::endl;
recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
}
2026-03-28 16:54:11 +11:00
try {
// Initialize detector (also triggers EPLoader init in BasicOrtHandler)
2026-04-14 20:30:21 +10:00
detector_ = std::make_unique<ONNXOCRDetector>(detModelPath, detectorOpts);
2026-03-28 16:54:11 +11:00
std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl;
// Ensure this DLL's copy of Ort::Global<void>::api_ is initialized.
// BasicOrtHandler sets it in ONNXEngine.dll, but each DLL has its own
// inline-static copy. Without this, inference calls from ANSOCR.dll crash.
if (Ort::Global<void>::api_ == nullptr) {
Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
}
// Initialize classifier (optional)
if (!clsModelPath.empty()) {
2026-04-14 20:30:21 +10:00
classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath, classifierOpts);
2026-03-28 16:54:11 +11:00
std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl;
}
else {
classifier_.reset();
std::cout << "[PaddleOCRV5Engine] Classifier skipped (no model path)" << std::endl;
}
// Initialize recognizer
2026-04-14 20:30:21 +10:00
recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath, recognizerOpts);
2026-03-28 16:54:11 +11:00
if (!recognizer_->LoadDictionary(dictPath)) {
std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl;
return false;
}
std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl;
2026-04-14 20:30:21 +10:00
// Pre-warm classifier (fixed [1,3,80,160]) and recognizer (4
// bucket widths) so the first frame doesn't pay the cuDNN/TRT
// algorithm-selection tax. The detector is intentionally NOT
// warmed up: its input shape varies continuously with each
// plate-ROI aspect ratio, so a warmup at any single canonical
// shape would cost minutes (TRT) or be useless (CUDA cache miss
// on the real frame anyway). Real frames will pay the per-shape
// cuDNN HEURISTIC cost on first use.
std::cout << "[PaddleOCRV5Engine] Warming up OCR pipeline..." << std::endl;
if (classifier_) classifier_->Warmup();
if (recognizer_) recognizer_->Warmup();
std::cout << "[PaddleOCRV5Engine] Warmup complete" << std::endl;
2026-03-28 16:54:11 +11:00
_initialized = true;
std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl;
return true;
}
catch (const std::exception& e) {
std::cerr << "[PaddleOCRV5Engine] Initialization failed: " << e.what() << std::endl;
detector_.reset();
classifier_.reset();
recognizer_.reset();
_initialized = false;
return false;
}
}
std::vector<OCRPredictResult> PaddleOCRV5Engine::ocr(const cv::Mat& img) {
2026-04-13 20:38:40 +10:00
if (_modelLoading.load()) return {};
2026-03-28 16:54:11 +11:00
std::vector<OCRPredictResult> results;
2026-04-13 20:38:40 +10:00
{
auto lk = TryLockWithTimeout("PaddleOCRV5Engine::ocr");
if (!lk.owns_lock()) return results;
if (!_initialized || img.empty()) return results;
2026-03-28 16:54:11 +11:00
}
2026-04-13 20:38:40 +10:00
// _mutex released — heavy pipeline runs lock-free
2026-03-28 16:54:11 +11:00
// Step 1: Text Detection
auto boxes = detector_->Detect(img, _maxSideLen, _detDbThresh, _detBoxThresh, _detUnclipRatio, _useDilation);
if (boxes.empty()) {
return results;
}
// Step 2: Crop detected text regions
std::vector<cv::Mat> croppedImages;
croppedImages.reserve(boxes.size());
for (auto& box : boxes) {
cv::Mat cropped = GetRotateCropImage(img, box);
if (!cropped.empty()) {
croppedImages.push_back(cropped);
}
}
// Step 3: Classification (optional)
std::vector<int> cls_labels(croppedImages.size(), 0);
std::vector<float> cls_scores(croppedImages.size(), 0.0f);
if (classifier_) {
classifier_->Classify(croppedImages, cls_labels, cls_scores, _clsThresh);
// Rotate images classified as upside-down (label=1 and score > threshold)
for (size_t i = 0; i < croppedImages.size(); i++) {
if (cls_labels[i] % 2 == 1 && cls_scores[i] > _clsThresh) {
cv::rotate(croppedImages[i], croppedImages[i], cv::ROTATE_180);
}
}
}
// Step 4: Text Recognition
auto textLines = recognizer_->RecognizeBatch(croppedImages);
// Step 5: Combine results
for (size_t i = 0; i < boxes.size() && i < textLines.size(); i++) {
OCRPredictResult result;
// Convert TextBox points to box format [[x0,y0], [x1,y1], [x2,y2], [x3,y3]]
result.box.resize(4);
for (int j = 0; j < 4; j++) {
result.box[j] = {
static_cast<int>(boxes[i].points[j].x),
static_cast<int>(boxes[i].points[j].y)
};
}
result.text = textLines[i].text;
result.score = textLines[i].score;
result.cls_label = cls_labels[i];
result.cls_score = cls_scores[i];
results.push_back(result);
}
return results;
}
TextLine PaddleOCRV5Engine::recognizeOnly(const cv::Mat& croppedImage) {
2026-04-13 20:38:40 +10:00
if (_modelLoading.load()) return { "", 0.0f };
{
auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeOnly");
if (!lk.owns_lock()) return { "", 0.0f };
if (!_initialized || !recognizer_ || croppedImage.empty()) return { "", 0.0f };
}
return recognizer_->Recognize(croppedImage);
}
2026-04-14 20:30:21 +10:00
std::vector<TextLine> PaddleOCRV5Engine::recognizeMany(const std::vector<cv::Mat>& croppedImages) {
if (_modelLoading.load()) return std::vector<TextLine>(croppedImages.size());
{
auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeMany");
if (!lk.owns_lock()) return std::vector<TextLine>(croppedImages.size());
if (!_initialized || !recognizer_ || croppedImages.empty()) {
return std::vector<TextLine>(croppedImages.size());
}
}
// Delegates to the bucketed, batched path in ONNXOCRRecognizer.
return recognizer_->RecognizeBatch(croppedImages);
}
2026-03-28 16:54:11 +11:00
} // namespace onnxocr
} // namespace ANSCENTER