Improve ALPR_OCR peformance

This commit is contained in:
2026-04-14 20:30:21 +10:00
parent 3349b45ade
commit f9a0af8949
18 changed files with 991 additions and 77 deletions

View File

@@ -7,6 +7,7 @@
#include <cmath>
#include <cfloat>
#include <cstring>
#include <chrono>
namespace ANSCENTER {
namespace onnxocr {
@@ -15,6 +16,12 @@ ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path, unsigned int
: BasicOrtHandler(onnx_path, num_threads) {
}
ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path,
const OrtHandlerOptions& options,
unsigned int num_threads)
: BasicOrtHandler(onnx_path, options, num_threads) {
}
bool ONNXOCRRecognizer::LoadDictionary(const std::string& dictPath) {
keys_ = LoadDict(dictPath);
if (keys_.size() < 2) {
@@ -46,6 +53,54 @@ Ort::Value ONNXOCRRecognizer::transformBatch(const std::vector<cv::Mat>& images)
return Ort::Value(nullptr);
}
// ----------------------------------------------------------------------------
// Width buckets — every recognizer input is padded up to one of these widths
// before reaching ORT. This bounds the number of distinct shapes cuDNN ever
// sees to four, so its HEURISTIC algorithm cache hits on every subsequent
// call instead of re-tuning per plate. Buckets cover the realistic range:
// 320 px → short Latin/Japanese plates (most common)
// 480 px → wider Latin plates with two rows of text
// 640 px → long single-row plates / multi-line stacked text
// 960 px → safety upper bound (== kRecImgMaxW)
// ----------------------------------------------------------------------------
static constexpr int kRecBucketWidths[] = { 320, 480, 640, 960 };
static constexpr int kRecNumBuckets = sizeof(kRecBucketWidths) / sizeof(kRecBucketWidths[0]);
int ONNXOCRRecognizer::RoundUpToBucket(int resizedW) const {
const int capped = std::min(resizedW, imgMaxW_);
for (int b = 0; b < kRecNumBuckets; ++b) {
if (kRecBucketWidths[b] >= capped) return kRecBucketWidths[b];
}
return imgMaxW_;
}
// Resize + normalize a single crop into a CHW float vector at width
// `bucketW`, padding with zeros on the right when needed. The returned
// vector has exactly 3*imgH_*bucketW elements.
static std::vector<float> PreprocessCropToBucket(const cv::Mat& crop,
int imgH, int bucketW) {
cv::Mat resized = ResizeRecImage(crop, imgH, bucketW);
int resizedW = resized.cols;
resized.convertTo(resized, CV_32FC3);
auto normalizedData = NormalizeAndPermuteCls(resized);
if (resizedW == bucketW) {
return normalizedData;
}
// Zero-pad on the right (CHW layout)
std::vector<float> padded(3 * imgH * bucketW, 0.0f);
for (int c = 0; c < 3; c++) {
for (int y = 0; y < imgH; y++) {
std::memcpy(
&padded[c * imgH * bucketW + y * bucketW],
&normalizedData[c * imgH * resizedW + y * resizedW],
resizedW * sizeof(float));
}
}
return padded;
}
TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
std::lock_guard<std::mutex> lock(_mutex);
@@ -54,52 +109,27 @@ TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
}
try {
// Preprocess: resize to fixed height, proportional width
// Step 1: aspect-preserving resize to height=imgH_, width capped
// at imgMaxW_. Then round resized width up to the next bucket.
cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
int resizedW = resized.cols;
const int bucketW = RoundUpToBucket(resized.cols);
resized.convertTo(resized, CV_32FC3);
// Recognition uses (pixel/255 - 0.5) / 0.5 normalization (same as classifier)
auto normalizedData = NormalizeAndPermuteCls(resized);
std::vector<float> inputData = PreprocessCropToBucket(croppedImage, imgH_, bucketW);
// Pad to at least kRecImgW width (matching official PaddleOCR behavior)
// Official PaddleOCR: padding_im = np.zeros((C, H, W)), then copies normalized
// image into left portion. Padding value = 0.0 in normalized space.
int imgW = std::max(resizedW, kRecImgW);
std::vector<float> inputData;
if (imgW > resizedW) {
// Zero-pad on the right (CHW layout)
inputData.resize(3 * imgH_ * imgW, 0.0f);
for (int c = 0; c < 3; c++) {
for (int y = 0; y < imgH_; y++) {
std::memcpy(
&inputData[c * imgH_ * imgW + y * imgW],
&normalizedData[c * imgH_ * resizedW + y * resizedW],
resizedW * sizeof(float));
}
}
} else {
inputData = std::move(normalizedData);
}
// Create input tensor with (possibly padded) width
std::array<int64_t, 4> inputShape = { 1, 3, imgH_, imgW };
std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size());
// Run inference
auto outputTensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
// Get output
float* outputData = outputTensors[0].GetTensorMutableData<float>();
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
int seqLen = static_cast<int>(outputShape[1]);
int seqLen = static_cast<int>(outputShape[1]);
int numClasses = static_cast<int>(outputShape[2]);
return CTCDecode(outputData, seqLen, numClasses);
@@ -110,18 +140,162 @@ TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
}
}
std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
std::vector<TextLine> results;
results.reserve(croppedImages.size());
void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector<cv::Mat>& crops,
const std::vector<size_t>& origIndices,
int bucketW,
std::vector<TextLine>& out) {
if (crops.empty()) return;
// Process one at a time (dynamic width per image)
for (size_t i = 0; i < croppedImages.size(); i++) {
results.push_back(Recognize(croppedImages[i]));
try {
const size_t batchN = crops.size();
const size_t perImage = static_cast<size_t>(3) * imgH_ * bucketW;
// Stack N preprocessed crops into one [N,3,H,W] buffer
std::vector<float> batchInput(batchN * perImage, 0.0f);
for (size_t i = 0; i < batchN; ++i) {
auto img = PreprocessCropToBucket(crops[i], imgH_, bucketW);
std::memcpy(&batchInput[i * perImage], img.data(),
perImage * sizeof(float));
}
std::array<int64_t, 4> inputShape = {
static_cast<int64_t>(batchN), 3,
static_cast<int64_t>(imgH_),
static_cast<int64_t>(bucketW)
};
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, batchInput.data(), batchInput.size(),
inputShape.data(), inputShape.size());
auto outputTensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
float* outputData = outputTensors[0].GetTensorMutableData<float>();
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
// Expected output: [N, seqLen, numClasses]
if (outputShape.size() < 3) {
std::cerr << "[ONNXOCRRecognizer] Unexpected batch output rank: "
<< outputShape.size() << std::endl;
return;
}
const int outBatch = static_cast<int>(outputShape[0]);
const int seqLen = static_cast<int>(outputShape[1]);
const int numClasses = static_cast<int>(outputShape[2]);
const size_t perRow = static_cast<size_t>(seqLen) * numClasses;
for (int i = 0; i < outBatch && i < static_cast<int>(batchN); ++i) {
TextLine tl = CTCDecode(outputData + i * perRow, seqLen, numClasses);
out[origIndices[i]] = std::move(tl);
}
}
catch (const Ort::Exception& e) {
// ORT will throw if the model doesn't support a batch dimension > 1.
// Fall back to per-image inference for this group.
std::cerr << "[ONNXOCRRecognizer] Batch inference failed at bucketW="
<< bucketW << " (" << e.what()
<< ") — falling back to single-image path." << std::endl;
for (size_t i = 0; i < crops.size(); ++i) {
// Direct call (we already hold _mutex via the public RecognizeBatch
// wrapper). Replicate the single-image preprocessing here to avoid
// re-entering Recognize() and double-locking the mutex.
try {
cv::Mat resized = ResizeRecImage(crops[i], imgH_, imgMaxW_);
int singleBucket = RoundUpToBucket(resized.cols);
auto inputData = PreprocessCropToBucket(crops[i], imgH_, singleBucket);
std::array<int64_t, 4> inputShape = { 1, 3, imgH_, singleBucket };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size());
auto outputTensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
float* outData = outputTensors[0].GetTensorMutableData<float>();
auto outShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
int seqLen = static_cast<int>(outShape[1]);
int numClasses = static_cast<int>(outShape[2]);
out[origIndices[i]] = CTCDecode(outData, seqLen, numClasses);
} catch (const Ort::Exception& e2) {
std::cerr << "[ONNXOCRRecognizer] Single-image fallback also failed: "
<< e2.what() << std::endl;
out[origIndices[i]] = {};
}
}
}
}
std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
std::lock_guard<std::mutex> lock(_mutex);
std::vector<TextLine> results(croppedImages.size());
if (!ort_session || croppedImages.empty() || keys_.empty()) {
return results;
}
// Group crops by their target bucket width
std::vector<std::vector<cv::Mat>> groupCrops(kRecNumBuckets);
std::vector<std::vector<size_t>> groupIdx(kRecNumBuckets);
for (size_t i = 0; i < croppedImages.size(); ++i) {
if (croppedImages[i].empty()) continue;
cv::Mat resized = ResizeRecImage(croppedImages[i], imgH_, imgMaxW_);
const int bw = RoundUpToBucket(resized.cols);
// Find bucket index
int bucketIdx = kRecNumBuckets - 1;
for (int b = 0; b < kRecNumBuckets; ++b) {
if (kRecBucketWidths[b] == bw) { bucketIdx = b; break; }
}
groupCrops[bucketIdx].push_back(croppedImages[i]);
groupIdx[bucketIdx].push_back(i);
}
// Run one batched inference per non-empty bucket
for (int b = 0; b < kRecNumBuckets; ++b) {
if (groupCrops[b].empty()) continue;
RunBatchAtWidth(groupCrops[b], groupIdx[b], kRecBucketWidths[b], results);
}
return results;
}
void ONNXOCRRecognizer::Warmup() {
std::lock_guard<std::mutex> lock(_mutex);
if (_warmedUp || !ort_session) return;
// Dummy 3-channel image, mid-grey, large enough to resize to imgH_
cv::Mat dummy(imgH_ * 2, kRecBucketWidths[kRecNumBuckets - 1] * 2,
CV_8UC3, cv::Scalar(128, 128, 128));
for (int b = 0; b < kRecNumBuckets; ++b) {
const int bucketW = kRecBucketWidths[b];
try {
auto inputData = PreprocessCropToBucket(dummy, imgH_, bucketW);
std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size());
auto t0 = std::chrono::high_resolution_clock::now();
(void)ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
auto t1 = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
std::cout << "[ONNXOCRRecognizer] Warmup bucketW=" << bucketW
<< " " << ms << " ms" << std::endl;
}
catch (const Ort::Exception& e) {
std::cerr << "[ONNXOCRRecognizer] Warmup failed at bucketW="
<< bucketW << ": " << e.what() << std::endl;
}
}
_warmedUp = true;
}
TextLine ONNXOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
TextLine result;
std::string text;