2026-03-28 16:54:11 +11:00
|
|
|
#include "ONNXOCRRecognizer.h"
|
|
|
|
|
|
|
|
|
|
#include <opencv2/imgproc.hpp>
|
|
|
|
|
#include <iostream>
|
|
|
|
|
#include <algorithm>
|
|
|
|
|
#include <numeric>
|
|
|
|
|
#include <cmath>
|
|
|
|
|
#include <cfloat>
|
|
|
|
|
#include <cstring>
|
2026-04-14 20:30:21 +10:00
|
|
|
#include <chrono>
|
2026-03-28 16:54:11 +11:00
|
|
|
|
|
|
|
|
namespace ANSCENTER {
|
|
|
|
|
namespace onnxocr {
|
|
|
|
|
|
|
|
|
|
ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads)
|
|
|
|
|
: BasicOrtHandler(onnx_path, num_threads) {
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-14 20:30:21 +10:00
|
|
|
ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path,
|
|
|
|
|
const OrtHandlerOptions& options,
|
|
|
|
|
unsigned int num_threads)
|
|
|
|
|
: BasicOrtHandler(onnx_path, options, num_threads) {
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
bool ONNXOCRRecognizer::LoadDictionary(const std::string& dictPath) {
|
|
|
|
|
keys_ = LoadDict(dictPath);
|
|
|
|
|
if (keys_.size() < 2) {
|
|
|
|
|
std::cerr << "[ONNXOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
std::cout << "[ONNXOCRRecognizer] Loaded dictionary with " << keys_.size()
|
|
|
|
|
<< " characters from: " << dictPath << std::endl;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ort::Value ONNXOCRRecognizer::transform(const cv::Mat& mat) {
|
|
|
|
|
// Not used directly - recognition uses custom preprocess with dynamic width
|
|
|
|
|
cv::Mat resized = ResizeRecImage(mat, imgH_, imgMaxW_);
|
|
|
|
|
resized.convertTo(resized, CV_32FC3);
|
|
|
|
|
auto data = NormalizeAndPermuteCls(resized);
|
|
|
|
|
|
|
|
|
|
input_values_handler.assign(data.begin(), data.end());
|
|
|
|
|
return Ort::Value::CreateTensor<float>(
|
|
|
|
|
*memory_info_handler, input_values_handler.data(), input_values_handler.size(),
|
|
|
|
|
input_node_dims.data(), input_node_dims.size());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ort::Value ONNXOCRRecognizer::transformBatch(const std::vector<cv::Mat>& images) {
|
|
|
|
|
// Not used - recognizer processes single images with dynamic widths
|
|
|
|
|
if (!images.empty()) {
|
|
|
|
|
return transform(images[0]);
|
|
|
|
|
}
|
|
|
|
|
return Ort::Value(nullptr);
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-14 20:30:21 +10:00
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
|
// Width buckets — every recognizer input is padded up to one of these widths
|
|
|
|
|
// before reaching ORT. This bounds the number of distinct shapes cuDNN ever
|
|
|
|
|
// sees to four, so its HEURISTIC algorithm cache hits on every subsequent
|
|
|
|
|
// call instead of re-tuning per plate. Buckets cover the realistic range:
|
|
|
|
|
// 320 px → short Latin/Japanese plates (most common)
|
|
|
|
|
// 480 px → wider Latin plates with two rows of text
|
|
|
|
|
// 640 px → long single-row plates / multi-line stacked text
|
|
|
|
|
// 960 px → safety upper bound (== kRecImgMaxW)
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
|
static constexpr int kRecBucketWidths[] = { 320, 480, 640, 960 };
|
|
|
|
|
static constexpr int kRecNumBuckets = sizeof(kRecBucketWidths) / sizeof(kRecBucketWidths[0]);
|
|
|
|
|
|
|
|
|
|
int ONNXOCRRecognizer::RoundUpToBucket(int resizedW) const {
|
|
|
|
|
const int capped = std::min(resizedW, imgMaxW_);
|
|
|
|
|
for (int b = 0; b < kRecNumBuckets; ++b) {
|
|
|
|
|
if (kRecBucketWidths[b] >= capped) return kRecBucketWidths[b];
|
|
|
|
|
}
|
|
|
|
|
return imgMaxW_;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Resize + normalize a single crop into a CHW float vector at width
|
|
|
|
|
// `bucketW`, padding with zeros on the right when needed. The returned
|
|
|
|
|
// vector has exactly 3*imgH_*bucketW elements.
|
|
|
|
|
static std::vector<float> PreprocessCropToBucket(const cv::Mat& crop,
|
|
|
|
|
int imgH, int bucketW) {
|
|
|
|
|
cv::Mat resized = ResizeRecImage(crop, imgH, bucketW);
|
|
|
|
|
int resizedW = resized.cols;
|
|
|
|
|
resized.convertTo(resized, CV_32FC3);
|
|
|
|
|
auto normalizedData = NormalizeAndPermuteCls(resized);
|
|
|
|
|
|
|
|
|
|
if (resizedW == bucketW) {
|
|
|
|
|
return normalizedData;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Zero-pad on the right (CHW layout)
|
|
|
|
|
std::vector<float> padded(3 * imgH * bucketW, 0.0f);
|
|
|
|
|
for (int c = 0; c < 3; c++) {
|
|
|
|
|
for (int y = 0; y < imgH; y++) {
|
|
|
|
|
std::memcpy(
|
|
|
|
|
&padded[c * imgH * bucketW + y * bucketW],
|
|
|
|
|
&normalizedData[c * imgH * resizedW + y * resizedW],
|
|
|
|
|
resizedW * sizeof(float));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return padded;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
|
|
|
|
|
std::lock_guard<std::mutex> lock(_mutex);
|
|
|
|
|
|
|
|
|
|
if (!ort_session || croppedImage.empty() || keys_.empty()) {
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try {
|
2026-04-14 20:30:21 +10:00
|
|
|
// Step 1: aspect-preserving resize to height=imgH_, width capped
|
|
|
|
|
// at imgMaxW_. Then round resized width up to the next bucket.
|
2026-03-28 16:54:11 +11:00
|
|
|
cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
|
2026-04-14 20:30:21 +10:00
|
|
|
const int bucketW = RoundUpToBucket(resized.cols);
|
|
|
|
|
|
|
|
|
|
std::vector<float> inputData = PreprocessCropToBucket(croppedImage, imgH_, bucketW);
|
2026-03-28 16:54:11 +11:00
|
|
|
|
2026-04-14 20:30:21 +10:00
|
|
|
std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };
|
2026-03-28 16:54:11 +11:00
|
|
|
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
|
|
|
|
|
*memory_info_handler, inputData.data(), inputData.size(),
|
|
|
|
|
inputShape.data(), inputShape.size());
|
|
|
|
|
|
|
|
|
|
auto outputTensors = ort_session->Run(
|
|
|
|
|
Ort::RunOptions{ nullptr },
|
|
|
|
|
input_node_names.data(), &inputTensor, 1,
|
|
|
|
|
output_node_names.data(), num_outputs);
|
|
|
|
|
|
|
|
|
|
float* outputData = outputTensors[0].GetTensorMutableData<float>();
|
|
|
|
|
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
|
2026-04-14 20:30:21 +10:00
|
|
|
int seqLen = static_cast<int>(outputShape[1]);
|
2026-03-28 16:54:11 +11:00
|
|
|
int numClasses = static_cast<int>(outputShape[2]);
|
|
|
|
|
|
|
|
|
|
return CTCDecode(outputData, seqLen, numClasses);
|
|
|
|
|
}
|
|
|
|
|
catch (const Ort::Exception& e) {
|
|
|
|
|
std::cerr << "[ONNXOCRRecognizer] Inference failed: " << e.what() << std::endl;
|
|
|
|
|
return {};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-14 20:30:21 +10:00
|
|
|
void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector<cv::Mat>& crops,
|
|
|
|
|
const std::vector<size_t>& origIndices,
|
|
|
|
|
int bucketW,
|
|
|
|
|
std::vector<TextLine>& out) {
|
|
|
|
|
if (crops.empty()) return;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
const size_t batchN = crops.size();
|
|
|
|
|
const size_t perImage = static_cast<size_t>(3) * imgH_ * bucketW;
|
|
|
|
|
|
|
|
|
|
// Stack N preprocessed crops into one [N,3,H,W] buffer
|
|
|
|
|
std::vector<float> batchInput(batchN * perImage, 0.0f);
|
|
|
|
|
for (size_t i = 0; i < batchN; ++i) {
|
|
|
|
|
auto img = PreprocessCropToBucket(crops[i], imgH_, bucketW);
|
|
|
|
|
std::memcpy(&batchInput[i * perImage], img.data(),
|
|
|
|
|
perImage * sizeof(float));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::array<int64_t, 4> inputShape = {
|
|
|
|
|
static_cast<int64_t>(batchN), 3,
|
|
|
|
|
static_cast<int64_t>(imgH_),
|
|
|
|
|
static_cast<int64_t>(bucketW)
|
|
|
|
|
};
|
|
|
|
|
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
|
|
|
|
|
*memory_info_handler, batchInput.data(), batchInput.size(),
|
|
|
|
|
inputShape.data(), inputShape.size());
|
|
|
|
|
|
|
|
|
|
auto outputTensors = ort_session->Run(
|
|
|
|
|
Ort::RunOptions{ nullptr },
|
|
|
|
|
input_node_names.data(), &inputTensor, 1,
|
|
|
|
|
output_node_names.data(), num_outputs);
|
|
|
|
|
|
|
|
|
|
float* outputData = outputTensors[0].GetTensorMutableData<float>();
|
|
|
|
|
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
|
|
|
|
|
// Expected output: [N, seqLen, numClasses]
|
|
|
|
|
if (outputShape.size() < 3) {
|
|
|
|
|
std::cerr << "[ONNXOCRRecognizer] Unexpected batch output rank: "
|
|
|
|
|
<< outputShape.size() << std::endl;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
const int outBatch = static_cast<int>(outputShape[0]);
|
|
|
|
|
const int seqLen = static_cast<int>(outputShape[1]);
|
|
|
|
|
const int numClasses = static_cast<int>(outputShape[2]);
|
|
|
|
|
const size_t perRow = static_cast<size_t>(seqLen) * numClasses;
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < outBatch && i < static_cast<int>(batchN); ++i) {
|
|
|
|
|
TextLine tl = CTCDecode(outputData + i * perRow, seqLen, numClasses);
|
|
|
|
|
out[origIndices[i]] = std::move(tl);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
catch (const Ort::Exception& e) {
|
|
|
|
|
// ORT will throw if the model doesn't support a batch dimension > 1.
|
|
|
|
|
// Fall back to per-image inference for this group.
|
|
|
|
|
std::cerr << "[ONNXOCRRecognizer] Batch inference failed at bucketW="
|
|
|
|
|
<< bucketW << " (" << e.what()
|
|
|
|
|
<< ") — falling back to single-image path." << std::endl;
|
|
|
|
|
for (size_t i = 0; i < crops.size(); ++i) {
|
|
|
|
|
// Direct call (we already hold _mutex via the public RecognizeBatch
|
|
|
|
|
// wrapper). Replicate the single-image preprocessing here to avoid
|
|
|
|
|
// re-entering Recognize() and double-locking the mutex.
|
|
|
|
|
try {
|
|
|
|
|
cv::Mat resized = ResizeRecImage(crops[i], imgH_, imgMaxW_);
|
|
|
|
|
int singleBucket = RoundUpToBucket(resized.cols);
|
|
|
|
|
auto inputData = PreprocessCropToBucket(crops[i], imgH_, singleBucket);
|
|
|
|
|
std::array<int64_t, 4> inputShape = { 1, 3, imgH_, singleBucket };
|
|
|
|
|
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
|
|
|
|
|
*memory_info_handler, inputData.data(), inputData.size(),
|
|
|
|
|
inputShape.data(), inputShape.size());
|
|
|
|
|
auto outputTensors = ort_session->Run(
|
|
|
|
|
Ort::RunOptions{ nullptr },
|
|
|
|
|
input_node_names.data(), &inputTensor, 1,
|
|
|
|
|
output_node_names.data(), num_outputs);
|
|
|
|
|
float* outData = outputTensors[0].GetTensorMutableData<float>();
|
|
|
|
|
auto outShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
|
int seqLen = static_cast<int>(outShape[1]);
|
|
|
|
|
int numClasses = static_cast<int>(outShape[2]);
|
|
|
|
|
out[origIndices[i]] = CTCDecode(outData, seqLen, numClasses);
|
|
|
|
|
} catch (const Ort::Exception& e2) {
|
|
|
|
|
std::cerr << "[ONNXOCRRecognizer] Single-image fallback also failed: "
|
|
|
|
|
<< e2.what() << std::endl;
|
|
|
|
|
out[origIndices[i]] = {};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
|
2026-04-14 20:30:21 +10:00
|
|
|
std::lock_guard<std::mutex> lock(_mutex);
|
2026-03-28 16:54:11 +11:00
|
|
|
|
2026-04-14 20:30:21 +10:00
|
|
|
std::vector<TextLine> results(croppedImages.size());
|
|
|
|
|
if (!ort_session || croppedImages.empty() || keys_.empty()) {
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Group crops by their target bucket width
|
|
|
|
|
std::vector<std::vector<cv::Mat>> groupCrops(kRecNumBuckets);
|
|
|
|
|
std::vector<std::vector<size_t>> groupIdx(kRecNumBuckets);
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < croppedImages.size(); ++i) {
|
|
|
|
|
if (croppedImages[i].empty()) continue;
|
|
|
|
|
cv::Mat resized = ResizeRecImage(croppedImages[i], imgH_, imgMaxW_);
|
|
|
|
|
const int bw = RoundUpToBucket(resized.cols);
|
|
|
|
|
// Find bucket index
|
|
|
|
|
int bucketIdx = kRecNumBuckets - 1;
|
|
|
|
|
for (int b = 0; b < kRecNumBuckets; ++b) {
|
|
|
|
|
if (kRecBucketWidths[b] == bw) { bucketIdx = b; break; }
|
|
|
|
|
}
|
|
|
|
|
groupCrops[bucketIdx].push_back(croppedImages[i]);
|
|
|
|
|
groupIdx[bucketIdx].push_back(i);
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-15 07:27:55 +10:00
|
|
|
// Run batched inference per non-empty bucket, slicing each bucket
|
|
|
|
|
// group into chunks of at most kRecMaxBatch crops so we never exceed
|
|
|
|
|
// the TRT dynamic profile's max-batch dimension. On a busy scene with
|
|
|
|
|
// (say) 30 plates all falling in bucket 320, we issue two back-to-back
|
|
|
|
|
// batched calls of 24 + 6 instead of one oversized call that would
|
|
|
|
|
// throw "does not satisfy any optimization profiles" and fall off
|
|
|
|
|
// the fast path to the per-image fallback.
|
2026-04-14 20:30:21 +10:00
|
|
|
for (int b = 0; b < kRecNumBuckets; ++b) {
|
2026-04-15 07:27:55 +10:00
|
|
|
const auto& bucketCrops = groupCrops[b];
|
|
|
|
|
const auto& bucketIndices = groupIdx[b];
|
|
|
|
|
if (bucketCrops.empty()) continue;
|
|
|
|
|
|
|
|
|
|
const int bucketW = kRecBucketWidths[b];
|
|
|
|
|
const size_t total = bucketCrops.size();
|
|
|
|
|
|
|
|
|
|
for (size_t start = 0; start < total; start += kRecMaxBatch) {
|
|
|
|
|
const size_t end = std::min(start + static_cast<size_t>(kRecMaxBatch), total);
|
|
|
|
|
std::vector<cv::Mat> chunkCrops(bucketCrops.begin() + start,
|
|
|
|
|
bucketCrops.begin() + end);
|
|
|
|
|
std::vector<size_t> chunkIdx(bucketIndices.begin() + start,
|
|
|
|
|
bucketIndices.begin() + end);
|
|
|
|
|
RunBatchAtWidth(chunkCrops, chunkIdx, bucketW, results);
|
|
|
|
|
}
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-14 20:30:21 +10:00
|
|
|
void ONNXOCRRecognizer::Warmup() {
|
|
|
|
|
std::lock_guard<std::mutex> lock(_mutex);
|
|
|
|
|
if (_warmedUp || !ort_session) return;
|
|
|
|
|
|
|
|
|
|
// Dummy 3-channel image, mid-grey, large enough to resize to imgH_
|
|
|
|
|
cv::Mat dummy(imgH_ * 2, kRecBucketWidths[kRecNumBuckets - 1] * 2,
|
|
|
|
|
CV_8UC3, cv::Scalar(128, 128, 128));
|
|
|
|
|
|
|
|
|
|
for (int b = 0; b < kRecNumBuckets; ++b) {
|
|
|
|
|
const int bucketW = kRecBucketWidths[b];
|
|
|
|
|
try {
|
|
|
|
|
auto inputData = PreprocessCropToBucket(dummy, imgH_, bucketW);
|
|
|
|
|
std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };
|
|
|
|
|
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
|
|
|
|
|
*memory_info_handler, inputData.data(), inputData.size(),
|
|
|
|
|
inputShape.data(), inputShape.size());
|
|
|
|
|
|
|
|
|
|
auto t0 = std::chrono::high_resolution_clock::now();
|
|
|
|
|
(void)ort_session->Run(
|
|
|
|
|
Ort::RunOptions{ nullptr },
|
|
|
|
|
input_node_names.data(), &inputTensor, 1,
|
|
|
|
|
output_node_names.data(), num_outputs);
|
|
|
|
|
auto t1 = std::chrono::high_resolution_clock::now();
|
|
|
|
|
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
|
|
|
|
|
std::cout << "[ONNXOCRRecognizer] Warmup bucketW=" << bucketW
|
|
|
|
|
<< " " << ms << " ms" << std::endl;
|
|
|
|
|
}
|
|
|
|
|
catch (const Ort::Exception& e) {
|
|
|
|
|
std::cerr << "[ONNXOCRRecognizer] Warmup failed at bucketW="
|
|
|
|
|
<< bucketW << ": " << e.what() << std::endl;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
_warmedUp = true;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
TextLine ONNXOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
|
|
|
|
|
TextLine result;
|
|
|
|
|
std::string text;
|
|
|
|
|
std::vector<float> scores;
|
|
|
|
|
|
|
|
|
|
int lastIndex = 0; // CTC blank is index 0
|
|
|
|
|
|
|
|
|
|
for (int t = 0; t < seqLen; t++) {
|
|
|
|
|
// Find argmax for this timestep
|
|
|
|
|
int maxIndex = 0;
|
|
|
|
|
float maxValue = -FLT_MAX;
|
|
|
|
|
|
|
|
|
|
const float* timeStep = outputData + t * numClasses;
|
|
|
|
|
for (int c = 0; c < numClasses; c++) {
|
|
|
|
|
if (timeStep[c] > maxValue) {
|
|
|
|
|
maxValue = timeStep[c];
|
|
|
|
|
maxIndex = c;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// CTC decode: skip blanks (index 0) and repeated characters
|
|
|
|
|
if (maxIndex != 0 && maxIndex != lastIndex) {
|
|
|
|
|
if (maxIndex > 0 && maxIndex < static_cast<int>(keys_.size())) {
|
|
|
|
|
text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc.
|
|
|
|
|
// Use raw model output value as confidence (PaddleOCR v5 models include softmax)
|
|
|
|
|
scores.push_back(maxValue);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
lastIndex = maxIndex;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
result.text = text;
|
|
|
|
|
if (!scores.empty()) {
|
|
|
|
|
result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) /
|
|
|
|
|
static_cast<float>(scores.size());
|
|
|
|
|
}
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace onnxocr
|
|
|
|
|
} // namespace ANSCENTER
|