#include "ONNXOCRRecognizer.h" #include #include #include #include #include #include #include #include namespace ANSCENTER { namespace onnxocr { ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads) : BasicOrtHandler(onnx_path, num_threads) { } ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path, const OrtHandlerOptions& options, unsigned int num_threads) : BasicOrtHandler(onnx_path, options, num_threads) { } bool ONNXOCRRecognizer::LoadDictionary(const std::string& dictPath) { keys_ = LoadDict(dictPath); if (keys_.size() < 2) { std::cerr << "[ONNXOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl; return false; } std::cout << "[ONNXOCRRecognizer] Loaded dictionary with " << keys_.size() << " characters from: " << dictPath << std::endl; return true; } Ort::Value ONNXOCRRecognizer::transform(const cv::Mat& mat) { // Not used directly - recognition uses custom preprocess with dynamic width cv::Mat resized = ResizeRecImage(mat, imgH_, imgMaxW_); resized.convertTo(resized, CV_32FC3); auto data = NormalizeAndPermuteCls(resized); input_values_handler.assign(data.begin(), data.end()); return Ort::Value::CreateTensor( *memory_info_handler, input_values_handler.data(), input_values_handler.size(), input_node_dims.data(), input_node_dims.size()); } Ort::Value ONNXOCRRecognizer::transformBatch(const std::vector& images) { // Not used - recognizer processes single images with dynamic widths if (!images.empty()) { return transform(images[0]); } return Ort::Value(nullptr); } // ---------------------------------------------------------------------------- // Width buckets — every recognizer input is padded up to one of these widths // before reaching ORT. This bounds the number of distinct shapes cuDNN ever // sees to four, so its HEURISTIC algorithm cache hits on every subsequent // call instead of re-tuning per plate. Buckets cover the realistic range: // 320 px → short Latin/Japanese plates (most common) // 480 px → wider Latin plates with two rows of text // 640 px → long single-row plates / multi-line stacked text // 960 px → safety upper bound (== kRecImgMaxW) // ---------------------------------------------------------------------------- static constexpr int kRecBucketWidths[] = { 320, 480, 640, 960 }; static constexpr int kRecNumBuckets = sizeof(kRecBucketWidths) / sizeof(kRecBucketWidths[0]); int ONNXOCRRecognizer::RoundUpToBucket(int resizedW) const { const int capped = std::min(resizedW, imgMaxW_); for (int b = 0; b < kRecNumBuckets; ++b) { if (kRecBucketWidths[b] >= capped) return kRecBucketWidths[b]; } return imgMaxW_; } // Resize + normalize a single crop into a CHW float vector at width // `bucketW`, padding with zeros on the right when needed. The returned // vector has exactly 3*imgH_*bucketW elements. static std::vector PreprocessCropToBucket(const cv::Mat& crop, int imgH, int bucketW) { cv::Mat resized = ResizeRecImage(crop, imgH, bucketW); int resizedW = resized.cols; resized.convertTo(resized, CV_32FC3); auto normalizedData = NormalizeAndPermuteCls(resized); if (resizedW == bucketW) { return normalizedData; } // Zero-pad on the right (CHW layout) std::vector padded(3 * imgH * bucketW, 0.0f); for (int c = 0; c < 3; c++) { for (int y = 0; y < imgH; y++) { std::memcpy( &padded[c * imgH * bucketW + y * bucketW], &normalizedData[c * imgH * resizedW + y * resizedW], resizedW * sizeof(float)); } } return padded; } TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) { std::lock_guard lock(_mutex); if (!ort_session || croppedImage.empty() || keys_.empty()) { return {}; } try { // Step 1: aspect-preserving resize to height=imgH_, width capped // at imgMaxW_. Then round resized width up to the next bucket. cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_); const int bucketW = RoundUpToBucket(resized.cols); std::vector inputData = PreprocessCropToBucket(croppedImage, imgH_, bucketW); std::array inputShape = { 1, 3, imgH_, bucketW }; Ort::Value inputTensor = Ort::Value::CreateTensor( *memory_info_handler, inputData.data(), inputData.size(), inputShape.data(), inputShape.size()); auto outputTensors = ort_session->Run( Ort::RunOptions{ nullptr }, input_node_names.data(), &inputTensor, 1, output_node_names.data(), num_outputs); float* outputData = outputTensors[0].GetTensorMutableData(); auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); int seqLen = static_cast(outputShape[1]); int numClasses = static_cast(outputShape[2]); return CTCDecode(outputData, seqLen, numClasses); } catch (const Ort::Exception& e) { std::cerr << "[ONNXOCRRecognizer] Inference failed: " << e.what() << std::endl; return {}; } } void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector& crops, const std::vector& origIndices, int bucketW, std::vector& out) { if (crops.empty()) return; try { const size_t batchN = crops.size(); const size_t perImage = static_cast(3) * imgH_ * bucketW; // Stack N preprocessed crops into one [N,3,H,W] buffer std::vector batchInput(batchN * perImage, 0.0f); for (size_t i = 0; i < batchN; ++i) { auto img = PreprocessCropToBucket(crops[i], imgH_, bucketW); std::memcpy(&batchInput[i * perImage], img.data(), perImage * sizeof(float)); } std::array inputShape = { static_cast(batchN), 3, static_cast(imgH_), static_cast(bucketW) }; Ort::Value inputTensor = Ort::Value::CreateTensor( *memory_info_handler, batchInput.data(), batchInput.size(), inputShape.data(), inputShape.size()); auto outputTensors = ort_session->Run( Ort::RunOptions{ nullptr }, input_node_names.data(), &inputTensor, 1, output_node_names.data(), num_outputs); float* outputData = outputTensors[0].GetTensorMutableData(); auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); // Expected output: [N, seqLen, numClasses] if (outputShape.size() < 3) { std::cerr << "[ONNXOCRRecognizer] Unexpected batch output rank: " << outputShape.size() << std::endl; return; } const int outBatch = static_cast(outputShape[0]); const int seqLen = static_cast(outputShape[1]); const int numClasses = static_cast(outputShape[2]); const size_t perRow = static_cast(seqLen) * numClasses; for (int i = 0; i < outBatch && i < static_cast(batchN); ++i) { TextLine tl = CTCDecode(outputData + i * perRow, seqLen, numClasses); out[origIndices[i]] = std::move(tl); } } catch (const Ort::Exception& e) { // ORT will throw if the model doesn't support a batch dimension > 1. // Fall back to per-image inference for this group. std::cerr << "[ONNXOCRRecognizer] Batch inference failed at bucketW=" << bucketW << " (" << e.what() << ") — falling back to single-image path." << std::endl; for (size_t i = 0; i < crops.size(); ++i) { // Direct call (we already hold _mutex via the public RecognizeBatch // wrapper). Replicate the single-image preprocessing here to avoid // re-entering Recognize() and double-locking the mutex. try { cv::Mat resized = ResizeRecImage(crops[i], imgH_, imgMaxW_); int singleBucket = RoundUpToBucket(resized.cols); auto inputData = PreprocessCropToBucket(crops[i], imgH_, singleBucket); std::array inputShape = { 1, 3, imgH_, singleBucket }; Ort::Value inputTensor = Ort::Value::CreateTensor( *memory_info_handler, inputData.data(), inputData.size(), inputShape.data(), inputShape.size()); auto outputTensors = ort_session->Run( Ort::RunOptions{ nullptr }, input_node_names.data(), &inputTensor, 1, output_node_names.data(), num_outputs); float* outData = outputTensors[0].GetTensorMutableData(); auto outShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); int seqLen = static_cast(outShape[1]); int numClasses = static_cast(outShape[2]); out[origIndices[i]] = CTCDecode(outData, seqLen, numClasses); } catch (const Ort::Exception& e2) { std::cerr << "[ONNXOCRRecognizer] Single-image fallback also failed: " << e2.what() << std::endl; out[origIndices[i]] = {}; } } } } std::vector ONNXOCRRecognizer::RecognizeBatch(const std::vector& croppedImages) { std::lock_guard lock(_mutex); std::vector results(croppedImages.size()); if (!ort_session || croppedImages.empty() || keys_.empty()) { return results; } // Group crops by their target bucket width std::vector> groupCrops(kRecNumBuckets); std::vector> groupIdx(kRecNumBuckets); for (size_t i = 0; i < croppedImages.size(); ++i) { if (croppedImages[i].empty()) continue; cv::Mat resized = ResizeRecImage(croppedImages[i], imgH_, imgMaxW_); const int bw = RoundUpToBucket(resized.cols); // Find bucket index int bucketIdx = kRecNumBuckets - 1; for (int b = 0; b < kRecNumBuckets; ++b) { if (kRecBucketWidths[b] == bw) { bucketIdx = b; break; } } groupCrops[bucketIdx].push_back(croppedImages[i]); groupIdx[bucketIdx].push_back(i); } // Run one batched inference per non-empty bucket for (int b = 0; b < kRecNumBuckets; ++b) { if (groupCrops[b].empty()) continue; RunBatchAtWidth(groupCrops[b], groupIdx[b], kRecBucketWidths[b], results); } return results; } void ONNXOCRRecognizer::Warmup() { std::lock_guard lock(_mutex); if (_warmedUp || !ort_session) return; // Dummy 3-channel image, mid-grey, large enough to resize to imgH_ cv::Mat dummy(imgH_ * 2, kRecBucketWidths[kRecNumBuckets - 1] * 2, CV_8UC3, cv::Scalar(128, 128, 128)); for (int b = 0; b < kRecNumBuckets; ++b) { const int bucketW = kRecBucketWidths[b]; try { auto inputData = PreprocessCropToBucket(dummy, imgH_, bucketW); std::array inputShape = { 1, 3, imgH_, bucketW }; Ort::Value inputTensor = Ort::Value::CreateTensor( *memory_info_handler, inputData.data(), inputData.size(), inputShape.data(), inputShape.size()); auto t0 = std::chrono::high_resolution_clock::now(); (void)ort_session->Run( Ort::RunOptions{ nullptr }, input_node_names.data(), &inputTensor, 1, output_node_names.data(), num_outputs); auto t1 = std::chrono::high_resolution_clock::now(); double ms = std::chrono::duration(t1 - t0).count(); std::cout << "[ONNXOCRRecognizer] Warmup bucketW=" << bucketW << " " << ms << " ms" << std::endl; } catch (const Ort::Exception& e) { std::cerr << "[ONNXOCRRecognizer] Warmup failed at bucketW=" << bucketW << ": " << e.what() << std::endl; } } _warmedUp = true; } TextLine ONNXOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) { TextLine result; std::string text; std::vector scores; int lastIndex = 0; // CTC blank is index 0 for (int t = 0; t < seqLen; t++) { // Find argmax for this timestep int maxIndex = 0; float maxValue = -FLT_MAX; const float* timeStep = outputData + t * numClasses; for (int c = 0; c < numClasses; c++) { if (timeStep[c] > maxValue) { maxValue = timeStep[c]; maxIndex = c; } } // CTC decode: skip blanks (index 0) and repeated characters if (maxIndex != 0 && maxIndex != lastIndex) { if (maxIndex > 0 && maxIndex < static_cast(keys_.size())) { text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc. // Use raw model output value as confidence (PaddleOCR v5 models include softmax) scores.push_back(maxValue); } } lastIndex = maxIndex; } result.text = text; if (!scores.empty()) { result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) / static_cast(scores.size()); } return result; } } // namespace onnxocr } // namespace ANSCENTER