modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp

#include "ONNXOCRRecognizer.h"

#include <opencv2/imgproc.hpp>
#include <iostream>
#include <algorithm>
#include <numeric>
#include <cmath>
#include <cfloat>
#include <cstring>
#include <chrono>

namespace ANSCENTER {
namespace onnxocr {

ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads)
    : BasicOrtHandler(onnx_path, num_threads) {
}

ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path,
                                     const OrtHandlerOptions& options,
                                     unsigned int num_threads)
    : BasicOrtHandler(onnx_path, options, num_threads) {
}

bool ONNXOCRRecognizer::LoadDictionary(const std::string& dictPath) {
    keys_ = LoadDict(dictPath);
    if (keys_.size() < 2) {
        std::cerr << "[ONNXOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl;
        return false;
    }
    std::cout << "[ONNXOCRRecognizer] Loaded dictionary with " << keys_.size()
              << " characters from: " << dictPath << std::endl;
    return true;
}

Ort::Value ONNXOCRRecognizer::transform(const cv::Mat& mat) {
    // Not used directly - recognition uses custom preprocess with dynamic width
    cv::Mat resized = ResizeRecImage(mat, imgH_, imgMaxW_);
    resized.convertTo(resized, CV_32FC3);
    auto data = NormalizeAndPermuteCls(resized);

    input_values_handler.assign(data.begin(), data.end());
    return Ort::Value::CreateTensor<float>(
        *memory_info_handler, input_values_handler.data(), input_values_handler.size(),
        input_node_dims.data(), input_node_dims.size());
}

Ort::Value ONNXOCRRecognizer::transformBatch(const std::vector<cv::Mat>& images) {
    // Not used - recognizer processes single images with dynamic widths
    if (!images.empty()) {
        return transform(images[0]);
    }
    return Ort::Value(nullptr);
}

// ----------------------------------------------------------------------------
// Width buckets — every recognizer input is padded up to one of these widths
// before reaching ORT. This bounds the number of distinct shapes cuDNN ever
// sees to four, so its HEURISTIC algorithm cache hits on every subsequent
// call instead of re-tuning per plate. Buckets cover the realistic range:
//   320 px  → short Latin/Japanese plates (most common)
//   480 px  → wider Latin plates with two rows of text
//   640 px  → long single-row plates / multi-line stacked text
//   960 px  → safety upper bound (== kRecImgMaxW)
// ----------------------------------------------------------------------------
static constexpr int kRecBucketWidths[] = { 320, 480, 640, 960 };
static constexpr int kRecNumBuckets = sizeof(kRecBucketWidths) / sizeof(kRecBucketWidths[0]);

int ONNXOCRRecognizer::RoundUpToBucket(int resizedW) const {
    const int capped = std::min(resizedW, imgMaxW_);
    for (int b = 0; b < kRecNumBuckets; ++b) {
        if (kRecBucketWidths[b] >= capped) return kRecBucketWidths[b];
    }
    return imgMaxW_;
}

// Resize + normalize a single crop into a CHW float vector at width
// `bucketW`, padding with zeros on the right when needed. The returned
// vector has exactly 3*imgH_*bucketW elements.
static std::vector<float> PreprocessCropToBucket(const cv::Mat& crop,
                                                 int imgH, int bucketW) {
    cv::Mat resized = ResizeRecImage(crop, imgH, bucketW);
    int resizedW = resized.cols;
    resized.convertTo(resized, CV_32FC3);
    auto normalizedData = NormalizeAndPermuteCls(resized);

    if (resizedW == bucketW) {
        return normalizedData;
    }

    // Zero-pad on the right (CHW layout)
    std::vector<float> padded(3 * imgH * bucketW, 0.0f);
    for (int c = 0; c < 3; c++) {
        for (int y = 0; y < imgH; y++) {
            std::memcpy(
                &padded[c * imgH * bucketW + y * bucketW],
                &normalizedData[c * imgH * resizedW + y * resizedW],
                resizedW * sizeof(float));
        }
    }
    return padded;
}

TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
    std::lock_guard<std::mutex> lock(_mutex);

    if (!ort_session || croppedImage.empty() || keys_.empty()) {
        return {};
    }

    try {
        // Step 1: aspect-preserving resize to height=imgH_, width capped
        // at imgMaxW_. Then round resized width up to the next bucket.
        cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
        const int bucketW = RoundUpToBucket(resized.cols);

        std::vector<float> inputData = PreprocessCropToBucket(croppedImage, imgH_, bucketW);

        std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };
        Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
            *memory_info_handler, inputData.data(), inputData.size(),
            inputShape.data(), inputShape.size());

        auto outputTensors = ort_session->Run(
            Ort::RunOptions{ nullptr },
            input_node_names.data(), &inputTensor, 1,
            output_node_names.data(), num_outputs);

        float* outputData = outputTensors[0].GetTensorMutableData<float>();
        auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();

        int seqLen     = static_cast<int>(outputShape[1]);
        int numClasses = static_cast<int>(outputShape[2]);

        return CTCDecode(outputData, seqLen, numClasses);
    }
    catch (const Ort::Exception& e) {
        std::cerr << "[ONNXOCRRecognizer] Inference failed: " << e.what() << std::endl;
        return {};
    }
}

void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector<cv::Mat>& crops,
                                        const std::vector<size_t>& origIndices,
                                        int bucketW,
                                        std::vector<TextLine>& out) {
    if (crops.empty()) return;

    try {
        const size_t batchN = crops.size();
        const size_t perImage = static_cast<size_t>(3) * imgH_ * bucketW;

        // Stack N preprocessed crops into one [N,3,H,W] buffer
        std::vector<float> batchInput(batchN * perImage, 0.0f);
        for (size_t i = 0; i < batchN; ++i) {
            auto img = PreprocessCropToBucket(crops[i], imgH_, bucketW);
            std::memcpy(&batchInput[i * perImage], img.data(),
                        perImage * sizeof(float));
        }

        std::array<int64_t, 4> inputShape = {
            static_cast<int64_t>(batchN), 3,
            static_cast<int64_t>(imgH_),
            static_cast<int64_t>(bucketW)
        };
        Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
            *memory_info_handler, batchInput.data(), batchInput.size(),
            inputShape.data(), inputShape.size());

        auto outputTensors = ort_session->Run(
            Ort::RunOptions{ nullptr },
            input_node_names.data(), &inputTensor, 1,
            output_node_names.data(), num_outputs);

        float* outputData = outputTensors[0].GetTensorMutableData<float>();
        auto outputShape  = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();

        // Expected output: [N, seqLen, numClasses]
        if (outputShape.size() < 3) {
            std::cerr << "[ONNXOCRRecognizer] Unexpected batch output rank: "
                      << outputShape.size() << std::endl;
            return;
        }
        const int outBatch   = static_cast<int>(outputShape[0]);
        const int seqLen     = static_cast<int>(outputShape[1]);
        const int numClasses = static_cast<int>(outputShape[2]);
        const size_t perRow  = static_cast<size_t>(seqLen) * numClasses;

        for (int i = 0; i < outBatch && i < static_cast<int>(batchN); ++i) {
            TextLine tl = CTCDecode(outputData + i * perRow, seqLen, numClasses);
            out[origIndices[i]] = std::move(tl);
        }
    }
    catch (const Ort::Exception& e) {
        // ORT will throw if the model doesn't support a batch dimension > 1.
        // Fall back to per-image inference for this group.
        std::cerr << "[ONNXOCRRecognizer] Batch inference failed at bucketW="
                  << bucketW << " (" << e.what()
                  << ") — falling back to single-image path." << std::endl;
        for (size_t i = 0; i < crops.size(); ++i) {
            // Direct call (we already hold _mutex via the public RecognizeBatch
            // wrapper). Replicate the single-image preprocessing here to avoid
            // re-entering Recognize() and double-locking the mutex.
            try {
                cv::Mat resized = ResizeRecImage(crops[i], imgH_, imgMaxW_);
                int singleBucket = RoundUpToBucket(resized.cols);
                auto inputData = PreprocessCropToBucket(crops[i], imgH_, singleBucket);
                std::array<int64_t, 4> inputShape = { 1, 3, imgH_, singleBucket };
                Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
                    *memory_info_handler, inputData.data(), inputData.size(),
                    inputShape.data(), inputShape.size());
                auto outputTensors = ort_session->Run(
                    Ort::RunOptions{ nullptr },
                    input_node_names.data(), &inputTensor, 1,
                    output_node_names.data(), num_outputs);
                float* outData = outputTensors[0].GetTensorMutableData<float>();
                auto outShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
                int seqLen = static_cast<int>(outShape[1]);
                int numClasses = static_cast<int>(outShape[2]);
                out[origIndices[i]] = CTCDecode(outData, seqLen, numClasses);
            } catch (const Ort::Exception& e2) {
                std::cerr << "[ONNXOCRRecognizer] Single-image fallback also failed: "
                          << e2.what() << std::endl;
                out[origIndices[i]] = {};
            }
        }
    }
}

std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
    std::lock_guard<std::mutex> lock(_mutex);

    std::vector<TextLine> results(croppedImages.size());
    if (!ort_session || croppedImages.empty() || keys_.empty()) {
        return results;
    }

    // Group crops by their target bucket width
    std::vector<std::vector<cv::Mat>> groupCrops(kRecNumBuckets);
    std::vector<std::vector<size_t>>  groupIdx(kRecNumBuckets);

    for (size_t i = 0; i < croppedImages.size(); ++i) {
        if (croppedImages[i].empty()) continue;
        cv::Mat resized = ResizeRecImage(croppedImages[i], imgH_, imgMaxW_);
        const int bw = RoundUpToBucket(resized.cols);
        // Find bucket index
        int bucketIdx = kRecNumBuckets - 1;
        for (int b = 0; b < kRecNumBuckets; ++b) {
            if (kRecBucketWidths[b] == bw) { bucketIdx = b; break; }
        }
        groupCrops[bucketIdx].push_back(croppedImages[i]);
        groupIdx[bucketIdx].push_back(i);
    }

    // Run batched inference per non-empty bucket, slicing each bucket
    // group into chunks of at most kRecMaxBatch crops so we never exceed
    // the TRT dynamic profile's max-batch dimension. On a busy scene with
    // (say) 30 plates all falling in bucket 320, we issue two back-to-back
    // batched calls of 24 + 6 instead of one oversized call that would
    // throw "does not satisfy any optimization profiles" and fall off
    // the fast path to the per-image fallback.
    for (int b = 0; b < kRecNumBuckets; ++b) {
        const auto& bucketCrops = groupCrops[b];
        const auto& bucketIndices = groupIdx[b];
        if (bucketCrops.empty()) continue;

        const int bucketW = kRecBucketWidths[b];
        const size_t total = bucketCrops.size();

        for (size_t start = 0; start < total; start += kRecMaxBatch) {
            const size_t end = std::min(start + static_cast<size_t>(kRecMaxBatch), total);
            std::vector<cv::Mat> chunkCrops(bucketCrops.begin() + start,
                                            bucketCrops.begin() + end);
            std::vector<size_t>  chunkIdx(bucketIndices.begin() + start,
                                          bucketIndices.begin() + end);
            RunBatchAtWidth(chunkCrops, chunkIdx, bucketW, results);
        }
    }

    return results;
}

void ONNXOCRRecognizer::Warmup() {
    std::lock_guard<std::mutex> lock(_mutex);
    if (_warmedUp || !ort_session) return;

    // Dummy 3-channel image, mid-grey, large enough to resize to imgH_
    cv::Mat dummy(imgH_ * 2, kRecBucketWidths[kRecNumBuckets - 1] * 2,
                  CV_8UC3, cv::Scalar(128, 128, 128));

    for (int b = 0; b < kRecNumBuckets; ++b) {
        const int bucketW = kRecBucketWidths[b];
        try {
            auto inputData = PreprocessCropToBucket(dummy, imgH_, bucketW);
            std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };
            Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
                *memory_info_handler, inputData.data(), inputData.size(),
                inputShape.data(), inputShape.size());

            auto t0 = std::chrono::high_resolution_clock::now();
            (void)ort_session->Run(
                Ort::RunOptions{ nullptr },
                input_node_names.data(), &inputTensor, 1,
                output_node_names.data(), num_outputs);
            auto t1 = std::chrono::high_resolution_clock::now();
            double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
            std::cout << "[ONNXOCRRecognizer] Warmup bucketW=" << bucketW
                      << "  " << ms << " ms" << std::endl;
        }
        catch (const Ort::Exception& e) {
            std::cerr << "[ONNXOCRRecognizer] Warmup failed at bucketW="
                      << bucketW << ": " << e.what() << std::endl;
        }
    }
    _warmedUp = true;
}

TextLine ONNXOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
    TextLine result;
    std::string text;
    std::vector<float> scores;

    int lastIndex = 0; // CTC blank is index 0

    for (int t = 0; t < seqLen; t++) {
        // Find argmax for this timestep
        int maxIndex = 0;
        float maxValue = -FLT_MAX;

        const float* timeStep = outputData + t * numClasses;
        for (int c = 0; c < numClasses; c++) {
            if (timeStep[c] > maxValue) {
                maxValue = timeStep[c];
                maxIndex = c;
            }
        }

        // CTC decode: skip blanks (index 0) and repeated characters
        if (maxIndex != 0 && maxIndex != lastIndex) {
            if (maxIndex > 0 && maxIndex < static_cast<int>(keys_.size())) {
                text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc.
                // Use raw model output value as confidence (PaddleOCR v5 models include softmax)
                scores.push_back(maxValue);
            }
        }
        lastIndex = maxIndex;
    }

    result.text = text;
    if (!scores.empty()) {
        result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) /
                       static_cast<float>(scores.size());
    }
    return result;
}

} // namespace onnxocr
} // namespace ANSCENTER
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`#include "ONNXOCRRecognizer.h"`

			`#include <opencv2/imgproc.hpp>`
			`#include <iostream>`
			`#include <algorithm>`
			`#include <numeric>`
			`#include <cmath>`
			`#include <cfloat>`
			`#include <cstring>`
Improve ALPR_OCR peformance 2026-04-14 20:30:21 +10:00			`#include <chrono>`
Initial setup for CLion 2026-03-28 16:54:11 +11:00
			`namespace ANSCENTER {`
			`namespace onnxocr {`

			`ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads)`
			`: BasicOrtHandler(onnx_path, num_threads) {`
			`}`

Improve ALPR_OCR peformance 2026-04-14 20:30:21 +10:00			`ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path,`
			`const OrtHandlerOptions& options,`
			`unsigned int num_threads)`
			`: BasicOrtHandler(onnx_path, options, num_threads) {`
			`}`

Initial setup for CLion 2026-03-28 16:54:11 +11:00			`bool ONNXOCRRecognizer::LoadDictionary(const std::string& dictPath) {`
			`keys_ = LoadDict(dictPath);`
			`if (keys_.size() < 2) {`
			`std::cerr << "[ONNXOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl;`
			`return false;`
			`}`
			`std::cout << "[ONNXOCRRecognizer] Loaded dictionary with " << keys_.size()`
			`<< " characters from: " << dictPath << std::endl;`
			`return true;`
			`}`

			`Ort::Value ONNXOCRRecognizer::transform(const cv::Mat& mat) {`
			`// Not used directly - recognition uses custom preprocess with dynamic width`
			`cv::Mat resized = ResizeRecImage(mat, imgH_, imgMaxW_);`
			`resized.convertTo(resized, CV_32FC3);`
			`auto data = NormalizeAndPermuteCls(resized);`

			`input_values_handler.assign(data.begin(), data.end());`
			`return Ort::Value::CreateTensor<float>(`
			`*memory_info_handler, input_values_handler.data(), input_values_handler.size(),`
			`input_node_dims.data(), input_node_dims.size());`
			`}`

			`Ort::Value ONNXOCRRecognizer::transformBatch(const std::vector<cv::Mat>& images) {`
			`// Not used - recognizer processes single images with dynamic widths`
			`if (!images.empty()) {`
			`return transform(images[0]);`
			`}`
			`return Ort::Value(nullptr);`
			`}`

Improve ALPR_OCR peformance 2026-04-14 20:30:21 +10:00			`// ----------------------------------------------------------------------------`
			`// Width buckets — every recognizer input is padded up to one of these widths`
			`// before reaching ORT. This bounds the number of distinct shapes cuDNN ever`
			`// sees to four, so its HEURISTIC algorithm cache hits on every subsequent`
			`// call instead of re-tuning per plate. Buckets cover the realistic range:`
			`// 320 px → short Latin/Japanese plates (most common)`
			`// 480 px → wider Latin plates with two rows of text`
			`// 640 px → long single-row plates / multi-line stacked text`
			`// 960 px → safety upper bound (== kRecImgMaxW)`
			`// ----------------------------------------------------------------------------`
			`static constexpr int kRecBucketWidths[] = { 320, 480, 640, 960 };`
			`static constexpr int kRecNumBuckets = sizeof(kRecBucketWidths) / sizeof(kRecBucketWidths[0]);`

			`int ONNXOCRRecognizer::RoundUpToBucket(int resizedW) const {`
			`const int capped = std::min(resizedW, imgMaxW_);`
			`for (int b = 0; b < kRecNumBuckets; ++b) {`
			`if (kRecBucketWidths[b] >= capped) return kRecBucketWidths[b];`
			`}`
			`return imgMaxW_;`
			`}`

			`// Resize + normalize a single crop into a CHW float vector at width`
			// `bucketW`, padding with zeros on the right when needed. The returned
			`// vector has exactly 3imgH_bucketW elements.`
			`static std::vector<float> PreprocessCropToBucket(const cv::Mat& crop,`
			`int imgH, int bucketW) {`
			`cv::Mat resized = ResizeRecImage(crop, imgH, bucketW);`
			`int resizedW = resized.cols;`
			`resized.convertTo(resized, CV_32FC3);`
			`auto normalizedData = NormalizeAndPermuteCls(resized);`

			`if (resizedW == bucketW) {`
			`return normalizedData;`
			`}`

			`// Zero-pad on the right (CHW layout)`
			`std::vector<float> padded(3 * imgH * bucketW, 0.0f);`
			`for (int c = 0; c < 3; c++) {`
			`for (int y = 0; y < imgH; y++) {`
			`std::memcpy(`
			`&padded[c * imgH * bucketW + y * bucketW],`
			`&normalizedData[c * imgH * resizedW + y * resizedW],`
			`resizedW * sizeof(float));`
			`}`
			`}`
			`return padded;`
			`}`

Initial setup for CLion 2026-03-28 16:54:11 +11:00			`TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {`
			`std::lock_guard<std::mutex> lock(_mutex);`

			`if (!ort_session \|\| croppedImage.empty() \|\| keys_.empty()) {`
			`return {};`
			`}`

			`try {`
Improve ALPR_OCR peformance 2026-04-14 20:30:21 +10:00			`// Step 1: aspect-preserving resize to height=imgH_, width capped`
			`// at imgMaxW_. Then round resized width up to the next bucket.`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);`
Improve ALPR_OCR peformance 2026-04-14 20:30:21 +10:00			`const int bucketW = RoundUpToBucket(resized.cols);`

			`std::vector<float> inputData = PreprocessCropToBucket(croppedImage, imgH_, bucketW);`
Initial setup for CLion 2026-03-28 16:54:11 +11:00
Improve ALPR_OCR peformance 2026-04-14 20:30:21 +10:00			`std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`Ort::Value inputTensor = Ort::Value::CreateTensor<float>(`
			`*memory_info_handler, inputData.data(), inputData.size(),`
			`inputShape.data(), inputShape.size());`

			`auto outputTensors = ort_session->Run(`
			`Ort::RunOptions{ nullptr },`
			`input_node_names.data(), &inputTensor, 1,`
			`output_node_names.data(), num_outputs);`

			`float* outputData = outputTensors[0].GetTensorMutableData<float>();`
			`auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();`

Improve ALPR_OCR peformance 2026-04-14 20:30:21 +10:00			`int seqLen = static_cast<int>(outputShape[1]);`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`int numClasses = static_cast<int>(outputShape[2]);`

			`return CTCDecode(outputData, seqLen, numClasses);`
			`}`
			`catch (const Ort::Exception& e) {`
			`std::cerr << "[ONNXOCRRecognizer] Inference failed: " << e.what() << std::endl;`
			`return {};`
			`}`
			`}`

Improve ALPR_OCR peformance 2026-04-14 20:30:21 +10:00			`void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector<cv::Mat>& crops,`
			`const std::vector<size_t>& origIndices,`
			`int bucketW,`
			`std::vector<TextLine>& out) {`
			`if (crops.empty()) return;`

			`try {`
			`const size_t batchN = crops.size();`
			`const size_t perImage = static_cast<size_t>(3) * imgH_ * bucketW;`

			`// Stack N preprocessed crops into one [N,3,H,W] buffer`
			`std::vector<float> batchInput(batchN * perImage, 0.0f);`
			`for (size_t i = 0; i < batchN; ++i) {`
			`auto img = PreprocessCropToBucket(crops[i], imgH_, bucketW);`
			`std::memcpy(&batchInput[i * perImage], img.data(),`
			`perImage * sizeof(float));`
			`}`

			`std::array<int64_t, 4> inputShape = {`
			`static_cast<int64_t>(batchN), 3,`
			`static_cast<int64_t>(imgH_),`
			`static_cast<int64_t>(bucketW)`
			`};`
			`Ort::Value inputTensor = Ort::Value::CreateTensor<float>(`
			`*memory_info_handler, batchInput.data(), batchInput.size(),`
			`inputShape.data(), inputShape.size());`

			`auto outputTensors = ort_session->Run(`
			`Ort::RunOptions{ nullptr },`
			`input_node_names.data(), &inputTensor, 1,`
			`output_node_names.data(), num_outputs);`

			`float* outputData = outputTensors[0].GetTensorMutableData<float>();`
			`auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();`

			`// Expected output: [N, seqLen, numClasses]`
			`if (outputShape.size() < 3) {`
			`std::cerr << "[ONNXOCRRecognizer] Unexpected batch output rank: "`
			`<< outputShape.size() << std::endl;`
			`return;`
			`}`
			`const int outBatch = static_cast<int>(outputShape[0]);`
			`const int seqLen = static_cast<int>(outputShape[1]);`
			`const int numClasses = static_cast<int>(outputShape[2]);`
			`const size_t perRow = static_cast<size_t>(seqLen) * numClasses;`

			`for (int i = 0; i < outBatch && i < static_cast<int>(batchN); ++i) {`
			`TextLine tl = CTCDecode(outputData + i * perRow, seqLen, numClasses);`
			`out[origIndices[i]] = std::move(tl);`
			`}`
			`}`
			`catch (const Ort::Exception& e) {`
			`// ORT will throw if the model doesn't support a batch dimension > 1.`
			`// Fall back to per-image inference for this group.`
			`std::cerr << "[ONNXOCRRecognizer] Batch inference failed at bucketW="`
			`<< bucketW << " (" << e.what()`
			`<< ") — falling back to single-image path." << std::endl;`
			`for (size_t i = 0; i < crops.size(); ++i) {`
			`// Direct call (we already hold _mutex via the public RecognizeBatch`
			`// wrapper). Replicate the single-image preprocessing here to avoid`
			`// re-entering Recognize() and double-locking the mutex.`
			`try {`
			`cv::Mat resized = ResizeRecImage(crops[i], imgH_, imgMaxW_);`
			`int singleBucket = RoundUpToBucket(resized.cols);`
			`auto inputData = PreprocessCropToBucket(crops[i], imgH_, singleBucket);`
			`std::array<int64_t, 4> inputShape = { 1, 3, imgH_, singleBucket };`
			`Ort::Value inputTensor = Ort::Value::CreateTensor<float>(`
			`*memory_info_handler, inputData.data(), inputData.size(),`
			`inputShape.data(), inputShape.size());`
			`auto outputTensors = ort_session->Run(`
			`Ort::RunOptions{ nullptr },`
			`input_node_names.data(), &inputTensor, 1,`
			`output_node_names.data(), num_outputs);`
			`float* outData = outputTensors[0].GetTensorMutableData<float>();`
			`auto outShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();`
			`int seqLen = static_cast<int>(outShape[1]);`
			`int numClasses = static_cast<int>(outShape[2]);`
			`out[origIndices[i]] = CTCDecode(outData, seqLen, numClasses);`
			`} catch (const Ort::Exception& e2) {`
			`std::cerr << "[ONNXOCRRecognizer] Single-image fallback also failed: "`
			`<< e2.what() << std::endl;`
			`out[origIndices[i]] = {};`
			`}`
			`}`
			`}`
			`}`

Initial setup for CLion 2026-03-28 16:54:11 +11:00			`std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {`
Improve ALPR_OCR peformance 2026-04-14 20:30:21 +10:00			`std::lock_guard<std::mutex> lock(_mutex);`
Initial setup for CLion 2026-03-28 16:54:11 +11:00
Improve ALPR_OCR peformance 2026-04-14 20:30:21 +10:00			`std::vector<TextLine> results(croppedImages.size());`
			`if (!ort_session \|\| croppedImages.empty() \|\| keys_.empty()) {`
			`return results;`
			`}`

			`// Group crops by their target bucket width`
			`std::vector<std::vector<cv::Mat>> groupCrops(kRecNumBuckets);`
			`std::vector<std::vector<size_t>> groupIdx(kRecNumBuckets);`

			`for (size_t i = 0; i < croppedImages.size(); ++i) {`
			`if (croppedImages[i].empty()) continue;`
			`cv::Mat resized = ResizeRecImage(croppedImages[i], imgH_, imgMaxW_);`
			`const int bw = RoundUpToBucket(resized.cols);`
			`// Find bucket index`
			`int bucketIdx = kRecNumBuckets - 1;`
			`for (int b = 0; b < kRecNumBuckets; ++b) {`
			`if (kRecBucketWidths[b] == bw) { bucketIdx = b; break; }`
			`}`
			`groupCrops[bucketIdx].push_back(croppedImages[i]);`
			`groupIdx[bucketIdx].push_back(i);`
			`}`

Two-part fix Fix 1 — Chunk oversized bucket groups (the correctness fix) ONNXOCRRecognizer::RecognizeBatch now slices each bucket group into chunks of ≤ kRecMaxBatch before submitting to TRT. A frame with 30 crops in bucket 320 produces two back-to-back batched calls (24 + 6), both within the profile, both on the fast path. Fix 2 — Raise the profile max from 16 to 24 (the performance fix) The old profile max was 16; your real scenes routinely hit 24. Raising the profile max to 24 means the common 12-plate scene (24 crops) fits in a single batched call with no chunking needed. Scenes with > 24 crops now use chunking, but that's rare. 2026-04-15 07:27:55 +10:00			`// Run batched inference per non-empty bucket, slicing each bucket`
			`// group into chunks of at most kRecMaxBatch crops so we never exceed`
			`// the TRT dynamic profile's max-batch dimension. On a busy scene with`
			`// (say) 30 plates all falling in bucket 320, we issue two back-to-back`
			`// batched calls of 24 + 6 instead of one oversized call that would`
			`// throw "does not satisfy any optimization profiles" and fall off`
			`// the fast path to the per-image fallback.`
Improve ALPR_OCR peformance 2026-04-14 20:30:21 +10:00			`for (int b = 0; b < kRecNumBuckets; ++b) {`
Two-part fix Fix 1 — Chunk oversized bucket groups (the correctness fix) ONNXOCRRecognizer::RecognizeBatch now slices each bucket group into chunks of ≤ kRecMaxBatch before submitting to TRT. A frame with 30 crops in bucket 320 produces two back-to-back batched calls (24 + 6), both within the profile, both on the fast path. Fix 2 — Raise the profile max from 16 to 24 (the performance fix) The old profile max was 16; your real scenes routinely hit 24. Raising the profile max to 24 means the common 12-plate scene (24 crops) fits in a single batched call with no chunking needed. Scenes with > 24 crops now use chunking, but that's rare. 2026-04-15 07:27:55 +10:00			`const auto& bucketCrops = groupCrops[b];`
			`const auto& bucketIndices = groupIdx[b];`
			`if (bucketCrops.empty()) continue;`

			`const int bucketW = kRecBucketWidths[b];`
			`const size_t total = bucketCrops.size();`

			`for (size_t start = 0; start < total; start += kRecMaxBatch) {`
			`const size_t end = std::min(start + static_cast<size_t>(kRecMaxBatch), total);`
			`std::vector<cv::Mat> chunkCrops(bucketCrops.begin() + start,`
			`bucketCrops.begin() + end);`
			`std::vector<size_t> chunkIdx(bucketIndices.begin() + start,`
			`bucketIndices.begin() + end);`
			`RunBatchAtWidth(chunkCrops, chunkIdx, bucketW, results);`
			`}`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`}`

			`return results;`
			`}`

Improve ALPR_OCR peformance 2026-04-14 20:30:21 +10:00			`void ONNXOCRRecognizer::Warmup() {`
			`std::lock_guard<std::mutex> lock(_mutex);`
			`if (_warmedUp \|\| !ort_session) return;`

			`// Dummy 3-channel image, mid-grey, large enough to resize to imgH_`
			`cv::Mat dummy(imgH_ * 2, kRecBucketWidths[kRecNumBuckets - 1] * 2,`
			`CV_8UC3, cv::Scalar(128, 128, 128));`

			`for (int b = 0; b < kRecNumBuckets; ++b) {`
			`const int bucketW = kRecBucketWidths[b];`
			`try {`
			`auto inputData = PreprocessCropToBucket(dummy, imgH_, bucketW);`
			`std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };`
			`Ort::Value inputTensor = Ort::Value::CreateTensor<float>(`
			`*memory_info_handler, inputData.data(), inputData.size(),`
			`inputShape.data(), inputShape.size());`

			`auto t0 = std::chrono::high_resolution_clock::now();`
			`(void)ort_session->Run(`
			`Ort::RunOptions{ nullptr },`
			`input_node_names.data(), &inputTensor, 1,`
			`output_node_names.data(), num_outputs);`
			`auto t1 = std::chrono::high_resolution_clock::now();`
			`double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();`
			`std::cout << "[ONNXOCRRecognizer] Warmup bucketW=" << bucketW`
			`<< " " << ms << " ms" << std::endl;`
			`}`
			`catch (const Ort::Exception& e) {`
			`std::cerr << "[ONNXOCRRecognizer] Warmup failed at bucketW="`
			`<< bucketW << ": " << e.what() << std::endl;`
			`}`
			`}`
			`_warmedUp = true;`
			`}`

Initial setup for CLion 2026-03-28 16:54:11 +11:00			`TextLine ONNXOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {`
			`TextLine result;`
			`std::string text;`
			`std::vector<float> scores;`

			`int lastIndex = 0; // CTC blank is index 0`

			`for (int t = 0; t < seqLen; t++) {`
			`// Find argmax for this timestep`
			`int maxIndex = 0;`
			`float maxValue = -FLT_MAX;`

			`const float* timeStep = outputData + t * numClasses;`
			`for (int c = 0; c < numClasses; c++) {`
			`if (timeStep[c] > maxValue) {`
			`maxValue = timeStep[c];`
			`maxIndex = c;`
			`}`
			`}`

			`// CTC decode: skip blanks (index 0) and repeated characters`
			`if (maxIndex != 0 && maxIndex != lastIndex) {`
			`if (maxIndex > 0 && maxIndex < static_cast<int>(keys_.size())) {`
			`text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc.`
			`// Use raw model output value as confidence (PaddleOCR v5 models include softmax)`
			`scores.push_back(maxValue);`
			`}`
			`}`
			`lastIndex = maxIndex;`
			`}`

			`result.text = text;`
			`if (!scores.empty()) {`
			`result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) /`
			`static_cast<float>(scores.size());`
			`}`
			`return result;`
			`}`

			`} // namespace onnxocr`
			`} // namespace ANSCENTER`