Improve ALPR_OCR peformance

2026-04-14 20:30:21 +10:00
parent 3349b45ade
commit f9a0af8949
18 changed files with 991 additions and 77 deletions
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp
@@ -4,6 +4,7 @@
 #include <iostream>
 #include <algorithm>
 #include <cmath>
+#include <chrono>

 namespace ANSCENTER {
 namespace onnxocr {
@@ -12,6 +13,12 @@ ONNXOCRClassifier::ONNXOCRClassifier(const std::string& onnx_path, unsigned int
    : BasicOrtHandler(onnx_path, num_threads) {
 }

+ONNXOCRClassifier::ONNXOCRClassifier(const std::string& onnx_path,
+                                     const OrtHandlerOptions& options,
+                                     unsigned int num_threads)
+    : BasicOrtHandler(onnx_path, options, num_threads) {
+}
+
 Ort::Value ONNXOCRClassifier::transform(const cv::Mat& mat) {
    cv::Mat resized;
    // Direct resize to 80x160 (PP-LCNet_x1_0_textline_ori)
@@ -103,5 +110,38 @@ void ONNXOCRClassifier::Classify(std::vector<cv::Mat>& img_list,
    }
 }

+void ONNXOCRClassifier::Warmup() {
+    std::lock_guard<std::mutex> lock(_mutex);
+    if (_warmedUp || !ort_session) return;
+
+    try {
+        cv::Mat dummy(kClsImageH * 2, kClsImageW * 2, CV_8UC3, cv::Scalar(128, 128, 128));
+        cv::Mat resized;
+        cv::resize(dummy, resized, cv::Size(kClsImageW, kClsImageH));
+        resized.convertTo(resized, CV_32FC3);
+        auto inputData = NormalizeAndPermute(resized);
+
+        std::array<int64_t, 4> inputShape = { 1, 3, kClsImageH, kClsImageW };
+        Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
+            *memory_info_handler, inputData.data(), inputData.size(),
+            inputShape.data(), inputShape.size());
+
+        auto t0 = std::chrono::high_resolution_clock::now();
+        (void)ort_session->Run(
+            Ort::RunOptions{ nullptr },
+            input_node_names.data(), &inputTensor, 1,
+            output_node_names.data(), num_outputs);
+        auto t1 = std::chrono::high_resolution_clock::now();
+        double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+        std::cout << "[ONNXOCRClassifier] Warmup [1,3,"
+                  << kClsImageH << "," << kClsImageW << "] "
+                  << ms << " ms" << std::endl;
+    }
+    catch (const Ort::Exception& e) {
+        std::cerr << "[ONNXOCRClassifier] Warmup failed: " << e.what() << std::endl;
+    }
+    _warmedUp = true;
+}
+
 } // namespace onnxocr
 } // namespace ANSCENTER
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.h
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.h
@@ -11,6 +11,9 @@ namespace onnxocr {
 class ONNXOCRClassifier : public BasicOrtHandler {
 public:
    explicit ONNXOCRClassifier(const std::string& onnx_path, unsigned int num_threads = 1);
+    explicit ONNXOCRClassifier(const std::string& onnx_path,
+                               const OrtHandlerOptions& options,
+                               unsigned int num_threads = 1);
    ~ONNXOCRClassifier() override = default;

    // Classify text orientation for a list of cropped images
@@ -21,7 +24,12 @@ public:
                  std::vector<float>& cls_scores,
                  float cls_thresh = kClsThresh);

+    // Pre-warm cuDNN/TRT for the classifier's fixed [1,3,80,160] shape.
+    // Idempotent — no-op after the first call.
+    void Warmup();
+
 private:
+    bool _warmedUp = false;
    Ort::Value transform(const cv::Mat& mat) override;
    Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;

--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp
@@ -7,6 +7,7 @@
 #include <iostream>
 #include <algorithm>
 #include <cmath>
+#include <chrono>

 namespace ANSCENTER {
 namespace onnxocr {
@@ -15,6 +16,12 @@ ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path, unsigned int num_
    : BasicOrtHandler(onnx_path, num_threads) {
 }

+ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path,
+                                 const OrtHandlerOptions& options,
+                                 unsigned int num_threads)
+    : BasicOrtHandler(onnx_path, options, num_threads) {
+}
+
 Ort::Value ONNXOCRDetector::transform(const cv::Mat& mat) {
    // Not used directly - detection uses custom Preprocess + manual tensor creation
    // Provided to satisfy BasicOrtHandler pure virtual
@@ -308,5 +315,41 @@ std::vector<cv::Point2f> ONNXOCRDetector::UnclipPolygon(const std::array<cv::Poi
    return result;
 }

+void ONNXOCRDetector::Warmup() {
+    std::lock_guard<std::mutex> lock(_mutex);
+    if (_warmedUp || !ort_session) return;
+
+    // 320x320 covers the typical license-plate ROI after LPD crop +
+    // multiple-of-32 rounding. cuDNN caches the algorithm for this
+    // shape so the first real inference doesn't pay the picker cost.
+    constexpr int kWarmupSide = 320;
+    try {
+        cv::Mat dummy(kWarmupSide, kWarmupSide, CV_8UC3, cv::Scalar(128, 128, 128));
+        cv::Mat dummyF;
+        dummy.convertTo(dummyF, CV_32FC3);
+        auto inputData = NormalizeAndPermute(dummyF);
+
+        std::array<int64_t, 4> inputShape = { 1, 3, kWarmupSide, kWarmupSide };
+        Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
+            *memory_info_handler, inputData.data(), inputData.size(),
+            inputShape.data(), inputShape.size());
+
+        auto t0 = std::chrono::high_resolution_clock::now();
+        (void)ort_session->Run(
+            Ort::RunOptions{ nullptr },
+            input_node_names.data(), &inputTensor, 1,
+            output_node_names.data(), num_outputs);
+        auto t1 = std::chrono::high_resolution_clock::now();
+        double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+        std::cout << "[ONNXOCRDetector] Warmup [1,3,"
+                  << kWarmupSide << "," << kWarmupSide << "] "
+                  << ms << " ms" << std::endl;
+    }
+    catch (const Ort::Exception& e) {
+        std::cerr << "[ONNXOCRDetector] Warmup failed: " << e.what() << std::endl;
+    }
+    _warmedUp = true;
+}
+
 } // namespace onnxocr
 } // namespace ANSCENTER
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.h
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.h
@@ -11,6 +11,9 @@ namespace onnxocr {
 class ONNXOCRDetector : public BasicOrtHandler {
 public:
    explicit ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads = 1);
+    explicit ONNXOCRDetector(const std::string& onnx_path,
+                             const OrtHandlerOptions& options,
+                             unsigned int num_threads = 1);
    ~ONNXOCRDetector() override = default;

    // Run text detection on an image
@@ -21,7 +24,12 @@ public:
                                float unclipRatio = kDetUnclipRatio,
                                bool useDilation  = false);

+    // Pre-warm cuDNN/TRT at a canonical 320x320 input so the first real
+    // call doesn't pay the algorithm-selection tax. Idempotent.
+    void Warmup();
+
 private:
+    bool _warmedUp = false;
    Ort::Value transform(const cv::Mat& mat) override;
    Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;

--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp
@@ -7,6 +7,7 @@
 #include <cmath>
 #include <cfloat>
 #include <cstring>
+#include <chrono>

 namespace ANSCENTER {
 namespace onnxocr {
@@ -15,6 +16,12 @@ ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path, unsigned int
    : BasicOrtHandler(onnx_path, num_threads) {
 }

+ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path,
+                                     const OrtHandlerOptions& options,
+                                     unsigned int num_threads)
+    : BasicOrtHandler(onnx_path, options, num_threads) {
+}
+
 bool ONNXOCRRecognizer::LoadDictionary(const std::string& dictPath) {
    keys_ = LoadDict(dictPath);
    if (keys_.size() < 2) {
@@ -46,6 +53,54 @@ Ort::Value ONNXOCRRecognizer::transformBatch(const std::vector<cv::Mat>& images)
    return Ort::Value(nullptr);
 }

+// ----------------------------------------------------------------------------
+// Width buckets — every recognizer input is padded up to one of these widths
+// before reaching ORT. This bounds the number of distinct shapes cuDNN ever
+// sees to four, so its HEURISTIC algorithm cache hits on every subsequent
+// call instead of re-tuning per plate. Buckets cover the realistic range:
+//   320 px  → short Latin/Japanese plates (most common)
+//   480 px  → wider Latin plates with two rows of text
+//   640 px  → long single-row plates / multi-line stacked text
+//   960 px  → safety upper bound (== kRecImgMaxW)
+// ----------------------------------------------------------------------------
+static constexpr int kRecBucketWidths[] = { 320, 480, 640, 960 };
+static constexpr int kRecNumBuckets = sizeof(kRecBucketWidths) / sizeof(kRecBucketWidths[0]);
+
+int ONNXOCRRecognizer::RoundUpToBucket(int resizedW) const {
+    const int capped = std::min(resizedW, imgMaxW_);
+    for (int b = 0; b < kRecNumBuckets; ++b) {
+        if (kRecBucketWidths[b] >= capped) return kRecBucketWidths[b];
+    }
+    return imgMaxW_;
+}
+
+// Resize + normalize a single crop into a CHW float vector at width
+// `bucketW`, padding with zeros on the right when needed. The returned
+// vector has exactly 3*imgH_*bucketW elements.
+static std::vector<float> PreprocessCropToBucket(const cv::Mat& crop,
+                                                 int imgH, int bucketW) {
+    cv::Mat resized = ResizeRecImage(crop, imgH, bucketW);
+    int resizedW = resized.cols;
+    resized.convertTo(resized, CV_32FC3);
+    auto normalizedData = NormalizeAndPermuteCls(resized);
+
+    if (resizedW == bucketW) {
+        return normalizedData;
+    }
+
+    // Zero-pad on the right (CHW layout)
+    std::vector<float> padded(3 * imgH * bucketW, 0.0f);
+    for (int c = 0; c < 3; c++) {
+        for (int y = 0; y < imgH; y++) {
+            std::memcpy(
+                &padded[c * imgH * bucketW + y * bucketW],
+                &normalizedData[c * imgH * resizedW + y * resizedW],
+                resizedW * sizeof(float));
+        }
+    }
+    return padded;
+}
+
 TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
    std::lock_guard<std::mutex> lock(_mutex);

@@ -54,52 +109,27 @@ TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
    }

    try {
-        // Preprocess: resize to fixed height, proportional width
+        // Step 1: aspect-preserving resize to height=imgH_, width capped
+        // at imgMaxW_. Then round resized width up to the next bucket.
        cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
-        int resizedW = resized.cols;
+        const int bucketW = RoundUpToBucket(resized.cols);

-        resized.convertTo(resized, CV_32FC3);
-        // Recognition uses (pixel/255 - 0.5) / 0.5 normalization (same as classifier)
-        auto normalizedData = NormalizeAndPermuteCls(resized);
+        std::vector<float> inputData = PreprocessCropToBucket(croppedImage, imgH_, bucketW);

-        // Pad to at least kRecImgW width (matching official PaddleOCR behavior)
-        // Official PaddleOCR: padding_im = np.zeros((C, H, W)), then copies normalized
-        // image into left portion. Padding value = 0.0 in normalized space.
-        int imgW = std::max(resizedW, kRecImgW);
-
-        std::vector<float> inputData;
-        if (imgW > resizedW) {
-            // Zero-pad on the right (CHW layout)
-            inputData.resize(3 * imgH_ * imgW, 0.0f);
-            for (int c = 0; c < 3; c++) {
-                for (int y = 0; y < imgH_; y++) {
-                    std::memcpy(
-                        &inputData[c * imgH_ * imgW + y * imgW],
-                        &normalizedData[c * imgH_ * resizedW + y * resizedW],
-                        resizedW * sizeof(float));
-                }
-            }
-        } else {
-            inputData = std::move(normalizedData);
-        }
-
-        // Create input tensor with (possibly padded) width
-        std::array<int64_t, 4> inputShape = { 1, 3, imgH_, imgW };
+        std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };
        Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
            *memory_info_handler, inputData.data(), inputData.size(),
            inputShape.data(), inputShape.size());

-        // Run inference
        auto outputTensors = ort_session->Run(
            Ort::RunOptions{ nullptr },
            input_node_names.data(), &inputTensor, 1,
            output_node_names.data(), num_outputs);

-        // Get output
        float* outputData = outputTensors[0].GetTensorMutableData<float>();
        auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();

-        int seqLen = static_cast<int>(outputShape[1]);
+        int seqLen     = static_cast<int>(outputShape[1]);
        int numClasses = static_cast<int>(outputShape[2]);

        return CTCDecode(outputData, seqLen, numClasses);
@@ -110,18 +140,162 @@ TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
    }
 }

-std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
-    std::vector<TextLine> results;
-    results.reserve(croppedImages.size());
+void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector<cv::Mat>& crops,
+                                        const std::vector<size_t>& origIndices,
+                                        int bucketW,
+                                        std::vector<TextLine>& out) {
+    if (crops.empty()) return;

-    // Process one at a time (dynamic width per image)
-    for (size_t i = 0; i < croppedImages.size(); i++) {
-        results.push_back(Recognize(croppedImages[i]));
+    try {
+        const size_t batchN = crops.size();
+        const size_t perImage = static_cast<size_t>(3) * imgH_ * bucketW;
+
+        // Stack N preprocessed crops into one [N,3,H,W] buffer
+        std::vector<float> batchInput(batchN * perImage, 0.0f);
+        for (size_t i = 0; i < batchN; ++i) {
+            auto img = PreprocessCropToBucket(crops[i], imgH_, bucketW);
+            std::memcpy(&batchInput[i * perImage], img.data(),
+                        perImage * sizeof(float));
+        }
+
+        std::array<int64_t, 4> inputShape = {
+            static_cast<int64_t>(batchN), 3,
+            static_cast<int64_t>(imgH_),
+            static_cast<int64_t>(bucketW)
+        };
+        Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
+            *memory_info_handler, batchInput.data(), batchInput.size(),
+            inputShape.data(), inputShape.size());
+
+        auto outputTensors = ort_session->Run(
+            Ort::RunOptions{ nullptr },
+            input_node_names.data(), &inputTensor, 1,
+            output_node_names.data(), num_outputs);
+
+        float* outputData = outputTensors[0].GetTensorMutableData<float>();
+        auto outputShape  = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
+
+        // Expected output: [N, seqLen, numClasses]
+        if (outputShape.size() < 3) {
+            std::cerr << "[ONNXOCRRecognizer] Unexpected batch output rank: "
+                      << outputShape.size() << std::endl;
+            return;
+        }
+        const int outBatch   = static_cast<int>(outputShape[0]);
+        const int seqLen     = static_cast<int>(outputShape[1]);
+        const int numClasses = static_cast<int>(outputShape[2]);
+        const size_t perRow  = static_cast<size_t>(seqLen) * numClasses;
+
+        for (int i = 0; i < outBatch && i < static_cast<int>(batchN); ++i) {
+            TextLine tl = CTCDecode(outputData + i * perRow, seqLen, numClasses);
+            out[origIndices[i]] = std::move(tl);
+        }
+    }
+    catch (const Ort::Exception& e) {
+        // ORT will throw if the model doesn't support a batch dimension > 1.
+        // Fall back to per-image inference for this group.
+        std::cerr << "[ONNXOCRRecognizer] Batch inference failed at bucketW="
+                  << bucketW << " (" << e.what()
+                  << ") — falling back to single-image path." << std::endl;
+        for (size_t i = 0; i < crops.size(); ++i) {
+            // Direct call (we already hold _mutex via the public RecognizeBatch
+            // wrapper). Replicate the single-image preprocessing here to avoid
+            // re-entering Recognize() and double-locking the mutex.
+            try {
+                cv::Mat resized = ResizeRecImage(crops[i], imgH_, imgMaxW_);
+                int singleBucket = RoundUpToBucket(resized.cols);
+                auto inputData = PreprocessCropToBucket(crops[i], imgH_, singleBucket);
+                std::array<int64_t, 4> inputShape = { 1, 3, imgH_, singleBucket };
+                Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
+                    *memory_info_handler, inputData.data(), inputData.size(),
+                    inputShape.data(), inputShape.size());
+                auto outputTensors = ort_session->Run(
+                    Ort::RunOptions{ nullptr },
+                    input_node_names.data(), &inputTensor, 1,
+                    output_node_names.data(), num_outputs);
+                float* outData = outputTensors[0].GetTensorMutableData<float>();
+                auto outShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
+                int seqLen = static_cast<int>(outShape[1]);
+                int numClasses = static_cast<int>(outShape[2]);
+                out[origIndices[i]] = CTCDecode(outData, seqLen, numClasses);
+            } catch (const Ort::Exception& e2) {
+                std::cerr << "[ONNXOCRRecognizer] Single-image fallback also failed: "
+                          << e2.what() << std::endl;
+                out[origIndices[i]] = {};
+            }
+        }
+    }
+}
+
+std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
+    std::lock_guard<std::mutex> lock(_mutex);
+
+    std::vector<TextLine> results(croppedImages.size());
+    if (!ort_session || croppedImages.empty() || keys_.empty()) {
+        return results;
+    }
+
+    // Group crops by their target bucket width
+    std::vector<std::vector<cv::Mat>> groupCrops(kRecNumBuckets);
+    std::vector<std::vector<size_t>>  groupIdx(kRecNumBuckets);
+
+    for (size_t i = 0; i < croppedImages.size(); ++i) {
+        if (croppedImages[i].empty()) continue;
+        cv::Mat resized = ResizeRecImage(croppedImages[i], imgH_, imgMaxW_);
+        const int bw = RoundUpToBucket(resized.cols);
+        // Find bucket index
+        int bucketIdx = kRecNumBuckets - 1;
+        for (int b = 0; b < kRecNumBuckets; ++b) {
+            if (kRecBucketWidths[b] == bw) { bucketIdx = b; break; }
+        }
+        groupCrops[bucketIdx].push_back(croppedImages[i]);
+        groupIdx[bucketIdx].push_back(i);
+    }
+
+    // Run one batched inference per non-empty bucket
+    for (int b = 0; b < kRecNumBuckets; ++b) {
+        if (groupCrops[b].empty()) continue;
+        RunBatchAtWidth(groupCrops[b], groupIdx[b], kRecBucketWidths[b], results);
    }

    return results;
 }

+void ONNXOCRRecognizer::Warmup() {
+    std::lock_guard<std::mutex> lock(_mutex);
+    if (_warmedUp || !ort_session) return;
+
+    // Dummy 3-channel image, mid-grey, large enough to resize to imgH_
+    cv::Mat dummy(imgH_ * 2, kRecBucketWidths[kRecNumBuckets - 1] * 2,
+                  CV_8UC3, cv::Scalar(128, 128, 128));
+
+    for (int b = 0; b < kRecNumBuckets; ++b) {
+        const int bucketW = kRecBucketWidths[b];
+        try {
+            auto inputData = PreprocessCropToBucket(dummy, imgH_, bucketW);
+            std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };
+            Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
+                *memory_info_handler, inputData.data(), inputData.size(),
+                inputShape.data(), inputShape.size());
+
+            auto t0 = std::chrono::high_resolution_clock::now();
+            (void)ort_session->Run(
+                Ort::RunOptions{ nullptr },
+                input_node_names.data(), &inputTensor, 1,
+                output_node_names.data(), num_outputs);
+            auto t1 = std::chrono::high_resolution_clock::now();
+            double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+            std::cout << "[ONNXOCRRecognizer] Warmup bucketW=" << bucketW
+                      << "  " << ms << " ms" << std::endl;
+        }
+        catch (const Ort::Exception& e) {
+            std::cerr << "[ONNXOCRRecognizer] Warmup failed at bucketW="
+                      << bucketW << ": " << e.what() << std::endl;
+        }
+    }
+    _warmedUp = true;
+}
+
 TextLine ONNXOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
    TextLine result;
    std::string text;
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.h
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.h
@@ -12,6 +12,9 @@ namespace onnxocr {
 class ONNXOCRRecognizer : public BasicOrtHandler {
 public:
    explicit ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads = 1);
+    explicit ONNXOCRRecognizer(const std::string& onnx_path,
+                               const OrtHandlerOptions& options,
+                               unsigned int num_threads = 1);
    ~ONNXOCRRecognizer() override = default;

    // Load character dictionary (must be called before Recognize)
@@ -20,13 +23,31 @@ public:
    // Recognize text from a single cropped text image
    TextLine Recognize(const cv::Mat& croppedImage);

-    // Batch recognition for multiple cropped images
+    // Batch recognition for multiple cropped images.
+    // Crops are grouped into a small set of fixed width buckets and
+    // submitted to ORT as [N,3,imgH_,bucketW] tensors so cuDNN sees
+    // shape-stable inputs and can reuse algorithms across calls.
    std::vector<TextLine> RecognizeBatch(const std::vector<cv::Mat>& croppedImages);

+    // Pre-warm cuDNN/TRT for every bucket width by running dummy
+    // inferences. Idempotent — no-op if already warmed up.
+    void Warmup();
+
 private:
    Ort::Value transform(const cv::Mat& mat) override;
    Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;

+    // Round resizedW up to the next bucket width (capped at imgMaxW_).
+    // Used by both Recognize() and RecognizeBatch() so cuDNN only ever
+    // sees a small finite set of input shapes.
+    int RoundUpToBucket(int resizedW) const;
+
+    // Run a single [N,3,imgH_,bucketW] inference and CTC-decode each row.
+    void RunBatchAtWidth(const std::vector<cv::Mat>& crops,
+                         const std::vector<size_t>& origIndices,
+                         int bucketW,
+                         std::vector<TextLine>& out);
+
    // CTC greedy decode
    TextLine CTCDecode(const float* outputData, int seqLen, int numClasses);

@@ -34,6 +55,7 @@ private:
    int imgH_    = kRecImgH;
    int imgMaxW_ = kRecImgMaxW;
    std::mutex _mutex;
+    bool _warmedUp = false;
 };

 } // namespace onnxocr
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h
@@ -88,11 +88,22 @@ inline std::vector<std::string> LoadDict(const std::string& dictPath) {
    return keys;
 }

-// Compute resize dimensions for detection model (multiples of 32)
+// Compute resize dimensions for detection model.
 // limit_type='max': scale down if max side > maxSideLen (PP-OCRv5 server default)
 // maxSideLimit: safety cap on final max dimension (default 4000)
+//
+// Each dimension is rounded UP to a multiple of kDetSizeBucket (96). The
+// coarse granularity is deliberate: cuDNN HEURISTIC has to re-select
+// convolution algorithms every time it sees a new input shape, and that
+// selection costs ~100 ms per shape. With multiples of 32, a typical ALPR
+// run produces 30+ unique detector shapes; with multiples of 96 that drops
+// to 5–10, which cuDNN can cache and reuse for the rest of the video.
+// 96 is divisible by the DBNet down-stride of 32, so feature-map sizes
+// stay integer.
 inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen,
                                       int maxSideLimit = kDetMaxSideLimit) {
+    constexpr int kDetSizeBucket = 96;
+
    float ratio = 1.0f;
    int maxSide = std::max(srcH, srcW);
    if (maxSide > maxSideLen) {
@@ -108,8 +119,12 @@ inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen,
        newW = static_cast<int>(newW * clampRatio);
    }

-    newH = std::max(32, static_cast<int>(std::round(newH / 32.0) * 32));
-    newW = std::max(32, static_cast<int>(std::round(newW / 32.0) * 32));
+    auto roundUpToBucket = [](int x) {
+        return std::max(kDetSizeBucket,
+                        ((x + kDetSizeBucket - 1) / kDetSizeBucket) * kDetSizeBucket);
+    };
+    newH = roundUpToBucket(newH);
+    newW = roundUpToBucket(newW);
    return cv::Size(newW, newH);
 }

--- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
@@ -11,13 +11,75 @@ namespace onnxocr {
 bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
                                    const std::string& clsModelPath,
                                    const std::string& recModelPath,
-                                    const std::string& dictPath) {
+                                    const std::string& dictPath,
+                                    bool preferTensorRT) {
    std::lock_guard<std::recursive_mutex> lock(_mutex);
    ModelLoadingGuard mlg(_modelLoading);

+    // High-perf options.  The OCR sub-models split into two groups:
+    //
+    //   1. Detector — its input shape varies continuously with every
+    //      plate-ROI aspect ratio.  TRT EP is a poor fit because it
+    //      builds a fresh engine for each unique shape (minutes each).
+    //      We keep it on CUDA EP with the largest cuDNN workspace and
+    //      let cuDNN HEURISTIC handle the per-shape algo selection.
+    //
+    //   2. Classifier + Recognizer — fixed-bucket shapes (cls is
+    //      [1,3,80,160], rec is [1,3,48,{320,480,640,960}]).  These
+    //      benefit massively from TRT EP because the engine is built
+    //      once per shape and reused forever.
+    OrtHandlerOptions detectorOpts;
+    // Detector uses CUDA EP with *conservative* cuDNN workspace.
+    // Empirical: on VRAM-constrained GPUs (LPD TRT engine + rec TRT
+    // engine + ORT arena in play) the max-workspace mode causes cuDNN
+    // to pick Winograd/implicit-precomp-GEMM variants that silently
+    // fall back to slow NO-WORKSPACE algorithms when the big workspace
+    // can't be allocated. With "0" cuDNN picks algorithms that are
+    // known to fit and runs ~10x faster in practice.
+    detectorOpts.useMaxCudnnWorkspace = false;
+    detectorOpts.preferTensorRT       = false;   // never TRT for the detector
+
+    // Classifier (fixed [1,3,80,160]): TRT with no profile is fine.
+    OrtHandlerOptions classifierOpts;
+    classifierOpts.useMaxCudnnWorkspace = true;
+    classifierOpts.preferTensorRT       = preferTensorRT;
+    classifierOpts.trtFP16              = true;
+
+    // Recognizer: needs a DYNAMIC profile so one TRT engine covers every
+    // (batch, bucket_width) pair we generate at runtime. Without this,
+    // each new shape triggers a ~80s engine rebuild mid-stream when a
+    // new plate appears or the plate count changes.
+    //
+    // Profile range:
+    //   batch  : 1 .. 16       (16 plates worth of crops is generous)
+    //   H      : 48 (fixed)
+    //   W      : 320 .. 960    (covers all 4 recognizer buckets)
+    //
+    // Query the actual input name from the .onnx file instead of
+    // hardcoding — PaddleOCR usually exports it as "x" but the name can
+    // vary across model versions.
+    OrtHandlerOptions recognizerOpts;
+    recognizerOpts.useMaxCudnnWorkspace = true;
+    recognizerOpts.preferTensorRT       = preferTensorRT;
+    recognizerOpts.trtFP16              = true;
+    if (preferTensorRT) {
+        std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath);
+        if (recInputName.empty()) {
+            std::cerr << "[PaddleOCRV5Engine] Could not query recognizer "
+                         "input name — defaulting to 'x'" << std::endl;
+            recInputName = "x";
+        }
+        std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
+                  << recInputName << "' — building TRT dynamic profile "
+                  << "[batch=1..16, W=320..960]" << std::endl;
+        recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
+        recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
+        recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
+    }
+
    try {
        // Initialize detector (also triggers EPLoader init in BasicOrtHandler)
-        detector_ = std::make_unique<ONNXOCRDetector>(detModelPath);
+        detector_ = std::make_unique<ONNXOCRDetector>(detModelPath, detectorOpts);
        std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl;

        // Ensure this DLL's copy of Ort::Global<void>::api_ is initialized.
@@ -29,7 +91,7 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,

        // Initialize classifier (optional)
        if (!clsModelPath.empty()) {
-            classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath);
+            classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath, classifierOpts);
            std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl;
        }
        else {
@@ -38,13 +100,26 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
        }

        // Initialize recognizer
-        recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath);
+        recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath, recognizerOpts);
        if (!recognizer_->LoadDictionary(dictPath)) {
            std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl;
            return false;
        }
        std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl;

+        // Pre-warm classifier (fixed [1,3,80,160]) and recognizer (4
+        // bucket widths) so the first frame doesn't pay the cuDNN/TRT
+        // algorithm-selection tax. The detector is intentionally NOT
+        // warmed up: its input shape varies continuously with each
+        // plate-ROI aspect ratio, so a warmup at any single canonical
+        // shape would cost minutes (TRT) or be useless (CUDA cache miss
+        // on the real frame anyway). Real frames will pay the per-shape
+        // cuDNN HEURISTIC cost on first use.
+        std::cout << "[PaddleOCRV5Engine] Warming up OCR pipeline..." << std::endl;
+        if (classifier_) classifier_->Warmup();
+        if (recognizer_) recognizer_->Warmup();
+        std::cout << "[PaddleOCRV5Engine] Warmup complete" << std::endl;
+
        _initialized = true;
        std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl;
        return true;
@@ -140,5 +215,18 @@ TextLine PaddleOCRV5Engine::recognizeOnly(const cv::Mat& croppedImage) {
    return recognizer_->Recognize(croppedImage);
 }

+std::vector<TextLine> PaddleOCRV5Engine::recognizeMany(const std::vector<cv::Mat>& croppedImages) {
+    if (_modelLoading.load()) return std::vector<TextLine>(croppedImages.size());
+    {
+        auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeMany");
+        if (!lk.owns_lock()) return std::vector<TextLine>(croppedImages.size());
+        if (!_initialized || !recognizer_ || croppedImages.empty()) {
+            return std::vector<TextLine>(croppedImages.size());
+        }
+    }
+    // Delegates to the bucketed, batched path in ONNXOCRRecognizer.
+    return recognizer_->RecognizeBatch(croppedImages);
+}
+
 } // namespace onnxocr
 } // namespace ANSCENTER
--- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.h
+++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.h
@@ -25,10 +25,13 @@ public:

    // Initialize the OCR pipeline
    // clsModelPath can be empty to skip classification
+    // preferTensorRT: try TensorRT EP first for the three sub-models
+    //                 (cuDNN-friendly cuDNN max-workspace mode either way)
    bool Initialize(const std::string& detModelPath,
                    const std::string& clsModelPath,
                    const std::string& recModelPath,
-                    const std::string& dictPath);
+                    const std::string& dictPath,
+                    bool preferTensorRT = false);

    // Run full OCR pipeline on an image
    // Returns results matching PaddleOCR::OCRPredictResult format
@@ -37,6 +40,14 @@ public:
    // Run recognizer only on a pre-cropped text image (no detection step)
    TextLine recognizeOnly(const cv::Mat& croppedImage);

+    // Run recognizer only on a batch of pre-cropped text images in a
+    // single batched ORT inference. Skips the detector entirely — the
+    // caller is expected to supply crops that are already roughly
+    // axis-aligned single-line text (e.g. ALPR plate ROIs, optionally
+    // pre-split into rows). Crops are grouped by bucket width, so a
+    // single call to this function typically issues 1–2 ORT Runs total.
+    std::vector<TextLine> recognizeMany(const std::vector<cv::Mat>& croppedImages);
+
    // Configuration setters (matching OCRModelConfig parameters)
    void SetDetMaxSideLen(int val)          { _maxSideLen = val; }
    void SetDetDbThresh(float val)          { _detDbThresh = val; }