Two-part fix

Fix 1 — Chunk oversized bucket groups (the correctness fix) ONNXOCRRecognizer::RecognizeBatch now slices each bucket group into chunks of ≤ kRecMaxBatch before submitting to TRT. A frame with 30 crops in bucket 320 produces two back-to-back batched calls (24 + 6), both within the profile, both on the fast path. Fix 2 — Raise the profile max from 16 to 24 (the performance fix) The old profile max was 16; your real scenes routinely hit 24. Raising the profile max to 24 means the common 12-plate scene (24 crops) fits in a single batched call with no chunking needed. Scenes with > 24 crops now use chunking, but that's rare.
2026-04-15 07:27:55 +10:00
parent 5706615ed5
commit 7778f8c214
3 changed files with 44 additions and 6 deletions
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp
@@ -252,10 +252,29 @@ std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Ma
        groupIdx[bucketIdx].push_back(i);
    }
-    // Run one batched inference per non-empty bucket
+    // Run batched inference per non-empty bucket, slicing each bucket
    // group into chunks of at most kRecMaxBatch crops so we never exceed
    // the TRT dynamic profile's max-batch dimension. On a busy scene with
    // (say) 30 plates all falling in bucket 320, we issue two back-to-back
    // batched calls of 24 + 6 instead of one oversized call that would
    // throw "does not satisfy any optimization profiles" and fall off
    // the fast path to the per-image fallback.
    for (int b = 0; b < kRecNumBuckets; ++b) {
-        if (groupCrops[b].empty()) continue;
+        const auto& bucketCrops = groupCrops[b];
-        RunBatchAtWidth(groupCrops[b], groupIdx[b], kRecBucketWidths[b], results);
+        const auto& bucketIndices = groupIdx[b];
        if (bucketCrops.empty()) continue;
        const int bucketW = kRecBucketWidths[b];
        const size_t total = bucketCrops.size();
        for (size_t start = 0; start < total; start += kRecMaxBatch) {
            const size_t end = std::min(start + static_cast<size_t>(kRecMaxBatch), total);
            std::vector<cv::Mat> chunkCrops(bucketCrops.begin() + start,
                                            bucketCrops.begin() + end);
            std::vector<size_t>  chunkIdx(bucketIndices.begin() + start,
                                          bucketIndices.begin() + end);
            RunBatchAtWidth(chunkCrops, chunkIdx, bucketW, results);
        }
    }
    return results;
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h
@@ -48,6 +48,20 @@ constexpr int kRecImgW    = 320;   // Default rec width (PP-OCRv5 rec_image_shap
 constexpr int kRecImgMaxW = 960;   // Allow wide recognition input for long text lines
 constexpr int kRecBatchSize = 6;
 // Maximum crops submitted to the recognizer in a single ORT Run call.
 // Two things must stay in sync with this value:
 //   1. The TRT dynamic profile in PaddleOCRV5Engine::BuildNvidiaOcrOptions —
 //      the profile's max-batch dimension is set from kRecMaxBatch so TRT
 //      builds a single engine that handles everything up to this size.
 //   2. The bucket-chunking loop in ONNXOCRRecognizer::RecognizeBatch —
 //      bucket groups larger than this get sliced into multiple Run() calls
 //      so we never exceed the profile and fall off the fast batched path.
 //
 // Raising it increases peak runtime VRAM (the TRT execution context
 // allocates worst-case activation buffers), so keep it as low as is
 // reasonable for your expected plate count per frame.
 constexpr int kRecMaxBatch = 24;
 // A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left)
 struct TextBox {
    std::array<cv::Point2f, 4> points;
--- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
@@ -60,7 +60,11 @@ static PerModelOcrOptions BuildNvidiaOcrOptions(
    opts.classifierOpts.preferTensorRT       = preferTensorRT;
    opts.classifierOpts.trtFP16              = true;
-    // Recognizer: TRT EP with dynamic shape profile.
+    // Recognizer: TRT EP with dynamic shape profile. The max-batch
    // dimension is kRecMaxBatch (defined in ONNXOCRTypes.h) — the same
    // constant that ONNXOCRRecognizer::RecognizeBatch uses to chunk
    // oversized bucket groups. Keeping them in lockstep ensures the
    // recognizer never submits a shape that falls outside the TRT profile.
    opts.recognizerOpts.useMaxCudnnWorkspace = true;
    opts.recognizerOpts.preferTensorRT       = preferTensorRT;
    opts.recognizerOpts.trtFP16              = true;
@@ -71,12 +75,13 @@ static PerModelOcrOptions BuildNvidiaOcrOptions(
                         "input name — defaulting to 'x'" << std::endl;
            recInputName = "x";
        }
        const std::string maxB = std::to_string(kRecMaxBatch);
        std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
                  << recInputName << "' — building TRT dynamic profile "
-                  << "[batch=1..16, W=320..960]" << std::endl;
+                  << "[batch=1.." << maxB << ", W=320..960]" << std::endl;
        opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
        opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
-        opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
+        opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":" + maxB + "x3x48x960";
    }
    return opts;
 }