From 7778f8c214b6756f1367790e336777c0cf720c69 Mon Sep 17 00:00:00 2001
From: Tuan Nghia Nguyen <nghia.nguyen@anscenter.com>
Date: Wed, 15 Apr 2026 07:27:55 +1000
Subject: [PATCH] =?UTF-8?q?Two-part=20fix=20Fix=201=20=E2=80=94=20Chunk=20?=
 =?UTF-8?q?oversized=20bucket=20groups=20(the=20correctness=20fix)=20ONNXO?=
 =?UTF-8?q?CRRecognizer::RecognizeBatch=20now=20slices=20each=20bucket=20g?=
 =?UTF-8?q?roup=20into=20chunks=20of=20=E2=89=A4=20kRecMaxBatch=20before?=
 =?UTF-8?q?=20submitting=20to=20TRT.=20A=20frame=20with=2030=20crops=20in?=
 =?UTF-8?q?=20bucket=20320=20produces=20two=20back-to-back=20batched=20cal?=
 =?UTF-8?q?ls=20(24=20+=206),=20both=20within=20the=20profile,=20both=20on?=
 =?UTF-8?q?=20the=20fast=20path.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix 2 — Raise the profile max from 16 to 24 (the performance fix)
The old profile max was 16; your real scenes routinely hit 24. Raising the profile max to 24 means the common 12-plate scene (24 crops) fits in a single batched call with no chunking needed. Scenes with > 24 crops now use chunking, but that's rare.
---
 .../ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp   | 25 ++++++++++++++++---
 modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h      | 14 +++++++++++
 .../ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp   | 11 +++++---
 3 files changed, 44 insertions(+), 6 deletions(-)
diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp
index 190a30a..52879bf 100644
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp
@@ -252,10 +252,29 @@ std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Ma
         groupIdx[bucketIdx].push_back(i);
     }
 
-    // Run one batched inference per non-empty bucket
+    // Run batched inference per non-empty bucket, slicing each bucket
+    // group into chunks of at most kRecMaxBatch crops so we never exceed
+    // the TRT dynamic profile's max-batch dimension. On a busy scene with
+    // (say) 30 plates all falling in bucket 320, we issue two back-to-back
+    // batched calls of 24 + 6 instead of one oversized call that would
+    // throw "does not satisfy any optimization profiles" and fall off
+    // the fast path to the per-image fallback.
     for (int b = 0; b < kRecNumBuckets; ++b) {
-        if (groupCrops[b].empty()) continue;
-        RunBatchAtWidth(groupCrops[b], groupIdx[b], kRecBucketWidths[b], results);
+        const auto& bucketCrops = groupCrops[b];
+        const auto& bucketIndices = groupIdx[b];
+        if (bucketCrops.empty()) continue;
+
+        const int bucketW = kRecBucketWidths[b];
+        const size_t total = bucketCrops.size();
+
+        for (size_t start = 0; start < total; start += kRecMaxBatch) {
+            const size_t end = std::min(start + static_cast<size_t>(kRecMaxBatch), total);
+            std::vector<cv::Mat> chunkCrops(bucketCrops.begin() + start,
+                                            bucketCrops.begin() + end);
+            std::vector<size_t>  chunkIdx(bucketIndices.begin() + start,
+                                          bucketIndices.begin() + end);
+            RunBatchAtWidth(chunkCrops, chunkIdx, bucketW, results);
+        }
     }
 
     return results;
diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h b/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h
index 5f07f2c..c12e02d 100644
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h
@@ -48,6 +48,20 @@ constexpr int kRecImgW    = 320;   // Default rec width (PP-OCRv5 rec_image_shap
 constexpr int kRecImgMaxW = 960;   // Allow wide recognition input for long text lines
 constexpr int kRecBatchSize = 6;
 
+// Maximum crops submitted to the recognizer in a single ORT Run call.
+// Two things must stay in sync with this value:
+//   1. The TRT dynamic profile in PaddleOCRV5Engine::BuildNvidiaOcrOptions —
+//      the profile's max-batch dimension is set from kRecMaxBatch so TRT
+//      builds a single engine that handles everything up to this size.
+//   2. The bucket-chunking loop in ONNXOCRRecognizer::RecognizeBatch —
+//      bucket groups larger than this get sliced into multiple Run() calls
+//      so we never exceed the profile and fall off the fast batched path.
+//
+// Raising it increases peak runtime VRAM (the TRT execution context
+// allocates worst-case activation buffers), so keep it as low as is
+// reasonable for your expected plate count per frame.
+constexpr int kRecMaxBatch = 24;
+
 // A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left)
 struct TextBox {
     std::array<cv::Point2f, 4> points;
diff --git a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
index 71406d4..04c082b 100644
--- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
@@ -60,7 +60,11 @@ static PerModelOcrOptions BuildNvidiaOcrOptions(
     opts.classifierOpts.preferTensorRT       = preferTensorRT;
     opts.classifierOpts.trtFP16              = true;
 
-    // Recognizer: TRT EP with dynamic shape profile.
+    // Recognizer: TRT EP with dynamic shape profile. The max-batch
+    // dimension is kRecMaxBatch (defined in ONNXOCRTypes.h) — the same
+    // constant that ONNXOCRRecognizer::RecognizeBatch uses to chunk
+    // oversized bucket groups. Keeping them in lockstep ensures the
+    // recognizer never submits a shape that falls outside the TRT profile.
     opts.recognizerOpts.useMaxCudnnWorkspace = true;
     opts.recognizerOpts.preferTensorRT       = preferTensorRT;
     opts.recognizerOpts.trtFP16              = true;
@@ -71,12 +75,13 @@ static PerModelOcrOptions BuildNvidiaOcrOptions(
                          "input name — defaulting to 'x'" << std::endl;
             recInputName = "x";
         }
+        const std::string maxB = std::to_string(kRecMaxBatch);
         std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
                   << recInputName << "' — building TRT dynamic profile "
-                  << "[batch=1..16, W=320..960]" << std::endl;
+                  << "[batch=1.." << maxB << ", W=320..960]" << std::endl;
         opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
         opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
-        opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
+        opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":" + maxB + "x3x48x960";
     }
     return opts;
 }