From 7778f8c214b6756f1367790e336777c0cf720c69 Mon Sep 17 00:00:00 2001 From: Tuan Nghia Nguyen Date: Wed, 15 Apr 2026 07:27:55 +1000 Subject: [PATCH] =?UTF-8?q?Two-part=20fix=20Fix=201=20=E2=80=94=20Chunk=20?= =?UTF-8?q?oversized=20bucket=20groups=20(the=20correctness=20fix)=20ONNXO?= =?UTF-8?q?CRRecognizer::RecognizeBatch=20now=20slices=20each=20bucket=20g?= =?UTF-8?q?roup=20into=20chunks=20of=20=E2=89=A4=20kRecMaxBatch=20before?= =?UTF-8?q?=20submitting=20to=20TRT.=20A=20frame=20with=2030=20crops=20in?= =?UTF-8?q?=20bucket=20320=20produces=20two=20back-to-back=20batched=20cal?= =?UTF-8?q?ls=20(24=20+=206),=20both=20within=20the=20profile,=20both=20on?= =?UTF-8?q?=20the=20fast=20path.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix 2 — Raise the profile max from 16 to 24 (the performance fix) The old profile max was 16; your real scenes routinely hit 24. Raising the profile max to 24 means the common 12-plate scene (24 crops) fits in a single batched call with no chunking needed. Scenes with > 24 crops now use chunking, but that's rare. --- .../ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp | 25 ++++++++++++++++--- modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h | 14 +++++++++++ .../ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp | 11 +++++--- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp index 190a30a..52879bf 100644 --- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp +++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp @@ -252,10 +252,29 @@ std::vector ONNXOCRRecognizer::RecognizeBatch(const std::vector(kRecMaxBatch), total); + std::vector chunkCrops(bucketCrops.begin() + start, + bucketCrops.begin() + end); + std::vector chunkIdx(bucketIndices.begin() + start, + bucketIndices.begin() + end); + RunBatchAtWidth(chunkCrops, chunkIdx, bucketW, results); + } } return results; diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h b/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h index 5f07f2c..c12e02d 100644 --- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h +++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h @@ -48,6 +48,20 @@ constexpr int kRecImgW = 320; // Default rec width (PP-OCRv5 rec_image_shap constexpr int kRecImgMaxW = 960; // Allow wide recognition input for long text lines constexpr int kRecBatchSize = 6; +// Maximum crops submitted to the recognizer in a single ORT Run call. +// Two things must stay in sync with this value: +// 1. The TRT dynamic profile in PaddleOCRV5Engine::BuildNvidiaOcrOptions — +// the profile's max-batch dimension is set from kRecMaxBatch so TRT +// builds a single engine that handles everything up to this size. +// 2. The bucket-chunking loop in ONNXOCRRecognizer::RecognizeBatch — +// bucket groups larger than this get sliced into multiple Run() calls +// so we never exceed the profile and fall off the fast batched path. +// +// Raising it increases peak runtime VRAM (the TRT execution context +// allocates worst-case activation buffers), so keep it as low as is +// reasonable for your expected plate count per frame. +constexpr int kRecMaxBatch = 24; + // A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left) struct TextBox { std::array points; diff --git a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp index 71406d4..04c082b 100644 --- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp +++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp @@ -60,7 +60,11 @@ static PerModelOcrOptions BuildNvidiaOcrOptions( opts.classifierOpts.preferTensorRT = preferTensorRT; opts.classifierOpts.trtFP16 = true; - // Recognizer: TRT EP with dynamic shape profile. + // Recognizer: TRT EP with dynamic shape profile. The max-batch + // dimension is kRecMaxBatch (defined in ONNXOCRTypes.h) — the same + // constant that ONNXOCRRecognizer::RecognizeBatch uses to chunk + // oversized bucket groups. Keeping them in lockstep ensures the + // recognizer never submits a shape that falls outside the TRT profile. opts.recognizerOpts.useMaxCudnnWorkspace = true; opts.recognizerOpts.preferTensorRT = preferTensorRT; opts.recognizerOpts.trtFP16 = true; @@ -71,12 +75,13 @@ static PerModelOcrOptions BuildNvidiaOcrOptions( "input name — defaulting to 'x'" << std::endl; recInputName = "x"; } + const std::string maxB = std::to_string(kRecMaxBatch); std::cout << "[PaddleOCRV5Engine] Recognizer input name: '" << recInputName << "' — building TRT dynamic profile " - << "[batch=1..16, W=320..960]" << std::endl; + << "[batch=1.." << maxB << ", W=320..960]" << std::endl; opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320"; opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480"; - opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960"; + opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":" + maxB + "x3x48x960"; } return opts; }