Two-part fix

Fix 1 — Chunk oversized bucket groups (the correctness fix)
ONNXOCRRecognizer::RecognizeBatch now slices each bucket group into chunks of ≤ kRecMaxBatch before submitting to TRT. A frame with 30 crops in bucket 320 produces two back-to-back batched calls (24 + 6), both within the profile, both on the fast path.

Fix 2 — Raise the profile max from 16 to 24 (the performance fix)
The old profile max was 16; your real scenes routinely hit 24. Raising the profile max to 24 means the common 12-plate scene (24 crops) fits in a single batched call with no chunking needed. Scenes with > 24 crops now use chunking, but that's rare.
This commit is contained in:
2026-04-15 07:27:55 +10:00
parent 5706615ed5
commit 7778f8c214
3 changed files with 44 additions and 6 deletions

View File

@@ -252,10 +252,29 @@ std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Ma
groupIdx[bucketIdx].push_back(i); groupIdx[bucketIdx].push_back(i);
} }
// Run one batched inference per non-empty bucket // Run batched inference per non-empty bucket, slicing each bucket
// group into chunks of at most kRecMaxBatch crops so we never exceed
// the TRT dynamic profile's max-batch dimension. On a busy scene with
// (say) 30 plates all falling in bucket 320, we issue two back-to-back
// batched calls of 24 + 6 instead of one oversized call that would
// throw "does not satisfy any optimization profiles" and fall off
// the fast path to the per-image fallback.
for (int b = 0; b < kRecNumBuckets; ++b) { for (int b = 0; b < kRecNumBuckets; ++b) {
if (groupCrops[b].empty()) continue; const auto& bucketCrops = groupCrops[b];
RunBatchAtWidth(groupCrops[b], groupIdx[b], kRecBucketWidths[b], results); const auto& bucketIndices = groupIdx[b];
if (bucketCrops.empty()) continue;
const int bucketW = kRecBucketWidths[b];
const size_t total = bucketCrops.size();
for (size_t start = 0; start < total; start += kRecMaxBatch) {
const size_t end = std::min(start + static_cast<size_t>(kRecMaxBatch), total);
std::vector<cv::Mat> chunkCrops(bucketCrops.begin() + start,
bucketCrops.begin() + end);
std::vector<size_t> chunkIdx(bucketIndices.begin() + start,
bucketIndices.begin() + end);
RunBatchAtWidth(chunkCrops, chunkIdx, bucketW, results);
}
} }
return results; return results;

View File

@@ -48,6 +48,20 @@ constexpr int kRecImgW = 320; // Default rec width (PP-OCRv5 rec_image_shap
constexpr int kRecImgMaxW = 960; // Allow wide recognition input for long text lines constexpr int kRecImgMaxW = 960; // Allow wide recognition input for long text lines
constexpr int kRecBatchSize = 6; constexpr int kRecBatchSize = 6;
// Maximum crops submitted to the recognizer in a single ORT Run call.
// Two things must stay in sync with this value:
// 1. The TRT dynamic profile in PaddleOCRV5Engine::BuildNvidiaOcrOptions —
// the profile's max-batch dimension is set from kRecMaxBatch so TRT
// builds a single engine that handles everything up to this size.
// 2. The bucket-chunking loop in ONNXOCRRecognizer::RecognizeBatch —
// bucket groups larger than this get sliced into multiple Run() calls
// so we never exceed the profile and fall off the fast batched path.
//
// Raising it increases peak runtime VRAM (the TRT execution context
// allocates worst-case activation buffers), so keep it as low as is
// reasonable for your expected plate count per frame.
constexpr int kRecMaxBatch = 24;
// A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left) // A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left)
struct TextBox { struct TextBox {
std::array<cv::Point2f, 4> points; std::array<cv::Point2f, 4> points;

View File

@@ -60,7 +60,11 @@ static PerModelOcrOptions BuildNvidiaOcrOptions(
opts.classifierOpts.preferTensorRT = preferTensorRT; opts.classifierOpts.preferTensorRT = preferTensorRT;
opts.classifierOpts.trtFP16 = true; opts.classifierOpts.trtFP16 = true;
// Recognizer: TRT EP with dynamic shape profile. // Recognizer: TRT EP with dynamic shape profile. The max-batch
// dimension is kRecMaxBatch (defined in ONNXOCRTypes.h) — the same
// constant that ONNXOCRRecognizer::RecognizeBatch uses to chunk
// oversized bucket groups. Keeping them in lockstep ensures the
// recognizer never submits a shape that falls outside the TRT profile.
opts.recognizerOpts.useMaxCudnnWorkspace = true; opts.recognizerOpts.useMaxCudnnWorkspace = true;
opts.recognizerOpts.preferTensorRT = preferTensorRT; opts.recognizerOpts.preferTensorRT = preferTensorRT;
opts.recognizerOpts.trtFP16 = true; opts.recognizerOpts.trtFP16 = true;
@@ -71,12 +75,13 @@ static PerModelOcrOptions BuildNvidiaOcrOptions(
"input name — defaulting to 'x'" << std::endl; "input name — defaulting to 'x'" << std::endl;
recInputName = "x"; recInputName = "x";
} }
const std::string maxB = std::to_string(kRecMaxBatch);
std::cout << "[PaddleOCRV5Engine] Recognizer input name: '" std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
<< recInputName << "' — building TRT dynamic profile " << recInputName << "' — building TRT dynamic profile "
<< "[batch=1..16, W=320..960]" << std::endl; << "[batch=1.." << maxB << ", W=320..960]" << std::endl;
opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320"; opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480"; opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960"; opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":" + maxB + "x3x48x960";
} }
return opts; return opts;
} }