Two-part fix
Fix 1 — Chunk oversized bucket groups (the correctness fix) ONNXOCRRecognizer::RecognizeBatch now slices each bucket group into chunks of ≤ kRecMaxBatch before submitting to TRT. A frame with 30 crops in bucket 320 produces two back-to-back batched calls (24 + 6), both within the profile, both on the fast path. Fix 2 — Raise the profile max from 16 to 24 (the performance fix) The old profile max was 16; your real scenes routinely hit 24. Raising the profile max to 24 means the common 12-plate scene (24 crops) fits in a single batched call with no chunking needed. Scenes with > 24 crops now use chunking, but that's rare.
This commit is contained in:
@@ -252,10 +252,29 @@ std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Ma
|
|||||||
groupIdx[bucketIdx].push_back(i);
|
groupIdx[bucketIdx].push_back(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run one batched inference per non-empty bucket
|
// Run batched inference per non-empty bucket, slicing each bucket
|
||||||
|
// group into chunks of at most kRecMaxBatch crops so we never exceed
|
||||||
|
// the TRT dynamic profile's max-batch dimension. On a busy scene with
|
||||||
|
// (say) 30 plates all falling in bucket 320, we issue two back-to-back
|
||||||
|
// batched calls of 24 + 6 instead of one oversized call that would
|
||||||
|
// throw "does not satisfy any optimization profiles" and fall off
|
||||||
|
// the fast path to the per-image fallback.
|
||||||
for (int b = 0; b < kRecNumBuckets; ++b) {
|
for (int b = 0; b < kRecNumBuckets; ++b) {
|
||||||
if (groupCrops[b].empty()) continue;
|
const auto& bucketCrops = groupCrops[b];
|
||||||
RunBatchAtWidth(groupCrops[b], groupIdx[b], kRecBucketWidths[b], results);
|
const auto& bucketIndices = groupIdx[b];
|
||||||
|
if (bucketCrops.empty()) continue;
|
||||||
|
|
||||||
|
const int bucketW = kRecBucketWidths[b];
|
||||||
|
const size_t total = bucketCrops.size();
|
||||||
|
|
||||||
|
for (size_t start = 0; start < total; start += kRecMaxBatch) {
|
||||||
|
const size_t end = std::min(start + static_cast<size_t>(kRecMaxBatch), total);
|
||||||
|
std::vector<cv::Mat> chunkCrops(bucketCrops.begin() + start,
|
||||||
|
bucketCrops.begin() + end);
|
||||||
|
std::vector<size_t> chunkIdx(bucketIndices.begin() + start,
|
||||||
|
bucketIndices.begin() + end);
|
||||||
|
RunBatchAtWidth(chunkCrops, chunkIdx, bucketW, results);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return results;
|
return results;
|
||||||
|
|||||||
@@ -48,6 +48,20 @@ constexpr int kRecImgW = 320; // Default rec width (PP-OCRv5 rec_image_shap
|
|||||||
constexpr int kRecImgMaxW = 960; // Allow wide recognition input for long text lines
|
constexpr int kRecImgMaxW = 960; // Allow wide recognition input for long text lines
|
||||||
constexpr int kRecBatchSize = 6;
|
constexpr int kRecBatchSize = 6;
|
||||||
|
|
||||||
|
// Maximum crops submitted to the recognizer in a single ORT Run call.
|
||||||
|
// Two things must stay in sync with this value:
|
||||||
|
// 1. The TRT dynamic profile in PaddleOCRV5Engine::BuildNvidiaOcrOptions —
|
||||||
|
// the profile's max-batch dimension is set from kRecMaxBatch so TRT
|
||||||
|
// builds a single engine that handles everything up to this size.
|
||||||
|
// 2. The bucket-chunking loop in ONNXOCRRecognizer::RecognizeBatch —
|
||||||
|
// bucket groups larger than this get sliced into multiple Run() calls
|
||||||
|
// so we never exceed the profile and fall off the fast batched path.
|
||||||
|
//
|
||||||
|
// Raising it increases peak runtime VRAM (the TRT execution context
|
||||||
|
// allocates worst-case activation buffers), so keep it as low as is
|
||||||
|
// reasonable for your expected plate count per frame.
|
||||||
|
constexpr int kRecMaxBatch = 24;
|
||||||
|
|
||||||
// A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left)
|
// A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left)
|
||||||
struct TextBox {
|
struct TextBox {
|
||||||
std::array<cv::Point2f, 4> points;
|
std::array<cv::Point2f, 4> points;
|
||||||
|
|||||||
@@ -60,7 +60,11 @@ static PerModelOcrOptions BuildNvidiaOcrOptions(
|
|||||||
opts.classifierOpts.preferTensorRT = preferTensorRT;
|
opts.classifierOpts.preferTensorRT = preferTensorRT;
|
||||||
opts.classifierOpts.trtFP16 = true;
|
opts.classifierOpts.trtFP16 = true;
|
||||||
|
|
||||||
// Recognizer: TRT EP with dynamic shape profile.
|
// Recognizer: TRT EP with dynamic shape profile. The max-batch
|
||||||
|
// dimension is kRecMaxBatch (defined in ONNXOCRTypes.h) — the same
|
||||||
|
// constant that ONNXOCRRecognizer::RecognizeBatch uses to chunk
|
||||||
|
// oversized bucket groups. Keeping them in lockstep ensures the
|
||||||
|
// recognizer never submits a shape that falls outside the TRT profile.
|
||||||
opts.recognizerOpts.useMaxCudnnWorkspace = true;
|
opts.recognizerOpts.useMaxCudnnWorkspace = true;
|
||||||
opts.recognizerOpts.preferTensorRT = preferTensorRT;
|
opts.recognizerOpts.preferTensorRT = preferTensorRT;
|
||||||
opts.recognizerOpts.trtFP16 = true;
|
opts.recognizerOpts.trtFP16 = true;
|
||||||
@@ -71,12 +75,13 @@ static PerModelOcrOptions BuildNvidiaOcrOptions(
|
|||||||
"input name — defaulting to 'x'" << std::endl;
|
"input name — defaulting to 'x'" << std::endl;
|
||||||
recInputName = "x";
|
recInputName = "x";
|
||||||
}
|
}
|
||||||
|
const std::string maxB = std::to_string(kRecMaxBatch);
|
||||||
std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
|
std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
|
||||||
<< recInputName << "' — building TRT dynamic profile "
|
<< recInputName << "' — building TRT dynamic profile "
|
||||||
<< "[batch=1..16, W=320..960]" << std::endl;
|
<< "[batch=1.." << maxB << ", W=320..960]" << std::endl;
|
||||||
opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
|
opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
|
||||||
opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
|
opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
|
||||||
opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
|
opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":" + maxB + "x3x48x960";
|
||||||
}
|
}
|
||||||
return opts;
|
return opts;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user