bucket prewarm + batch padding

This commit is contained in:
2026-04-28 17:12:27 +10:00
parent 4bab357c72
commit 234f2c68a2
4 changed files with 121 additions and 34 deletions

View File

@@ -1,4 +1,5 @@
#include "ONNXOCRRecognizer.h"
#include "ANSLicense.h" // ANS_DBG
#include <opencv2/imgproc.hpp>
#include <iostream>
@@ -74,6 +75,17 @@ int ONNXOCRRecognizer::RoundUpToBucket(int resizedW) const {
return imgMaxW_;
}
// Round real batch size UP to the next entry in kRecBatchLadder, clamped to
// kRecMaxBatch. Padded slots in the input tensor are zero-filled and the
// corresponding output rows are discarded after Run(). See the comment on
// kRecBatchLadder in ONNXOCRTypes.h for the rationale.
static int RoundUpToBatchLadder(size_t n) {
for (int b : kRecBatchLadder) {
if (b >= static_cast<int>(n)) return b;
}
return kRecMaxBatch;
}
// Resize + normalize a single crop into a CHW float vector at width
// `bucketW`, padding with zeros on the right when needed. The returned
// vector has exactly 3*imgH_*bucketW elements.
@@ -147,19 +159,29 @@ void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector<cv::Mat>& crops,
if (crops.empty()) return;
try {
const size_t batchN = crops.size();
// Real number of crops vs. padded batch size submitted to ORT.
// Padding to the ladder keeps the recognizer's input-shape set
// bounded (≤24 distinct shapes) so OpenVINO/DML don't recompile
// kernels mid-stream — see kRecBatchLadder in ONNXOCRTypes.h.
// The padded slots are zeros; their decoded output is discarded.
const size_t realN = crops.size();
const size_t paddedN = static_cast<size_t>(RoundUpToBatchLadder(realN));
const size_t perImage = static_cast<size_t>(3) * imgH_ * bucketW;
ANS_DBG("ANSONNXRec_run",
"RunBatchAtWidth: bucketW=%d realN=%zu paddedN=%zu",
bucketW, realN, paddedN);
// Stack N preprocessed crops into one [N,3,H,W] buffer
std::vector<float> batchInput(batchN * perImage, 0.0f);
for (size_t i = 0; i < batchN; ++i) {
// Stack realN preprocessed crops into a [paddedN,3,H,W] buffer.
// The remaining (paddedN - realN) slots stay zero-initialized.
std::vector<float> batchInput(paddedN * perImage, 0.0f);
for (size_t i = 0; i < realN; ++i) {
auto img = PreprocessCropToBucket(crops[i], imgH_, bucketW);
std::memcpy(&batchInput[i * perImage], img.data(),
perImage * sizeof(float));
}
std::array<int64_t, 4> inputShape = {
static_cast<int64_t>(batchN), 3,
static_cast<int64_t>(paddedN), 3,
static_cast<int64_t>(imgH_),
static_cast<int64_t>(bucketW)
};
@@ -175,7 +197,8 @@ void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector<cv::Mat>& crops,
float* outputData = outputTensors[0].GetTensorMutableData<float>();
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
// Expected output: [N, seqLen, numClasses]
// Expected output: [paddedN, seqLen, numClasses]. We only decode
// the first realN rows; rows [realN..paddedN) are zero-pad noise.
if (outputShape.size() < 3) {
std::cerr << "[ONNXOCRRecognizer] Unexpected batch output rank: "
<< outputShape.size() << std::endl;
@@ -186,7 +209,7 @@ void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector<cv::Mat>& crops,
const int numClasses = static_cast<int>(outputShape[2]);
const size_t perRow = static_cast<size_t>(seqLen) * numClasses;
for (int i = 0; i < outBatch && i < static_cast<int>(batchN); ++i) {
for (size_t i = 0; i < realN && static_cast<int>(i) < outBatch; ++i) {
TextLine tl = CTCDecode(outputData + i * perRow, seqLen, numClasses);
out[origIndices[i]] = std::move(tl);
}
@@ -288,30 +311,63 @@ void ONNXOCRRecognizer::Warmup() {
cv::Mat dummy(imgH_ * 2, kRecBucketWidths[kRecNumBuckets - 1] * 2,
CV_8UC3, cv::Scalar(128, 128, 128));
for (int b = 0; b < kRecNumBuckets; ++b) {
const int bucketW = kRecBucketWidths[b];
try {
auto inputData = PreprocessCropToBucket(dummy, imgH_, bucketW);
std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size());
// Pre-compile every (batchN, bucketW) shape we expect to hit on the
// hot path. Each unique shape costs ~1.7 s to compile on OpenVINO and
// ~150 ms on DirectML; without this prewarm, that compile cost shows
// up as a multi-hundred-ms frame spike the first time a given batch
// count + plate-width combination occurs in the video stream.
//
// We deliberately warm only kRecWarmupBatchLadder ({1,2,4,8}) instead
// of the full kRecBatchLadder ({1,2,4,8,16,24}) — covers >95% of ALPR
// frames while keeping engine-init time bounded. Frames with 9+
// plates pay a one-time spike on first occurrence of batch=16/24.
auto totalT0 = std::chrono::high_resolution_clock::now();
int warmedShapes = 0;
for (int bs = 0; bs < kRecNumWarmupBatchSizes; ++bs) {
const int batchN = kRecWarmupBatchLadder[bs];
for (int b = 0; b < kRecNumBuckets; ++b) {
const int bucketW = kRecBucketWidths[b];
try {
const size_t perImage = static_cast<size_t>(3) * imgH_ * bucketW;
std::vector<float> batchInput(static_cast<size_t>(batchN) * perImage, 0.0f);
auto perCrop = PreprocessCropToBucket(dummy, imgH_, bucketW);
for (int i = 0; i < batchN; ++i) {
std::memcpy(&batchInput[static_cast<size_t>(i) * perImage],
perCrop.data(),
perImage * sizeof(float));
}
std::array<int64_t, 4> inputShape = {
static_cast<int64_t>(batchN), 3,
static_cast<int64_t>(imgH_),
static_cast<int64_t>(bucketW)
};
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, batchInput.data(), batchInput.size(),
inputShape.data(), inputShape.size());
auto t0 = std::chrono::high_resolution_clock::now();
(void)ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
auto t1 = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
std::cout << "[ONNXOCRRecognizer] Warmup bucketW=" << bucketW
<< " " << ms << " ms" << std::endl;
}
catch (const Ort::Exception& e) {
std::cerr << "[ONNXOCRRecognizer] Warmup failed at bucketW="
<< bucketW << ": " << e.what() << std::endl;
auto t0 = std::chrono::high_resolution_clock::now();
(void)ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
auto t1 = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
std::cout << "[ONNXOCRRecognizer] Warmup batch=" << batchN
<< " bucketW=" << bucketW
<< " " << ms << " ms" << std::endl;
++warmedShapes;
}
catch (const Ort::Exception& e) {
std::cerr << "[ONNXOCRRecognizer] Warmup failed at batch="
<< batchN << " bucketW=" << bucketW << ": "
<< e.what() << std::endl;
}
}
}
auto totalT1 = std::chrono::high_resolution_clock::now();
double totalMs = std::chrono::duration<double, std::milli>(totalT1 - totalT0).count();
std::cout << "[ONNXOCRRecognizer] Warmup complete: " << warmedShapes
<< " shapes in " << totalMs << " ms total" << std::endl;
_warmedUp = true;
}