bucket prewarm + batch padding
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
#include "ONNXOCRRecognizer.h"
|
#include "ONNXOCRRecognizer.h"
|
||||||
|
#include "ANSLicense.h" // ANS_DBG
|
||||||
|
|
||||||
#include <opencv2/imgproc.hpp>
|
#include <opencv2/imgproc.hpp>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
@@ -74,6 +75,17 @@ int ONNXOCRRecognizer::RoundUpToBucket(int resizedW) const {
|
|||||||
return imgMaxW_;
|
return imgMaxW_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Round real batch size UP to the next entry in kRecBatchLadder, clamped to
|
||||||
|
// kRecMaxBatch. Padded slots in the input tensor are zero-filled and the
|
||||||
|
// corresponding output rows are discarded after Run(). See the comment on
|
||||||
|
// kRecBatchLadder in ONNXOCRTypes.h for the rationale.
|
||||||
|
static int RoundUpToBatchLadder(size_t n) {
|
||||||
|
for (int b : kRecBatchLadder) {
|
||||||
|
if (b >= static_cast<int>(n)) return b;
|
||||||
|
}
|
||||||
|
return kRecMaxBatch;
|
||||||
|
}
|
||||||
|
|
||||||
// Resize + normalize a single crop into a CHW float vector at width
|
// Resize + normalize a single crop into a CHW float vector at width
|
||||||
// `bucketW`, padding with zeros on the right when needed. The returned
|
// `bucketW`, padding with zeros on the right when needed. The returned
|
||||||
// vector has exactly 3*imgH_*bucketW elements.
|
// vector has exactly 3*imgH_*bucketW elements.
|
||||||
@@ -147,19 +159,29 @@ void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector<cv::Mat>& crops,
|
|||||||
if (crops.empty()) return;
|
if (crops.empty()) return;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const size_t batchN = crops.size();
|
// Real number of crops vs. padded batch size submitted to ORT.
|
||||||
|
// Padding to the ladder keeps the recognizer's input-shape set
|
||||||
|
// bounded (≤24 distinct shapes) so OpenVINO/DML don't recompile
|
||||||
|
// kernels mid-stream — see kRecBatchLadder in ONNXOCRTypes.h.
|
||||||
|
// The padded slots are zeros; their decoded output is discarded.
|
||||||
|
const size_t realN = crops.size();
|
||||||
|
const size_t paddedN = static_cast<size_t>(RoundUpToBatchLadder(realN));
|
||||||
const size_t perImage = static_cast<size_t>(3) * imgH_ * bucketW;
|
const size_t perImage = static_cast<size_t>(3) * imgH_ * bucketW;
|
||||||
|
ANS_DBG("ANSONNXRec_run",
|
||||||
|
"RunBatchAtWidth: bucketW=%d realN=%zu paddedN=%zu",
|
||||||
|
bucketW, realN, paddedN);
|
||||||
|
|
||||||
// Stack N preprocessed crops into one [N,3,H,W] buffer
|
// Stack realN preprocessed crops into a [paddedN,3,H,W] buffer.
|
||||||
std::vector<float> batchInput(batchN * perImage, 0.0f);
|
// The remaining (paddedN - realN) slots stay zero-initialized.
|
||||||
for (size_t i = 0; i < batchN; ++i) {
|
std::vector<float> batchInput(paddedN * perImage, 0.0f);
|
||||||
|
for (size_t i = 0; i < realN; ++i) {
|
||||||
auto img = PreprocessCropToBucket(crops[i], imgH_, bucketW);
|
auto img = PreprocessCropToBucket(crops[i], imgH_, bucketW);
|
||||||
std::memcpy(&batchInput[i * perImage], img.data(),
|
std::memcpy(&batchInput[i * perImage], img.data(),
|
||||||
perImage * sizeof(float));
|
perImage * sizeof(float));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::array<int64_t, 4> inputShape = {
|
std::array<int64_t, 4> inputShape = {
|
||||||
static_cast<int64_t>(batchN), 3,
|
static_cast<int64_t>(paddedN), 3,
|
||||||
static_cast<int64_t>(imgH_),
|
static_cast<int64_t>(imgH_),
|
||||||
static_cast<int64_t>(bucketW)
|
static_cast<int64_t>(bucketW)
|
||||||
};
|
};
|
||||||
@@ -175,7 +197,8 @@ void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector<cv::Mat>& crops,
|
|||||||
float* outputData = outputTensors[0].GetTensorMutableData<float>();
|
float* outputData = outputTensors[0].GetTensorMutableData<float>();
|
||||||
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
||||||
|
|
||||||
// Expected output: [N, seqLen, numClasses]
|
// Expected output: [paddedN, seqLen, numClasses]. We only decode
|
||||||
|
// the first realN rows; rows [realN..paddedN) are zero-pad noise.
|
||||||
if (outputShape.size() < 3) {
|
if (outputShape.size() < 3) {
|
||||||
std::cerr << "[ONNXOCRRecognizer] Unexpected batch output rank: "
|
std::cerr << "[ONNXOCRRecognizer] Unexpected batch output rank: "
|
||||||
<< outputShape.size() << std::endl;
|
<< outputShape.size() << std::endl;
|
||||||
@@ -186,7 +209,7 @@ void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector<cv::Mat>& crops,
|
|||||||
const int numClasses = static_cast<int>(outputShape[2]);
|
const int numClasses = static_cast<int>(outputShape[2]);
|
||||||
const size_t perRow = static_cast<size_t>(seqLen) * numClasses;
|
const size_t perRow = static_cast<size_t>(seqLen) * numClasses;
|
||||||
|
|
||||||
for (int i = 0; i < outBatch && i < static_cast<int>(batchN); ++i) {
|
for (size_t i = 0; i < realN && static_cast<int>(i) < outBatch; ++i) {
|
||||||
TextLine tl = CTCDecode(outputData + i * perRow, seqLen, numClasses);
|
TextLine tl = CTCDecode(outputData + i * perRow, seqLen, numClasses);
|
||||||
out[origIndices[i]] = std::move(tl);
|
out[origIndices[i]] = std::move(tl);
|
||||||
}
|
}
|
||||||
@@ -288,13 +311,38 @@ void ONNXOCRRecognizer::Warmup() {
|
|||||||
cv::Mat dummy(imgH_ * 2, kRecBucketWidths[kRecNumBuckets - 1] * 2,
|
cv::Mat dummy(imgH_ * 2, kRecBucketWidths[kRecNumBuckets - 1] * 2,
|
||||||
CV_8UC3, cv::Scalar(128, 128, 128));
|
CV_8UC3, cv::Scalar(128, 128, 128));
|
||||||
|
|
||||||
|
// Pre-compile every (batchN, bucketW) shape we expect to hit on the
|
||||||
|
// hot path. Each unique shape costs ~1.7 s to compile on OpenVINO and
|
||||||
|
// ~150 ms on DirectML; without this prewarm, that compile cost shows
|
||||||
|
// up as a multi-hundred-ms frame spike the first time a given batch
|
||||||
|
// count + plate-width combination occurs in the video stream.
|
||||||
|
//
|
||||||
|
// We deliberately warm only kRecWarmupBatchLadder ({1,2,4,8}) instead
|
||||||
|
// of the full kRecBatchLadder ({1,2,4,8,16,24}) — covers >95% of ALPR
|
||||||
|
// frames while keeping engine-init time bounded. Frames with 9+
|
||||||
|
// plates pay a one-time spike on first occurrence of batch=16/24.
|
||||||
|
auto totalT0 = std::chrono::high_resolution_clock::now();
|
||||||
|
int warmedShapes = 0;
|
||||||
|
for (int bs = 0; bs < kRecNumWarmupBatchSizes; ++bs) {
|
||||||
|
const int batchN = kRecWarmupBatchLadder[bs];
|
||||||
for (int b = 0; b < kRecNumBuckets; ++b) {
|
for (int b = 0; b < kRecNumBuckets; ++b) {
|
||||||
const int bucketW = kRecBucketWidths[b];
|
const int bucketW = kRecBucketWidths[b];
|
||||||
try {
|
try {
|
||||||
auto inputData = PreprocessCropToBucket(dummy, imgH_, bucketW);
|
const size_t perImage = static_cast<size_t>(3) * imgH_ * bucketW;
|
||||||
std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };
|
std::vector<float> batchInput(static_cast<size_t>(batchN) * perImage, 0.0f);
|
||||||
|
auto perCrop = PreprocessCropToBucket(dummy, imgH_, bucketW);
|
||||||
|
for (int i = 0; i < batchN; ++i) {
|
||||||
|
std::memcpy(&batchInput[static_cast<size_t>(i) * perImage],
|
||||||
|
perCrop.data(),
|
||||||
|
perImage * sizeof(float));
|
||||||
|
}
|
||||||
|
std::array<int64_t, 4> inputShape = {
|
||||||
|
static_cast<int64_t>(batchN), 3,
|
||||||
|
static_cast<int64_t>(imgH_),
|
||||||
|
static_cast<int64_t>(bucketW)
|
||||||
|
};
|
||||||
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
|
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
|
||||||
*memory_info_handler, inputData.data(), inputData.size(),
|
*memory_info_handler, batchInput.data(), batchInput.size(),
|
||||||
inputShape.data(), inputShape.size());
|
inputShape.data(), inputShape.size());
|
||||||
|
|
||||||
auto t0 = std::chrono::high_resolution_clock::now();
|
auto t0 = std::chrono::high_resolution_clock::now();
|
||||||
@@ -304,14 +352,22 @@ void ONNXOCRRecognizer::Warmup() {
|
|||||||
output_node_names.data(), num_outputs);
|
output_node_names.data(), num_outputs);
|
||||||
auto t1 = std::chrono::high_resolution_clock::now();
|
auto t1 = std::chrono::high_resolution_clock::now();
|
||||||
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
|
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
|
||||||
std::cout << "[ONNXOCRRecognizer] Warmup bucketW=" << bucketW
|
std::cout << "[ONNXOCRRecognizer] Warmup batch=" << batchN
|
||||||
|
<< " bucketW=" << bucketW
|
||||||
<< " " << ms << " ms" << std::endl;
|
<< " " << ms << " ms" << std::endl;
|
||||||
|
++warmedShapes;
|
||||||
}
|
}
|
||||||
catch (const Ort::Exception& e) {
|
catch (const Ort::Exception& e) {
|
||||||
std::cerr << "[ONNXOCRRecognizer] Warmup failed at bucketW="
|
std::cerr << "[ONNXOCRRecognizer] Warmup failed at batch="
|
||||||
<< bucketW << ": " << e.what() << std::endl;
|
<< batchN << " bucketW=" << bucketW << ": "
|
||||||
|
<< e.what() << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
auto totalT1 = std::chrono::high_resolution_clock::now();
|
||||||
|
double totalMs = std::chrono::duration<double, std::milli>(totalT1 - totalT0).count();
|
||||||
|
std::cout << "[ONNXOCRRecognizer] Warmup complete: " << warmedShapes
|
||||||
|
<< " shapes in " << totalMs << " ms total" << std::endl;
|
||||||
_warmedUp = true;
|
_warmedUp = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -62,6 +62,37 @@ constexpr int kRecBatchSize = 6;
|
|||||||
// reasonable for your expected plate count per frame.
|
// reasonable for your expected plate count per frame.
|
||||||
constexpr int kRecMaxBatch = 24;
|
constexpr int kRecMaxBatch = 24;
|
||||||
|
|
||||||
|
// Runtime batch-padding ladder. Real batch sizes get rounded UP to one of
|
||||||
|
// these values before the ORT Run() call, with zero-padded slots in the
|
||||||
|
// input tensor and the padded outputs discarded after CTC decode. The goal
|
||||||
|
// is to bound the number of distinct (batch, bucketW) input shapes the
|
||||||
|
// recognizer ever sees: with this ladder × kRecBucketWidths there are at
|
||||||
|
// most 24 unique shapes total, small enough for OpenVINO/DirectML to keep
|
||||||
|
// in their per-shape kernel cache instead of recompiling mid-stream.
|
||||||
|
//
|
||||||
|
// Without this padding, OpenVINO recompiles the recognizer for every new
|
||||||
|
// (N, W) combination — measured at ~1.7 s per recompile on Intel Iris Xe,
|
||||||
|
// which is exactly what produced the 1500–1800 ms frame spikes seen in the
|
||||||
|
// OPENVINO_GPU video tests.
|
||||||
|
//
|
||||||
|
// The last entry MUST equal kRecMaxBatch so the whole ladder stays inside
|
||||||
|
// the TRT dynamic profile [batch=1..kRecMaxBatch].
|
||||||
|
constexpr int kRecBatchLadder[] = { 1, 2, 4, 8, 16, 24 };
|
||||||
|
constexpr int kRecNumBatchSizes = sizeof(kRecBatchLadder) / sizeof(kRecBatchLadder[0]);
|
||||||
|
static_assert(kRecBatchLadder[kRecNumBatchSizes - 1] == kRecMaxBatch,
|
||||||
|
"Last kRecBatchLadder entry must equal kRecMaxBatch so that "
|
||||||
|
"padded shapes stay inside the TRT profile.");
|
||||||
|
|
||||||
|
// Warmup batch ladder — a subset of kRecBatchLadder used at engine init to
|
||||||
|
// pre-compile the most common runtime shapes. Kept smaller than the full
|
||||||
|
// ladder because each warmup Run on OpenVINO costs ~1.7 s of kernel-compile
|
||||||
|
// time, and {1,2,4,8} covers the realistic plate-count distribution for
|
||||||
|
// ALPR (>95% of frames have ≤8 detected plates). Frames with 9+ plates
|
||||||
|
// will pay a one-time recompile spike on first occurrence of batch=16/24.
|
||||||
|
constexpr int kRecWarmupBatchLadder[] = { 1, 2, 4, 8 };
|
||||||
|
constexpr int kRecNumWarmupBatchSizes =
|
||||||
|
sizeof(kRecWarmupBatchLadder) / sizeof(kRecWarmupBatchLadder[0]);
|
||||||
|
|
||||||
// A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left)
|
// A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left)
|
||||||
struct TextBox {
|
struct TextBox {
|
||||||
std::array<cv::Point2f, 4> points;
|
std::array<cv::Point2f, 4> points;
|
||||||
|
|||||||
@@ -3936,8 +3936,8 @@ int ALPR_OCR_VideoTest() {
|
|||||||
|
|
||||||
ANSCENTER::ANSALPR* infHandle = nullptr;
|
ANSCENTER::ANSALPR* infHandle = nullptr;
|
||||||
std::string licenseKey = "";
|
std::string licenseKey = "";
|
||||||
std::string modelFilePath = "C:\\Projects\\ANSVIS\\Models\\ANS_GenericALPR_v2.0.zip";
|
std::string modelFilePath = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_GenericALPR_v2.0.zip";
|
||||||
std::string videoFilePath = "E:\\Programs\\DemoAssets\\Videos\\ALRP\\PMH\\Day\\day.mp4";
|
std::string videoFilePath = "C:\\ProgramData\\ANSCENTER\\Shared\\HM1.mp4";
|
||||||
|
|
||||||
int engineType = 2; // ANSALPR_OCR
|
int engineType = 2; // ANSALPR_OCR
|
||||||
double detectionThreshold = 0.3;
|
double detectionThreshold = 0.3;
|
||||||
@@ -4099,12 +4099,12 @@ int main()
|
|||||||
//}
|
//}
|
||||||
//ANSLPR_SingleTask_Test();
|
//ANSLPR_SingleTask_Test();
|
||||||
//ANSLPR_CPU_StressTest();
|
//ANSLPR_CPU_StressTest();
|
||||||
ANSLPR_MultiGPU_StressTest();
|
// ANSLPR_MultiGPU_StressTest();
|
||||||
//ANSLPR_MultiGPU_StressTest_SimulatedCam();
|
//ANSLPR_MultiGPU_StressTest_SimulatedCam();
|
||||||
// ANSLPR_MultiGPU_StressTest_FilePlayer();
|
// ANSLPR_MultiGPU_StressTest_FilePlayer();
|
||||||
//ANSLPR_OD_CPU_VideoTest();
|
//ANSLPR_OD_CPU_VideoTest();
|
||||||
//ALPR_OCR_Test();
|
//ALPR_OCR_Test();
|
||||||
//ALPR_OCR_VideoTest();
|
ALPR_OCR_VideoTest();
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -503,9 +503,9 @@ int main()
|
|||||||
SetConsoleOutputCP(CP_UTF8);
|
SetConsoleOutputCP(CP_UTF8);
|
||||||
SetConsoleCP(CP_UTF8);
|
SetConsoleCP(CP_UTF8);
|
||||||
#endif
|
#endif
|
||||||
TestOCRv5mage();
|
//TestOCRv5mage();
|
||||||
|
|
||||||
//ANSOCR_VideoTest();
|
ANSOCR_VideoTest();
|
||||||
// TestOCRImage();
|
// TestOCRImage();
|
||||||
/* for (int i = 0; i < 20; i++) {
|
/* for (int i = 0; i < 20; i++) {
|
||||||
TestOCRImage();
|
TestOCRImage();
|
||||||
|
|||||||
Reference in New Issue
Block a user