diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp index 52879bf..8cc8a17 100644 --- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp +++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp @@ -1,4 +1,5 @@ #include "ONNXOCRRecognizer.h" +#include "ANSLicense.h" // ANS_DBG #include #include @@ -74,6 +75,17 @@ int ONNXOCRRecognizer::RoundUpToBucket(int resizedW) const { return imgMaxW_; } +// Round real batch size UP to the next entry in kRecBatchLadder, clamped to +// kRecMaxBatch. Padded slots in the input tensor are zero-filled and the +// corresponding output rows are discarded after Run(). See the comment on +// kRecBatchLadder in ONNXOCRTypes.h for the rationale. +static int RoundUpToBatchLadder(size_t n) { + for (int b : kRecBatchLadder) { + if (b >= static_cast(n)) return b; + } + return kRecMaxBatch; +} + // Resize + normalize a single crop into a CHW float vector at width // `bucketW`, padding with zeros on the right when needed. The returned // vector has exactly 3*imgH_*bucketW elements. @@ -147,19 +159,29 @@ void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector& crops, if (crops.empty()) return; try { - const size_t batchN = crops.size(); + // Real number of crops vs. padded batch size submitted to ORT. + // Padding to the ladder keeps the recognizer's input-shape set + // bounded (≤24 distinct shapes) so OpenVINO/DML don't recompile + // kernels mid-stream — see kRecBatchLadder in ONNXOCRTypes.h. + // The padded slots are zeros; their decoded output is discarded. + const size_t realN = crops.size(); + const size_t paddedN = static_cast(RoundUpToBatchLadder(realN)); const size_t perImage = static_cast(3) * imgH_ * bucketW; + ANS_DBG("ANSONNXRec_run", + "RunBatchAtWidth: bucketW=%d realN=%zu paddedN=%zu", + bucketW, realN, paddedN); - // Stack N preprocessed crops into one [N,3,H,W] buffer - std::vector batchInput(batchN * perImage, 0.0f); - for (size_t i = 0; i < batchN; ++i) { + // Stack realN preprocessed crops into a [paddedN,3,H,W] buffer. + // The remaining (paddedN - realN) slots stay zero-initialized. + std::vector batchInput(paddedN * perImage, 0.0f); + for (size_t i = 0; i < realN; ++i) { auto img = PreprocessCropToBucket(crops[i], imgH_, bucketW); std::memcpy(&batchInput[i * perImage], img.data(), perImage * sizeof(float)); } std::array inputShape = { - static_cast(batchN), 3, + static_cast(paddedN), 3, static_cast(imgH_), static_cast(bucketW) }; @@ -175,7 +197,8 @@ void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector& crops, float* outputData = outputTensors[0].GetTensorMutableData(); auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); - // Expected output: [N, seqLen, numClasses] + // Expected output: [paddedN, seqLen, numClasses]. We only decode + // the first realN rows; rows [realN..paddedN) are zero-pad noise. if (outputShape.size() < 3) { std::cerr << "[ONNXOCRRecognizer] Unexpected batch output rank: " << outputShape.size() << std::endl; @@ -186,7 +209,7 @@ void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector& crops, const int numClasses = static_cast(outputShape[2]); const size_t perRow = static_cast(seqLen) * numClasses; - for (int i = 0; i < outBatch && i < static_cast(batchN); ++i) { + for (size_t i = 0; i < realN && static_cast(i) < outBatch; ++i) { TextLine tl = CTCDecode(outputData + i * perRow, seqLen, numClasses); out[origIndices[i]] = std::move(tl); } @@ -288,30 +311,63 @@ void ONNXOCRRecognizer::Warmup() { cv::Mat dummy(imgH_ * 2, kRecBucketWidths[kRecNumBuckets - 1] * 2, CV_8UC3, cv::Scalar(128, 128, 128)); - for (int b = 0; b < kRecNumBuckets; ++b) { - const int bucketW = kRecBucketWidths[b]; - try { - auto inputData = PreprocessCropToBucket(dummy, imgH_, bucketW); - std::array inputShape = { 1, 3, imgH_, bucketW }; - Ort::Value inputTensor = Ort::Value::CreateTensor( - *memory_info_handler, inputData.data(), inputData.size(), - inputShape.data(), inputShape.size()); + // Pre-compile every (batchN, bucketW) shape we expect to hit on the + // hot path. Each unique shape costs ~1.7 s to compile on OpenVINO and + // ~150 ms on DirectML; without this prewarm, that compile cost shows + // up as a multi-hundred-ms frame spike the first time a given batch + // count + plate-width combination occurs in the video stream. + // + // We deliberately warm only kRecWarmupBatchLadder ({1,2,4,8}) instead + // of the full kRecBatchLadder ({1,2,4,8,16,24}) — covers >95% of ALPR + // frames while keeping engine-init time bounded. Frames with 9+ + // plates pay a one-time spike on first occurrence of batch=16/24. + auto totalT0 = std::chrono::high_resolution_clock::now(); + int warmedShapes = 0; + for (int bs = 0; bs < kRecNumWarmupBatchSizes; ++bs) { + const int batchN = kRecWarmupBatchLadder[bs]; + for (int b = 0; b < kRecNumBuckets; ++b) { + const int bucketW = kRecBucketWidths[b]; + try { + const size_t perImage = static_cast(3) * imgH_ * bucketW; + std::vector batchInput(static_cast(batchN) * perImage, 0.0f); + auto perCrop = PreprocessCropToBucket(dummy, imgH_, bucketW); + for (int i = 0; i < batchN; ++i) { + std::memcpy(&batchInput[static_cast(i) * perImage], + perCrop.data(), + perImage * sizeof(float)); + } + std::array inputShape = { + static_cast(batchN), 3, + static_cast(imgH_), + static_cast(bucketW) + }; + Ort::Value inputTensor = Ort::Value::CreateTensor( + *memory_info_handler, batchInput.data(), batchInput.size(), + inputShape.data(), inputShape.size()); - auto t0 = std::chrono::high_resolution_clock::now(); - (void)ort_session->Run( - Ort::RunOptions{ nullptr }, - input_node_names.data(), &inputTensor, 1, - output_node_names.data(), num_outputs); - auto t1 = std::chrono::high_resolution_clock::now(); - double ms = std::chrono::duration(t1 - t0).count(); - std::cout << "[ONNXOCRRecognizer] Warmup bucketW=" << bucketW - << " " << ms << " ms" << std::endl; - } - catch (const Ort::Exception& e) { - std::cerr << "[ONNXOCRRecognizer] Warmup failed at bucketW=" - << bucketW << ": " << e.what() << std::endl; + auto t0 = std::chrono::high_resolution_clock::now(); + (void)ort_session->Run( + Ort::RunOptions{ nullptr }, + input_node_names.data(), &inputTensor, 1, + output_node_names.data(), num_outputs); + auto t1 = std::chrono::high_resolution_clock::now(); + double ms = std::chrono::duration(t1 - t0).count(); + std::cout << "[ONNXOCRRecognizer] Warmup batch=" << batchN + << " bucketW=" << bucketW + << " " << ms << " ms" << std::endl; + ++warmedShapes; + } + catch (const Ort::Exception& e) { + std::cerr << "[ONNXOCRRecognizer] Warmup failed at batch=" + << batchN << " bucketW=" << bucketW << ": " + << e.what() << std::endl; + } } } + auto totalT1 = std::chrono::high_resolution_clock::now(); + double totalMs = std::chrono::duration(totalT1 - totalT0).count(); + std::cout << "[ONNXOCRRecognizer] Warmup complete: " << warmedShapes + << " shapes in " << totalMs << " ms total" << std::endl; _warmedUp = true; } diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h b/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h index c12e02d..62a0b82 100644 --- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h +++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h @@ -62,6 +62,37 @@ constexpr int kRecBatchSize = 6; // reasonable for your expected plate count per frame. constexpr int kRecMaxBatch = 24; +// Runtime batch-padding ladder. Real batch sizes get rounded UP to one of +// these values before the ORT Run() call, with zero-padded slots in the +// input tensor and the padded outputs discarded after CTC decode. The goal +// is to bound the number of distinct (batch, bucketW) input shapes the +// recognizer ever sees: with this ladder × kRecBucketWidths there are at +// most 24 unique shapes total, small enough for OpenVINO/DirectML to keep +// in their per-shape kernel cache instead of recompiling mid-stream. +// +// Without this padding, OpenVINO recompiles the recognizer for every new +// (N, W) combination — measured at ~1.7 s per recompile on Intel Iris Xe, +// which is exactly what produced the 1500–1800 ms frame spikes seen in the +// OPENVINO_GPU video tests. +// +// The last entry MUST equal kRecMaxBatch so the whole ladder stays inside +// the TRT dynamic profile [batch=1..kRecMaxBatch]. +constexpr int kRecBatchLadder[] = { 1, 2, 4, 8, 16, 24 }; +constexpr int kRecNumBatchSizes = sizeof(kRecBatchLadder) / sizeof(kRecBatchLadder[0]); +static_assert(kRecBatchLadder[kRecNumBatchSizes - 1] == kRecMaxBatch, + "Last kRecBatchLadder entry must equal kRecMaxBatch so that " + "padded shapes stay inside the TRT profile."); + +// Warmup batch ladder — a subset of kRecBatchLadder used at engine init to +// pre-compile the most common runtime shapes. Kept smaller than the full +// ladder because each warmup Run on OpenVINO costs ~1.7 s of kernel-compile +// time, and {1,2,4,8} covers the realistic plate-count distribution for +// ALPR (>95% of frames have ≤8 detected plates). Frames with 9+ plates +// will pay a one-time recompile spike on first occurrence of batch=16/24. +constexpr int kRecWarmupBatchLadder[] = { 1, 2, 4, 8 }; +constexpr int kRecNumWarmupBatchSizes = + sizeof(kRecWarmupBatchLadder) / sizeof(kRecWarmupBatchLadder[0]); + // A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left) struct TextBox { std::array points; diff --git a/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp b/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp index 6bc2401..9f7d896 100644 --- a/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp +++ b/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp @@ -3936,8 +3936,8 @@ int ALPR_OCR_VideoTest() { ANSCENTER::ANSALPR* infHandle = nullptr; std::string licenseKey = ""; - std::string modelFilePath = "C:\\Projects\\ANSVIS\\Models\\ANS_GenericALPR_v2.0.zip"; - std::string videoFilePath = "E:\\Programs\\DemoAssets\\Videos\\ALRP\\PMH\\Day\\day.mp4"; + std::string modelFilePath = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_GenericALPR_v2.0.zip"; + std::string videoFilePath = "C:\\ProgramData\\ANSCENTER\\Shared\\HM1.mp4"; int engineType = 2; // ANSALPR_OCR double detectionThreshold = 0.3; @@ -4099,12 +4099,12 @@ int main() //} //ANSLPR_SingleTask_Test(); //ANSLPR_CPU_StressTest(); - ANSLPR_MultiGPU_StressTest(); + // ANSLPR_MultiGPU_StressTest(); //ANSLPR_MultiGPU_StressTest_SimulatedCam(); // ANSLPR_MultiGPU_StressTest_FilePlayer(); //ANSLPR_OD_CPU_VideoTest(); //ALPR_OCR_Test(); - //ALPR_OCR_VideoTest(); + ALPR_OCR_VideoTest(); return 0; } diff --git a/tests/ANSOCR-UnitTest/ANSOCR-UnitTest.cpp b/tests/ANSOCR-UnitTest/ANSOCR-UnitTest.cpp index c3250d6..98e3023 100644 --- a/tests/ANSOCR-UnitTest/ANSOCR-UnitTest.cpp +++ b/tests/ANSOCR-UnitTest/ANSOCR-UnitTest.cpp @@ -503,9 +503,9 @@ int main() SetConsoleOutputCP(CP_UTF8); SetConsoleCP(CP_UTF8); #endif - TestOCRv5mage(); + //TestOCRv5mage(); - //ANSOCR_VideoTest(); + ANSOCR_VideoTest(); // TestOCRImage(); /* for (int i = 0; i < 20; i++) { TestOCRImage();