diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 32e701d..0767e3d 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -134,7 +134,16 @@ "Bash(python /tmp/apply_fd_guards.py)", "Bash(python /tmp/apply_fd_precheck.py)", "Bash(ls /c/Projects/CLionProjects/ANSCORE/modules/ANSFR/*.cpp /c/Projects/CLionProjects/ANSCORE/modules/ANSFR/*.h)", - "Bash(grep -rn \"lock_guard.*_mutex\\\\|lock_guard.*mutex\" /c/Projects/CLionProjects/ANSCORE/modules/ANSFR/*.cpp)" + "Bash(grep -rn \"lock_guard.*_mutex\\\\|lock_guard.*mutex\" /c/Projects/CLionProjects/ANSCORE/modules/ANSFR/*.cpp)", + "Bash(grep -l \"TensorRT\\\\|tensorrt_provider_factory\\\\|OrtTensorRTProviderOptionsV2\\\\|CreateTensorRTProviderOptions\\\\|UpdateTensorRTProviderOptions\" \"C:/ANSLibs/onnxruntime/include/\"*.h)", + "Bash(cmake --build . --target ONNXEngine ANSOCR ANSLPR ANSLPR-UnitTest --config Release -- -j 4)", + "Bash(cmake --build . --target help)", + "Bash(cmake --build . --target ANSLPR-UnitTest)", + "Bash(cmd.exe /c \"call \\\\\"C:\\\\\\\\Program Files\\\\\\\\Microsoft Visual Studio\\\\\\\\2022\\\\\\\\Community\\\\\\\\VC\\\\\\\\Auxiliary\\\\\\\\Build\\\\\\\\vcvars64.bat\\\\\" >nul 2>&1 && cmake --build . --target ANSLPR-UnitTest 2>&1\")", + "Bash(cmd.exe //c \"cd /d C:\\\\\\\\Projects\\\\\\\\CLionProjects\\\\\\\\ANSCORE\\\\\\\\cmake-build-release && call \\\\\"C:\\\\\\\\Program Files\\\\\\\\Microsoft Visual Studio\\\\\\\\2022\\\\\\\\Community\\\\\\\\VC\\\\\\\\Auxiliary\\\\\\\\Build\\\\\\\\vcvars64.bat\\\\\" && cmake --build . --target ANSLPR-UnitTest\")", + "Bash(cmd.exe //c \"C:\\\\\\\\Projects\\\\\\\\CLionProjects\\\\\\\\ANSCORE\\\\\\\\cmake-build-release\\\\\\\\__build_check.bat\")", + "Bash(cmd.exe //c \"tasklist\")", + "Bash(cmd.exe //c \"taskkill /F /PID 45704\")" ] } } diff --git a/engines/ONNXEngine/ONNXEngine.cpp b/engines/ONNXEngine/ONNXEngine.cpp index d5549e3..7a2881b 100644 --- a/engines/ONNXEngine/ONNXEngine.cpp +++ b/engines/ONNXEngine/ONNXEngine.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include namespace ANSCENTER { @@ -13,6 +15,40 @@ namespace ANSCENTER { // BasicOrtHandler — constructors // ==================================================================== + std::string BasicOrtHandler::QueryModelInputName(const std::string& onnxPath) + { + try { + // Make sure the Ort API pointer is initialised in THIS DLL. + if (Ort::Global::api_ == nullptr) { + Ort::InitApi(static_cast(EPLoader::GetOrtApiRaw())); + } + + Ort::Env env(ORT_LOGGING_LEVEL_ERROR, "QueryModelInputName"); + Ort::SessionOptions opts; + opts.SetIntraOpNumThreads(1); + opts.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_DISABLE_ALL); + // Intentionally NOT attaching CUDA/TRT EP — CPU is fastest + // for a no-inference metadata read. + + std::wstring wpath(onnxPath.begin(), onnxPath.end()); + Ort::Session session(env, wpath.c_str(), opts); + + Ort::AllocatorWithDefaultOptions alloc; + auto inName = session.GetInputNameAllocated(0, alloc); + return std::string(inName.get()); + } + catch (const Ort::Exception& e) { + std::cerr << "[QueryModelInputName] ORT exception: " + << e.what() << " (path=" << onnxPath << ")" << std::endl; + return ""; + } + catch (const std::exception& e) { + std::cerr << "[QueryModelInputName] std exception: " + << e.what() << " (path=" << onnxPath << ")" << std::endl; + return ""; + } + } + BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path, unsigned int _num_threads) : log_id(_onnx_path.data()), @@ -36,6 +72,33 @@ namespace ANSCENTER { initialize_handler(); } + BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path, + EngineType engineType, + const OrtHandlerOptions& options, + unsigned int _num_threads) + : log_id(_onnx_path.data()), + num_threads(_num_threads), + m_engineType(engineType), + m_handlerOptions(options), + onnx_path_w(_onnx_path.begin(), _onnx_path.end()) + { + onnx_path = onnx_path_w.c_str(); + initialize_handler(); + } + + BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path, + const OrtHandlerOptions& options, + unsigned int _num_threads) + : log_id(_onnx_path.data()), + num_threads(_num_threads), + m_engineType(static_cast(-1)), // EPLoader auto-detect + m_handlerOptions(options), + onnx_path_w(_onnx_path.begin(), _onnx_path.end()) + { + onnx_path = onnx_path_w.c_str(); + initialize_handler(); + } + BasicOrtHandler::~BasicOrtHandler() { if (ort_session) { @@ -66,11 +129,15 @@ namespace ANSCENTER { // - arena_extend_strategy = 1 (kSameAsRequested) to avoid // pre-allocating huge GPU memory blocks that may exceed VRAM // - cudnn_conv_algo_search = HEURISTIC for faster session init - // - cudnn_conv_use_max_workspace = 0 — use minimal cuDNN workspace - // to prevent CUDNN_BACKEND_API_FAILED when TRT engines already - // occupy most VRAM on the same GPU + // - cudnn_conv_use_max_workspace defaults to "0" to prevent + // CUDNN_BACKEND_API_FAILED when TRT engines already occupy + // most VRAM on the same GPU. OCR sub-models that need fast + // convs opt into "1" via OrtHandlerOptions::useMaxCudnnWorkspace // - gpu_mem_limit — cap ONNX Runtime's GPU memory arena to 2 GB // so it doesn't compete with TensorRT for the remaining VRAM + const char* maxWorkspace = + m_handlerOptions.useMaxCudnnWorkspace ? "1" : "0"; + const char* keys[] = { "device_id", "arena_extend_strategy", @@ -82,7 +149,7 @@ namespace ANSCENTER { "0", "1", // kSameAsRequested "HEURISTIC", // avoid exhaustive algo search on large model - "0", // minimal cuDNN workspace (prevents OOM) + maxWorkspace, // "1" for OCR (perf), "0" elsewhere (safety) "2147483648" // 2 GB arena limit }; Ort::GetApi().UpdateCUDAProviderOptions( @@ -92,7 +159,8 @@ namespace ANSCENTER { Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options); std::cout << "[ORT] CUDA EP attached (arena=SameAsRequested, " - "cudnn=HEURISTIC, maxWorkspace=0, memLimit=2GB)." << std::endl; + "cudnn=HEURISTIC, maxWorkspace=" << maxWorkspace + << ", memLimit=2GB)." << std::endl; return true; } catch (const Ort::Exception& e) { @@ -100,6 +168,113 @@ namespace ANSCENTER { return false; } } + + bool BasicOrtHandler::TryAppendTensorRT(Ort::SessionOptions& session_options) + { + try { + OrtTensorRTProviderOptionsV2* trt_options = nullptr; + Ort::GetApi().CreateTensorRTProviderOptions(&trt_options); + + // Cache built engines on disk so subsequent runs skip the + // multi-minute build. Engines are keyed on (model hash, GPU + // arch, shape profile) so changing any of those triggers + // a rebuild automatically. + std::string cacheDir = m_handlerOptions.trtEngineCacheDir; + if (cacheDir.empty()) { + // %TEMP%\ANSCENTER\TRTEngineCache + const char* tmp = std::getenv("TEMP"); + if (!tmp) tmp = std::getenv("TMP"); + if (!tmp) tmp = "."; + std::filesystem::path p(tmp); + p /= "ANSCENTER"; + p /= "TRTEngineCache"; + std::error_code ec; + std::filesystem::create_directories(p, ec); + cacheDir = p.string(); + } + + // Builder options tuned for *fast first-run*: + // - opt_level 1: builds in seconds, ~5–10 % runtime cost vs 3 + // - workspace 1 GB: leaves room for CUDA EP arena and the + // LPD's own TRT engine on the same GPU + // - timing cache: persists kernel timings between runs so + // builds at new shapes get progressively faster + // - profile shapes (if set): build ONE dynamic-shape + // engine that handles all (batch, width) combos instead + // of rebuilding per unique input. Critical for variable + // batch workloads — without this, TRT EP rebuilds every + // time runtime sees a new shape pair, causing 60-90 s + // hangs mid-stream. + std::filesystem::path timingCachePath = + std::filesystem::path(cacheDir) / "timing.cache"; + std::string timingCacheStr = timingCachePath.string(); + + const bool haveProfile = !m_handlerOptions.trtProfileMinShapes.empty() + && !m_handlerOptions.trtProfileOptShapes.empty() + && !m_handlerOptions.trtProfileMaxShapes.empty(); + + // Build the key/value arrays. We always set the first 8 keys; + // the profile shapes are appended only when provided. + std::vector keys = { + "device_id", + "trt_fp16_enable", + "trt_engine_cache_enable", + "trt_engine_cache_path", + "trt_max_workspace_size", + "trt_builder_optimization_level", + "trt_timing_cache_enable", + "trt_timing_cache_path" + }; + std::vector values = { + "0", + m_handlerOptions.trtFP16 ? "1" : "0", + "1", + cacheDir.c_str(), + "1073741824", // 1 GB build workspace + "1", // fast build (was "3") + "1", + cacheDir.c_str() + }; + + if (haveProfile) { + keys.push_back("trt_profile_min_shapes"); + values.push_back(m_handlerOptions.trtProfileMinShapes.c_str()); + keys.push_back("trt_profile_opt_shapes"); + values.push_back(m_handlerOptions.trtProfileOptShapes.c_str()); + keys.push_back("trt_profile_max_shapes"); + values.push_back(m_handlerOptions.trtProfileMaxShapes.c_str()); + } + + Ort::GetApi().UpdateTensorRTProviderOptions( + trt_options, keys.data(), values.data(), keys.size()); + + session_options.AppendExecutionProvider_TensorRT_V2(*trt_options); + Ort::GetApi().ReleaseTensorRTProviderOptions(trt_options); + + std::cout << "[ORT] TensorRT EP attached (fp16=" + << (m_handlerOptions.trtFP16 ? "1" : "0") + << ", cache=" << cacheDir + << ", profile=" << (haveProfile ? "dynamic" : "static") + << ")." << std::endl; + if (haveProfile) { + std::cout << "[ORT] profile min: " + << m_handlerOptions.trtProfileMinShapes << std::endl + << "[ORT] profile opt: " + << m_handlerOptions.trtProfileOptShapes << std::endl + << "[ORT] profile max: " + << m_handlerOptions.trtProfileMaxShapes << std::endl; + } + return true; + } + catch (const Ort::Exception& e) { + std::cerr << "[ORT] TensorRT EP failed: " << e.what() << std::endl; + return false; + } + catch (const std::exception& e) { + std::cerr << "[ORT] TensorRT EP failed (std): " << e.what() << std::endl; + return false; + } + } bool BasicOrtHandler::TryAppendDirectML(Ort::SessionOptions& session_options) { try { @@ -267,9 +442,28 @@ namespace ANSCENTER { { // -------------------------------------------------------- case EngineType::NVIDIA_GPU: + // Try TensorRT EP first when explicitly requested. Falls + // through to CUDA EP if TRT is missing or option creation + // fails. Both EPs may be attached at once — ORT picks TRT + // for nodes it supports and CUDA for the rest. + if (m_handlerOptions.preferTensorRT + && hasProvider("TensorrtExecutionProvider")) { + ANS_DBG("OrtHandler", "Trying TensorRT EP..."); + if (TryAppendTensorRT(session_options)) { + epAttached = true; + } + else { + std::cerr << "[ORT] TensorRT EP attach failed — " + "falling back to CUDA EP." << std::endl; + } + } + ANS_DBG("OrtHandler", "Trying CUDA EP..."); - if (hasProvider("CUDAExecutionProvider")) - epAttached = TryAppendCUDA(session_options); + if (hasProvider("CUDAExecutionProvider")) { + if (TryAppendCUDA(session_options)) { + epAttached = true; + } + } if (!epAttached) { std::cerr << "[ORT] CUDA EP unavailable — falling back to CPU." << std::endl; diff --git a/engines/ONNXEngine/ONNXEngine.h b/engines/ONNXEngine/ONNXEngine.h index ed957b5..44f7418 100644 --- a/engines/ONNXEngine/ONNXEngine.h +++ b/engines/ONNXEngine/ONNXEngine.h @@ -248,6 +248,46 @@ namespace ANSCENTER { return std::string(ort_session->GetOutputNameAllocated(index, allocator).get()); } + // ==================================================================== + // High-perf options for OCR sub-models that need TRT EP and full + // cuDNN workspace. Default-constructed = identical to the legacy + // behavior (CUDA EP only, minimal cuDNN workspace). + // ==================================================================== + struct OrtHandlerOptions { + // Try to attach TensorRT EP before CUDA EP (NVIDIA only). + // Falls back to CUDA EP automatically if TRT EP creation or session + // creation fails. Engines are cached on disk for fast reload. + bool preferTensorRT = false; + + // Use the largest cuDNN conv workspace. cuDNN can then pick fast + // algorithms (Winograd, implicit-precomp-GEMM with big workspaces). + // Defaults off because some deployments share VRAM with TRT engines + // and need the minimal-workspace mode to avoid OOM. + bool useMaxCudnnWorkspace = false; + + // Where to cache built TRT engines. Empty → default + // %TEMP%/ANSCENTER/TRTEngineCache. Only used when preferTensorRT. + std::string trtEngineCacheDir; + + // FP16 builds for TRT EP. Recommended for inference; ignored if + // preferTensorRT is false. + bool trtFP16 = true; + + // Dynamic-shape profile for TRT EP. When set, TRT builds ONE + // engine that handles every input shape in the [min..max] range + // instead of rebuilding per unique shape. Critical for models + // that see many (batch_size, spatial) combinations at runtime. + // + // Format: "input_name:d0xd1xd2xd3[,input2:...]" + // e.g. "x:1x3x48x320" for batch=1, C=3, H=48, W=320 + // + // All three fields must be set together. An empty min implies + // no profile (fall back to static-shape-per-unique-input mode). + std::string trtProfileMinShapes; + std::string trtProfileOptShapes; + std::string trtProfileMaxShapes; + }; + // ==================================================================== // BasicOrtHandler // ==================================================================== @@ -280,6 +320,9 @@ namespace ANSCENTER { const unsigned int num_threads; EngineType m_engineType; + // Per-session high-perf options. Default = legacy behavior. + OrtHandlerOptions m_handlerOptions; + protected: // Default: hardware auto-detection via ANSLicenseHelper through EPLoader explicit BasicOrtHandler(const std::string& _onnx_path, @@ -290,6 +333,19 @@ namespace ANSCENTER { EngineType engineType, unsigned int _num_threads = 1); + // Engine override + per-session high-perf options (TRT EP, max + // cuDNN workspace, etc.). Used by OCR sub-models that need + // shape-stable, high-throughput inference. + explicit BasicOrtHandler(const std::string& _onnx_path, + EngineType engineType, + const OrtHandlerOptions& options, + unsigned int _num_threads = 1); + + // Auto-detect engine via EPLoader, but with high-perf options. + explicit BasicOrtHandler(const std::string& _onnx_path, + const OrtHandlerOptions& options, + unsigned int _num_threads = 1); + virtual ~BasicOrtHandler(); BasicOrtHandler(const BasicOrtHandler&) = delete; @@ -298,6 +354,13 @@ namespace ANSCENTER { // Resolved EP type (after EPLoader fallback). Subclasses use this // to branch on actual EP at inference time. EngineType getEngineType() const { return m_engineType; } + + // Spin up a tiny CPU-only ORT session just long enough to read + // the name of the model's first input, then tear it down. Used + // by callers that need to build TRT profile-shape strings + // (which require the input name) BEFORE the real session is + // created. Returns an empty string on failure. + static std::string QueryModelInputName(const std::string& onnxPath); private: void initialize_handler(); protected: @@ -306,6 +369,7 @@ namespace ANSCENTER { // EP-specific session option builders bool TryAppendCUDA(Ort::SessionOptions& opts); + bool TryAppendTensorRT(Ort::SessionOptions& opts); bool TryAppendDirectML(Ort::SessionOptions& opts); bool TryAppendOpenVINO(Ort::SessionOptions& opts); }; diff --git a/modules/ANSLPR/ANSLPR_OCR.cpp b/modules/ANSLPR/ANSLPR_OCR.cpp index f7f0cfb..a2f8383 100644 --- a/modules/ANSLPR/ANSLPR_OCR.cpp +++ b/modules/ANSLPR/ANSLPR_OCR.cpp @@ -363,10 +363,14 @@ namespace ANSCENTER ocrModelConfig.ocrLanguage = ocrLang; ocrModelConfig.useDetector = true; ocrModelConfig.useRecognizer = true; - ocrModelConfig.useCLS = true; + // Skip the angle classifier for ALPR. License-plate boxes + // from the YOLO detector are already axis-aligned, so the + // 180° classifier is dead weight (one extra ORT call per + // plate for no recall gain). + ocrModelConfig.useCLS = false; ocrModelConfig.useLayout = false; ocrModelConfig.useTable = false; - ocrModelConfig.useTensorRT = false; + ocrModelConfig.useTensorRT = true; ocrModelConfig.enableMKLDNN = false; ocrModelConfig.useDilation = true; ocrModelConfig.useAngleCLS = false; @@ -375,7 +379,7 @@ namespace ANSCENTER ocrModelConfig.detectionBoxThreshold = 0.3; ocrModelConfig.detectionDBUnclipRatio = 1.2; ocrModelConfig.clsThreshold = 0.9; - ocrModelConfig.limitSideLen = 2560; + ocrModelConfig.limitSideLen = 480; // Pass the original ALPR model zip path — ANSOCRBase::Initialize // will extract it to the same folder (already done, so extraction @@ -638,41 +642,104 @@ namespace ANSCENTER return {}; } - std::vector output; - output.reserve(lprOutput.size()); + // Step 2: Collect crops from every valid plate. Wide plates + // (aspect >= 2.0) are treated as a single text line; narrow + // plates (2-row layouts like Japanese) are split horizontally + // at H/2 into top and bottom rows. All crops go through a + // single batched recognizer call, bypassing the OCR text-line + // detector entirely — for ALPR the LP YOLO box already bounds + // the text region precisely. + struct PlateInfo { + size_t origIndex; // into lprOutput + std::vector cropIndices; // into allCrops + cv::Mat plateROI; // full (unsplit) ROI, kept for colour + }; + std::vector allCrops; + std::vector plateInfos; + allCrops.reserve(lprOutput.size() * 2); + plateInfos.reserve(lprOutput.size()); - for (auto& lprObject : lprOutput) { - const cv::Rect& box = lprObject.box; + for (size_t i = 0; i < lprOutput.size(); ++i) { + const cv::Rect& box = lprOutput[i].box; // Calculate safe cropped region const int x1 = std::max(0, box.x); const int y1 = std::max(0, box.y); - const int width = std::min(frameWidth - x1, box.width); + const int width = std::min(frameWidth - x1, box.width); const int height = std::min(frameHeight - y1, box.height); if (width <= 0 || height <= 0) continue; - cv::Rect lprPos(x1, y1, width, height); - cv::Mat plateROI = frame(lprPos); + cv::Mat plateROI = frame(cv::Rect(x1, y1, width, height)); - // Step 2: Run OCR on the detected plate - std::string ocrText = RunOCROnPlate(plateROI, cameraId); + PlateInfo info; + info.origIndex = i; + info.plateROI = plateROI; - if (ocrText.empty()) continue; + const float aspect = static_cast(width) / + std::max(1, height); + // 2-row heuristic: aspect < 2.0 → split top/bottom. + // Threshold tuned to catch Japanese square plates + // (~1.5–1.9) while leaving wide EU/VN plates (3.0+) + // untouched. + if (aspect < 2.0f && height >= 24) { + const int halfH = height / 2; + info.cropIndices.push_back(allCrops.size()); + allCrops.push_back(plateROI(cv::Rect(0, 0, width, halfH))); + info.cropIndices.push_back(allCrops.size()); + allCrops.push_back(plateROI(cv::Rect(0, halfH, width, height - halfH))); + } + else { + info.cropIndices.push_back(allCrops.size()); + allCrops.push_back(plateROI); + } + + plateInfos.push_back(std::move(info)); + } + + if (allCrops.empty()) { + return {}; + } + + // Step 3: Single batched recognizer call for every crop. + // ONNXOCRRecognizer groups crops by bucket width and issues + // one ORT Run per bucket — typically 1–2 GPU calls for an + // entire frame regardless of plate count. + auto ocrResults = _ocrEngine->RecognizeTextBatch(allCrops); + + // Step 4: Assemble per-plate output + std::vector output; + output.reserve(plateInfos.size()); + + for (const auto& info : plateInfos) { + std::string combinedText; + for (size_t cropIdx : info.cropIndices) { + if (cropIdx >= ocrResults.size()) continue; + const std::string& lineText = ocrResults[cropIdx].first; + if (lineText.empty()) continue; + if (!combinedText.empty()) combinedText += " "; + combinedText += lineText; + } + if (combinedText.empty()) continue; + + Object lprObject = lprOutput[info.origIndex]; lprObject.cameraId = cameraId; - // Use ALPRChecker for text stabilization if enabled + // Cross-frame stabilization (unchanged) if (_enableALPRChecker) { - lprObject.className = alprChecker.checkPlateByTrackId(cameraId, ocrText, lprObject.trackId); - } else { - lprObject.className = ocrText; + lprObject.className = alprChecker.checkPlateByTrackId( + cameraId, combinedText, lprObject.trackId); + } + else { + lprObject.className = combinedText; } if (lprObject.className.empty()) continue; - // Step 3: Colour detection (optional) - std::string colour = DetectLPColourCached(plateROI, cameraId, lprObject.className); + // Optional colour detection on the full plate ROI + std::string colour = DetectLPColourCached( + info.plateROI, cameraId, lprObject.className); if (!colour.empty()) { lprObject.extraInfo = "color:" + colour; } diff --git a/modules/ANSOCR/ANSOCRBase.h b/modules/ANSOCR/ANSOCRBase.h index 185f7fe..66d06e4 100644 --- a/modules/ANSOCR/ANSOCRBase.h +++ b/modules/ANSOCR/ANSOCRBase.h @@ -159,6 +159,18 @@ namespace ANSCENTER { // Returns recognized text and confidence. Default returns empty. virtual std::pair RecognizeText(const cv::Mat& croppedImage) { return {"", 0.0f}; } + // Batch recognition — skips the text-line detector entirely and + // runs the whole batch through a single ORT call when possible. + // Default implementation falls back to per-image RecognizeText + // so existing subclasses keep working without changes. + virtual std::vector> RecognizeTextBatch( + const std::vector& croppedImages) { + std::vector> out; + out.reserve(croppedImages.size()); + for (const auto& m : croppedImages) out.push_back(RecognizeText(m)); + return out; + } + // ALPR configuration methods void SetOCRMode(OCRMode mode); OCRMode GetOCRMode() const; diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp b/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp index 47e8008..8babdea 100644 --- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp +++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace ANSCENTER { namespace onnxocr { @@ -12,6 +13,12 @@ ONNXOCRClassifier::ONNXOCRClassifier(const std::string& onnx_path, unsigned int : BasicOrtHandler(onnx_path, num_threads) { } +ONNXOCRClassifier::ONNXOCRClassifier(const std::string& onnx_path, + const OrtHandlerOptions& options, + unsigned int num_threads) + : BasicOrtHandler(onnx_path, options, num_threads) { +} + Ort::Value ONNXOCRClassifier::transform(const cv::Mat& mat) { cv::Mat resized; // Direct resize to 80x160 (PP-LCNet_x1_0_textline_ori) @@ -103,5 +110,38 @@ void ONNXOCRClassifier::Classify(std::vector& img_list, } } +void ONNXOCRClassifier::Warmup() { + std::lock_guard lock(_mutex); + if (_warmedUp || !ort_session) return; + + try { + cv::Mat dummy(kClsImageH * 2, kClsImageW * 2, CV_8UC3, cv::Scalar(128, 128, 128)); + cv::Mat resized; + cv::resize(dummy, resized, cv::Size(kClsImageW, kClsImageH)); + resized.convertTo(resized, CV_32FC3); + auto inputData = NormalizeAndPermute(resized); + + std::array inputShape = { 1, 3, kClsImageH, kClsImageW }; + Ort::Value inputTensor = Ort::Value::CreateTensor( + *memory_info_handler, inputData.data(), inputData.size(), + inputShape.data(), inputShape.size()); + + auto t0 = std::chrono::high_resolution_clock::now(); + (void)ort_session->Run( + Ort::RunOptions{ nullptr }, + input_node_names.data(), &inputTensor, 1, + output_node_names.data(), num_outputs); + auto t1 = std::chrono::high_resolution_clock::now(); + double ms = std::chrono::duration(t1 - t0).count(); + std::cout << "[ONNXOCRClassifier] Warmup [1,3," + << kClsImageH << "," << kClsImageW << "] " + << ms << " ms" << std::endl; + } + catch (const Ort::Exception& e) { + std::cerr << "[ONNXOCRClassifier] Warmup failed: " << e.what() << std::endl; + } + _warmedUp = true; +} + } // namespace onnxocr } // namespace ANSCENTER diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.h b/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.h index 9bd5535..1147f43 100644 --- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.h +++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.h @@ -11,6 +11,9 @@ namespace onnxocr { class ONNXOCRClassifier : public BasicOrtHandler { public: explicit ONNXOCRClassifier(const std::string& onnx_path, unsigned int num_threads = 1); + explicit ONNXOCRClassifier(const std::string& onnx_path, + const OrtHandlerOptions& options, + unsigned int num_threads = 1); ~ONNXOCRClassifier() override = default; // Classify text orientation for a list of cropped images @@ -21,7 +24,12 @@ public: std::vector& cls_scores, float cls_thresh = kClsThresh); + // Pre-warm cuDNN/TRT for the classifier's fixed [1,3,80,160] shape. + // Idempotent — no-op after the first call. + void Warmup(); + private: + bool _warmedUp = false; Ort::Value transform(const cv::Mat& mat) override; Ort::Value transformBatch(const std::vector& images) override; diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp b/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp index 616cecf..a8a5c5b 100644 --- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp +++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace ANSCENTER { namespace onnxocr { @@ -15,6 +16,12 @@ ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path, unsigned int num_ : BasicOrtHandler(onnx_path, num_threads) { } +ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path, + const OrtHandlerOptions& options, + unsigned int num_threads) + : BasicOrtHandler(onnx_path, options, num_threads) { +} + Ort::Value ONNXOCRDetector::transform(const cv::Mat& mat) { // Not used directly - detection uses custom Preprocess + manual tensor creation // Provided to satisfy BasicOrtHandler pure virtual @@ -308,5 +315,41 @@ std::vector ONNXOCRDetector::UnclipPolygon(const std::array lock(_mutex); + if (_warmedUp || !ort_session) return; + + // 320x320 covers the typical license-plate ROI after LPD crop + + // multiple-of-32 rounding. cuDNN caches the algorithm for this + // shape so the first real inference doesn't pay the picker cost. + constexpr int kWarmupSide = 320; + try { + cv::Mat dummy(kWarmupSide, kWarmupSide, CV_8UC3, cv::Scalar(128, 128, 128)); + cv::Mat dummyF; + dummy.convertTo(dummyF, CV_32FC3); + auto inputData = NormalizeAndPermute(dummyF); + + std::array inputShape = { 1, 3, kWarmupSide, kWarmupSide }; + Ort::Value inputTensor = Ort::Value::CreateTensor( + *memory_info_handler, inputData.data(), inputData.size(), + inputShape.data(), inputShape.size()); + + auto t0 = std::chrono::high_resolution_clock::now(); + (void)ort_session->Run( + Ort::RunOptions{ nullptr }, + input_node_names.data(), &inputTensor, 1, + output_node_names.data(), num_outputs); + auto t1 = std::chrono::high_resolution_clock::now(); + double ms = std::chrono::duration(t1 - t0).count(); + std::cout << "[ONNXOCRDetector] Warmup [1,3," + << kWarmupSide << "," << kWarmupSide << "] " + << ms << " ms" << std::endl; + } + catch (const Ort::Exception& e) { + std::cerr << "[ONNXOCRDetector] Warmup failed: " << e.what() << std::endl; + } + _warmedUp = true; +} + } // namespace onnxocr } // namespace ANSCENTER diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.h b/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.h index 02d5a5b..d8620c5 100644 --- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.h +++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.h @@ -11,6 +11,9 @@ namespace onnxocr { class ONNXOCRDetector : public BasicOrtHandler { public: explicit ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads = 1); + explicit ONNXOCRDetector(const std::string& onnx_path, + const OrtHandlerOptions& options, + unsigned int num_threads = 1); ~ONNXOCRDetector() override = default; // Run text detection on an image @@ -21,7 +24,12 @@ public: float unclipRatio = kDetUnclipRatio, bool useDilation = false); + // Pre-warm cuDNN/TRT at a canonical 320x320 input so the first real + // call doesn't pay the algorithm-selection tax. Idempotent. + void Warmup(); + private: + bool _warmedUp = false; Ort::Value transform(const cv::Mat& mat) override; Ort::Value transformBatch(const std::vector& images) override; diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp index 0fe881b..190a30a 100644 --- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp +++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace ANSCENTER { namespace onnxocr { @@ -15,6 +16,12 @@ ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path, unsigned int : BasicOrtHandler(onnx_path, num_threads) { } +ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path, + const OrtHandlerOptions& options, + unsigned int num_threads) + : BasicOrtHandler(onnx_path, options, num_threads) { +} + bool ONNXOCRRecognizer::LoadDictionary(const std::string& dictPath) { keys_ = LoadDict(dictPath); if (keys_.size() < 2) { @@ -46,6 +53,54 @@ Ort::Value ONNXOCRRecognizer::transformBatch(const std::vector& images) return Ort::Value(nullptr); } +// ---------------------------------------------------------------------------- +// Width buckets — every recognizer input is padded up to one of these widths +// before reaching ORT. This bounds the number of distinct shapes cuDNN ever +// sees to four, so its HEURISTIC algorithm cache hits on every subsequent +// call instead of re-tuning per plate. Buckets cover the realistic range: +// 320 px → short Latin/Japanese plates (most common) +// 480 px → wider Latin plates with two rows of text +// 640 px → long single-row plates / multi-line stacked text +// 960 px → safety upper bound (== kRecImgMaxW) +// ---------------------------------------------------------------------------- +static constexpr int kRecBucketWidths[] = { 320, 480, 640, 960 }; +static constexpr int kRecNumBuckets = sizeof(kRecBucketWidths) / sizeof(kRecBucketWidths[0]); + +int ONNXOCRRecognizer::RoundUpToBucket(int resizedW) const { + const int capped = std::min(resizedW, imgMaxW_); + for (int b = 0; b < kRecNumBuckets; ++b) { + if (kRecBucketWidths[b] >= capped) return kRecBucketWidths[b]; + } + return imgMaxW_; +} + +// Resize + normalize a single crop into a CHW float vector at width +// `bucketW`, padding with zeros on the right when needed. The returned +// vector has exactly 3*imgH_*bucketW elements. +static std::vector PreprocessCropToBucket(const cv::Mat& crop, + int imgH, int bucketW) { + cv::Mat resized = ResizeRecImage(crop, imgH, bucketW); + int resizedW = resized.cols; + resized.convertTo(resized, CV_32FC3); + auto normalizedData = NormalizeAndPermuteCls(resized); + + if (resizedW == bucketW) { + return normalizedData; + } + + // Zero-pad on the right (CHW layout) + std::vector padded(3 * imgH * bucketW, 0.0f); + for (int c = 0; c < 3; c++) { + for (int y = 0; y < imgH; y++) { + std::memcpy( + &padded[c * imgH * bucketW + y * bucketW], + &normalizedData[c * imgH * resizedW + y * resizedW], + resizedW * sizeof(float)); + } + } + return padded; +} + TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) { std::lock_guard lock(_mutex); @@ -54,52 +109,27 @@ TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) { } try { - // Preprocess: resize to fixed height, proportional width + // Step 1: aspect-preserving resize to height=imgH_, width capped + // at imgMaxW_. Then round resized width up to the next bucket. cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_); - int resizedW = resized.cols; + const int bucketW = RoundUpToBucket(resized.cols); - resized.convertTo(resized, CV_32FC3); - // Recognition uses (pixel/255 - 0.5) / 0.5 normalization (same as classifier) - auto normalizedData = NormalizeAndPermuteCls(resized); + std::vector inputData = PreprocessCropToBucket(croppedImage, imgH_, bucketW); - // Pad to at least kRecImgW width (matching official PaddleOCR behavior) - // Official PaddleOCR: padding_im = np.zeros((C, H, W)), then copies normalized - // image into left portion. Padding value = 0.0 in normalized space. - int imgW = std::max(resizedW, kRecImgW); - - std::vector inputData; - if (imgW > resizedW) { - // Zero-pad on the right (CHW layout) - inputData.resize(3 * imgH_ * imgW, 0.0f); - for (int c = 0; c < 3; c++) { - for (int y = 0; y < imgH_; y++) { - std::memcpy( - &inputData[c * imgH_ * imgW + y * imgW], - &normalizedData[c * imgH_ * resizedW + y * resizedW], - resizedW * sizeof(float)); - } - } - } else { - inputData = std::move(normalizedData); - } - - // Create input tensor with (possibly padded) width - std::array inputShape = { 1, 3, imgH_, imgW }; + std::array inputShape = { 1, 3, imgH_, bucketW }; Ort::Value inputTensor = Ort::Value::CreateTensor( *memory_info_handler, inputData.data(), inputData.size(), inputShape.data(), inputShape.size()); - // Run inference auto outputTensors = ort_session->Run( Ort::RunOptions{ nullptr }, input_node_names.data(), &inputTensor, 1, output_node_names.data(), num_outputs); - // Get output float* outputData = outputTensors[0].GetTensorMutableData(); auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); - int seqLen = static_cast(outputShape[1]); + int seqLen = static_cast(outputShape[1]); int numClasses = static_cast(outputShape[2]); return CTCDecode(outputData, seqLen, numClasses); @@ -110,18 +140,162 @@ TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) { } } -std::vector ONNXOCRRecognizer::RecognizeBatch(const std::vector& croppedImages) { - std::vector results; - results.reserve(croppedImages.size()); +void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector& crops, + const std::vector& origIndices, + int bucketW, + std::vector& out) { + if (crops.empty()) return; - // Process one at a time (dynamic width per image) - for (size_t i = 0; i < croppedImages.size(); i++) { - results.push_back(Recognize(croppedImages[i])); + try { + const size_t batchN = crops.size(); + const size_t perImage = static_cast(3) * imgH_ * bucketW; + + // Stack N preprocessed crops into one [N,3,H,W] buffer + std::vector batchInput(batchN * perImage, 0.0f); + for (size_t i = 0; i < batchN; ++i) { + auto img = PreprocessCropToBucket(crops[i], imgH_, bucketW); + std::memcpy(&batchInput[i * perImage], img.data(), + perImage * sizeof(float)); + } + + std::array inputShape = { + static_cast(batchN), 3, + static_cast(imgH_), + static_cast(bucketW) + }; + Ort::Value inputTensor = Ort::Value::CreateTensor( + *memory_info_handler, batchInput.data(), batchInput.size(), + inputShape.data(), inputShape.size()); + + auto outputTensors = ort_session->Run( + Ort::RunOptions{ nullptr }, + input_node_names.data(), &inputTensor, 1, + output_node_names.data(), num_outputs); + + float* outputData = outputTensors[0].GetTensorMutableData(); + auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); + + // Expected output: [N, seqLen, numClasses] + if (outputShape.size() < 3) { + std::cerr << "[ONNXOCRRecognizer] Unexpected batch output rank: " + << outputShape.size() << std::endl; + return; + } + const int outBatch = static_cast(outputShape[0]); + const int seqLen = static_cast(outputShape[1]); + const int numClasses = static_cast(outputShape[2]); + const size_t perRow = static_cast(seqLen) * numClasses; + + for (int i = 0; i < outBatch && i < static_cast(batchN); ++i) { + TextLine tl = CTCDecode(outputData + i * perRow, seqLen, numClasses); + out[origIndices[i]] = std::move(tl); + } + } + catch (const Ort::Exception& e) { + // ORT will throw if the model doesn't support a batch dimension > 1. + // Fall back to per-image inference for this group. + std::cerr << "[ONNXOCRRecognizer] Batch inference failed at bucketW=" + << bucketW << " (" << e.what() + << ") — falling back to single-image path." << std::endl; + for (size_t i = 0; i < crops.size(); ++i) { + // Direct call (we already hold _mutex via the public RecognizeBatch + // wrapper). Replicate the single-image preprocessing here to avoid + // re-entering Recognize() and double-locking the mutex. + try { + cv::Mat resized = ResizeRecImage(crops[i], imgH_, imgMaxW_); + int singleBucket = RoundUpToBucket(resized.cols); + auto inputData = PreprocessCropToBucket(crops[i], imgH_, singleBucket); + std::array inputShape = { 1, 3, imgH_, singleBucket }; + Ort::Value inputTensor = Ort::Value::CreateTensor( + *memory_info_handler, inputData.data(), inputData.size(), + inputShape.data(), inputShape.size()); + auto outputTensors = ort_session->Run( + Ort::RunOptions{ nullptr }, + input_node_names.data(), &inputTensor, 1, + output_node_names.data(), num_outputs); + float* outData = outputTensors[0].GetTensorMutableData(); + auto outShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); + int seqLen = static_cast(outShape[1]); + int numClasses = static_cast(outShape[2]); + out[origIndices[i]] = CTCDecode(outData, seqLen, numClasses); + } catch (const Ort::Exception& e2) { + std::cerr << "[ONNXOCRRecognizer] Single-image fallback also failed: " + << e2.what() << std::endl; + out[origIndices[i]] = {}; + } + } + } +} + +std::vector ONNXOCRRecognizer::RecognizeBatch(const std::vector& croppedImages) { + std::lock_guard lock(_mutex); + + std::vector results(croppedImages.size()); + if (!ort_session || croppedImages.empty() || keys_.empty()) { + return results; + } + + // Group crops by their target bucket width + std::vector> groupCrops(kRecNumBuckets); + std::vector> groupIdx(kRecNumBuckets); + + for (size_t i = 0; i < croppedImages.size(); ++i) { + if (croppedImages[i].empty()) continue; + cv::Mat resized = ResizeRecImage(croppedImages[i], imgH_, imgMaxW_); + const int bw = RoundUpToBucket(resized.cols); + // Find bucket index + int bucketIdx = kRecNumBuckets - 1; + for (int b = 0; b < kRecNumBuckets; ++b) { + if (kRecBucketWidths[b] == bw) { bucketIdx = b; break; } + } + groupCrops[bucketIdx].push_back(croppedImages[i]); + groupIdx[bucketIdx].push_back(i); + } + + // Run one batched inference per non-empty bucket + for (int b = 0; b < kRecNumBuckets; ++b) { + if (groupCrops[b].empty()) continue; + RunBatchAtWidth(groupCrops[b], groupIdx[b], kRecBucketWidths[b], results); } return results; } +void ONNXOCRRecognizer::Warmup() { + std::lock_guard lock(_mutex); + if (_warmedUp || !ort_session) return; + + // Dummy 3-channel image, mid-grey, large enough to resize to imgH_ + cv::Mat dummy(imgH_ * 2, kRecBucketWidths[kRecNumBuckets - 1] * 2, + CV_8UC3, cv::Scalar(128, 128, 128)); + + for (int b = 0; b < kRecNumBuckets; ++b) { + const int bucketW = kRecBucketWidths[b]; + try { + auto inputData = PreprocessCropToBucket(dummy, imgH_, bucketW); + std::array inputShape = { 1, 3, imgH_, bucketW }; + Ort::Value inputTensor = Ort::Value::CreateTensor( + *memory_info_handler, inputData.data(), inputData.size(), + inputShape.data(), inputShape.size()); + + auto t0 = std::chrono::high_resolution_clock::now(); + (void)ort_session->Run( + Ort::RunOptions{ nullptr }, + input_node_names.data(), &inputTensor, 1, + output_node_names.data(), num_outputs); + auto t1 = std::chrono::high_resolution_clock::now(); + double ms = std::chrono::duration(t1 - t0).count(); + std::cout << "[ONNXOCRRecognizer] Warmup bucketW=" << bucketW + << " " << ms << " ms" << std::endl; + } + catch (const Ort::Exception& e) { + std::cerr << "[ONNXOCRRecognizer] Warmup failed at bucketW=" + << bucketW << ": " << e.what() << std::endl; + } + } + _warmedUp = true; +} + TextLine ONNXOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) { TextLine result; std::string text; diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.h b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.h index a8292f2..5b3159a 100644 --- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.h +++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.h @@ -12,6 +12,9 @@ namespace onnxocr { class ONNXOCRRecognizer : public BasicOrtHandler { public: explicit ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads = 1); + explicit ONNXOCRRecognizer(const std::string& onnx_path, + const OrtHandlerOptions& options, + unsigned int num_threads = 1); ~ONNXOCRRecognizer() override = default; // Load character dictionary (must be called before Recognize) @@ -20,13 +23,31 @@ public: // Recognize text from a single cropped text image TextLine Recognize(const cv::Mat& croppedImage); - // Batch recognition for multiple cropped images + // Batch recognition for multiple cropped images. + // Crops are grouped into a small set of fixed width buckets and + // submitted to ORT as [N,3,imgH_,bucketW] tensors so cuDNN sees + // shape-stable inputs and can reuse algorithms across calls. std::vector RecognizeBatch(const std::vector& croppedImages); + // Pre-warm cuDNN/TRT for every bucket width by running dummy + // inferences. Idempotent — no-op if already warmed up. + void Warmup(); + private: Ort::Value transform(const cv::Mat& mat) override; Ort::Value transformBatch(const std::vector& images) override; + // Round resizedW up to the next bucket width (capped at imgMaxW_). + // Used by both Recognize() and RecognizeBatch() so cuDNN only ever + // sees a small finite set of input shapes. + int RoundUpToBucket(int resizedW) const; + + // Run a single [N,3,imgH_,bucketW] inference and CTC-decode each row. + void RunBatchAtWidth(const std::vector& crops, + const std::vector& origIndices, + int bucketW, + std::vector& out); + // CTC greedy decode TextLine CTCDecode(const float* outputData, int seqLen, int numClasses); @@ -34,6 +55,7 @@ private: int imgH_ = kRecImgH; int imgMaxW_ = kRecImgMaxW; std::mutex _mutex; + bool _warmedUp = false; }; } // namespace onnxocr diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h b/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h index fe4856b..5f07f2c 100644 --- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h +++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h @@ -88,11 +88,22 @@ inline std::vector LoadDict(const std::string& dictPath) { return keys; } -// Compute resize dimensions for detection model (multiples of 32) +// Compute resize dimensions for detection model. // limit_type='max': scale down if max side > maxSideLen (PP-OCRv5 server default) // maxSideLimit: safety cap on final max dimension (default 4000) +// +// Each dimension is rounded UP to a multiple of kDetSizeBucket (96). The +// coarse granularity is deliberate: cuDNN HEURISTIC has to re-select +// convolution algorithms every time it sees a new input shape, and that +// selection costs ~100 ms per shape. With multiples of 32, a typical ALPR +// run produces 30+ unique detector shapes; with multiples of 96 that drops +// to 5–10, which cuDNN can cache and reuse for the rest of the video. +// 96 is divisible by the DBNet down-stride of 32, so feature-map sizes +// stay integer. inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen, int maxSideLimit = kDetMaxSideLimit) { + constexpr int kDetSizeBucket = 96; + float ratio = 1.0f; int maxSide = std::max(srcH, srcW); if (maxSide > maxSideLen) { @@ -108,8 +119,12 @@ inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen, newW = static_cast(newW * clampRatio); } - newH = std::max(32, static_cast(std::round(newH / 32.0) * 32)); - newW = std::max(32, static_cast(std::round(newW / 32.0) * 32)); + auto roundUpToBucket = [](int x) { + return std::max(kDetSizeBucket, + ((x + kDetSizeBucket - 1) / kDetSizeBucket) * kDetSizeBucket); + }; + newH = roundUpToBucket(newH); + newW = roundUpToBucket(newW); return cv::Size(newW, newH); } diff --git a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp index 9216154..79fb541 100644 --- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp +++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp @@ -11,13 +11,75 @@ namespace onnxocr { bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath, const std::string& clsModelPath, const std::string& recModelPath, - const std::string& dictPath) { + const std::string& dictPath, + bool preferTensorRT) { std::lock_guard lock(_mutex); ModelLoadingGuard mlg(_modelLoading); + // High-perf options. The OCR sub-models split into two groups: + // + // 1. Detector — its input shape varies continuously with every + // plate-ROI aspect ratio. TRT EP is a poor fit because it + // builds a fresh engine for each unique shape (minutes each). + // We keep it on CUDA EP with the largest cuDNN workspace and + // let cuDNN HEURISTIC handle the per-shape algo selection. + // + // 2. Classifier + Recognizer — fixed-bucket shapes (cls is + // [1,3,80,160], rec is [1,3,48,{320,480,640,960}]). These + // benefit massively from TRT EP because the engine is built + // once per shape and reused forever. + OrtHandlerOptions detectorOpts; + // Detector uses CUDA EP with *conservative* cuDNN workspace. + // Empirical: on VRAM-constrained GPUs (LPD TRT engine + rec TRT + // engine + ORT arena in play) the max-workspace mode causes cuDNN + // to pick Winograd/implicit-precomp-GEMM variants that silently + // fall back to slow NO-WORKSPACE algorithms when the big workspace + // can't be allocated. With "0" cuDNN picks algorithms that are + // known to fit and runs ~10x faster in practice. + detectorOpts.useMaxCudnnWorkspace = false; + detectorOpts.preferTensorRT = false; // never TRT for the detector + + // Classifier (fixed [1,3,80,160]): TRT with no profile is fine. + OrtHandlerOptions classifierOpts; + classifierOpts.useMaxCudnnWorkspace = true; + classifierOpts.preferTensorRT = preferTensorRT; + classifierOpts.trtFP16 = true; + + // Recognizer: needs a DYNAMIC profile so one TRT engine covers every + // (batch, bucket_width) pair we generate at runtime. Without this, + // each new shape triggers a ~80s engine rebuild mid-stream when a + // new plate appears or the plate count changes. + // + // Profile range: + // batch : 1 .. 16 (16 plates worth of crops is generous) + // H : 48 (fixed) + // W : 320 .. 960 (covers all 4 recognizer buckets) + // + // Query the actual input name from the .onnx file instead of + // hardcoding — PaddleOCR usually exports it as "x" but the name can + // vary across model versions. + OrtHandlerOptions recognizerOpts; + recognizerOpts.useMaxCudnnWorkspace = true; + recognizerOpts.preferTensorRT = preferTensorRT; + recognizerOpts.trtFP16 = true; + if (preferTensorRT) { + std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath); + if (recInputName.empty()) { + std::cerr << "[PaddleOCRV5Engine] Could not query recognizer " + "input name — defaulting to 'x'" << std::endl; + recInputName = "x"; + } + std::cout << "[PaddleOCRV5Engine] Recognizer input name: '" + << recInputName << "' — building TRT dynamic profile " + << "[batch=1..16, W=320..960]" << std::endl; + recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320"; + recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480"; + recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960"; + } + try { // Initialize detector (also triggers EPLoader init in BasicOrtHandler) - detector_ = std::make_unique(detModelPath); + detector_ = std::make_unique(detModelPath, detectorOpts); std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl; // Ensure this DLL's copy of Ort::Global::api_ is initialized. @@ -29,7 +91,7 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath, // Initialize classifier (optional) if (!clsModelPath.empty()) { - classifier_ = std::make_unique(clsModelPath); + classifier_ = std::make_unique(clsModelPath, classifierOpts); std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl; } else { @@ -38,13 +100,26 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath, } // Initialize recognizer - recognizer_ = std::make_unique(recModelPath); + recognizer_ = std::make_unique(recModelPath, recognizerOpts); if (!recognizer_->LoadDictionary(dictPath)) { std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl; return false; } std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl; + // Pre-warm classifier (fixed [1,3,80,160]) and recognizer (4 + // bucket widths) so the first frame doesn't pay the cuDNN/TRT + // algorithm-selection tax. The detector is intentionally NOT + // warmed up: its input shape varies continuously with each + // plate-ROI aspect ratio, so a warmup at any single canonical + // shape would cost minutes (TRT) or be useless (CUDA cache miss + // on the real frame anyway). Real frames will pay the per-shape + // cuDNN HEURISTIC cost on first use. + std::cout << "[PaddleOCRV5Engine] Warming up OCR pipeline..." << std::endl; + if (classifier_) classifier_->Warmup(); + if (recognizer_) recognizer_->Warmup(); + std::cout << "[PaddleOCRV5Engine] Warmup complete" << std::endl; + _initialized = true; std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl; return true; @@ -140,5 +215,18 @@ TextLine PaddleOCRV5Engine::recognizeOnly(const cv::Mat& croppedImage) { return recognizer_->Recognize(croppedImage); } +std::vector PaddleOCRV5Engine::recognizeMany(const std::vector& croppedImages) { + if (_modelLoading.load()) return std::vector(croppedImages.size()); + { + auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeMany"); + if (!lk.owns_lock()) return std::vector(croppedImages.size()); + if (!_initialized || !recognizer_ || croppedImages.empty()) { + return std::vector(croppedImages.size()); + } + } + // Delegates to the bucketed, batched path in ONNXOCRRecognizer. + return recognizer_->RecognizeBatch(croppedImages); +} + } // namespace onnxocr } // namespace ANSCENTER diff --git a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.h b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.h index a7ec21f..6ac906e 100644 --- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.h +++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.h @@ -25,10 +25,13 @@ public: // Initialize the OCR pipeline // clsModelPath can be empty to skip classification + // preferTensorRT: try TensorRT EP first for the three sub-models + // (cuDNN-friendly cuDNN max-workspace mode either way) bool Initialize(const std::string& detModelPath, const std::string& clsModelPath, const std::string& recModelPath, - const std::string& dictPath); + const std::string& dictPath, + bool preferTensorRT = false); // Run full OCR pipeline on an image // Returns results matching PaddleOCR::OCRPredictResult format @@ -37,6 +40,14 @@ public: // Run recognizer only on a pre-cropped text image (no detection step) TextLine recognizeOnly(const cv::Mat& croppedImage); + // Run recognizer only on a batch of pre-cropped text images in a + // single batched ORT inference. Skips the detector entirely — the + // caller is expected to supply crops that are already roughly + // axis-aligned single-line text (e.g. ALPR plate ROIs, optionally + // pre-split into rows). Crops are grouped by bucket width, so a + // single call to this function typically issues 1–2 ORT Runs total. + std::vector recognizeMany(const std::vector& croppedImages); + // Configuration setters (matching OCRModelConfig parameters) void SetDetMaxSideLen(int val) { _maxSideLen = val; } void SetDetDbThresh(float val) { _detDbThresh = val; } diff --git a/modules/ANSOCR/ANSOnnxOCR.cpp b/modules/ANSOCR/ANSOnnxOCR.cpp index 6ed1031..763aed6 100644 --- a/modules/ANSOCR/ANSOnnxOCR.cpp +++ b/modules/ANSOCR/ANSOnnxOCR.cpp @@ -50,7 +50,8 @@ bool ANSONNXOCR::Initialize(const std::string& licenseKey, OCRModelConfig modelC _modelConfig.detectionModelFile, clsModelPath, _modelConfig.recognizerModelFile, - _modelConfig.recogizerCharDictionaryPath); + _modelConfig.recogizerCharDictionaryPath, + _modelConfig.useTensorRT); return _isInitialized; } @@ -391,4 +392,16 @@ std::pair ANSONNXOCR::RecognizeText(const cv::Mat& croppedIm return {result.text, result.score}; } +std::vector> ANSONNXOCR::RecognizeTextBatch( + const std::vector& croppedImages) { + std::vector> out(croppedImages.size(), {"", 0.0f}); + if (!_isInitialized || !_engine || croppedImages.empty()) return out; + + auto lines = _engine->recognizeMany(croppedImages); + for (size_t i = 0; i < lines.size() && i < out.size(); ++i) { + out[i] = { lines[i].text, lines[i].score }; + } + return out; +} + } // namespace ANSCENTER diff --git a/modules/ANSOCR/ANSOnnxOCR.h b/modules/ANSOCR/ANSOnnxOCR.h index b6b1f17..902ff3a 100644 --- a/modules/ANSOCR/ANSOnnxOCR.h +++ b/modules/ANSOCR/ANSOnnxOCR.h @@ -24,6 +24,8 @@ namespace ANSCENTER { std::vector RunInference(const cv::Mat& input, const std::vector& Bbox, const std::string& cameraId) override; std::pair RecognizeText(const cv::Mat& croppedImage) override; + std::vector> RecognizeTextBatch( + const std::vector& croppedImages) override; ~ANSONNXOCR(); bool Destroy() override; diff --git a/tests/ANSCV-UnitTest/ANSCV-UnitTest.cpp b/tests/ANSCV-UnitTest/ANSCV-UnitTest.cpp index e0f0bbb..d91791b 100644 --- a/tests/ANSCV-UnitTest/ANSCV-UnitTest.cpp +++ b/tests/ANSCV-UnitTest/ANSCV-UnitTest.cpp @@ -1367,8 +1367,8 @@ int TestGetImage() { } int GenerateVideo() { std::string imageFolder = "E:\\Programs\\DemoAssets\\ImageSeries\\20260413_152604.321"; - std::string outputVideoPath = "E:\\Programs\\DemoAssets\\ImageSeries\\output1.mp4"; - int conversionResult = ANSCV_ImagesToMP4_S(imageFolder.c_str(), outputVideoPath.c_str(), 0, 5); + std::string outputVideoPath = "E:\\Programs\\DemoAssets\\ImageSeries\\output3.mp4"; + int conversionResult = ANSCV_ImagesToMP4_S(imageFolder.c_str(), outputVideoPath.c_str(), 0,20); if (!conversionResult) { std::cerr << "Failed to convert images to MP4." << std::endl; return -1; diff --git a/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp b/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp index cee072b..fd4715f 100644 --- a/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp +++ b/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp @@ -3805,6 +3805,149 @@ int ALPR_OCR_Test() { return 0; } +int ALPR_OCR_VideoTest() { + std::cout << "=== ALPR_OCR_VideoTest: ANSALPR_OCR engine on video ===" << std::endl; + std::filesystem::path currentPath = std::filesystem::current_path(); + std::cout << "Current working directory: " << currentPath << std::endl; + + ANSCENTER::ANSALPR* infHandle = nullptr; + std::string licenseKey = ""; + std::string modelFilePath = "C:\\Projects\\ANSVIS\\Models\\ANS_GenericALPR_v2.0.zip"; + std::string videoFilePath = "E:\\Programs\\DemoAssets\\Videos\\ALRP\\PMH\\Day\\day.mp4"; + + int engineType = 2; // ANSALPR_OCR + double detectionThreshold = 0.3; + double ocrThreshold = 0.5; + double colourThreshold = 0.0; + + // Step 1: Create handle + int createResult = CreateANSALPRHandle(&infHandle, licenseKey.c_str(), + modelFilePath.c_str(), "", engineType, detectionThreshold, ocrThreshold, colourThreshold); + std::cout << "CreateANSALPRHandle result: " << createResult << std::endl; + if (!createResult || !infHandle) { + std::cerr << "Failed to create ANSALPR_OCR handle" << std::endl; + return -1; + } + + // Step 2: Set country (JAPAN = 5 — adjust to match the dataset if needed) + ANSALPR_SetCountry(&infHandle, 5); + std::cout << "Country set to JAPAN" << std::endl; + + // Step 3: Load engine + auto engineStart = std::chrono::high_resolution_clock::now(); + int loadResult = LoadANSALPREngineHandle(&infHandle); + auto engineEnd = std::chrono::high_resolution_clock::now(); + double engineMs = std::chrono::duration(engineEnd - engineStart).count(); + std::cout << "LoadANSALPREngineHandle result: " << loadResult << " (" << engineMs << " ms)" << std::endl; + if (!loadResult) { + std::cerr << "Failed to load ANSALPR_OCR engine" << std::endl; + ReleaseANSALPRHandle(&infHandle); + return -2; + } + + // Step 4: Open video + cv::VideoCapture capture(videoFilePath); + if (!capture.isOpened()) { + std::cerr << "Could not open video file: " << videoFilePath << std::endl; + ReleaseANSALPRHandle(&infHandle); + return -3; + } + + boost::property_tree::ptree pt; + int frameIdx = 0; + + while (true) { + cv::Mat frame; + if (!capture.read(frame)) { + std::cout << "\nEnd of video stream.\n"; + break; + } + ++frameIdx; + + int width = frame.cols; + int height = frame.rows; + + // Convert to raw BGR bytes for ANSALPR_RunInferenceBinary + unsigned int bufferLength = static_cast(frame.total() * frame.elemSize()); + unsigned char* imageBytes = new unsigned char[bufferLength]; + std::memcpy(imageBytes, frame.data, bufferLength); + + auto t0 = std::chrono::high_resolution_clock::now(); + std::string detectionResult = ANSALPR_RunInferenceBinary(&infHandle, imageBytes, width, height); + auto t1 = std::chrono::high_resolution_clock::now(); + double inferMs = std::chrono::duration(t1 - t0).count(); + delete[] imageBytes; + + printf("Frame %d: %.2f ms (%.1f FPS)\n", frameIdx, inferMs, + inferMs > 0.0 ? (1000.0 / inferMs) : 0.0); + + // Draw detections + if (!detectionResult.empty()) { + try { + pt.clear(); + std::stringstream ss(detectionResult); + boost::property_tree::read_json(ss, pt); + BOOST_FOREACH(const boost::property_tree::ptree::value_type& child, pt.get_child("results")) { + const boost::property_tree::ptree& res = child.second; + const auto class_name_raw = GetData(res, "class_name"); + const std::string class_name = DecodeUnicodeEscapes(class_name_raw); + const auto x = GetData(res, "x"); + const auto y = GetData(res, "y"); + const auto w = GetData(res, "width"); + const auto h = GetData(res, "height"); + + cv::rectangle(frame, cv::Rect(x, y, w, h), cv::Scalar(0, 255, 0), 2); + + std::string extraInfo = GetOptionalValue(res, "extra_info", ""); + if (!class_name.empty()) { + std::cout << " Plate: " << class_name; + if (!extraInfo.empty()) std::cout << " (" << extraInfo << ")"; + std::cout << std::endl; + } + +#ifdef WIN32 + { + int textH = (int)(1.5 * 30); + int ty = y - 5 - textH; + if (ty < 0) ty = y + 3; + putTextUnicode(frame, class_name, cv::Point(x, ty), + 1.5, cv::Scalar(0, 0, 255), 3); + } +#else + cv::putText(frame, class_name, cv::Point(x, y - 5), + cv::FONT_HERSHEY_SIMPLEX, 1.0, cv::Scalar(0, 0, 255), 2, cv::LINE_AA); +#endif + } + } + catch (const std::exception& e) { + std::cerr << "JSON parse error: " << e.what() << std::endl; + } + } + + // Display (fit to 1920x1080) + cv::Mat display; + double scale = std::min(1920.0 / frame.cols, 1080.0 / frame.rows); + if (scale < 1.0) { + cv::resize(frame, display, cv::Size(), scale, scale); + } else { + display = frame; + } + cv::namedWindow("ALPR_OCR_VideoTest", cv::WINDOW_AUTOSIZE); + cv::imshow("ALPR_OCR_VideoTest", display); + if (cv::waitKey(1) == 27) { // ESC to exit + std::cout << "ESC pressed — stopping.\n"; + break; + } + } + + capture.release(); + cv::destroyAllWindows(); + ReleaseANSALPRHandle(&infHandle); + + std::cout << "=== ALPR_OCR_VideoTest complete ===" << std::endl; + return 0; +} + int main() { #ifdef WIN32 @@ -3825,7 +3968,8 @@ int main() //ANSLPR_MultiGPU_StressTest_SimulatedCam(); // ANSLPR_MultiGPU_StressTest_FilePlayer(); //ANSLPR_OD_CPU_VideoTest(); - ALPR_OCR_Test(); + //ALPR_OCR_Test(); + ALPR_OCR_VideoTest(); return 0; }