Improve ALPR_OCR peformance

This commit is contained in:
2026-04-14 20:30:21 +10:00
parent 3349b45ade
commit f9a0af8949
18 changed files with 991 additions and 77 deletions

View File

@@ -134,7 +134,16 @@
"Bash(python /tmp/apply_fd_guards.py)", "Bash(python /tmp/apply_fd_guards.py)",
"Bash(python /tmp/apply_fd_precheck.py)", "Bash(python /tmp/apply_fd_precheck.py)",
"Bash(ls /c/Projects/CLionProjects/ANSCORE/modules/ANSFR/*.cpp /c/Projects/CLionProjects/ANSCORE/modules/ANSFR/*.h)", "Bash(ls /c/Projects/CLionProjects/ANSCORE/modules/ANSFR/*.cpp /c/Projects/CLionProjects/ANSCORE/modules/ANSFR/*.h)",
"Bash(grep -rn \"lock_guard.*_mutex\\\\|lock_guard.*mutex\" /c/Projects/CLionProjects/ANSCORE/modules/ANSFR/*.cpp)" "Bash(grep -rn \"lock_guard.*_mutex\\\\|lock_guard.*mutex\" /c/Projects/CLionProjects/ANSCORE/modules/ANSFR/*.cpp)",
"Bash(grep -l \"TensorRT\\\\|tensorrt_provider_factory\\\\|OrtTensorRTProviderOptionsV2\\\\|CreateTensorRTProviderOptions\\\\|UpdateTensorRTProviderOptions\" \"C:/ANSLibs/onnxruntime/include/\"*.h)",
"Bash(cmake --build . --target ONNXEngine ANSOCR ANSLPR ANSLPR-UnitTest --config Release -- -j 4)",
"Bash(cmake --build . --target help)",
"Bash(cmake --build . --target ANSLPR-UnitTest)",
"Bash(cmd.exe /c \"call \\\\\"C:\\\\\\\\Program Files\\\\\\\\Microsoft Visual Studio\\\\\\\\2022\\\\\\\\Community\\\\\\\\VC\\\\\\\\Auxiliary\\\\\\\\Build\\\\\\\\vcvars64.bat\\\\\" >nul 2>&1 && cmake --build . --target ANSLPR-UnitTest 2>&1\")",
"Bash(cmd.exe //c \"cd /d C:\\\\\\\\Projects\\\\\\\\CLionProjects\\\\\\\\ANSCORE\\\\\\\\cmake-build-release && call \\\\\"C:\\\\\\\\Program Files\\\\\\\\Microsoft Visual Studio\\\\\\\\2022\\\\\\\\Community\\\\\\\\VC\\\\\\\\Auxiliary\\\\\\\\Build\\\\\\\\vcvars64.bat\\\\\" && cmake --build . --target ANSLPR-UnitTest\")",
"Bash(cmd.exe //c \"C:\\\\\\\\Projects\\\\\\\\CLionProjects\\\\\\\\ANSCORE\\\\\\\\cmake-build-release\\\\\\\\__build_check.bat\")",
"Bash(cmd.exe //c \"tasklist\")",
"Bash(cmd.exe //c \"taskkill /F /PID 45704\")"
] ]
} }
} }

View File

@@ -6,6 +6,8 @@
#include <limits> #include <limits>
#include <filesystem> #include <filesystem>
#include <fstream> #include <fstream>
#include <cstdlib>
#include <system_error>
namespace ANSCENTER { namespace ANSCENTER {
@@ -13,6 +15,40 @@ namespace ANSCENTER {
// BasicOrtHandler — constructors // BasicOrtHandler — constructors
// ==================================================================== // ====================================================================
std::string BasicOrtHandler::QueryModelInputName(const std::string& onnxPath)
{
try {
// Make sure the Ort API pointer is initialised in THIS DLL.
if (Ort::Global<void>::api_ == nullptr) {
Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
}
Ort::Env env(ORT_LOGGING_LEVEL_ERROR, "QueryModelInputName");
Ort::SessionOptions opts;
opts.SetIntraOpNumThreads(1);
opts.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_DISABLE_ALL);
// Intentionally NOT attaching CUDA/TRT EP — CPU is fastest
// for a no-inference metadata read.
std::wstring wpath(onnxPath.begin(), onnxPath.end());
Ort::Session session(env, wpath.c_str(), opts);
Ort::AllocatorWithDefaultOptions alloc;
auto inName = session.GetInputNameAllocated(0, alloc);
return std::string(inName.get());
}
catch (const Ort::Exception& e) {
std::cerr << "[QueryModelInputName] ORT exception: "
<< e.what() << " (path=" << onnxPath << ")" << std::endl;
return "";
}
catch (const std::exception& e) {
std::cerr << "[QueryModelInputName] std exception: "
<< e.what() << " (path=" << onnxPath << ")" << std::endl;
return "";
}
}
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path, BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
unsigned int _num_threads) unsigned int _num_threads)
: log_id(_onnx_path.data()), : log_id(_onnx_path.data()),
@@ -36,6 +72,33 @@ namespace ANSCENTER {
initialize_handler(); initialize_handler();
} }
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
EngineType engineType,
const OrtHandlerOptions& options,
unsigned int _num_threads)
: log_id(_onnx_path.data()),
num_threads(_num_threads),
m_engineType(engineType),
m_handlerOptions(options),
onnx_path_w(_onnx_path.begin(), _onnx_path.end())
{
onnx_path = onnx_path_w.c_str();
initialize_handler();
}
BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
const OrtHandlerOptions& options,
unsigned int _num_threads)
: log_id(_onnx_path.data()),
num_threads(_num_threads),
m_engineType(static_cast<EngineType>(-1)), // EPLoader auto-detect
m_handlerOptions(options),
onnx_path_w(_onnx_path.begin(), _onnx_path.end())
{
onnx_path = onnx_path_w.c_str();
initialize_handler();
}
BasicOrtHandler::~BasicOrtHandler() BasicOrtHandler::~BasicOrtHandler()
{ {
if (ort_session) { if (ort_session) {
@@ -66,11 +129,15 @@ namespace ANSCENTER {
// - arena_extend_strategy = 1 (kSameAsRequested) to avoid // - arena_extend_strategy = 1 (kSameAsRequested) to avoid
// pre-allocating huge GPU memory blocks that may exceed VRAM // pre-allocating huge GPU memory blocks that may exceed VRAM
// - cudnn_conv_algo_search = HEURISTIC for faster session init // - cudnn_conv_algo_search = HEURISTIC for faster session init
// - cudnn_conv_use_max_workspace = 0 — use minimal cuDNN workspace // - cudnn_conv_use_max_workspace defaults to "0" to prevent
// to prevent CUDNN_BACKEND_API_FAILED when TRT engines already // CUDNN_BACKEND_API_FAILED when TRT engines already occupy
// occupy most VRAM on the same GPU // most VRAM on the same GPU. OCR sub-models that need fast
// convs opt into "1" via OrtHandlerOptions::useMaxCudnnWorkspace
// - gpu_mem_limit — cap ONNX Runtime's GPU memory arena to 2 GB // - gpu_mem_limit — cap ONNX Runtime's GPU memory arena to 2 GB
// so it doesn't compete with TensorRT for the remaining VRAM // so it doesn't compete with TensorRT for the remaining VRAM
const char* maxWorkspace =
m_handlerOptions.useMaxCudnnWorkspace ? "1" : "0";
const char* keys[] = { const char* keys[] = {
"device_id", "device_id",
"arena_extend_strategy", "arena_extend_strategy",
@@ -82,7 +149,7 @@ namespace ANSCENTER {
"0", "0",
"1", // kSameAsRequested "1", // kSameAsRequested
"HEURISTIC", // avoid exhaustive algo search on large model "HEURISTIC", // avoid exhaustive algo search on large model
"0", // minimal cuDNN workspace (prevents OOM) maxWorkspace, // "1" for OCR (perf), "0" elsewhere (safety)
"2147483648" // 2 GB arena limit "2147483648" // 2 GB arena limit
}; };
Ort::GetApi().UpdateCUDAProviderOptions( Ort::GetApi().UpdateCUDAProviderOptions(
@@ -92,7 +159,8 @@ namespace ANSCENTER {
Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options); Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options);
std::cout << "[ORT] CUDA EP attached (arena=SameAsRequested, " std::cout << "[ORT] CUDA EP attached (arena=SameAsRequested, "
"cudnn=HEURISTIC, maxWorkspace=0, memLimit=2GB)." << std::endl; "cudnn=HEURISTIC, maxWorkspace=" << maxWorkspace
<< ", memLimit=2GB)." << std::endl;
return true; return true;
} }
catch (const Ort::Exception& e) { catch (const Ort::Exception& e) {
@@ -100,6 +168,113 @@ namespace ANSCENTER {
return false; return false;
} }
} }
bool BasicOrtHandler::TryAppendTensorRT(Ort::SessionOptions& session_options)
{
try {
OrtTensorRTProviderOptionsV2* trt_options = nullptr;
Ort::GetApi().CreateTensorRTProviderOptions(&trt_options);
// Cache built engines on disk so subsequent runs skip the
// multi-minute build. Engines are keyed on (model hash, GPU
// arch, shape profile) so changing any of those triggers
// a rebuild automatically.
std::string cacheDir = m_handlerOptions.trtEngineCacheDir;
if (cacheDir.empty()) {
// %TEMP%\ANSCENTER\TRTEngineCache
const char* tmp = std::getenv("TEMP");
if (!tmp) tmp = std::getenv("TMP");
if (!tmp) tmp = ".";
std::filesystem::path p(tmp);
p /= "ANSCENTER";
p /= "TRTEngineCache";
std::error_code ec;
std::filesystem::create_directories(p, ec);
cacheDir = p.string();
}
// Builder options tuned for *fast first-run*:
// - opt_level 1: builds in seconds, ~510 % runtime cost vs 3
// - workspace 1 GB: leaves room for CUDA EP arena and the
// LPD's own TRT engine on the same GPU
// - timing cache: persists kernel timings between runs so
// builds at new shapes get progressively faster
// - profile shapes (if set): build ONE dynamic-shape
// engine that handles all (batch, width) combos instead
// of rebuilding per unique input. Critical for variable
// batch workloads — without this, TRT EP rebuilds every
// time runtime sees a new shape pair, causing 60-90 s
// hangs mid-stream.
std::filesystem::path timingCachePath =
std::filesystem::path(cacheDir) / "timing.cache";
std::string timingCacheStr = timingCachePath.string();
const bool haveProfile = !m_handlerOptions.trtProfileMinShapes.empty()
&& !m_handlerOptions.trtProfileOptShapes.empty()
&& !m_handlerOptions.trtProfileMaxShapes.empty();
// Build the key/value arrays. We always set the first 8 keys;
// the profile shapes are appended only when provided.
std::vector<const char*> keys = {
"device_id",
"trt_fp16_enable",
"trt_engine_cache_enable",
"trt_engine_cache_path",
"trt_max_workspace_size",
"trt_builder_optimization_level",
"trt_timing_cache_enable",
"trt_timing_cache_path"
};
std::vector<const char*> values = {
"0",
m_handlerOptions.trtFP16 ? "1" : "0",
"1",
cacheDir.c_str(),
"1073741824", // 1 GB build workspace
"1", // fast build (was "3")
"1",
cacheDir.c_str()
};
if (haveProfile) {
keys.push_back("trt_profile_min_shapes");
values.push_back(m_handlerOptions.trtProfileMinShapes.c_str());
keys.push_back("trt_profile_opt_shapes");
values.push_back(m_handlerOptions.trtProfileOptShapes.c_str());
keys.push_back("trt_profile_max_shapes");
values.push_back(m_handlerOptions.trtProfileMaxShapes.c_str());
}
Ort::GetApi().UpdateTensorRTProviderOptions(
trt_options, keys.data(), values.data(), keys.size());
session_options.AppendExecutionProvider_TensorRT_V2(*trt_options);
Ort::GetApi().ReleaseTensorRTProviderOptions(trt_options);
std::cout << "[ORT] TensorRT EP attached (fp16="
<< (m_handlerOptions.trtFP16 ? "1" : "0")
<< ", cache=" << cacheDir
<< ", profile=" << (haveProfile ? "dynamic" : "static")
<< ")." << std::endl;
if (haveProfile) {
std::cout << "[ORT] profile min: "
<< m_handlerOptions.trtProfileMinShapes << std::endl
<< "[ORT] profile opt: "
<< m_handlerOptions.trtProfileOptShapes << std::endl
<< "[ORT] profile max: "
<< m_handlerOptions.trtProfileMaxShapes << std::endl;
}
return true;
}
catch (const Ort::Exception& e) {
std::cerr << "[ORT] TensorRT EP failed: " << e.what() << std::endl;
return false;
}
catch (const std::exception& e) {
std::cerr << "[ORT] TensorRT EP failed (std): " << e.what() << std::endl;
return false;
}
}
bool BasicOrtHandler::TryAppendDirectML(Ort::SessionOptions& session_options) bool BasicOrtHandler::TryAppendDirectML(Ort::SessionOptions& session_options)
{ {
try { try {
@@ -267,9 +442,28 @@ namespace ANSCENTER {
{ {
// -------------------------------------------------------- // --------------------------------------------------------
case EngineType::NVIDIA_GPU: case EngineType::NVIDIA_GPU:
// Try TensorRT EP first when explicitly requested. Falls
// through to CUDA EP if TRT is missing or option creation
// fails. Both EPs may be attached at once — ORT picks TRT
// for nodes it supports and CUDA for the rest.
if (m_handlerOptions.preferTensorRT
&& hasProvider("TensorrtExecutionProvider")) {
ANS_DBG("OrtHandler", "Trying TensorRT EP...");
if (TryAppendTensorRT(session_options)) {
epAttached = true;
}
else {
std::cerr << "[ORT] TensorRT EP attach failed — "
"falling back to CUDA EP." << std::endl;
}
}
ANS_DBG("OrtHandler", "Trying CUDA EP..."); ANS_DBG("OrtHandler", "Trying CUDA EP...");
if (hasProvider("CUDAExecutionProvider")) if (hasProvider("CUDAExecutionProvider")) {
epAttached = TryAppendCUDA(session_options); if (TryAppendCUDA(session_options)) {
epAttached = true;
}
}
if (!epAttached) { if (!epAttached) {
std::cerr << "[ORT] CUDA EP unavailable — falling back to CPU." std::cerr << "[ORT] CUDA EP unavailable — falling back to CPU."
<< std::endl; << std::endl;

View File

@@ -248,6 +248,46 @@ namespace ANSCENTER {
return std::string(ort_session->GetOutputNameAllocated(index, allocator).get()); return std::string(ort_session->GetOutputNameAllocated(index, allocator).get());
} }
// ====================================================================
// High-perf options for OCR sub-models that need TRT EP and full
// cuDNN workspace. Default-constructed = identical to the legacy
// behavior (CUDA EP only, minimal cuDNN workspace).
// ====================================================================
struct OrtHandlerOptions {
// Try to attach TensorRT EP before CUDA EP (NVIDIA only).
// Falls back to CUDA EP automatically if TRT EP creation or session
// creation fails. Engines are cached on disk for fast reload.
bool preferTensorRT = false;
// Use the largest cuDNN conv workspace. cuDNN can then pick fast
// algorithms (Winograd, implicit-precomp-GEMM with big workspaces).
// Defaults off because some deployments share VRAM with TRT engines
// and need the minimal-workspace mode to avoid OOM.
bool useMaxCudnnWorkspace = false;
// Where to cache built TRT engines. Empty → default
// %TEMP%/ANSCENTER/TRTEngineCache. Only used when preferTensorRT.
std::string trtEngineCacheDir;
// FP16 builds for TRT EP. Recommended for inference; ignored if
// preferTensorRT is false.
bool trtFP16 = true;
// Dynamic-shape profile for TRT EP. When set, TRT builds ONE
// engine that handles every input shape in the [min..max] range
// instead of rebuilding per unique shape. Critical for models
// that see many (batch_size, spatial) combinations at runtime.
//
// Format: "input_name:d0xd1xd2xd3[,input2:...]"
// e.g. "x:1x3x48x320" for batch=1, C=3, H=48, W=320
//
// All three fields must be set together. An empty min implies
// no profile (fall back to static-shape-per-unique-input mode).
std::string trtProfileMinShapes;
std::string trtProfileOptShapes;
std::string trtProfileMaxShapes;
};
// ==================================================================== // ====================================================================
// BasicOrtHandler // BasicOrtHandler
// ==================================================================== // ====================================================================
@@ -280,6 +320,9 @@ namespace ANSCENTER {
const unsigned int num_threads; const unsigned int num_threads;
EngineType m_engineType; EngineType m_engineType;
// Per-session high-perf options. Default = legacy behavior.
OrtHandlerOptions m_handlerOptions;
protected: protected:
// Default: hardware auto-detection via ANSLicenseHelper through EPLoader // Default: hardware auto-detection via ANSLicenseHelper through EPLoader
explicit BasicOrtHandler(const std::string& _onnx_path, explicit BasicOrtHandler(const std::string& _onnx_path,
@@ -290,6 +333,19 @@ namespace ANSCENTER {
EngineType engineType, EngineType engineType,
unsigned int _num_threads = 1); unsigned int _num_threads = 1);
// Engine override + per-session high-perf options (TRT EP, max
// cuDNN workspace, etc.). Used by OCR sub-models that need
// shape-stable, high-throughput inference.
explicit BasicOrtHandler(const std::string& _onnx_path,
EngineType engineType,
const OrtHandlerOptions& options,
unsigned int _num_threads = 1);
// Auto-detect engine via EPLoader, but with high-perf options.
explicit BasicOrtHandler(const std::string& _onnx_path,
const OrtHandlerOptions& options,
unsigned int _num_threads = 1);
virtual ~BasicOrtHandler(); virtual ~BasicOrtHandler();
BasicOrtHandler(const BasicOrtHandler&) = delete; BasicOrtHandler(const BasicOrtHandler&) = delete;
@@ -298,6 +354,13 @@ namespace ANSCENTER {
// Resolved EP type (after EPLoader fallback). Subclasses use this // Resolved EP type (after EPLoader fallback). Subclasses use this
// to branch on actual EP at inference time. // to branch on actual EP at inference time.
EngineType getEngineType() const { return m_engineType; } EngineType getEngineType() const { return m_engineType; }
// Spin up a tiny CPU-only ORT session just long enough to read
// the name of the model's first input, then tear it down. Used
// by callers that need to build TRT profile-shape strings
// (which require the input name) BEFORE the real session is
// created. Returns an empty string on failure.
static std::string QueryModelInputName(const std::string& onnxPath);
private: private:
void initialize_handler(); void initialize_handler();
protected: protected:
@@ -306,6 +369,7 @@ namespace ANSCENTER {
// EP-specific session option builders // EP-specific session option builders
bool TryAppendCUDA(Ort::SessionOptions& opts); bool TryAppendCUDA(Ort::SessionOptions& opts);
bool TryAppendTensorRT(Ort::SessionOptions& opts);
bool TryAppendDirectML(Ort::SessionOptions& opts); bool TryAppendDirectML(Ort::SessionOptions& opts);
bool TryAppendOpenVINO(Ort::SessionOptions& opts); bool TryAppendOpenVINO(Ort::SessionOptions& opts);
}; };

View File

@@ -363,10 +363,14 @@ namespace ANSCENTER
ocrModelConfig.ocrLanguage = ocrLang; ocrModelConfig.ocrLanguage = ocrLang;
ocrModelConfig.useDetector = true; ocrModelConfig.useDetector = true;
ocrModelConfig.useRecognizer = true; ocrModelConfig.useRecognizer = true;
ocrModelConfig.useCLS = true; // Skip the angle classifier for ALPR. License-plate boxes
// from the YOLO detector are already axis-aligned, so the
// 180° classifier is dead weight (one extra ORT call per
// plate for no recall gain).
ocrModelConfig.useCLS = false;
ocrModelConfig.useLayout = false; ocrModelConfig.useLayout = false;
ocrModelConfig.useTable = false; ocrModelConfig.useTable = false;
ocrModelConfig.useTensorRT = false; ocrModelConfig.useTensorRT = true;
ocrModelConfig.enableMKLDNN = false; ocrModelConfig.enableMKLDNN = false;
ocrModelConfig.useDilation = true; ocrModelConfig.useDilation = true;
ocrModelConfig.useAngleCLS = false; ocrModelConfig.useAngleCLS = false;
@@ -375,7 +379,7 @@ namespace ANSCENTER
ocrModelConfig.detectionBoxThreshold = 0.3; ocrModelConfig.detectionBoxThreshold = 0.3;
ocrModelConfig.detectionDBUnclipRatio = 1.2; ocrModelConfig.detectionDBUnclipRatio = 1.2;
ocrModelConfig.clsThreshold = 0.9; ocrModelConfig.clsThreshold = 0.9;
ocrModelConfig.limitSideLen = 2560; ocrModelConfig.limitSideLen = 480;
// Pass the original ALPR model zip path — ANSOCRBase::Initialize // Pass the original ALPR model zip path — ANSOCRBase::Initialize
// will extract it to the same folder (already done, so extraction // will extract it to the same folder (already done, so extraction
@@ -638,11 +642,25 @@ namespace ANSCENTER
return {}; return {};
} }
std::vector<Object> output; // Step 2: Collect crops from every valid plate. Wide plates
output.reserve(lprOutput.size()); // (aspect >= 2.0) are treated as a single text line; narrow
// plates (2-row layouts like Japanese) are split horizontally
// at H/2 into top and bottom rows. All crops go through a
// single batched recognizer call, bypassing the OCR text-line
// detector entirely — for ALPR the LP YOLO box already bounds
// the text region precisely.
struct PlateInfo {
size_t origIndex; // into lprOutput
std::vector<size_t> cropIndices; // into allCrops
cv::Mat plateROI; // full (unsplit) ROI, kept for colour
};
std::vector<cv::Mat> allCrops;
std::vector<PlateInfo> plateInfos;
allCrops.reserve(lprOutput.size() * 2);
plateInfos.reserve(lprOutput.size());
for (auto& lprObject : lprOutput) { for (size_t i = 0; i < lprOutput.size(); ++i) {
const cv::Rect& box = lprObject.box; const cv::Rect& box = lprOutput[i].box;
// Calculate safe cropped region // Calculate safe cropped region
const int x1 = std::max(0, box.x); const int x1 = std::max(0, box.x);
@@ -652,27 +670,76 @@ namespace ANSCENTER
if (width <= 0 || height <= 0) continue; if (width <= 0 || height <= 0) continue;
cv::Rect lprPos(x1, y1, width, height); cv::Mat plateROI = frame(cv::Rect(x1, y1, width, height));
cv::Mat plateROI = frame(lprPos);
// Step 2: Run OCR on the detected plate PlateInfo info;
std::string ocrText = RunOCROnPlate(plateROI, cameraId); info.origIndex = i;
info.plateROI = plateROI;
if (ocrText.empty()) continue; const float aspect = static_cast<float>(width) /
std::max(1, height);
// 2-row heuristic: aspect < 2.0 → split top/bottom.
// Threshold tuned to catch Japanese square plates
// (~1.51.9) while leaving wide EU/VN plates (3.0+)
// untouched.
if (aspect < 2.0f && height >= 24) {
const int halfH = height / 2;
info.cropIndices.push_back(allCrops.size());
allCrops.push_back(plateROI(cv::Rect(0, 0, width, halfH)));
info.cropIndices.push_back(allCrops.size());
allCrops.push_back(plateROI(cv::Rect(0, halfH, width, height - halfH)));
}
else {
info.cropIndices.push_back(allCrops.size());
allCrops.push_back(plateROI);
}
plateInfos.push_back(std::move(info));
}
if (allCrops.empty()) {
return {};
}
// Step 3: Single batched recognizer call for every crop.
// ONNXOCRRecognizer groups crops by bucket width and issues
// one ORT Run per bucket — typically 12 GPU calls for an
// entire frame regardless of plate count.
auto ocrResults = _ocrEngine->RecognizeTextBatch(allCrops);
// Step 4: Assemble per-plate output
std::vector<Object> output;
output.reserve(plateInfos.size());
for (const auto& info : plateInfos) {
std::string combinedText;
for (size_t cropIdx : info.cropIndices) {
if (cropIdx >= ocrResults.size()) continue;
const std::string& lineText = ocrResults[cropIdx].first;
if (lineText.empty()) continue;
if (!combinedText.empty()) combinedText += " ";
combinedText += lineText;
}
if (combinedText.empty()) continue;
Object lprObject = lprOutput[info.origIndex];
lprObject.cameraId = cameraId; lprObject.cameraId = cameraId;
// Use ALPRChecker for text stabilization if enabled // Cross-frame stabilization (unchanged)
if (_enableALPRChecker) { if (_enableALPRChecker) {
lprObject.className = alprChecker.checkPlateByTrackId(cameraId, ocrText, lprObject.trackId); lprObject.className = alprChecker.checkPlateByTrackId(
} else { cameraId, combinedText, lprObject.trackId);
lprObject.className = ocrText; }
else {
lprObject.className = combinedText;
} }
if (lprObject.className.empty()) continue; if (lprObject.className.empty()) continue;
// Step 3: Colour detection (optional) // Optional colour detection on the full plate ROI
std::string colour = DetectLPColourCached(plateROI, cameraId, lprObject.className); std::string colour = DetectLPColourCached(
info.plateROI, cameraId, lprObject.className);
if (!colour.empty()) { if (!colour.empty()) {
lprObject.extraInfo = "color:" + colour; lprObject.extraInfo = "color:" + colour;
} }

View File

@@ -159,6 +159,18 @@ namespace ANSCENTER {
// Returns recognized text and confidence. Default returns empty. // Returns recognized text and confidence. Default returns empty.
virtual std::pair<std::string, float> RecognizeText(const cv::Mat& croppedImage) { return {"", 0.0f}; } virtual std::pair<std::string, float> RecognizeText(const cv::Mat& croppedImage) { return {"", 0.0f}; }
// Batch recognition — skips the text-line detector entirely and
// runs the whole batch through a single ORT call when possible.
// Default implementation falls back to per-image RecognizeText
// so existing subclasses keep working without changes.
virtual std::vector<std::pair<std::string, float>> RecognizeTextBatch(
const std::vector<cv::Mat>& croppedImages) {
std::vector<std::pair<std::string, float>> out;
out.reserve(croppedImages.size());
for (const auto& m : croppedImages) out.push_back(RecognizeText(m));
return out;
}
// ALPR configuration methods // ALPR configuration methods
void SetOCRMode(OCRMode mode); void SetOCRMode(OCRMode mode);
OCRMode GetOCRMode() const; OCRMode GetOCRMode() const;

View File

@@ -4,6 +4,7 @@
#include <iostream> #include <iostream>
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <chrono>
namespace ANSCENTER { namespace ANSCENTER {
namespace onnxocr { namespace onnxocr {
@@ -12,6 +13,12 @@ ONNXOCRClassifier::ONNXOCRClassifier(const std::string& onnx_path, unsigned int
: BasicOrtHandler(onnx_path, num_threads) { : BasicOrtHandler(onnx_path, num_threads) {
} }
ONNXOCRClassifier::ONNXOCRClassifier(const std::string& onnx_path,
const OrtHandlerOptions& options,
unsigned int num_threads)
: BasicOrtHandler(onnx_path, options, num_threads) {
}
Ort::Value ONNXOCRClassifier::transform(const cv::Mat& mat) { Ort::Value ONNXOCRClassifier::transform(const cv::Mat& mat) {
cv::Mat resized; cv::Mat resized;
// Direct resize to 80x160 (PP-LCNet_x1_0_textline_ori) // Direct resize to 80x160 (PP-LCNet_x1_0_textline_ori)
@@ -103,5 +110,38 @@ void ONNXOCRClassifier::Classify(std::vector<cv::Mat>& img_list,
} }
} }
void ONNXOCRClassifier::Warmup() {
std::lock_guard<std::mutex> lock(_mutex);
if (_warmedUp || !ort_session) return;
try {
cv::Mat dummy(kClsImageH * 2, kClsImageW * 2, CV_8UC3, cv::Scalar(128, 128, 128));
cv::Mat resized;
cv::resize(dummy, resized, cv::Size(kClsImageW, kClsImageH));
resized.convertTo(resized, CV_32FC3);
auto inputData = NormalizeAndPermute(resized);
std::array<int64_t, 4> inputShape = { 1, 3, kClsImageH, kClsImageW };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size());
auto t0 = std::chrono::high_resolution_clock::now();
(void)ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
auto t1 = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
std::cout << "[ONNXOCRClassifier] Warmup [1,3,"
<< kClsImageH << "," << kClsImageW << "] "
<< ms << " ms" << std::endl;
}
catch (const Ort::Exception& e) {
std::cerr << "[ONNXOCRClassifier] Warmup failed: " << e.what() << std::endl;
}
_warmedUp = true;
}
} // namespace onnxocr } // namespace onnxocr
} // namespace ANSCENTER } // namespace ANSCENTER

View File

@@ -11,6 +11,9 @@ namespace onnxocr {
class ONNXOCRClassifier : public BasicOrtHandler { class ONNXOCRClassifier : public BasicOrtHandler {
public: public:
explicit ONNXOCRClassifier(const std::string& onnx_path, unsigned int num_threads = 1); explicit ONNXOCRClassifier(const std::string& onnx_path, unsigned int num_threads = 1);
explicit ONNXOCRClassifier(const std::string& onnx_path,
const OrtHandlerOptions& options,
unsigned int num_threads = 1);
~ONNXOCRClassifier() override = default; ~ONNXOCRClassifier() override = default;
// Classify text orientation for a list of cropped images // Classify text orientation for a list of cropped images
@@ -21,7 +24,12 @@ public:
std::vector<float>& cls_scores, std::vector<float>& cls_scores,
float cls_thresh = kClsThresh); float cls_thresh = kClsThresh);
// Pre-warm cuDNN/TRT for the classifier's fixed [1,3,80,160] shape.
// Idempotent — no-op after the first call.
void Warmup();
private: private:
bool _warmedUp = false;
Ort::Value transform(const cv::Mat& mat) override; Ort::Value transform(const cv::Mat& mat) override;
Ort::Value transformBatch(const std::vector<cv::Mat>& images) override; Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;

View File

@@ -7,6 +7,7 @@
#include <iostream> #include <iostream>
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
#include <chrono>
namespace ANSCENTER { namespace ANSCENTER {
namespace onnxocr { namespace onnxocr {
@@ -15,6 +16,12 @@ ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path, unsigned int num_
: BasicOrtHandler(onnx_path, num_threads) { : BasicOrtHandler(onnx_path, num_threads) {
} }
ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path,
const OrtHandlerOptions& options,
unsigned int num_threads)
: BasicOrtHandler(onnx_path, options, num_threads) {
}
Ort::Value ONNXOCRDetector::transform(const cv::Mat& mat) { Ort::Value ONNXOCRDetector::transform(const cv::Mat& mat) {
// Not used directly - detection uses custom Preprocess + manual tensor creation // Not used directly - detection uses custom Preprocess + manual tensor creation
// Provided to satisfy BasicOrtHandler pure virtual // Provided to satisfy BasicOrtHandler pure virtual
@@ -308,5 +315,41 @@ std::vector<cv::Point2f> ONNXOCRDetector::UnclipPolygon(const std::array<cv::Poi
return result; return result;
} }
void ONNXOCRDetector::Warmup() {
std::lock_guard<std::mutex> lock(_mutex);
if (_warmedUp || !ort_session) return;
// 320x320 covers the typical license-plate ROI after LPD crop +
// multiple-of-32 rounding. cuDNN caches the algorithm for this
// shape so the first real inference doesn't pay the picker cost.
constexpr int kWarmupSide = 320;
try {
cv::Mat dummy(kWarmupSide, kWarmupSide, CV_8UC3, cv::Scalar(128, 128, 128));
cv::Mat dummyF;
dummy.convertTo(dummyF, CV_32FC3);
auto inputData = NormalizeAndPermute(dummyF);
std::array<int64_t, 4> inputShape = { 1, 3, kWarmupSide, kWarmupSide };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size());
auto t0 = std::chrono::high_resolution_clock::now();
(void)ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
auto t1 = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
std::cout << "[ONNXOCRDetector] Warmup [1,3,"
<< kWarmupSide << "," << kWarmupSide << "] "
<< ms << " ms" << std::endl;
}
catch (const Ort::Exception& e) {
std::cerr << "[ONNXOCRDetector] Warmup failed: " << e.what() << std::endl;
}
_warmedUp = true;
}
} // namespace onnxocr } // namespace onnxocr
} // namespace ANSCENTER } // namespace ANSCENTER

View File

@@ -11,6 +11,9 @@ namespace onnxocr {
class ONNXOCRDetector : public BasicOrtHandler { class ONNXOCRDetector : public BasicOrtHandler {
public: public:
explicit ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads = 1); explicit ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads = 1);
explicit ONNXOCRDetector(const std::string& onnx_path,
const OrtHandlerOptions& options,
unsigned int num_threads = 1);
~ONNXOCRDetector() override = default; ~ONNXOCRDetector() override = default;
// Run text detection on an image // Run text detection on an image
@@ -21,7 +24,12 @@ public:
float unclipRatio = kDetUnclipRatio, float unclipRatio = kDetUnclipRatio,
bool useDilation = false); bool useDilation = false);
// Pre-warm cuDNN/TRT at a canonical 320x320 input so the first real
// call doesn't pay the algorithm-selection tax. Idempotent.
void Warmup();
private: private:
bool _warmedUp = false;
Ort::Value transform(const cv::Mat& mat) override; Ort::Value transform(const cv::Mat& mat) override;
Ort::Value transformBatch(const std::vector<cv::Mat>& images) override; Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;

View File

@@ -7,6 +7,7 @@
#include <cmath> #include <cmath>
#include <cfloat> #include <cfloat>
#include <cstring> #include <cstring>
#include <chrono>
namespace ANSCENTER { namespace ANSCENTER {
namespace onnxocr { namespace onnxocr {
@@ -15,6 +16,12 @@ ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path, unsigned int
: BasicOrtHandler(onnx_path, num_threads) { : BasicOrtHandler(onnx_path, num_threads) {
} }
ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path,
const OrtHandlerOptions& options,
unsigned int num_threads)
: BasicOrtHandler(onnx_path, options, num_threads) {
}
bool ONNXOCRRecognizer::LoadDictionary(const std::string& dictPath) { bool ONNXOCRRecognizer::LoadDictionary(const std::string& dictPath) {
keys_ = LoadDict(dictPath); keys_ = LoadDict(dictPath);
if (keys_.size() < 2) { if (keys_.size() < 2) {
@@ -46,6 +53,54 @@ Ort::Value ONNXOCRRecognizer::transformBatch(const std::vector<cv::Mat>& images)
return Ort::Value(nullptr); return Ort::Value(nullptr);
} }
// ----------------------------------------------------------------------------
// Width buckets — every recognizer input is padded up to one of these widths
// before reaching ORT. This bounds the number of distinct shapes cuDNN ever
// sees to four, so its HEURISTIC algorithm cache hits on every subsequent
// call instead of re-tuning per plate. Buckets cover the realistic range:
// 320 px → short Latin/Japanese plates (most common)
// 480 px → wider Latin plates with two rows of text
// 640 px → long single-row plates / multi-line stacked text
// 960 px → safety upper bound (== kRecImgMaxW)
// ----------------------------------------------------------------------------
static constexpr int kRecBucketWidths[] = { 320, 480, 640, 960 };
static constexpr int kRecNumBuckets = sizeof(kRecBucketWidths) / sizeof(kRecBucketWidths[0]);
int ONNXOCRRecognizer::RoundUpToBucket(int resizedW) const {
const int capped = std::min(resizedW, imgMaxW_);
for (int b = 0; b < kRecNumBuckets; ++b) {
if (kRecBucketWidths[b] >= capped) return kRecBucketWidths[b];
}
return imgMaxW_;
}
// Resize + normalize a single crop into a CHW float vector at width
// `bucketW`, padding with zeros on the right when needed. The returned
// vector has exactly 3*imgH_*bucketW elements.
static std::vector<float> PreprocessCropToBucket(const cv::Mat& crop,
int imgH, int bucketW) {
cv::Mat resized = ResizeRecImage(crop, imgH, bucketW);
int resizedW = resized.cols;
resized.convertTo(resized, CV_32FC3);
auto normalizedData = NormalizeAndPermuteCls(resized);
if (resizedW == bucketW) {
return normalizedData;
}
// Zero-pad on the right (CHW layout)
std::vector<float> padded(3 * imgH * bucketW, 0.0f);
for (int c = 0; c < 3; c++) {
for (int y = 0; y < imgH; y++) {
std::memcpy(
&padded[c * imgH * bucketW + y * bucketW],
&normalizedData[c * imgH * resizedW + y * resizedW],
resizedW * sizeof(float));
}
}
return padded;
}
TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) { TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
std::lock_guard<std::mutex> lock(_mutex); std::lock_guard<std::mutex> lock(_mutex);
@@ -54,48 +109,23 @@ TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
} }
try { try {
// Preprocess: resize to fixed height, proportional width // Step 1: aspect-preserving resize to height=imgH_, width capped
// at imgMaxW_. Then round resized width up to the next bucket.
cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_); cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
int resizedW = resized.cols; const int bucketW = RoundUpToBucket(resized.cols);
resized.convertTo(resized, CV_32FC3); std::vector<float> inputData = PreprocessCropToBucket(croppedImage, imgH_, bucketW);
// Recognition uses (pixel/255 - 0.5) / 0.5 normalization (same as classifier)
auto normalizedData = NormalizeAndPermuteCls(resized);
// Pad to at least kRecImgW width (matching official PaddleOCR behavior) std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };
// Official PaddleOCR: padding_im = np.zeros((C, H, W)), then copies normalized
// image into left portion. Padding value = 0.0 in normalized space.
int imgW = std::max(resizedW, kRecImgW);
std::vector<float> inputData;
if (imgW > resizedW) {
// Zero-pad on the right (CHW layout)
inputData.resize(3 * imgH_ * imgW, 0.0f);
for (int c = 0; c < 3; c++) {
for (int y = 0; y < imgH_; y++) {
std::memcpy(
&inputData[c * imgH_ * imgW + y * imgW],
&normalizedData[c * imgH_ * resizedW + y * resizedW],
resizedW * sizeof(float));
}
}
} else {
inputData = std::move(normalizedData);
}
// Create input tensor with (possibly padded) width
std::array<int64_t, 4> inputShape = { 1, 3, imgH_, imgW };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>( Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(), *memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size()); inputShape.data(), inputShape.size());
// Run inference
auto outputTensors = ort_session->Run( auto outputTensors = ort_session->Run(
Ort::RunOptions{ nullptr }, Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1, input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs); output_node_names.data(), num_outputs);
// Get output
float* outputData = outputTensors[0].GetTensorMutableData<float>(); float* outputData = outputTensors[0].GetTensorMutableData<float>();
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
@@ -110,18 +140,162 @@ TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
} }
} }
std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) { void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector<cv::Mat>& crops,
std::vector<TextLine> results; const std::vector<size_t>& origIndices,
results.reserve(croppedImages.size()); int bucketW,
std::vector<TextLine>& out) {
if (crops.empty()) return;
// Process one at a time (dynamic width per image) try {
for (size_t i = 0; i < croppedImages.size(); i++) { const size_t batchN = crops.size();
results.push_back(Recognize(croppedImages[i])); const size_t perImage = static_cast<size_t>(3) * imgH_ * bucketW;
// Stack N preprocessed crops into one [N,3,H,W] buffer
std::vector<float> batchInput(batchN * perImage, 0.0f);
for (size_t i = 0; i < batchN; ++i) {
auto img = PreprocessCropToBucket(crops[i], imgH_, bucketW);
std::memcpy(&batchInput[i * perImage], img.data(),
perImage * sizeof(float));
}
std::array<int64_t, 4> inputShape = {
static_cast<int64_t>(batchN), 3,
static_cast<int64_t>(imgH_),
static_cast<int64_t>(bucketW)
};
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, batchInput.data(), batchInput.size(),
inputShape.data(), inputShape.size());
auto outputTensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
float* outputData = outputTensors[0].GetTensorMutableData<float>();
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
// Expected output: [N, seqLen, numClasses]
if (outputShape.size() < 3) {
std::cerr << "[ONNXOCRRecognizer] Unexpected batch output rank: "
<< outputShape.size() << std::endl;
return;
}
const int outBatch = static_cast<int>(outputShape[0]);
const int seqLen = static_cast<int>(outputShape[1]);
const int numClasses = static_cast<int>(outputShape[2]);
const size_t perRow = static_cast<size_t>(seqLen) * numClasses;
for (int i = 0; i < outBatch && i < static_cast<int>(batchN); ++i) {
TextLine tl = CTCDecode(outputData + i * perRow, seqLen, numClasses);
out[origIndices[i]] = std::move(tl);
}
}
catch (const Ort::Exception& e) {
// ORT will throw if the model doesn't support a batch dimension > 1.
// Fall back to per-image inference for this group.
std::cerr << "[ONNXOCRRecognizer] Batch inference failed at bucketW="
<< bucketW << " (" << e.what()
<< ") — falling back to single-image path." << std::endl;
for (size_t i = 0; i < crops.size(); ++i) {
// Direct call (we already hold _mutex via the public RecognizeBatch
// wrapper). Replicate the single-image preprocessing here to avoid
// re-entering Recognize() and double-locking the mutex.
try {
cv::Mat resized = ResizeRecImage(crops[i], imgH_, imgMaxW_);
int singleBucket = RoundUpToBucket(resized.cols);
auto inputData = PreprocessCropToBucket(crops[i], imgH_, singleBucket);
std::array<int64_t, 4> inputShape = { 1, 3, imgH_, singleBucket };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size());
auto outputTensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
float* outData = outputTensors[0].GetTensorMutableData<float>();
auto outShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
int seqLen = static_cast<int>(outShape[1]);
int numClasses = static_cast<int>(outShape[2]);
out[origIndices[i]] = CTCDecode(outData, seqLen, numClasses);
} catch (const Ort::Exception& e2) {
std::cerr << "[ONNXOCRRecognizer] Single-image fallback also failed: "
<< e2.what() << std::endl;
out[origIndices[i]] = {};
}
}
}
}
std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
std::lock_guard<std::mutex> lock(_mutex);
std::vector<TextLine> results(croppedImages.size());
if (!ort_session || croppedImages.empty() || keys_.empty()) {
return results;
}
// Group crops by their target bucket width
std::vector<std::vector<cv::Mat>> groupCrops(kRecNumBuckets);
std::vector<std::vector<size_t>> groupIdx(kRecNumBuckets);
for (size_t i = 0; i < croppedImages.size(); ++i) {
if (croppedImages[i].empty()) continue;
cv::Mat resized = ResizeRecImage(croppedImages[i], imgH_, imgMaxW_);
const int bw = RoundUpToBucket(resized.cols);
// Find bucket index
int bucketIdx = kRecNumBuckets - 1;
for (int b = 0; b < kRecNumBuckets; ++b) {
if (kRecBucketWidths[b] == bw) { bucketIdx = b; break; }
}
groupCrops[bucketIdx].push_back(croppedImages[i]);
groupIdx[bucketIdx].push_back(i);
}
// Run one batched inference per non-empty bucket
for (int b = 0; b < kRecNumBuckets; ++b) {
if (groupCrops[b].empty()) continue;
RunBatchAtWidth(groupCrops[b], groupIdx[b], kRecBucketWidths[b], results);
} }
return results; return results;
} }
void ONNXOCRRecognizer::Warmup() {
std::lock_guard<std::mutex> lock(_mutex);
if (_warmedUp || !ort_session) return;
// Dummy 3-channel image, mid-grey, large enough to resize to imgH_
cv::Mat dummy(imgH_ * 2, kRecBucketWidths[kRecNumBuckets - 1] * 2,
CV_8UC3, cv::Scalar(128, 128, 128));
for (int b = 0; b < kRecNumBuckets; ++b) {
const int bucketW = kRecBucketWidths[b];
try {
auto inputData = PreprocessCropToBucket(dummy, imgH_, bucketW);
std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size());
auto t0 = std::chrono::high_resolution_clock::now();
(void)ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
auto t1 = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
std::cout << "[ONNXOCRRecognizer] Warmup bucketW=" << bucketW
<< " " << ms << " ms" << std::endl;
}
catch (const Ort::Exception& e) {
std::cerr << "[ONNXOCRRecognizer] Warmup failed at bucketW="
<< bucketW << ": " << e.what() << std::endl;
}
}
_warmedUp = true;
}
TextLine ONNXOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) { TextLine ONNXOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
TextLine result; TextLine result;
std::string text; std::string text;

View File

@@ -12,6 +12,9 @@ namespace onnxocr {
class ONNXOCRRecognizer : public BasicOrtHandler { class ONNXOCRRecognizer : public BasicOrtHandler {
public: public:
explicit ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads = 1); explicit ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads = 1);
explicit ONNXOCRRecognizer(const std::string& onnx_path,
const OrtHandlerOptions& options,
unsigned int num_threads = 1);
~ONNXOCRRecognizer() override = default; ~ONNXOCRRecognizer() override = default;
// Load character dictionary (must be called before Recognize) // Load character dictionary (must be called before Recognize)
@@ -20,13 +23,31 @@ public:
// Recognize text from a single cropped text image // Recognize text from a single cropped text image
TextLine Recognize(const cv::Mat& croppedImage); TextLine Recognize(const cv::Mat& croppedImage);
// Batch recognition for multiple cropped images // Batch recognition for multiple cropped images.
// Crops are grouped into a small set of fixed width buckets and
// submitted to ORT as [N,3,imgH_,bucketW] tensors so cuDNN sees
// shape-stable inputs and can reuse algorithms across calls.
std::vector<TextLine> RecognizeBatch(const std::vector<cv::Mat>& croppedImages); std::vector<TextLine> RecognizeBatch(const std::vector<cv::Mat>& croppedImages);
// Pre-warm cuDNN/TRT for every bucket width by running dummy
// inferences. Idempotent — no-op if already warmed up.
void Warmup();
private: private:
Ort::Value transform(const cv::Mat& mat) override; Ort::Value transform(const cv::Mat& mat) override;
Ort::Value transformBatch(const std::vector<cv::Mat>& images) override; Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
// Round resizedW up to the next bucket width (capped at imgMaxW_).
// Used by both Recognize() and RecognizeBatch() so cuDNN only ever
// sees a small finite set of input shapes.
int RoundUpToBucket(int resizedW) const;
// Run a single [N,3,imgH_,bucketW] inference and CTC-decode each row.
void RunBatchAtWidth(const std::vector<cv::Mat>& crops,
const std::vector<size_t>& origIndices,
int bucketW,
std::vector<TextLine>& out);
// CTC greedy decode // CTC greedy decode
TextLine CTCDecode(const float* outputData, int seqLen, int numClasses); TextLine CTCDecode(const float* outputData, int seqLen, int numClasses);
@@ -34,6 +55,7 @@ private:
int imgH_ = kRecImgH; int imgH_ = kRecImgH;
int imgMaxW_ = kRecImgMaxW; int imgMaxW_ = kRecImgMaxW;
std::mutex _mutex; std::mutex _mutex;
bool _warmedUp = false;
}; };
} // namespace onnxocr } // namespace onnxocr

View File

@@ -88,11 +88,22 @@ inline std::vector<std::string> LoadDict(const std::string& dictPath) {
return keys; return keys;
} }
// Compute resize dimensions for detection model (multiples of 32) // Compute resize dimensions for detection model.
// limit_type='max': scale down if max side > maxSideLen (PP-OCRv5 server default) // limit_type='max': scale down if max side > maxSideLen (PP-OCRv5 server default)
// maxSideLimit: safety cap on final max dimension (default 4000) // maxSideLimit: safety cap on final max dimension (default 4000)
//
// Each dimension is rounded UP to a multiple of kDetSizeBucket (96). The
// coarse granularity is deliberate: cuDNN HEURISTIC has to re-select
// convolution algorithms every time it sees a new input shape, and that
// selection costs ~100 ms per shape. With multiples of 32, a typical ALPR
// run produces 30+ unique detector shapes; with multiples of 96 that drops
// to 510, which cuDNN can cache and reuse for the rest of the video.
// 96 is divisible by the DBNet down-stride of 32, so feature-map sizes
// stay integer.
inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen, inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen,
int maxSideLimit = kDetMaxSideLimit) { int maxSideLimit = kDetMaxSideLimit) {
constexpr int kDetSizeBucket = 96;
float ratio = 1.0f; float ratio = 1.0f;
int maxSide = std::max(srcH, srcW); int maxSide = std::max(srcH, srcW);
if (maxSide > maxSideLen) { if (maxSide > maxSideLen) {
@@ -108,8 +119,12 @@ inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen,
newW = static_cast<int>(newW * clampRatio); newW = static_cast<int>(newW * clampRatio);
} }
newH = std::max(32, static_cast<int>(std::round(newH / 32.0) * 32)); auto roundUpToBucket = [](int x) {
newW = std::max(32, static_cast<int>(std::round(newW / 32.0) * 32)); return std::max(kDetSizeBucket,
((x + kDetSizeBucket - 1) / kDetSizeBucket) * kDetSizeBucket);
};
newH = roundUpToBucket(newH);
newW = roundUpToBucket(newW);
return cv::Size(newW, newH); return cv::Size(newW, newH);
} }

View File

@@ -11,13 +11,75 @@ namespace onnxocr {
bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath, bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
const std::string& clsModelPath, const std::string& clsModelPath,
const std::string& recModelPath, const std::string& recModelPath,
const std::string& dictPath) { const std::string& dictPath,
bool preferTensorRT) {
std::lock_guard<std::recursive_mutex> lock(_mutex); std::lock_guard<std::recursive_mutex> lock(_mutex);
ModelLoadingGuard mlg(_modelLoading); ModelLoadingGuard mlg(_modelLoading);
// High-perf options. The OCR sub-models split into two groups:
//
// 1. Detector — its input shape varies continuously with every
// plate-ROI aspect ratio. TRT EP is a poor fit because it
// builds a fresh engine for each unique shape (minutes each).
// We keep it on CUDA EP with the largest cuDNN workspace and
// let cuDNN HEURISTIC handle the per-shape algo selection.
//
// 2. Classifier + Recognizer — fixed-bucket shapes (cls is
// [1,3,80,160], rec is [1,3,48,{320,480,640,960}]). These
// benefit massively from TRT EP because the engine is built
// once per shape and reused forever.
OrtHandlerOptions detectorOpts;
// Detector uses CUDA EP with *conservative* cuDNN workspace.
// Empirical: on VRAM-constrained GPUs (LPD TRT engine + rec TRT
// engine + ORT arena in play) the max-workspace mode causes cuDNN
// to pick Winograd/implicit-precomp-GEMM variants that silently
// fall back to slow NO-WORKSPACE algorithms when the big workspace
// can't be allocated. With "0" cuDNN picks algorithms that are
// known to fit and runs ~10x faster in practice.
detectorOpts.useMaxCudnnWorkspace = false;
detectorOpts.preferTensorRT = false; // never TRT for the detector
// Classifier (fixed [1,3,80,160]): TRT with no profile is fine.
OrtHandlerOptions classifierOpts;
classifierOpts.useMaxCudnnWorkspace = true;
classifierOpts.preferTensorRT = preferTensorRT;
classifierOpts.trtFP16 = true;
// Recognizer: needs a DYNAMIC profile so one TRT engine covers every
// (batch, bucket_width) pair we generate at runtime. Without this,
// each new shape triggers a ~80s engine rebuild mid-stream when a
// new plate appears or the plate count changes.
//
// Profile range:
// batch : 1 .. 16 (16 plates worth of crops is generous)
// H : 48 (fixed)
// W : 320 .. 960 (covers all 4 recognizer buckets)
//
// Query the actual input name from the .onnx file instead of
// hardcoding — PaddleOCR usually exports it as "x" but the name can
// vary across model versions.
OrtHandlerOptions recognizerOpts;
recognizerOpts.useMaxCudnnWorkspace = true;
recognizerOpts.preferTensorRT = preferTensorRT;
recognizerOpts.trtFP16 = true;
if (preferTensorRT) {
std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath);
if (recInputName.empty()) {
std::cerr << "[PaddleOCRV5Engine] Could not query recognizer "
"input name — defaulting to 'x'" << std::endl;
recInputName = "x";
}
std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
<< recInputName << "' — building TRT dynamic profile "
<< "[batch=1..16, W=320..960]" << std::endl;
recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
}
try { try {
// Initialize detector (also triggers EPLoader init in BasicOrtHandler) // Initialize detector (also triggers EPLoader init in BasicOrtHandler)
detector_ = std::make_unique<ONNXOCRDetector>(detModelPath); detector_ = std::make_unique<ONNXOCRDetector>(detModelPath, detectorOpts);
std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl; std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl;
// Ensure this DLL's copy of Ort::Global<void>::api_ is initialized. // Ensure this DLL's copy of Ort::Global<void>::api_ is initialized.
@@ -29,7 +91,7 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
// Initialize classifier (optional) // Initialize classifier (optional)
if (!clsModelPath.empty()) { if (!clsModelPath.empty()) {
classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath); classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath, classifierOpts);
std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl; std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl;
} }
else { else {
@@ -38,13 +100,26 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
} }
// Initialize recognizer // Initialize recognizer
recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath); recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath, recognizerOpts);
if (!recognizer_->LoadDictionary(dictPath)) { if (!recognizer_->LoadDictionary(dictPath)) {
std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl; std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl;
return false; return false;
} }
std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl; std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl;
// Pre-warm classifier (fixed [1,3,80,160]) and recognizer (4
// bucket widths) so the first frame doesn't pay the cuDNN/TRT
// algorithm-selection tax. The detector is intentionally NOT
// warmed up: its input shape varies continuously with each
// plate-ROI aspect ratio, so a warmup at any single canonical
// shape would cost minutes (TRT) or be useless (CUDA cache miss
// on the real frame anyway). Real frames will pay the per-shape
// cuDNN HEURISTIC cost on first use.
std::cout << "[PaddleOCRV5Engine] Warming up OCR pipeline..." << std::endl;
if (classifier_) classifier_->Warmup();
if (recognizer_) recognizer_->Warmup();
std::cout << "[PaddleOCRV5Engine] Warmup complete" << std::endl;
_initialized = true; _initialized = true;
std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl; std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl;
return true; return true;
@@ -140,5 +215,18 @@ TextLine PaddleOCRV5Engine::recognizeOnly(const cv::Mat& croppedImage) {
return recognizer_->Recognize(croppedImage); return recognizer_->Recognize(croppedImage);
} }
std::vector<TextLine> PaddleOCRV5Engine::recognizeMany(const std::vector<cv::Mat>& croppedImages) {
if (_modelLoading.load()) return std::vector<TextLine>(croppedImages.size());
{
auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeMany");
if (!lk.owns_lock()) return std::vector<TextLine>(croppedImages.size());
if (!_initialized || !recognizer_ || croppedImages.empty()) {
return std::vector<TextLine>(croppedImages.size());
}
}
// Delegates to the bucketed, batched path in ONNXOCRRecognizer.
return recognizer_->RecognizeBatch(croppedImages);
}
} // namespace onnxocr } // namespace onnxocr
} // namespace ANSCENTER } // namespace ANSCENTER

View File

@@ -25,10 +25,13 @@ public:
// Initialize the OCR pipeline // Initialize the OCR pipeline
// clsModelPath can be empty to skip classification // clsModelPath can be empty to skip classification
// preferTensorRT: try TensorRT EP first for the three sub-models
// (cuDNN-friendly cuDNN max-workspace mode either way)
bool Initialize(const std::string& detModelPath, bool Initialize(const std::string& detModelPath,
const std::string& clsModelPath, const std::string& clsModelPath,
const std::string& recModelPath, const std::string& recModelPath,
const std::string& dictPath); const std::string& dictPath,
bool preferTensorRT = false);
// Run full OCR pipeline on an image // Run full OCR pipeline on an image
// Returns results matching PaddleOCR::OCRPredictResult format // Returns results matching PaddleOCR::OCRPredictResult format
@@ -37,6 +40,14 @@ public:
// Run recognizer only on a pre-cropped text image (no detection step) // Run recognizer only on a pre-cropped text image (no detection step)
TextLine recognizeOnly(const cv::Mat& croppedImage); TextLine recognizeOnly(const cv::Mat& croppedImage);
// Run recognizer only on a batch of pre-cropped text images in a
// single batched ORT inference. Skips the detector entirely — the
// caller is expected to supply crops that are already roughly
// axis-aligned single-line text (e.g. ALPR plate ROIs, optionally
// pre-split into rows). Crops are grouped by bucket width, so a
// single call to this function typically issues 12 ORT Runs total.
std::vector<TextLine> recognizeMany(const std::vector<cv::Mat>& croppedImages);
// Configuration setters (matching OCRModelConfig parameters) // Configuration setters (matching OCRModelConfig parameters)
void SetDetMaxSideLen(int val) { _maxSideLen = val; } void SetDetMaxSideLen(int val) { _maxSideLen = val; }
void SetDetDbThresh(float val) { _detDbThresh = val; } void SetDetDbThresh(float val) { _detDbThresh = val; }

View File

@@ -50,7 +50,8 @@ bool ANSONNXOCR::Initialize(const std::string& licenseKey, OCRModelConfig modelC
_modelConfig.detectionModelFile, _modelConfig.detectionModelFile,
clsModelPath, clsModelPath,
_modelConfig.recognizerModelFile, _modelConfig.recognizerModelFile,
_modelConfig.recogizerCharDictionaryPath); _modelConfig.recogizerCharDictionaryPath,
_modelConfig.useTensorRT);
return _isInitialized; return _isInitialized;
} }
@@ -391,4 +392,16 @@ std::pair<std::string, float> ANSONNXOCR::RecognizeText(const cv::Mat& croppedIm
return {result.text, result.score}; return {result.text, result.score};
} }
std::vector<std::pair<std::string, float>> ANSONNXOCR::RecognizeTextBatch(
const std::vector<cv::Mat>& croppedImages) {
std::vector<std::pair<std::string, float>> out(croppedImages.size(), {"", 0.0f});
if (!_isInitialized || !_engine || croppedImages.empty()) return out;
auto lines = _engine->recognizeMany(croppedImages);
for (size_t i = 0; i < lines.size() && i < out.size(); ++i) {
out[i] = { lines[i].text, lines[i].score };
}
return out;
}
} // namespace ANSCENTER } // namespace ANSCENTER

View File

@@ -24,6 +24,8 @@ namespace ANSCENTER {
std::vector<ANSCENTER::OCRObject> RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox, const std::string& cameraId) override; std::vector<ANSCENTER::OCRObject> RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox, const std::string& cameraId) override;
std::pair<std::string, float> RecognizeText(const cv::Mat& croppedImage) override; std::pair<std::string, float> RecognizeText(const cv::Mat& croppedImage) override;
std::vector<std::pair<std::string, float>> RecognizeTextBatch(
const std::vector<cv::Mat>& croppedImages) override;
~ANSONNXOCR(); ~ANSONNXOCR();
bool Destroy() override; bool Destroy() override;

View File

@@ -1367,8 +1367,8 @@ int TestGetImage() {
} }
int GenerateVideo() { int GenerateVideo() {
std::string imageFolder = "E:\\Programs\\DemoAssets\\ImageSeries\\20260413_152604.321"; std::string imageFolder = "E:\\Programs\\DemoAssets\\ImageSeries\\20260413_152604.321";
std::string outputVideoPath = "E:\\Programs\\DemoAssets\\ImageSeries\\output1.mp4"; std::string outputVideoPath = "E:\\Programs\\DemoAssets\\ImageSeries\\output3.mp4";
int conversionResult = ANSCV_ImagesToMP4_S(imageFolder.c_str(), outputVideoPath.c_str(), 0, 5); int conversionResult = ANSCV_ImagesToMP4_S(imageFolder.c_str(), outputVideoPath.c_str(), 0,20);
if (!conversionResult) { if (!conversionResult) {
std::cerr << "Failed to convert images to MP4." << std::endl; std::cerr << "Failed to convert images to MP4." << std::endl;
return -1; return -1;

View File

@@ -3805,6 +3805,149 @@ int ALPR_OCR_Test() {
return 0; return 0;
} }
int ALPR_OCR_VideoTest() {
std::cout << "=== ALPR_OCR_VideoTest: ANSALPR_OCR engine on video ===" << std::endl;
std::filesystem::path currentPath = std::filesystem::current_path();
std::cout << "Current working directory: " << currentPath << std::endl;
ANSCENTER::ANSALPR* infHandle = nullptr;
std::string licenseKey = "";
std::string modelFilePath = "C:\\Projects\\ANSVIS\\Models\\ANS_GenericALPR_v2.0.zip";
std::string videoFilePath = "E:\\Programs\\DemoAssets\\Videos\\ALRP\\PMH\\Day\\day.mp4";
int engineType = 2; // ANSALPR_OCR
double detectionThreshold = 0.3;
double ocrThreshold = 0.5;
double colourThreshold = 0.0;
// Step 1: Create handle
int createResult = CreateANSALPRHandle(&infHandle, licenseKey.c_str(),
modelFilePath.c_str(), "", engineType, detectionThreshold, ocrThreshold, colourThreshold);
std::cout << "CreateANSALPRHandle result: " << createResult << std::endl;
if (!createResult || !infHandle) {
std::cerr << "Failed to create ANSALPR_OCR handle" << std::endl;
return -1;
}
// Step 2: Set country (JAPAN = 5 — adjust to match the dataset if needed)
ANSALPR_SetCountry(&infHandle, 5);
std::cout << "Country set to JAPAN" << std::endl;
// Step 3: Load engine
auto engineStart = std::chrono::high_resolution_clock::now();
int loadResult = LoadANSALPREngineHandle(&infHandle);
auto engineEnd = std::chrono::high_resolution_clock::now();
double engineMs = std::chrono::duration<double, std::milli>(engineEnd - engineStart).count();
std::cout << "LoadANSALPREngineHandle result: " << loadResult << " (" << engineMs << " ms)" << std::endl;
if (!loadResult) {
std::cerr << "Failed to load ANSALPR_OCR engine" << std::endl;
ReleaseANSALPRHandle(&infHandle);
return -2;
}
// Step 4: Open video
cv::VideoCapture capture(videoFilePath);
if (!capture.isOpened()) {
std::cerr << "Could not open video file: " << videoFilePath << std::endl;
ReleaseANSALPRHandle(&infHandle);
return -3;
}
boost::property_tree::ptree pt;
int frameIdx = 0;
while (true) {
cv::Mat frame;
if (!capture.read(frame)) {
std::cout << "\nEnd of video stream.\n";
break;
}
++frameIdx;
int width = frame.cols;
int height = frame.rows;
// Convert to raw BGR bytes for ANSALPR_RunInferenceBinary
unsigned int bufferLength = static_cast<unsigned int>(frame.total() * frame.elemSize());
unsigned char* imageBytes = new unsigned char[bufferLength];
std::memcpy(imageBytes, frame.data, bufferLength);
auto t0 = std::chrono::high_resolution_clock::now();
std::string detectionResult = ANSALPR_RunInferenceBinary(&infHandle, imageBytes, width, height);
auto t1 = std::chrono::high_resolution_clock::now();
double inferMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
delete[] imageBytes;
printf("Frame %d: %.2f ms (%.1f FPS)\n", frameIdx, inferMs,
inferMs > 0.0 ? (1000.0 / inferMs) : 0.0);
// Draw detections
if (!detectionResult.empty()) {
try {
pt.clear();
std::stringstream ss(detectionResult);
boost::property_tree::read_json(ss, pt);
BOOST_FOREACH(const boost::property_tree::ptree::value_type& child, pt.get_child("results")) {
const boost::property_tree::ptree& res = child.second;
const auto class_name_raw = GetData<std::string>(res, "class_name");
const std::string class_name = DecodeUnicodeEscapes(class_name_raw);
const auto x = GetData<int>(res, "x");
const auto y = GetData<int>(res, "y");
const auto w = GetData<int>(res, "width");
const auto h = GetData<int>(res, "height");
cv::rectangle(frame, cv::Rect(x, y, w, h), cv::Scalar(0, 255, 0), 2);
std::string extraInfo = GetOptionalValue<std::string>(res, "extra_info", "");
if (!class_name.empty()) {
std::cout << " Plate: " << class_name;
if (!extraInfo.empty()) std::cout << " (" << extraInfo << ")";
std::cout << std::endl;
}
#ifdef WIN32
{
int textH = (int)(1.5 * 30);
int ty = y - 5 - textH;
if (ty < 0) ty = y + 3;
putTextUnicode(frame, class_name, cv::Point(x, ty),
1.5, cv::Scalar(0, 0, 255), 3);
}
#else
cv::putText(frame, class_name, cv::Point(x, y - 5),
cv::FONT_HERSHEY_SIMPLEX, 1.0, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);
#endif
}
}
catch (const std::exception& e) {
std::cerr << "JSON parse error: " << e.what() << std::endl;
}
}
// Display (fit to 1920x1080)
cv::Mat display;
double scale = std::min(1920.0 / frame.cols, 1080.0 / frame.rows);
if (scale < 1.0) {
cv::resize(frame, display, cv::Size(), scale, scale);
} else {
display = frame;
}
cv::namedWindow("ALPR_OCR_VideoTest", cv::WINDOW_AUTOSIZE);
cv::imshow("ALPR_OCR_VideoTest", display);
if (cv::waitKey(1) == 27) { // ESC to exit
std::cout << "ESC pressed — stopping.\n";
break;
}
}
capture.release();
cv::destroyAllWindows();
ReleaseANSALPRHandle(&infHandle);
std::cout << "=== ALPR_OCR_VideoTest complete ===" << std::endl;
return 0;
}
int main() int main()
{ {
#ifdef WIN32 #ifdef WIN32
@@ -3825,7 +3968,8 @@ int main()
//ANSLPR_MultiGPU_StressTest_SimulatedCam(); //ANSLPR_MultiGPU_StressTest_SimulatedCam();
// ANSLPR_MultiGPU_StressTest_FilePlayer(); // ANSLPR_MultiGPU_StressTest_FilePlayer();
//ANSLPR_OD_CPU_VideoTest(); //ANSLPR_OD_CPU_VideoTest();
ALPR_OCR_Test(); //ALPR_OCR_Test();
ALPR_OCR_VideoTest();
return 0; return 0;
} }