diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index 32e701d..0767e3d 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -134,7 +134,16 @@
       "Bash(python /tmp/apply_fd_guards.py)",
       "Bash(python /tmp/apply_fd_precheck.py)",
       "Bash(ls /c/Projects/CLionProjects/ANSCORE/modules/ANSFR/*.cpp /c/Projects/CLionProjects/ANSCORE/modules/ANSFR/*.h)",
-      "Bash(grep -rn \"lock_guard.*_mutex\\\\|lock_guard.*mutex\" /c/Projects/CLionProjects/ANSCORE/modules/ANSFR/*.cpp)"
+      "Bash(grep -rn \"lock_guard.*_mutex\\\\|lock_guard.*mutex\" /c/Projects/CLionProjects/ANSCORE/modules/ANSFR/*.cpp)",
+      "Bash(grep -l \"TensorRT\\\\|tensorrt_provider_factory\\\\|OrtTensorRTProviderOptionsV2\\\\|CreateTensorRTProviderOptions\\\\|UpdateTensorRTProviderOptions\" \"C:/ANSLibs/onnxruntime/include/\"*.h)",
+      "Bash(cmake --build . --target ONNXEngine ANSOCR ANSLPR ANSLPR-UnitTest --config Release -- -j 4)",
+      "Bash(cmake --build . --target help)",
+      "Bash(cmake --build . --target ANSLPR-UnitTest)",
+      "Bash(cmd.exe /c \"call \\\\\"C:\\\\\\\\Program Files\\\\\\\\Microsoft Visual Studio\\\\\\\\2022\\\\\\\\Community\\\\\\\\VC\\\\\\\\Auxiliary\\\\\\\\Build\\\\\\\\vcvars64.bat\\\\\" >nul 2>&1 && cmake --build . --target ANSLPR-UnitTest 2>&1\")",
+      "Bash(cmd.exe //c \"cd /d C:\\\\\\\\Projects\\\\\\\\CLionProjects\\\\\\\\ANSCORE\\\\\\\\cmake-build-release && call \\\\\"C:\\\\\\\\Program Files\\\\\\\\Microsoft Visual Studio\\\\\\\\2022\\\\\\\\Community\\\\\\\\VC\\\\\\\\Auxiliary\\\\\\\\Build\\\\\\\\vcvars64.bat\\\\\" && cmake --build . --target ANSLPR-UnitTest\")",
+      "Bash(cmd.exe //c \"C:\\\\\\\\Projects\\\\\\\\CLionProjects\\\\\\\\ANSCORE\\\\\\\\cmake-build-release\\\\\\\\__build_check.bat\")",
+      "Bash(cmd.exe //c \"tasklist\")",
+      "Bash(cmd.exe //c \"taskkill /F /PID 45704\")"
     ]
   }
 }
diff --git a/engines/ONNXEngine/ONNXEngine.cpp b/engines/ONNXEngine/ONNXEngine.cpp
index d5549e3..7a2881b 100644
--- a/engines/ONNXEngine/ONNXEngine.cpp
+++ b/engines/ONNXEngine/ONNXEngine.cpp
@@ -6,6 +6,8 @@
 #include <limits>
 #include <filesystem>
 #include <fstream>
+#include <cstdlib>
+#include <system_error>
 
 namespace ANSCENTER {
 
@@ -13,6 +15,40 @@ namespace ANSCENTER {
     // BasicOrtHandler — constructors
     // ====================================================================
 
+    std::string BasicOrtHandler::QueryModelInputName(const std::string& onnxPath)
+    {
+        try {
+            // Make sure the Ort API pointer is initialised in THIS DLL.
+            if (Ort::Global<void>::api_ == nullptr) {
+                Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
+            }
+
+            Ort::Env env(ORT_LOGGING_LEVEL_ERROR, "QueryModelInputName");
+            Ort::SessionOptions opts;
+            opts.SetIntraOpNumThreads(1);
+            opts.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_DISABLE_ALL);
+            // Intentionally NOT attaching CUDA/TRT EP — CPU is fastest
+            // for a no-inference metadata read.
+
+            std::wstring wpath(onnxPath.begin(), onnxPath.end());
+            Ort::Session session(env, wpath.c_str(), opts);
+
+            Ort::AllocatorWithDefaultOptions alloc;
+            auto inName = session.GetInputNameAllocated(0, alloc);
+            return std::string(inName.get());
+        }
+        catch (const Ort::Exception& e) {
+            std::cerr << "[QueryModelInputName] ORT exception: "
+                      << e.what() << " (path=" << onnxPath << ")" << std::endl;
+            return "";
+        }
+        catch (const std::exception& e) {
+            std::cerr << "[QueryModelInputName] std exception: "
+                      << e.what() << " (path=" << onnxPath << ")" << std::endl;
+            return "";
+        }
+    }
+
     BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
         unsigned int _num_threads)
         : log_id(_onnx_path.data()),
@@ -36,6 +72,33 @@ namespace ANSCENTER {
         initialize_handler();
     }
 
+    BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
+        EngineType engineType,
+        const OrtHandlerOptions& options,
+        unsigned int _num_threads)
+        : log_id(_onnx_path.data()),
+        num_threads(_num_threads),
+        m_engineType(engineType),
+        m_handlerOptions(options),
+        onnx_path_w(_onnx_path.begin(), _onnx_path.end())
+    {
+        onnx_path = onnx_path_w.c_str();
+        initialize_handler();
+    }
+
+    BasicOrtHandler::BasicOrtHandler(const std::string& _onnx_path,
+        const OrtHandlerOptions& options,
+        unsigned int _num_threads)
+        : log_id(_onnx_path.data()),
+        num_threads(_num_threads),
+        m_engineType(static_cast<EngineType>(-1)),  // EPLoader auto-detect
+        m_handlerOptions(options),
+        onnx_path_w(_onnx_path.begin(), _onnx_path.end())
+    {
+        onnx_path = onnx_path_w.c_str();
+        initialize_handler();
+    }
+
     BasicOrtHandler::~BasicOrtHandler()
     {
         if (ort_session) {
@@ -66,11 +129,15 @@ namespace ANSCENTER {
             //  - arena_extend_strategy = 1 (kSameAsRequested) to avoid
             //    pre-allocating huge GPU memory blocks that may exceed VRAM
             //  - cudnn_conv_algo_search = HEURISTIC for faster session init
-            //  - cudnn_conv_use_max_workspace = 0 — use minimal cuDNN workspace
-            //    to prevent CUDNN_BACKEND_API_FAILED when TRT engines already
-            //    occupy most VRAM on the same GPU
+            //  - cudnn_conv_use_max_workspace defaults to "0" to prevent
+            //    CUDNN_BACKEND_API_FAILED when TRT engines already occupy
+            //    most VRAM on the same GPU. OCR sub-models that need fast
+            //    convs opt into "1" via OrtHandlerOptions::useMaxCudnnWorkspace
             //  - gpu_mem_limit — cap ONNX Runtime's GPU memory arena to 2 GB
             //    so it doesn't compete with TensorRT for the remaining VRAM
+            const char* maxWorkspace =
+                m_handlerOptions.useMaxCudnnWorkspace ? "1" : "0";
+
             const char* keys[] = {
                 "device_id",
                 "arena_extend_strategy",
@@ -82,7 +149,7 @@ namespace ANSCENTER {
                 "0",
                 "1",              // kSameAsRequested
                 "HEURISTIC",      // avoid exhaustive algo search on large model
-                "0",              // minimal cuDNN workspace (prevents OOM)
+                maxWorkspace,     // "1" for OCR (perf), "0" elsewhere (safety)
                 "2147483648"      // 2 GB arena limit
             };
             Ort::GetApi().UpdateCUDAProviderOptions(
@@ -92,7 +159,8 @@ namespace ANSCENTER {
             Ort::GetApi().ReleaseCUDAProviderOptions(cuda_options);
 
             std::cout << "[ORT] CUDA EP attached (arena=SameAsRequested, "
-                         "cudnn=HEURISTIC, maxWorkspace=0, memLimit=2GB)." << std::endl;
+                         "cudnn=HEURISTIC, maxWorkspace=" << maxWorkspace
+                      << ", memLimit=2GB)." << std::endl;
             return true;
         }
         catch (const Ort::Exception& e) {
@@ -100,6 +168,113 @@ namespace ANSCENTER {
             return false;
         }
     }
+
+    bool BasicOrtHandler::TryAppendTensorRT(Ort::SessionOptions& session_options)
+    {
+        try {
+            OrtTensorRTProviderOptionsV2* trt_options = nullptr;
+            Ort::GetApi().CreateTensorRTProviderOptions(&trt_options);
+
+            // Cache built engines on disk so subsequent runs skip the
+            // multi-minute build. Engines are keyed on (model hash, GPU
+            // arch, shape profile) so changing any of those triggers
+            // a rebuild automatically.
+            std::string cacheDir = m_handlerOptions.trtEngineCacheDir;
+            if (cacheDir.empty()) {
+                // %TEMP%\ANSCENTER\TRTEngineCache
+                const char* tmp = std::getenv("TEMP");
+                if (!tmp) tmp = std::getenv("TMP");
+                if (!tmp) tmp = ".";
+                std::filesystem::path p(tmp);
+                p /= "ANSCENTER";
+                p /= "TRTEngineCache";
+                std::error_code ec;
+                std::filesystem::create_directories(p, ec);
+                cacheDir = p.string();
+            }
+
+            // Builder options tuned for *fast first-run*:
+            //   - opt_level 1: builds in seconds, ~5–10 % runtime cost vs 3
+            //   - workspace 1 GB: leaves room for CUDA EP arena and the
+            //     LPD's own TRT engine on the same GPU
+            //   - timing cache: persists kernel timings between runs so
+            //     builds at new shapes get progressively faster
+            //   - profile shapes (if set): build ONE dynamic-shape
+            //     engine that handles all (batch, width) combos instead
+            //     of rebuilding per unique input. Critical for variable
+            //     batch workloads — without this, TRT EP rebuilds every
+            //     time runtime sees a new shape pair, causing 60-90 s
+            //     hangs mid-stream.
+            std::filesystem::path timingCachePath =
+                std::filesystem::path(cacheDir) / "timing.cache";
+            std::string timingCacheStr = timingCachePath.string();
+
+            const bool haveProfile = !m_handlerOptions.trtProfileMinShapes.empty()
+                                  && !m_handlerOptions.trtProfileOptShapes.empty()
+                                  && !m_handlerOptions.trtProfileMaxShapes.empty();
+
+            // Build the key/value arrays. We always set the first 8 keys;
+            // the profile shapes are appended only when provided.
+            std::vector<const char*> keys = {
+                "device_id",
+                "trt_fp16_enable",
+                "trt_engine_cache_enable",
+                "trt_engine_cache_path",
+                "trt_max_workspace_size",
+                "trt_builder_optimization_level",
+                "trt_timing_cache_enable",
+                "trt_timing_cache_path"
+            };
+            std::vector<const char*> values = {
+                "0",
+                m_handlerOptions.trtFP16 ? "1" : "0",
+                "1",
+                cacheDir.c_str(),
+                "1073741824",     // 1 GB build workspace
+                "1",              // fast build (was "3")
+                "1",
+                cacheDir.c_str()
+            };
+
+            if (haveProfile) {
+                keys.push_back("trt_profile_min_shapes");
+                values.push_back(m_handlerOptions.trtProfileMinShapes.c_str());
+                keys.push_back("trt_profile_opt_shapes");
+                values.push_back(m_handlerOptions.trtProfileOptShapes.c_str());
+                keys.push_back("trt_profile_max_shapes");
+                values.push_back(m_handlerOptions.trtProfileMaxShapes.c_str());
+            }
+
+            Ort::GetApi().UpdateTensorRTProviderOptions(
+                trt_options, keys.data(), values.data(), keys.size());
+
+            session_options.AppendExecutionProvider_TensorRT_V2(*trt_options);
+            Ort::GetApi().ReleaseTensorRTProviderOptions(trt_options);
+
+            std::cout << "[ORT] TensorRT EP attached (fp16="
+                      << (m_handlerOptions.trtFP16 ? "1" : "0")
+                      << ", cache=" << cacheDir
+                      << ", profile=" << (haveProfile ? "dynamic" : "static")
+                      << ")." << std::endl;
+            if (haveProfile) {
+                std::cout << "[ORT]   profile min: "
+                          << m_handlerOptions.trtProfileMinShapes << std::endl
+                          << "[ORT]   profile opt: "
+                          << m_handlerOptions.trtProfileOptShapes << std::endl
+                          << "[ORT]   profile max: "
+                          << m_handlerOptions.trtProfileMaxShapes << std::endl;
+            }
+            return true;
+        }
+        catch (const Ort::Exception& e) {
+            std::cerr << "[ORT] TensorRT EP failed: " << e.what() << std::endl;
+            return false;
+        }
+        catch (const std::exception& e) {
+            std::cerr << "[ORT] TensorRT EP failed (std): " << e.what() << std::endl;
+            return false;
+        }
+    }
     bool BasicOrtHandler::TryAppendDirectML(Ort::SessionOptions& session_options)
     {
         try {
@@ -267,9 +442,28 @@ namespace ANSCENTER {
         {
             // --------------------------------------------------------
         case EngineType::NVIDIA_GPU:
+            // Try TensorRT EP first when explicitly requested. Falls
+            // through to CUDA EP if TRT is missing or option creation
+            // fails. Both EPs may be attached at once — ORT picks TRT
+            // for nodes it supports and CUDA for the rest.
+            if (m_handlerOptions.preferTensorRT
+                && hasProvider("TensorrtExecutionProvider")) {
+                ANS_DBG("OrtHandler", "Trying TensorRT EP...");
+                if (TryAppendTensorRT(session_options)) {
+                    epAttached = true;
+                }
+                else {
+                    std::cerr << "[ORT] TensorRT EP attach failed — "
+                                 "falling back to CUDA EP." << std::endl;
+                }
+            }
+
             ANS_DBG("OrtHandler", "Trying CUDA EP...");
-            if (hasProvider("CUDAExecutionProvider"))
-                epAttached = TryAppendCUDA(session_options);
+            if (hasProvider("CUDAExecutionProvider")) {
+                if (TryAppendCUDA(session_options)) {
+                    epAttached = true;
+                }
+            }
             if (!epAttached) {
                 std::cerr << "[ORT] CUDA EP unavailable — falling back to CPU."
                 << std::endl;
diff --git a/engines/ONNXEngine/ONNXEngine.h b/engines/ONNXEngine/ONNXEngine.h
index ed957b5..44f7418 100644
--- a/engines/ONNXEngine/ONNXEngine.h
+++ b/engines/ONNXEngine/ONNXEngine.h
@@ -248,6 +248,46 @@ namespace ANSCENTER {
         return std::string(ort_session->GetOutputNameAllocated(index, allocator).get());
     }
 
+    // ====================================================================
+    // High-perf options for OCR sub-models that need TRT EP and full
+    // cuDNN workspace.  Default-constructed = identical to the legacy
+    // behavior (CUDA EP only, minimal cuDNN workspace).
+    // ====================================================================
+    struct OrtHandlerOptions {
+        // Try to attach TensorRT EP before CUDA EP (NVIDIA only).
+        // Falls back to CUDA EP automatically if TRT EP creation or session
+        // creation fails.  Engines are cached on disk for fast reload.
+        bool preferTensorRT = false;
+
+        // Use the largest cuDNN conv workspace.  cuDNN can then pick fast
+        // algorithms (Winograd, implicit-precomp-GEMM with big workspaces).
+        // Defaults off because some deployments share VRAM with TRT engines
+        // and need the minimal-workspace mode to avoid OOM.
+        bool useMaxCudnnWorkspace = false;
+
+        // Where to cache built TRT engines.  Empty → default
+        // %TEMP%/ANSCENTER/TRTEngineCache.  Only used when preferTensorRT.
+        std::string trtEngineCacheDir;
+
+        // FP16 builds for TRT EP.  Recommended for inference; ignored if
+        // preferTensorRT is false.
+        bool trtFP16 = true;
+
+        // Dynamic-shape profile for TRT EP. When set, TRT builds ONE
+        // engine that handles every input shape in the [min..max] range
+        // instead of rebuilding per unique shape. Critical for models
+        // that see many (batch_size, spatial) combinations at runtime.
+        //
+        // Format: "input_name:d0xd1xd2xd3[,input2:...]"
+        //   e.g. "x:1x3x48x320"  for batch=1, C=3, H=48, W=320
+        //
+        // All three fields must be set together. An empty min implies
+        // no profile (fall back to static-shape-per-unique-input mode).
+        std::string trtProfileMinShapes;
+        std::string trtProfileOptShapes;
+        std::string trtProfileMaxShapes;
+    };
+
     // ====================================================================
     // BasicOrtHandler
     // ====================================================================
@@ -280,6 +320,9 @@ namespace ANSCENTER {
         const unsigned int num_threads;
         EngineType m_engineType;
 
+        // Per-session high-perf options. Default = legacy behavior.
+        OrtHandlerOptions m_handlerOptions;
+
     protected:
         // Default: hardware auto-detection via ANSLicenseHelper through EPLoader
         explicit BasicOrtHandler(const std::string& _onnx_path,
@@ -290,6 +333,19 @@ namespace ANSCENTER {
             EngineType engineType,
             unsigned int _num_threads = 1);
 
+        // Engine override + per-session high-perf options (TRT EP, max
+        // cuDNN workspace, etc.).  Used by OCR sub-models that need
+        // shape-stable, high-throughput inference.
+        explicit BasicOrtHandler(const std::string& _onnx_path,
+            EngineType engineType,
+            const OrtHandlerOptions& options,
+            unsigned int _num_threads = 1);
+
+        // Auto-detect engine via EPLoader, but with high-perf options.
+        explicit BasicOrtHandler(const std::string& _onnx_path,
+            const OrtHandlerOptions& options,
+            unsigned int _num_threads = 1);
+
         virtual ~BasicOrtHandler();
 
         BasicOrtHandler(const BasicOrtHandler&) = delete;
@@ -298,6 +354,13 @@ namespace ANSCENTER {
         // Resolved EP type (after EPLoader fallback). Subclasses use this
         // to branch on actual EP at inference time.
         EngineType getEngineType() const { return m_engineType; }
+
+        // Spin up a tiny CPU-only ORT session just long enough to read
+        // the name of the model's first input, then tear it down. Used
+        // by callers that need to build TRT profile-shape strings
+        // (which require the input name) BEFORE the real session is
+        // created. Returns an empty string on failure.
+        static std::string QueryModelInputName(const std::string& onnxPath);
     private:
         void initialize_handler();
     protected:
@@ -306,6 +369,7 @@ namespace ANSCENTER {
 
         // EP-specific session option builders
         bool TryAppendCUDA(Ort::SessionOptions& opts);
+        bool TryAppendTensorRT(Ort::SessionOptions& opts);
         bool TryAppendDirectML(Ort::SessionOptions& opts);
         bool TryAppendOpenVINO(Ort::SessionOptions& opts);
     };
diff --git a/modules/ANSLPR/ANSLPR_OCR.cpp b/modules/ANSLPR/ANSLPR_OCR.cpp
index f7f0cfb..a2f8383 100644
--- a/modules/ANSLPR/ANSLPR_OCR.cpp
+++ b/modules/ANSLPR/ANSLPR_OCR.cpp
@@ -363,10 +363,14 @@ namespace ANSCENTER
 			ocrModelConfig.ocrLanguage = ocrLang;
 			ocrModelConfig.useDetector = true;
 			ocrModelConfig.useRecognizer = true;
-			ocrModelConfig.useCLS = true;
+			// Skip the angle classifier for ALPR. License-plate boxes
+			// from the YOLO detector are already axis-aligned, so the
+			// 180° classifier is dead weight (one extra ORT call per
+			// plate for no recall gain).
+			ocrModelConfig.useCLS = false;
 			ocrModelConfig.useLayout = false;
 			ocrModelConfig.useTable = false;
-			ocrModelConfig.useTensorRT = false;
+			ocrModelConfig.useTensorRT = true;
 			ocrModelConfig.enableMKLDNN = false;
 			ocrModelConfig.useDilation = true;
 			ocrModelConfig.useAngleCLS = false;
@@ -375,7 +379,7 @@ namespace ANSCENTER
 			ocrModelConfig.detectionBoxThreshold = 0.3;
 			ocrModelConfig.detectionDBUnclipRatio = 1.2;
 			ocrModelConfig.clsThreshold = 0.9;
-			ocrModelConfig.limitSideLen = 2560;
+			ocrModelConfig.limitSideLen = 480;
 
 			// Pass the original ALPR model zip path — ANSOCRBase::Initialize
 			// will extract it to the same folder (already done, so extraction
@@ -638,41 +642,104 @@ namespace ANSCENTER
 				return {};
 			}
 
-			std::vector<Object> output;
-			output.reserve(lprOutput.size());
+			// Step 2: Collect crops from every valid plate. Wide plates
+			// (aspect >= 2.0) are treated as a single text line; narrow
+			// plates (2-row layouts like Japanese) are split horizontally
+			// at H/2 into top and bottom rows. All crops go through a
+			// single batched recognizer call, bypassing the OCR text-line
+			// detector entirely — for ALPR the LP YOLO box already bounds
+			// the text region precisely.
+			struct PlateInfo {
+				size_t origIndex;                 // into lprOutput
+				std::vector<size_t> cropIndices;  // into allCrops
+				cv::Mat plateROI;                 // full (unsplit) ROI, kept for colour
+			};
+			std::vector<cv::Mat>   allCrops;
+			std::vector<PlateInfo> plateInfos;
+			allCrops.reserve(lprOutput.size() * 2);
+			plateInfos.reserve(lprOutput.size());
 
-			for (auto& lprObject : lprOutput) {
-				const cv::Rect& box = lprObject.box;
+			for (size_t i = 0; i < lprOutput.size(); ++i) {
+				const cv::Rect& box = lprOutput[i].box;
 
 				// Calculate safe cropped region
 				const int x1 = std::max(0, box.x);
 				const int y1 = std::max(0, box.y);
-				const int width = std::min(frameWidth - x1, box.width);
+				const int width  = std::min(frameWidth  - x1, box.width);
 				const int height = std::min(frameHeight - y1, box.height);
 
 				if (width <= 0 || height <= 0) continue;
 
-				cv::Rect lprPos(x1, y1, width, height);
-				cv::Mat plateROI = frame(lprPos);
+				cv::Mat plateROI = frame(cv::Rect(x1, y1, width, height));
 
-				// Step 2: Run OCR on the detected plate
-				std::string ocrText = RunOCROnPlate(plateROI, cameraId);
+				PlateInfo info;
+				info.origIndex = i;
+				info.plateROI  = plateROI;
 
-				if (ocrText.empty()) continue;
+				const float aspect = static_cast<float>(width) /
+				                     std::max(1, height);
 
+				// 2-row heuristic: aspect < 2.0 → split top/bottom.
+				// Threshold tuned to catch Japanese square plates
+				// (~1.5–1.9) while leaving wide EU/VN plates (3.0+)
+				// untouched.
+				if (aspect < 2.0f && height >= 24) {
+					const int halfH = height / 2;
+					info.cropIndices.push_back(allCrops.size());
+					allCrops.push_back(plateROI(cv::Rect(0, 0, width, halfH)));
+					info.cropIndices.push_back(allCrops.size());
+					allCrops.push_back(plateROI(cv::Rect(0, halfH, width, height - halfH)));
+				}
+				else {
+					info.cropIndices.push_back(allCrops.size());
+					allCrops.push_back(plateROI);
+				}
+
+				plateInfos.push_back(std::move(info));
+			}
+
+			if (allCrops.empty()) {
+				return {};
+			}
+
+			// Step 3: Single batched recognizer call for every crop.
+			// ONNXOCRRecognizer groups crops by bucket width and issues
+			// one ORT Run per bucket — typically 1–2 GPU calls for an
+			// entire frame regardless of plate count.
+			auto ocrResults = _ocrEngine->RecognizeTextBatch(allCrops);
+
+			// Step 4: Assemble per-plate output
+			std::vector<Object> output;
+			output.reserve(plateInfos.size());
+
+			for (const auto& info : plateInfos) {
+				std::string combinedText;
+				for (size_t cropIdx : info.cropIndices) {
+					if (cropIdx >= ocrResults.size()) continue;
+					const std::string& lineText = ocrResults[cropIdx].first;
+					if (lineText.empty()) continue;
+					if (!combinedText.empty()) combinedText += " ";
+					combinedText += lineText;
+				}
+				if (combinedText.empty()) continue;
+
+				Object lprObject = lprOutput[info.origIndex];
 				lprObject.cameraId = cameraId;
 
-				// Use ALPRChecker for text stabilization if enabled
+				// Cross-frame stabilization (unchanged)
 				if (_enableALPRChecker) {
-					lprObject.className = alprChecker.checkPlateByTrackId(cameraId, ocrText, lprObject.trackId);
-				} else {
-					lprObject.className = ocrText;
+					lprObject.className = alprChecker.checkPlateByTrackId(
+						cameraId, combinedText, lprObject.trackId);
+				}
+				else {
+					lprObject.className = combinedText;
 				}
 
 				if (lprObject.className.empty()) continue;
 
-				// Step 3: Colour detection (optional)
-				std::string colour = DetectLPColourCached(plateROI, cameraId, lprObject.className);
+				// Optional colour detection on the full plate ROI
+				std::string colour = DetectLPColourCached(
+					info.plateROI, cameraId, lprObject.className);
 				if (!colour.empty()) {
 					lprObject.extraInfo = "color:" + colour;
 				}
diff --git a/modules/ANSOCR/ANSOCRBase.h b/modules/ANSOCR/ANSOCRBase.h
index 185f7fe..66d06e4 100644
--- a/modules/ANSOCR/ANSOCRBase.h
+++ b/modules/ANSOCR/ANSOCRBase.h
@@ -159,6 +159,18 @@ namespace ANSCENTER {
 		// Returns recognized text and confidence. Default returns empty.
 		virtual std::pair<std::string, float> RecognizeText(const cv::Mat& croppedImage) { return {"", 0.0f}; }
 
+		// Batch recognition — skips the text-line detector entirely and
+		// runs the whole batch through a single ORT call when possible.
+		// Default implementation falls back to per-image RecognizeText
+		// so existing subclasses keep working without changes.
+		virtual std::vector<std::pair<std::string, float>> RecognizeTextBatch(
+			const std::vector<cv::Mat>& croppedImages) {
+			std::vector<std::pair<std::string, float>> out;
+			out.reserve(croppedImages.size());
+			for (const auto& m : croppedImages) out.push_back(RecognizeText(m));
+			return out;
+		}
+
 		// ALPR configuration methods
 		void SetOCRMode(OCRMode mode);
 		OCRMode GetOCRMode() const;
diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp b/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp
index 47e8008..8babdea 100644
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp
@@ -4,6 +4,7 @@
 #include <iostream>
 #include <algorithm>
 #include <cmath>
+#include <chrono>
 
 namespace ANSCENTER {
 namespace onnxocr {
@@ -12,6 +13,12 @@ ONNXOCRClassifier::ONNXOCRClassifier(const std::string& onnx_path, unsigned int
     : BasicOrtHandler(onnx_path, num_threads) {
 }
 
+ONNXOCRClassifier::ONNXOCRClassifier(const std::string& onnx_path,
+                                     const OrtHandlerOptions& options,
+                                     unsigned int num_threads)
+    : BasicOrtHandler(onnx_path, options, num_threads) {
+}
+
 Ort::Value ONNXOCRClassifier::transform(const cv::Mat& mat) {
     cv::Mat resized;
     // Direct resize to 80x160 (PP-LCNet_x1_0_textline_ori)
@@ -103,5 +110,38 @@ void ONNXOCRClassifier::Classify(std::vector<cv::Mat>& img_list,
     }
 }
 
+void ONNXOCRClassifier::Warmup() {
+    std::lock_guard<std::mutex> lock(_mutex);
+    if (_warmedUp || !ort_session) return;
+
+    try {
+        cv::Mat dummy(kClsImageH * 2, kClsImageW * 2, CV_8UC3, cv::Scalar(128, 128, 128));
+        cv::Mat resized;
+        cv::resize(dummy, resized, cv::Size(kClsImageW, kClsImageH));
+        resized.convertTo(resized, CV_32FC3);
+        auto inputData = NormalizeAndPermute(resized);
+
+        std::array<int64_t, 4> inputShape = { 1, 3, kClsImageH, kClsImageW };
+        Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
+            *memory_info_handler, inputData.data(), inputData.size(),
+            inputShape.data(), inputShape.size());
+
+        auto t0 = std::chrono::high_resolution_clock::now();
+        (void)ort_session->Run(
+            Ort::RunOptions{ nullptr },
+            input_node_names.data(), &inputTensor, 1,
+            output_node_names.data(), num_outputs);
+        auto t1 = std::chrono::high_resolution_clock::now();
+        double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+        std::cout << "[ONNXOCRClassifier] Warmup [1,3,"
+                  << kClsImageH << "," << kClsImageW << "] "
+                  << ms << " ms" << std::endl;
+    }
+    catch (const Ort::Exception& e) {
+        std::cerr << "[ONNXOCRClassifier] Warmup failed: " << e.what() << std::endl;
+    }
+    _warmedUp = true;
+}
+
 } // namespace onnxocr
 } // namespace ANSCENTER
diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.h b/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.h
index 9bd5535..1147f43 100644
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.h
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.h
@@ -11,6 +11,9 @@ namespace onnxocr {
 class ONNXOCRClassifier : public BasicOrtHandler {
 public:
     explicit ONNXOCRClassifier(const std::string& onnx_path, unsigned int num_threads = 1);
+    explicit ONNXOCRClassifier(const std::string& onnx_path,
+                               const OrtHandlerOptions& options,
+                               unsigned int num_threads = 1);
     ~ONNXOCRClassifier() override = default;
 
     // Classify text orientation for a list of cropped images
@@ -21,7 +24,12 @@ public:
                   std::vector<float>& cls_scores,
                   float cls_thresh = kClsThresh);
 
+    // Pre-warm cuDNN/TRT for the classifier's fixed [1,3,80,160] shape.
+    // Idempotent — no-op after the first call.
+    void Warmup();
+
 private:
+    bool _warmedUp = false;
     Ort::Value transform(const cv::Mat& mat) override;
     Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
 
diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp b/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp
index 616cecf..a8a5c5b 100644
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp
@@ -7,6 +7,7 @@
 #include <iostream>
 #include <algorithm>
 #include <cmath>
+#include <chrono>
 
 namespace ANSCENTER {
 namespace onnxocr {
@@ -15,6 +16,12 @@ ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path, unsigned int num_
     : BasicOrtHandler(onnx_path, num_threads) {
 }
 
+ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path,
+                                 const OrtHandlerOptions& options,
+                                 unsigned int num_threads)
+    : BasicOrtHandler(onnx_path, options, num_threads) {
+}
+
 Ort::Value ONNXOCRDetector::transform(const cv::Mat& mat) {
     // Not used directly - detection uses custom Preprocess + manual tensor creation
     // Provided to satisfy BasicOrtHandler pure virtual
@@ -308,5 +315,41 @@ std::vector<cv::Point2f> ONNXOCRDetector::UnclipPolygon(const std::array<cv::Poi
     return result;
 }
 
+void ONNXOCRDetector::Warmup() {
+    std::lock_guard<std::mutex> lock(_mutex);
+    if (_warmedUp || !ort_session) return;
+
+    // 320x320 covers the typical license-plate ROI after LPD crop +
+    // multiple-of-32 rounding. cuDNN caches the algorithm for this
+    // shape so the first real inference doesn't pay the picker cost.
+    constexpr int kWarmupSide = 320;
+    try {
+        cv::Mat dummy(kWarmupSide, kWarmupSide, CV_8UC3, cv::Scalar(128, 128, 128));
+        cv::Mat dummyF;
+        dummy.convertTo(dummyF, CV_32FC3);
+        auto inputData = NormalizeAndPermute(dummyF);
+
+        std::array<int64_t, 4> inputShape = { 1, 3, kWarmupSide, kWarmupSide };
+        Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
+            *memory_info_handler, inputData.data(), inputData.size(),
+            inputShape.data(), inputShape.size());
+
+        auto t0 = std::chrono::high_resolution_clock::now();
+        (void)ort_session->Run(
+            Ort::RunOptions{ nullptr },
+            input_node_names.data(), &inputTensor, 1,
+            output_node_names.data(), num_outputs);
+        auto t1 = std::chrono::high_resolution_clock::now();
+        double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+        std::cout << "[ONNXOCRDetector] Warmup [1,3,"
+                  << kWarmupSide << "," << kWarmupSide << "] "
+                  << ms << " ms" << std::endl;
+    }
+    catch (const Ort::Exception& e) {
+        std::cerr << "[ONNXOCRDetector] Warmup failed: " << e.what() << std::endl;
+    }
+    _warmedUp = true;
+}
+
 } // namespace onnxocr
 } // namespace ANSCENTER
diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.h b/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.h
index 02d5a5b..d8620c5 100644
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.h
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.h
@@ -11,6 +11,9 @@ namespace onnxocr {
 class ONNXOCRDetector : public BasicOrtHandler {
 public:
     explicit ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads = 1);
+    explicit ONNXOCRDetector(const std::string& onnx_path,
+                             const OrtHandlerOptions& options,
+                             unsigned int num_threads = 1);
     ~ONNXOCRDetector() override = default;
 
     // Run text detection on an image
@@ -21,7 +24,12 @@ public:
                                 float unclipRatio = kDetUnclipRatio,
                                 bool useDilation  = false);
 
+    // Pre-warm cuDNN/TRT at a canonical 320x320 input so the first real
+    // call doesn't pay the algorithm-selection tax. Idempotent.
+    void Warmup();
+
 private:
+    bool _warmedUp = false;
     Ort::Value transform(const cv::Mat& mat) override;
     Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
 
diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp
index 0fe881b..190a30a 100644
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp
@@ -7,6 +7,7 @@
 #include <cmath>
 #include <cfloat>
 #include <cstring>
+#include <chrono>
 
 namespace ANSCENTER {
 namespace onnxocr {
@@ -15,6 +16,12 @@ ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path, unsigned int
     : BasicOrtHandler(onnx_path, num_threads) {
 }
 
+ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path,
+                                     const OrtHandlerOptions& options,
+                                     unsigned int num_threads)
+    : BasicOrtHandler(onnx_path, options, num_threads) {
+}
+
 bool ONNXOCRRecognizer::LoadDictionary(const std::string& dictPath) {
     keys_ = LoadDict(dictPath);
     if (keys_.size() < 2) {
@@ -46,6 +53,54 @@ Ort::Value ONNXOCRRecognizer::transformBatch(const std::vector<cv::Mat>& images)
     return Ort::Value(nullptr);
 }
 
+// ----------------------------------------------------------------------------
+// Width buckets — every recognizer input is padded up to one of these widths
+// before reaching ORT. This bounds the number of distinct shapes cuDNN ever
+// sees to four, so its HEURISTIC algorithm cache hits on every subsequent
+// call instead of re-tuning per plate. Buckets cover the realistic range:
+//   320 px  → short Latin/Japanese plates (most common)
+//   480 px  → wider Latin plates with two rows of text
+//   640 px  → long single-row plates / multi-line stacked text
+//   960 px  → safety upper bound (== kRecImgMaxW)
+// ----------------------------------------------------------------------------
+static constexpr int kRecBucketWidths[] = { 320, 480, 640, 960 };
+static constexpr int kRecNumBuckets = sizeof(kRecBucketWidths) / sizeof(kRecBucketWidths[0]);
+
+int ONNXOCRRecognizer::RoundUpToBucket(int resizedW) const {
+    const int capped = std::min(resizedW, imgMaxW_);
+    for (int b = 0; b < kRecNumBuckets; ++b) {
+        if (kRecBucketWidths[b] >= capped) return kRecBucketWidths[b];
+    }
+    return imgMaxW_;
+}
+
+// Resize + normalize a single crop into a CHW float vector at width
+// `bucketW`, padding with zeros on the right when needed. The returned
+// vector has exactly 3*imgH_*bucketW elements.
+static std::vector<float> PreprocessCropToBucket(const cv::Mat& crop,
+                                                 int imgH, int bucketW) {
+    cv::Mat resized = ResizeRecImage(crop, imgH, bucketW);
+    int resizedW = resized.cols;
+    resized.convertTo(resized, CV_32FC3);
+    auto normalizedData = NormalizeAndPermuteCls(resized);
+
+    if (resizedW == bucketW) {
+        return normalizedData;
+    }
+
+    // Zero-pad on the right (CHW layout)
+    std::vector<float> padded(3 * imgH * bucketW, 0.0f);
+    for (int c = 0; c < 3; c++) {
+        for (int y = 0; y < imgH; y++) {
+            std::memcpy(
+                &padded[c * imgH * bucketW + y * bucketW],
+                &normalizedData[c * imgH * resizedW + y * resizedW],
+                resizedW * sizeof(float));
+        }
+    }
+    return padded;
+}
+
 TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
     std::lock_guard<std::mutex> lock(_mutex);
 
@@ -54,52 +109,27 @@ TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
     }
 
     try {
-        // Preprocess: resize to fixed height, proportional width
+        // Step 1: aspect-preserving resize to height=imgH_, width capped
+        // at imgMaxW_. Then round resized width up to the next bucket.
         cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
-        int resizedW = resized.cols;
+        const int bucketW = RoundUpToBucket(resized.cols);
 
-        resized.convertTo(resized, CV_32FC3);
-        // Recognition uses (pixel/255 - 0.5) / 0.5 normalization (same as classifier)
-        auto normalizedData = NormalizeAndPermuteCls(resized);
+        std::vector<float> inputData = PreprocessCropToBucket(croppedImage, imgH_, bucketW);
 
-        // Pad to at least kRecImgW width (matching official PaddleOCR behavior)
-        // Official PaddleOCR: padding_im = np.zeros((C, H, W)), then copies normalized
-        // image into left portion. Padding value = 0.0 in normalized space.
-        int imgW = std::max(resizedW, kRecImgW);
-
-        std::vector<float> inputData;
-        if (imgW > resizedW) {
-            // Zero-pad on the right (CHW layout)
-            inputData.resize(3 * imgH_ * imgW, 0.0f);
-            for (int c = 0; c < 3; c++) {
-                for (int y = 0; y < imgH_; y++) {
-                    std::memcpy(
-                        &inputData[c * imgH_ * imgW + y * imgW],
-                        &normalizedData[c * imgH_ * resizedW + y * resizedW],
-                        resizedW * sizeof(float));
-                }
-            }
-        } else {
-            inputData = std::move(normalizedData);
-        }
-
-        // Create input tensor with (possibly padded) width
-        std::array<int64_t, 4> inputShape = { 1, 3, imgH_, imgW };
+        std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };
         Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
             *memory_info_handler, inputData.data(), inputData.size(),
             inputShape.data(), inputShape.size());
 
-        // Run inference
         auto outputTensors = ort_session->Run(
             Ort::RunOptions{ nullptr },
             input_node_names.data(), &inputTensor, 1,
             output_node_names.data(), num_outputs);
 
-        // Get output
         float* outputData = outputTensors[0].GetTensorMutableData<float>();
         auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
 
-        int seqLen = static_cast<int>(outputShape[1]);
+        int seqLen     = static_cast<int>(outputShape[1]);
         int numClasses = static_cast<int>(outputShape[2]);
 
         return CTCDecode(outputData, seqLen, numClasses);
@@ -110,18 +140,162 @@ TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
     }
 }
 
-std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
-    std::vector<TextLine> results;
-    results.reserve(croppedImages.size());
+void ONNXOCRRecognizer::RunBatchAtWidth(const std::vector<cv::Mat>& crops,
+                                        const std::vector<size_t>& origIndices,
+                                        int bucketW,
+                                        std::vector<TextLine>& out) {
+    if (crops.empty()) return;
 
-    // Process one at a time (dynamic width per image)
-    for (size_t i = 0; i < croppedImages.size(); i++) {
-        results.push_back(Recognize(croppedImages[i]));
+    try {
+        const size_t batchN = crops.size();
+        const size_t perImage = static_cast<size_t>(3) * imgH_ * bucketW;
+
+        // Stack N preprocessed crops into one [N,3,H,W] buffer
+        std::vector<float> batchInput(batchN * perImage, 0.0f);
+        for (size_t i = 0; i < batchN; ++i) {
+            auto img = PreprocessCropToBucket(crops[i], imgH_, bucketW);
+            std::memcpy(&batchInput[i * perImage], img.data(),
+                        perImage * sizeof(float));
+        }
+
+        std::array<int64_t, 4> inputShape = {
+            static_cast<int64_t>(batchN), 3,
+            static_cast<int64_t>(imgH_),
+            static_cast<int64_t>(bucketW)
+        };
+        Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
+            *memory_info_handler, batchInput.data(), batchInput.size(),
+            inputShape.data(), inputShape.size());
+
+        auto outputTensors = ort_session->Run(
+            Ort::RunOptions{ nullptr },
+            input_node_names.data(), &inputTensor, 1,
+            output_node_names.data(), num_outputs);
+
+        float* outputData = outputTensors[0].GetTensorMutableData<float>();
+        auto outputShape  = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
+
+        // Expected output: [N, seqLen, numClasses]
+        if (outputShape.size() < 3) {
+            std::cerr << "[ONNXOCRRecognizer] Unexpected batch output rank: "
+                      << outputShape.size() << std::endl;
+            return;
+        }
+        const int outBatch   = static_cast<int>(outputShape[0]);
+        const int seqLen     = static_cast<int>(outputShape[1]);
+        const int numClasses = static_cast<int>(outputShape[2]);
+        const size_t perRow  = static_cast<size_t>(seqLen) * numClasses;
+
+        for (int i = 0; i < outBatch && i < static_cast<int>(batchN); ++i) {
+            TextLine tl = CTCDecode(outputData + i * perRow, seqLen, numClasses);
+            out[origIndices[i]] = std::move(tl);
+        }
+    }
+    catch (const Ort::Exception& e) {
+        // ORT will throw if the model doesn't support a batch dimension > 1.
+        // Fall back to per-image inference for this group.
+        std::cerr << "[ONNXOCRRecognizer] Batch inference failed at bucketW="
+                  << bucketW << " (" << e.what()
+                  << ") — falling back to single-image path." << std::endl;
+        for (size_t i = 0; i < crops.size(); ++i) {
+            // Direct call (we already hold _mutex via the public RecognizeBatch
+            // wrapper). Replicate the single-image preprocessing here to avoid
+            // re-entering Recognize() and double-locking the mutex.
+            try {
+                cv::Mat resized = ResizeRecImage(crops[i], imgH_, imgMaxW_);
+                int singleBucket = RoundUpToBucket(resized.cols);
+                auto inputData = PreprocessCropToBucket(crops[i], imgH_, singleBucket);
+                std::array<int64_t, 4> inputShape = { 1, 3, imgH_, singleBucket };
+                Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
+                    *memory_info_handler, inputData.data(), inputData.size(),
+                    inputShape.data(), inputShape.size());
+                auto outputTensors = ort_session->Run(
+                    Ort::RunOptions{ nullptr },
+                    input_node_names.data(), &inputTensor, 1,
+                    output_node_names.data(), num_outputs);
+                float* outData = outputTensors[0].GetTensorMutableData<float>();
+                auto outShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
+                int seqLen = static_cast<int>(outShape[1]);
+                int numClasses = static_cast<int>(outShape[2]);
+                out[origIndices[i]] = CTCDecode(outData, seqLen, numClasses);
+            } catch (const Ort::Exception& e2) {
+                std::cerr << "[ONNXOCRRecognizer] Single-image fallback also failed: "
+                          << e2.what() << std::endl;
+                out[origIndices[i]] = {};
+            }
+        }
+    }
+}
+
+std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
+    std::lock_guard<std::mutex> lock(_mutex);
+
+    std::vector<TextLine> results(croppedImages.size());
+    if (!ort_session || croppedImages.empty() || keys_.empty()) {
+        return results;
+    }
+
+    // Group crops by their target bucket width
+    std::vector<std::vector<cv::Mat>> groupCrops(kRecNumBuckets);
+    std::vector<std::vector<size_t>>  groupIdx(kRecNumBuckets);
+
+    for (size_t i = 0; i < croppedImages.size(); ++i) {
+        if (croppedImages[i].empty()) continue;
+        cv::Mat resized = ResizeRecImage(croppedImages[i], imgH_, imgMaxW_);
+        const int bw = RoundUpToBucket(resized.cols);
+        // Find bucket index
+        int bucketIdx = kRecNumBuckets - 1;
+        for (int b = 0; b < kRecNumBuckets; ++b) {
+            if (kRecBucketWidths[b] == bw) { bucketIdx = b; break; }
+        }
+        groupCrops[bucketIdx].push_back(croppedImages[i]);
+        groupIdx[bucketIdx].push_back(i);
+    }
+
+    // Run one batched inference per non-empty bucket
+    for (int b = 0; b < kRecNumBuckets; ++b) {
+        if (groupCrops[b].empty()) continue;
+        RunBatchAtWidth(groupCrops[b], groupIdx[b], kRecBucketWidths[b], results);
     }
 
     return results;
 }
 
+void ONNXOCRRecognizer::Warmup() {
+    std::lock_guard<std::mutex> lock(_mutex);
+    if (_warmedUp || !ort_session) return;
+
+    // Dummy 3-channel image, mid-grey, large enough to resize to imgH_
+    cv::Mat dummy(imgH_ * 2, kRecBucketWidths[kRecNumBuckets - 1] * 2,
+                  CV_8UC3, cv::Scalar(128, 128, 128));
+
+    for (int b = 0; b < kRecNumBuckets; ++b) {
+        const int bucketW = kRecBucketWidths[b];
+        try {
+            auto inputData = PreprocessCropToBucket(dummy, imgH_, bucketW);
+            std::array<int64_t, 4> inputShape = { 1, 3, imgH_, bucketW };
+            Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
+                *memory_info_handler, inputData.data(), inputData.size(),
+                inputShape.data(), inputShape.size());
+
+            auto t0 = std::chrono::high_resolution_clock::now();
+            (void)ort_session->Run(
+                Ort::RunOptions{ nullptr },
+                input_node_names.data(), &inputTensor, 1,
+                output_node_names.data(), num_outputs);
+            auto t1 = std::chrono::high_resolution_clock::now();
+            double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
+            std::cout << "[ONNXOCRRecognizer] Warmup bucketW=" << bucketW
+                      << "  " << ms << " ms" << std::endl;
+        }
+        catch (const Ort::Exception& e) {
+            std::cerr << "[ONNXOCRRecognizer] Warmup failed at bucketW="
+                      << bucketW << ": " << e.what() << std::endl;
+        }
+    }
+    _warmedUp = true;
+}
+
 TextLine ONNXOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
     TextLine result;
     std::string text;
diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.h b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.h
index a8292f2..5b3159a 100644
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.h
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.h
@@ -12,6 +12,9 @@ namespace onnxocr {
 class ONNXOCRRecognizer : public BasicOrtHandler {
 public:
     explicit ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads = 1);
+    explicit ONNXOCRRecognizer(const std::string& onnx_path,
+                               const OrtHandlerOptions& options,
+                               unsigned int num_threads = 1);
     ~ONNXOCRRecognizer() override = default;
 
     // Load character dictionary (must be called before Recognize)
@@ -20,13 +23,31 @@ public:
     // Recognize text from a single cropped text image
     TextLine Recognize(const cv::Mat& croppedImage);
 
-    // Batch recognition for multiple cropped images
+    // Batch recognition for multiple cropped images.
+    // Crops are grouped into a small set of fixed width buckets and
+    // submitted to ORT as [N,3,imgH_,bucketW] tensors so cuDNN sees
+    // shape-stable inputs and can reuse algorithms across calls.
     std::vector<TextLine> RecognizeBatch(const std::vector<cv::Mat>& croppedImages);
 
+    // Pre-warm cuDNN/TRT for every bucket width by running dummy
+    // inferences. Idempotent — no-op if already warmed up.
+    void Warmup();
+
 private:
     Ort::Value transform(const cv::Mat& mat) override;
     Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
 
+    // Round resizedW up to the next bucket width (capped at imgMaxW_).
+    // Used by both Recognize() and RecognizeBatch() so cuDNN only ever
+    // sees a small finite set of input shapes.
+    int RoundUpToBucket(int resizedW) const;
+
+    // Run a single [N,3,imgH_,bucketW] inference and CTC-decode each row.
+    void RunBatchAtWidth(const std::vector<cv::Mat>& crops,
+                         const std::vector<size_t>& origIndices,
+                         int bucketW,
+                         std::vector<TextLine>& out);
+
     // CTC greedy decode
     TextLine CTCDecode(const float* outputData, int seqLen, int numClasses);
 
@@ -34,6 +55,7 @@ private:
     int imgH_    = kRecImgH;
     int imgMaxW_ = kRecImgMaxW;
     std::mutex _mutex;
+    bool _warmedUp = false;
 };
 
 } // namespace onnxocr
diff --git a/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h b/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h
index fe4856b..5f07f2c 100644
--- a/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h
+++ b/modules/ANSOCR/ANSONNXOCR/ONNXOCRTypes.h
@@ -88,11 +88,22 @@ inline std::vector<std::string> LoadDict(const std::string& dictPath) {
     return keys;
 }
 
-// Compute resize dimensions for detection model (multiples of 32)
+// Compute resize dimensions for detection model.
 // limit_type='max': scale down if max side > maxSideLen (PP-OCRv5 server default)
 // maxSideLimit: safety cap on final max dimension (default 4000)
+//
+// Each dimension is rounded UP to a multiple of kDetSizeBucket (96). The
+// coarse granularity is deliberate: cuDNN HEURISTIC has to re-select
+// convolution algorithms every time it sees a new input shape, and that
+// selection costs ~100 ms per shape. With multiples of 32, a typical ALPR
+// run produces 30+ unique detector shapes; with multiples of 96 that drops
+// to 5–10, which cuDNN can cache and reuse for the rest of the video.
+// 96 is divisible by the DBNet down-stride of 32, so feature-map sizes
+// stay integer.
 inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen,
                                        int maxSideLimit = kDetMaxSideLimit) {
+    constexpr int kDetSizeBucket = 96;
+
     float ratio = 1.0f;
     int maxSide = std::max(srcH, srcW);
     if (maxSide > maxSideLen) {
@@ -108,8 +119,12 @@ inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen,
         newW = static_cast<int>(newW * clampRatio);
     }
 
-    newH = std::max(32, static_cast<int>(std::round(newH / 32.0) * 32));
-    newW = std::max(32, static_cast<int>(std::round(newW / 32.0) * 32));
+    auto roundUpToBucket = [](int x) {
+        return std::max(kDetSizeBucket,
+                        ((x + kDetSizeBucket - 1) / kDetSizeBucket) * kDetSizeBucket);
+    };
+    newH = roundUpToBucket(newH);
+    newW = roundUpToBucket(newW);
     return cv::Size(newW, newH);
 }
 
diff --git a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
index 9216154..79fb541 100644
--- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
@@ -11,13 +11,75 @@ namespace onnxocr {
 bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
                                     const std::string& clsModelPath,
                                     const std::string& recModelPath,
-                                    const std::string& dictPath) {
+                                    const std::string& dictPath,
+                                    bool preferTensorRT) {
     std::lock_guard<std::recursive_mutex> lock(_mutex);
     ModelLoadingGuard mlg(_modelLoading);
 
+    // High-perf options.  The OCR sub-models split into two groups:
+    //
+    //   1. Detector — its input shape varies continuously with every
+    //      plate-ROI aspect ratio.  TRT EP is a poor fit because it
+    //      builds a fresh engine for each unique shape (minutes each).
+    //      We keep it on CUDA EP with the largest cuDNN workspace and
+    //      let cuDNN HEURISTIC handle the per-shape algo selection.
+    //
+    //   2. Classifier + Recognizer — fixed-bucket shapes (cls is
+    //      [1,3,80,160], rec is [1,3,48,{320,480,640,960}]).  These
+    //      benefit massively from TRT EP because the engine is built
+    //      once per shape and reused forever.
+    OrtHandlerOptions detectorOpts;
+    // Detector uses CUDA EP with *conservative* cuDNN workspace.
+    // Empirical: on VRAM-constrained GPUs (LPD TRT engine + rec TRT
+    // engine + ORT arena in play) the max-workspace mode causes cuDNN
+    // to pick Winograd/implicit-precomp-GEMM variants that silently
+    // fall back to slow NO-WORKSPACE algorithms when the big workspace
+    // can't be allocated. With "0" cuDNN picks algorithms that are
+    // known to fit and runs ~10x faster in practice.
+    detectorOpts.useMaxCudnnWorkspace = false;
+    detectorOpts.preferTensorRT       = false;   // never TRT for the detector
+
+    // Classifier (fixed [1,3,80,160]): TRT with no profile is fine.
+    OrtHandlerOptions classifierOpts;
+    classifierOpts.useMaxCudnnWorkspace = true;
+    classifierOpts.preferTensorRT       = preferTensorRT;
+    classifierOpts.trtFP16              = true;
+
+    // Recognizer: needs a DYNAMIC profile so one TRT engine covers every
+    // (batch, bucket_width) pair we generate at runtime. Without this,
+    // each new shape triggers a ~80s engine rebuild mid-stream when a
+    // new plate appears or the plate count changes.
+    //
+    // Profile range:
+    //   batch  : 1 .. 16       (16 plates worth of crops is generous)
+    //   H      : 48 (fixed)
+    //   W      : 320 .. 960    (covers all 4 recognizer buckets)
+    //
+    // Query the actual input name from the .onnx file instead of
+    // hardcoding — PaddleOCR usually exports it as "x" but the name can
+    // vary across model versions.
+    OrtHandlerOptions recognizerOpts;
+    recognizerOpts.useMaxCudnnWorkspace = true;
+    recognizerOpts.preferTensorRT       = preferTensorRT;
+    recognizerOpts.trtFP16              = true;
+    if (preferTensorRT) {
+        std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath);
+        if (recInputName.empty()) {
+            std::cerr << "[PaddleOCRV5Engine] Could not query recognizer "
+                         "input name — defaulting to 'x'" << std::endl;
+            recInputName = "x";
+        }
+        std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
+                  << recInputName << "' — building TRT dynamic profile "
+                  << "[batch=1..16, W=320..960]" << std::endl;
+        recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
+        recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
+        recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
+    }
+
     try {
         // Initialize detector (also triggers EPLoader init in BasicOrtHandler)
-        detector_ = std::make_unique<ONNXOCRDetector>(detModelPath);
+        detector_ = std::make_unique<ONNXOCRDetector>(detModelPath, detectorOpts);
         std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl;
 
         // Ensure this DLL's copy of Ort::Global<void>::api_ is initialized.
@@ -29,7 +91,7 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
 
         // Initialize classifier (optional)
         if (!clsModelPath.empty()) {
-            classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath);
+            classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath, classifierOpts);
             std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl;
         }
         else {
@@ -38,13 +100,26 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
         }
 
         // Initialize recognizer
-        recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath);
+        recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath, recognizerOpts);
         if (!recognizer_->LoadDictionary(dictPath)) {
             std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl;
             return false;
         }
         std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl;
 
+        // Pre-warm classifier (fixed [1,3,80,160]) and recognizer (4
+        // bucket widths) so the first frame doesn't pay the cuDNN/TRT
+        // algorithm-selection tax. The detector is intentionally NOT
+        // warmed up: its input shape varies continuously with each
+        // plate-ROI aspect ratio, so a warmup at any single canonical
+        // shape would cost minutes (TRT) or be useless (CUDA cache miss
+        // on the real frame anyway). Real frames will pay the per-shape
+        // cuDNN HEURISTIC cost on first use.
+        std::cout << "[PaddleOCRV5Engine] Warming up OCR pipeline..." << std::endl;
+        if (classifier_) classifier_->Warmup();
+        if (recognizer_) recognizer_->Warmup();
+        std::cout << "[PaddleOCRV5Engine] Warmup complete" << std::endl;
+
         _initialized = true;
         std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl;
         return true;
@@ -140,5 +215,18 @@ TextLine PaddleOCRV5Engine::recognizeOnly(const cv::Mat& croppedImage) {
     return recognizer_->Recognize(croppedImage);
 }
 
+std::vector<TextLine> PaddleOCRV5Engine::recognizeMany(const std::vector<cv::Mat>& croppedImages) {
+    if (_modelLoading.load()) return std::vector<TextLine>(croppedImages.size());
+    {
+        auto lk = TryLockWithTimeout("PaddleOCRV5Engine::recognizeMany");
+        if (!lk.owns_lock()) return std::vector<TextLine>(croppedImages.size());
+        if (!_initialized || !recognizer_ || croppedImages.empty()) {
+            return std::vector<TextLine>(croppedImages.size());
+        }
+    }
+    // Delegates to the bucketed, batched path in ONNXOCRRecognizer.
+    return recognizer_->RecognizeBatch(croppedImages);
+}
+
 } // namespace onnxocr
 } // namespace ANSCENTER
diff --git a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.h b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.h
index a7ec21f..6ac906e 100644
--- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.h
+++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.h
@@ -25,10 +25,13 @@ public:
 
     // Initialize the OCR pipeline
     // clsModelPath can be empty to skip classification
+    // preferTensorRT: try TensorRT EP first for the three sub-models
+    //                 (cuDNN-friendly cuDNN max-workspace mode either way)
     bool Initialize(const std::string& detModelPath,
                     const std::string& clsModelPath,
                     const std::string& recModelPath,
-                    const std::string& dictPath);
+                    const std::string& dictPath,
+                    bool preferTensorRT = false);
 
     // Run full OCR pipeline on an image
     // Returns results matching PaddleOCR::OCRPredictResult format
@@ -37,6 +40,14 @@ public:
     // Run recognizer only on a pre-cropped text image (no detection step)
     TextLine recognizeOnly(const cv::Mat& croppedImage);
 
+    // Run recognizer only on a batch of pre-cropped text images in a
+    // single batched ORT inference. Skips the detector entirely — the
+    // caller is expected to supply crops that are already roughly
+    // axis-aligned single-line text (e.g. ALPR plate ROIs, optionally
+    // pre-split into rows). Crops are grouped by bucket width, so a
+    // single call to this function typically issues 1–2 ORT Runs total.
+    std::vector<TextLine> recognizeMany(const std::vector<cv::Mat>& croppedImages);
+
     // Configuration setters (matching OCRModelConfig parameters)
     void SetDetMaxSideLen(int val)          { _maxSideLen = val; }
     void SetDetDbThresh(float val)          { _detDbThresh = val; }
diff --git a/modules/ANSOCR/ANSOnnxOCR.cpp b/modules/ANSOCR/ANSOnnxOCR.cpp
index 6ed1031..763aed6 100644
--- a/modules/ANSOCR/ANSOnnxOCR.cpp
+++ b/modules/ANSOCR/ANSOnnxOCR.cpp
@@ -50,7 +50,8 @@ bool ANSONNXOCR::Initialize(const std::string& licenseKey, OCRModelConfig modelC
 				_modelConfig.detectionModelFile,
 				clsModelPath,
 				_modelConfig.recognizerModelFile,
-				_modelConfig.recogizerCharDictionaryPath);
+				_modelConfig.recogizerCharDictionaryPath,
+				_modelConfig.useTensorRT);
 
 			return _isInitialized;
 		}
@@ -391,4 +392,16 @@ std::pair<std::string, float> ANSONNXOCR::RecognizeText(const cv::Mat& croppedIm
 	return {result.text, result.score};
 }
 
+std::vector<std::pair<std::string, float>> ANSONNXOCR::RecognizeTextBatch(
+	const std::vector<cv::Mat>& croppedImages) {
+	std::vector<std::pair<std::string, float>> out(croppedImages.size(), {"", 0.0f});
+	if (!_isInitialized || !_engine || croppedImages.empty()) return out;
+
+	auto lines = _engine->recognizeMany(croppedImages);
+	for (size_t i = 0; i < lines.size() && i < out.size(); ++i) {
+		out[i] = { lines[i].text, lines[i].score };
+	}
+	return out;
+}
+
 } // namespace ANSCENTER
diff --git a/modules/ANSOCR/ANSOnnxOCR.h b/modules/ANSOCR/ANSOnnxOCR.h
index b6b1f17..902ff3a 100644
--- a/modules/ANSOCR/ANSOnnxOCR.h
+++ b/modules/ANSOCR/ANSOnnxOCR.h
@@ -24,6 +24,8 @@ namespace ANSCENTER {
 		std::vector<ANSCENTER::OCRObject> RunInference(const cv::Mat& input, const std::vector<cv::Rect>& Bbox, const std::string& cameraId) override;
 
 		std::pair<std::string, float> RecognizeText(const cv::Mat& croppedImage) override;
+		std::vector<std::pair<std::string, float>> RecognizeTextBatch(
+			const std::vector<cv::Mat>& croppedImages) override;
 		~ANSONNXOCR();
 		bool Destroy() override;
 
diff --git a/tests/ANSCV-UnitTest/ANSCV-UnitTest.cpp b/tests/ANSCV-UnitTest/ANSCV-UnitTest.cpp
index e0f0bbb..d91791b 100644
--- a/tests/ANSCV-UnitTest/ANSCV-UnitTest.cpp
+++ b/tests/ANSCV-UnitTest/ANSCV-UnitTest.cpp
@@ -1367,8 +1367,8 @@ int TestGetImage() {
 }
 int GenerateVideo() {
    std::string imageFolder = "E:\\Programs\\DemoAssets\\ImageSeries\\20260413_152604.321";
-   std::string outputVideoPath = "E:\\Programs\\DemoAssets\\ImageSeries\\output1.mp4";
-   int conversionResult = ANSCV_ImagesToMP4_S(imageFolder.c_str(), outputVideoPath.c_str(), 0, 5);
+   std::string outputVideoPath = "E:\\Programs\\DemoAssets\\ImageSeries\\output3.mp4";
+   int conversionResult = ANSCV_ImagesToMP4_S(imageFolder.c_str(), outputVideoPath.c_str(), 0,20);
    if (!conversionResult) {
        std::cerr << "Failed to convert images to MP4." << std::endl;
       return -1;
diff --git a/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp b/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp
index cee072b..fd4715f 100644
--- a/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp
+++ b/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp
@@ -3805,6 +3805,149 @@ int ALPR_OCR_Test() {
     return 0;
 }
 
+int ALPR_OCR_VideoTest() {
+    std::cout << "=== ALPR_OCR_VideoTest: ANSALPR_OCR engine on video ===" << std::endl;
+    std::filesystem::path currentPath = std::filesystem::current_path();
+    std::cout << "Current working directory: " << currentPath << std::endl;
+
+    ANSCENTER::ANSALPR* infHandle = nullptr;
+    std::string licenseKey = "";
+    std::string modelFilePath = "C:\\Projects\\ANSVIS\\Models\\ANS_GenericALPR_v2.0.zip";
+    std::string videoFilePath = "E:\\Programs\\DemoAssets\\Videos\\ALRP\\PMH\\Day\\day.mp4";
+
+    int engineType = 2; // ANSALPR_OCR
+    double detectionThreshold = 0.3;
+    double ocrThreshold = 0.5;
+    double colourThreshold = 0.0;
+
+    // Step 1: Create handle
+    int createResult = CreateANSALPRHandle(&infHandle, licenseKey.c_str(),
+        modelFilePath.c_str(), "", engineType, detectionThreshold, ocrThreshold, colourThreshold);
+    std::cout << "CreateANSALPRHandle result: " << createResult << std::endl;
+    if (!createResult || !infHandle) {
+        std::cerr << "Failed to create ANSALPR_OCR handle" << std::endl;
+        return -1;
+    }
+
+    // Step 2: Set country (JAPAN = 5 — adjust to match the dataset if needed)
+    ANSALPR_SetCountry(&infHandle, 5);
+    std::cout << "Country set to JAPAN" << std::endl;
+
+    // Step 3: Load engine
+    auto engineStart = std::chrono::high_resolution_clock::now();
+    int loadResult = LoadANSALPREngineHandle(&infHandle);
+    auto engineEnd = std::chrono::high_resolution_clock::now();
+    double engineMs = std::chrono::duration<double, std::milli>(engineEnd - engineStart).count();
+    std::cout << "LoadANSALPREngineHandle result: " << loadResult << " (" << engineMs << " ms)" << std::endl;
+    if (!loadResult) {
+        std::cerr << "Failed to load ANSALPR_OCR engine" << std::endl;
+        ReleaseANSALPRHandle(&infHandle);
+        return -2;
+    }
+
+    // Step 4: Open video
+    cv::VideoCapture capture(videoFilePath);
+    if (!capture.isOpened()) {
+        std::cerr << "Could not open video file: " << videoFilePath << std::endl;
+        ReleaseANSALPRHandle(&infHandle);
+        return -3;
+    }
+
+    boost::property_tree::ptree pt;
+    int frameIdx = 0;
+
+    while (true) {
+        cv::Mat frame;
+        if (!capture.read(frame)) {
+            std::cout << "\nEnd of video stream.\n";
+            break;
+        }
+        ++frameIdx;
+
+        int width = frame.cols;
+        int height = frame.rows;
+
+        // Convert to raw BGR bytes for ANSALPR_RunInferenceBinary
+        unsigned int bufferLength = static_cast<unsigned int>(frame.total() * frame.elemSize());
+        unsigned char* imageBytes = new unsigned char[bufferLength];
+        std::memcpy(imageBytes, frame.data, bufferLength);
+
+        auto t0 = std::chrono::high_resolution_clock::now();
+        std::string detectionResult = ANSALPR_RunInferenceBinary(&infHandle, imageBytes, width, height);
+        auto t1 = std::chrono::high_resolution_clock::now();
+        double inferMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
+        delete[] imageBytes;
+
+        printf("Frame %d: %.2f ms (%.1f FPS)\n", frameIdx, inferMs,
+               inferMs > 0.0 ? (1000.0 / inferMs) : 0.0);
+
+        // Draw detections
+        if (!detectionResult.empty()) {
+            try {
+                pt.clear();
+                std::stringstream ss(detectionResult);
+                boost::property_tree::read_json(ss, pt);
+                BOOST_FOREACH(const boost::property_tree::ptree::value_type& child, pt.get_child("results")) {
+                    const boost::property_tree::ptree& res = child.second;
+                    const auto class_name_raw = GetData<std::string>(res, "class_name");
+                    const std::string class_name = DecodeUnicodeEscapes(class_name_raw);
+                    const auto x = GetData<int>(res, "x");
+                    const auto y = GetData<int>(res, "y");
+                    const auto w = GetData<int>(res, "width");
+                    const auto h = GetData<int>(res, "height");
+
+                    cv::rectangle(frame, cv::Rect(x, y, w, h), cv::Scalar(0, 255, 0), 2);
+
+                    std::string extraInfo = GetOptionalValue<std::string>(res, "extra_info", "");
+                    if (!class_name.empty()) {
+                        std::cout << "  Plate: " << class_name;
+                        if (!extraInfo.empty()) std::cout << "  (" << extraInfo << ")";
+                        std::cout << std::endl;
+                    }
+
+#ifdef WIN32
+                    {
+                        int textH = (int)(1.5 * 30);
+                        int ty = y - 5 - textH;
+                        if (ty < 0) ty = y + 3;
+                        putTextUnicode(frame, class_name, cv::Point(x, ty),
+                            1.5, cv::Scalar(0, 0, 255), 3);
+                    }
+#else
+                    cv::putText(frame, class_name, cv::Point(x, y - 5),
+                        cv::FONT_HERSHEY_SIMPLEX, 1.0, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);
+#endif
+                }
+            }
+            catch (const std::exception& e) {
+                std::cerr << "JSON parse error: " << e.what() << std::endl;
+            }
+        }
+
+        // Display (fit to 1920x1080)
+        cv::Mat display;
+        double scale = std::min(1920.0 / frame.cols, 1080.0 / frame.rows);
+        if (scale < 1.0) {
+            cv::resize(frame, display, cv::Size(), scale, scale);
+        } else {
+            display = frame;
+        }
+        cv::namedWindow("ALPR_OCR_VideoTest", cv::WINDOW_AUTOSIZE);
+        cv::imshow("ALPR_OCR_VideoTest", display);
+        if (cv::waitKey(1) == 27) { // ESC to exit
+            std::cout << "ESC pressed — stopping.\n";
+            break;
+        }
+    }
+
+    capture.release();
+    cv::destroyAllWindows();
+    ReleaseANSALPRHandle(&infHandle);
+
+    std::cout << "=== ALPR_OCR_VideoTest complete ===" << std::endl;
+    return 0;
+}
+
 int main()
 {
 #ifdef WIN32
@@ -3825,7 +3968,8 @@ int main()
     //ANSLPR_MultiGPU_StressTest_SimulatedCam();
    // ANSLPR_MultiGPU_StressTest_FilePlayer();
     //ANSLPR_OD_CPU_VideoTest();
-    ALPR_OCR_Test();
+    //ALPR_OCR_Test();
+    ALPR_OCR_VideoTest();
     return 0;
 
 }