diff --git a/engines/ONNXEngine/ONNXEngine.h b/engines/ONNXEngine/ONNXEngine.h index 44f7418..6e6cb27 100644 --- a/engines/ONNXEngine/ONNXEngine.h +++ b/engines/ONNXEngine/ONNXEngine.h @@ -253,23 +253,50 @@ namespace ANSCENTER { // cuDNN workspace. Default-constructed = identical to the legacy // behavior (CUDA EP only, minimal cuDNN workspace). // ==================================================================== + // ==================================================================== + // OrtHandlerOptions + // + // Per-session knobs for the ORT execution providers. Options are + // grouped by target backend. A field set for one backend is silently + // ignored by every other backend — e.g. `trtProfileMinShapes` only + // affects TensorRT EP (NVIDIA); DirectML and OpenVINO don't read it. + // + // When adding a new backend optimization: + // - put the new field in the correct backend section below + // - NEVER reuse an NVIDIA field for AMD/Intel tuning + // - update the matching Build*OcrOptions() helper in + // PaddleOCRV5Engine.cpp to populate it + // + // The NVIDIA section is considered locked — it's been tuned end-to-end + // for the ANSALPR pipeline and should not change unless fixing a + // specific NVIDIA-observable regression. + // ==================================================================== struct OrtHandlerOptions { - // Try to attach TensorRT EP before CUDA EP (NVIDIA only). - // Falls back to CUDA EP automatically if TRT EP creation or session - // creation fails. Engines are cached on disk for fast reload. + // ---------------------------------------------------------------- + // NVIDIA (CUDA EP + TensorRT EP) — LOCKED + // + // These fields only have effect when the resolved execution + // provider is CUDA EP or TensorRT EP. DirectML (AMD), OpenVINO + // (Intel), and CPU EP silently ignore every field below. Do not + // repurpose them for other backends. + // ---------------------------------------------------------------- + + // Try to attach TensorRT EP before CUDA EP. Falls back to CUDA EP + // automatically if TRT EP creation or session creation fails. + // Engines are cached on disk for fast reload. bool preferTensorRT = false; - // Use the largest cuDNN conv workspace. cuDNN can then pick fast + // Use the largest cuDNN conv workspace. cuDNN can then pick fast // algorithms (Winograd, implicit-precomp-GEMM with big workspaces). // Defaults off because some deployments share VRAM with TRT engines // and need the minimal-workspace mode to avoid OOM. bool useMaxCudnnWorkspace = false; - // Where to cache built TRT engines. Empty → default - // %TEMP%/ANSCENTER/TRTEngineCache. Only used when preferTensorRT. + // Where to cache built TRT engines. Empty → default + // %TEMP%/ANSCENTER/TRTEngineCache. Only used when preferTensorRT. std::string trtEngineCacheDir; - // FP16 builds for TRT EP. Recommended for inference; ignored if + // FP16 builds for TRT EP. Recommended for inference; ignored if // preferTensorRT is false. bool trtFP16 = true; @@ -286,6 +313,28 @@ namespace ANSCENTER { std::string trtProfileMinShapes; std::string trtProfileOptShapes; std::string trtProfileMaxShapes; + + // ---------------------------------------------------------------- + // Intel (OpenVINO EP) — OPEN FOR OPTIMIZATION + // + // Currently unused. Future Intel-specific tuning (cache_dir for + // kernel cache, explicit device selection, INT8 routing, etc.) + // should add fields here and wire them through the OpenVINO + // branch of initialize_handler(). Do NOT put Intel logic inside + // TryAppendCUDA or TryAppendTensorRT. + // ---------------------------------------------------------------- + // (Intel fields go here — none yet) + + // ---------------------------------------------------------------- + // AMD (DirectML EP / MIGraphX EP) — OPEN FOR OPTIMIZATION + // + // Currently unused. Future AMD-specific tuning (graph optimization + // gate for RDNA3+, MIGraphX cache dir on Linux, etc.) should add + // fields here and wire them through the DirectML branch of + // initialize_handler(). Do NOT put AMD logic inside TryAppendCUDA + // or TryAppendTensorRT. + // ---------------------------------------------------------------- + // (AMD fields go here — none yet) }; // ==================================================================== diff --git a/modules/ANSLPR/ANSLPR_OCR.cpp b/modules/ANSLPR/ANSLPR_OCR.cpp index a2f8383..2a73222 100644 --- a/modules/ANSLPR/ANSLPR_OCR.cpp +++ b/modules/ANSLPR/ANSLPR_OCR.cpp @@ -547,6 +547,181 @@ namespace ANSCENTER return colour; } + // ── Full-frame vs pipeline auto-detection ──────────────────────────── + // Mirror of ANSALPR_OD::shouldUseALPRChecker. The auto-detection logic + // watches whether consecutive frames from a given camera have the exact + // same (width, height). Pre-cropped pipeline inputs vary by a few + // pixels per crop, so the exact-match check fails and we return false. + // Real video frames are pixel-identical across frames, so after a few + // consistent frames we flip into FULL-FRAME mode and start running the + // ALPRChecker voting + ensureUniquePlateText dedup. + bool ANSALPR_OCR::shouldUseALPRChecker(const cv::Size& imageSize, + const std::string& cameraId) { + // Force disabled via SetALPRCheckerEnabled(false) → never use. + if (!_enableALPRChecker) return false; + + // Small images are always pipeline crops — skip auto-detection. + if (imageSize.width < ImageSizeTracker::MIN_FULLFRAME_WIDTH) return false; + + auto& tracker = _imageSizeTrackers[cameraId]; + bool wasFullFrame = tracker.detectedFullFrame; + if (imageSize == tracker.lastSize) { + tracker.consistentCount++; + if (tracker.consistentCount >= ImageSizeTracker::CONFIRM_THRESHOLD) { + tracker.detectedFullFrame = true; + } + } else { + tracker.lastSize = imageSize; + tracker.consistentCount = 1; + tracker.detectedFullFrame = false; + } + if (tracker.detectedFullFrame != wasFullFrame) { + ANS_DBG("ALPR_OCR_Checker", + "cam=%s mode auto-detected: %s (img=%dx%d consistent=%d)", + cameraId.c_str(), + tracker.detectedFullFrame ? "FULL-FRAME (tracker ON)" : "PIPELINE (tracker OFF)", + imageSize.width, imageSize.height, tracker.consistentCount); + } + return tracker.detectedFullFrame; + } + + // ── Spatial plate dedup with accumulated scoring ───────────────────── + // Mirror of ANSALPR_OD::ensureUniquePlateText. When more than one + // detection in the same frame ends up with the same plate text (e.g. + // tracker occlusion or two cars in a single frame reading the same + // string), we resolve the ambiguity by accumulating confidence per + // spatial location across frames. The location with the higher running + // score keeps the plate text; the loser has its className cleared and + // is dropped from the output. + void ANSALPR_OCR::ensureUniquePlateText(std::vector& results, + const std::string& cameraId) { + std::lock_guard plateLock(_plateIdentitiesMutex); + auto& identities = _plateIdentities[cameraId]; + + // Auto-detect mode by detection count. + // 1 detection → pipeline/single-crop mode → no dedup needed. + // 2+ detections → full-frame mode → apply accumulated scoring. + if (results.size() <= 1) { + // Still age out stale spatial identities from previous full-frame calls + if (!identities.empty()) { + constexpr int MAX_UNSEEN_FRAMES = 30; + for (auto& id : identities) id.framesSinceLastSeen++; + for (auto it = identities.begin(); it != identities.end(); ) { + if (it->framesSinceLastSeen > MAX_UNSEEN_FRAMES) { + it = identities.erase(it); + } else { + ++it; + } + } + } + return; + } + + // Helper: IoU between two rects. + auto computeIoU = [](const cv::Rect& a, const cv::Rect& b) -> float { + int x1 = std::max(a.x, b.x); + int y1 = std::max(a.y, b.y); + int x2 = std::min(a.x + a.width, b.x + b.width); + int y2 = std::min(a.y + a.height, b.y + b.height); + if (x2 <= x1 || y2 <= y1) return 0.0f; + float intersection = static_cast((x2 - x1) * (y2 - y1)); + float unionArea = static_cast(a.area() + b.area()) - intersection; + return (unionArea > 0.0f) ? intersection / unionArea : 0.0f; + }; + + // Helper: find matching spatial identity by bounding-box overlap. + auto findSpatialMatch = [&](const cv::Rect& box, + const std::string& plateText) -> SpatialPlateIdentity* { + for (auto& id : identities) { + if (id.plateText == plateText) { + cv::Rect storedRect( + static_cast(id.center.x - box.width * 0.5f), + static_cast(id.center.y - box.height * 0.5f), + box.width, box.height); + if (computeIoU(box, storedRect) > PLATE_SPATIAL_MATCH_THRESHOLD) { + return &id; + } + } + } + return nullptr; + }; + + // Step 1: Build map of plateText → candidate indices + std::unordered_map> plateCandidates; + for (size_t i = 0; i < results.size(); ++i) { + if (results[i].className.empty()) continue; + plateCandidates[results[i].className].push_back(i); + } + + // Step 2: Resolve duplicates using spatial accumulated scores + for (auto& [plateText, indices] : plateCandidates) { + if (indices.size() <= 1) continue; + + size_t winner = indices[0]; + float bestScore = 0.0f; + + for (size_t idx : indices) { + float score = results[idx].confidence; + auto* match = findSpatialMatch(results[idx].box, plateText); + if (match) { + score = match->accumulatedScore + results[idx].confidence; + } + if (score > bestScore) { + bestScore = score; + winner = idx; + } + } + + for (size_t idx : indices) { + if (idx != winner) { + results[idx].className.clear(); + } + } + } + + // Step 3: Update spatial identities — winners accumulate, losers decay + constexpr float DECAY_FACTOR = 0.8f; + constexpr float MIN_SCORE = 0.1f; + constexpr int MAX_UNSEEN_FRAMES = 30; + + for (auto& id : identities) id.framesSinceLastSeen++; + + for (auto& r : results) { + if (r.className.empty()) continue; + + cv::Point2f center( + r.box.x + r.box.width * 0.5f, + r.box.y + r.box.height * 0.5f); + + auto* match = findSpatialMatch(r.box, r.className); + if (match) { + match->accumulatedScore += r.confidence; + match->center = center; + match->framesSinceLastSeen = 0; + } else { + identities.push_back({ center, r.className, r.confidence, 0 }); + } + } + + // Decay unseen identities and remove stale ones + for (auto it = identities.begin(); it != identities.end(); ) { + if (it->framesSinceLastSeen > 0) { + it->accumulatedScore *= DECAY_FACTOR; + } + if (it->accumulatedScore < MIN_SCORE || it->framesSinceLastSeen > MAX_UNSEEN_FRAMES) { + it = identities.erase(it); + } else { + ++it; + } + } + + // Step 4: Remove entries with cleared plate text + results.erase( + std::remove_if(results.begin(), results.end(), + [](const Object& o) { return o.className.empty(); }), + results.end()); + } + // ── OCR on a single plate ROI ──────────────────────────────────────── // Returns the plate text via the out-parameter and populates alprExtraInfo // with the structured ALPR JSON (zone parts) when ALPR mode is active. @@ -712,6 +887,13 @@ namespace ANSCENTER std::vector output; output.reserve(plateInfos.size()); + // Decide once per frame whether the tracker-based correction + // layer should run. We auto-detect full-frame vs pipeline mode + // by watching for pixel-identical consecutive frames, exactly + // the same way ANSALPR_OD does it. + const bool useChecker = shouldUseALPRChecker( + cv::Size(frameWidth, frameHeight), cameraId); + for (const auto& info : plateInfos) { std::string combinedText; for (size_t cropIdx : info.cropIndices) { @@ -726,8 +908,9 @@ namespace ANSCENTER Object lprObject = lprOutput[info.origIndex]; lprObject.cameraId = cameraId; - // Cross-frame stabilization (unchanged) - if (_enableALPRChecker) { + // Cross-frame stabilization: per-track majority vote in + // full-frame mode, raw OCR text in pipeline mode. + if (useChecker) { lprObject.className = alprChecker.checkPlateByTrackId( cameraId, combinedText, lprObject.trackId); } @@ -747,6 +930,14 @@ namespace ANSCENTER output.push_back(std::move(lprObject)); } + // Spatial dedup: if two detections in the same frame ended up + // with the same plate text, keep only the one whose spatial + // history has the higher accumulated confidence. Skip this in + // pipeline mode because there's only ever one plate per call. + if (useChecker) { + ensureUniquePlateText(output, cameraId); + } + return output; } catch (const cv::Exception& e) { diff --git a/modules/ANSLPR/ANSLPR_OCR.h b/modules/ANSLPR/ANSLPR_OCR.h index fa53597..f1e8f49 100644 --- a/modules/ANSLPR/ANSLPR_OCR.h +++ b/modules/ANSLPR/ANSLPR_OCR.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -45,6 +46,66 @@ namespace ANSCENTER ALPRChecker alprChecker; + // ---------------------------------------------------------------- + // Full-frame vs pipeline auto-detection (ported from ANSALPR_OD) + // + // When the caller feeds ANSLPR_OCR pre-cropped vehicle ROIs (each + // frame is a different small image), the tracker can't work — the + // LP detector sees a totally new image every call so trackIds mean + // nothing. In that "pipeline" mode we must skip the ALPRChecker + // voting layer entirely and return raw OCR results. + // + // When the caller feeds full-frame video (same resolution every + // frame, plates moving through the scene), the tracker works + // normally and we run plate text through ALPRChecker majority + // voting + spatial dedup to stabilise readings. + // + // Mode is auto-detected by watching whether consecutive frames + // share the exact same (width, height) for at least + // CONFIRM_THRESHOLD frames. Pipeline crops vary by a few pixels; + // full-frame video is pixel-identical. + // ---------------------------------------------------------------- + struct ImageSizeTracker { + cv::Size lastSize{ 0, 0 }; + int consistentCount = 0; + bool detectedFullFrame = false; + static constexpr int CONFIRM_THRESHOLD = 5; + static constexpr int MIN_FULLFRAME_WIDTH = 1000; + }; + std::unordered_map _imageSizeTrackers; + + [[nodiscard]] bool shouldUseALPRChecker(const cv::Size& imageSize, + const std::string& cameraId); + + // ---------------------------------------------------------------- + // Spatial plate identity persistence (ported from ANSALPR_OD) + // + // Prevents the same plate string from appearing on two different + // vehicles in the same frame. The LP tracker may briefly assign + // the same trackId to two different plates when vehicles pass + // each other, or two different trackIds to the same plate when + // occlusion breaks a track. In either case, OCR can produce the + // same text for two spatial locations for a frame or two — which + // looks like "plate flicker" in the UI. + // + // ensureUniquePlateText() resolves the ambiguity by accumulating + // confidence per spatial location. When two detections share a + // plate text, the one whose spatial history has the higher score + // wins and the other has its className cleared. + // ---------------------------------------------------------------- + struct SpatialPlateIdentity { + cv::Point2f center; // plate center in frame coords + std::string plateText; + float accumulatedScore = 0.0f; + int framesSinceLastSeen = 0; + }; + std::mutex _plateIdentitiesMutex; + std::unordered_map> _plateIdentities; + static constexpr float PLATE_SPATIAL_MATCH_THRESHOLD = 0.3f; // IoU threshold + + void ensureUniquePlateText(std::vector& results, + const std::string& cameraId); + // --- Original model zip path (reused for ANSONNXOCR initialization) --- std::string _modelZipFilePath; diff --git a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp index 79fb541..71406d4 100644 --- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp +++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp @@ -8,60 +8,62 @@ namespace ANSCENTER { namespace onnxocr { -bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath, - const std::string& clsModelPath, - const std::string& recModelPath, - const std::string& dictPath, - bool preferTensorRT) { - std::lock_guard lock(_mutex); - ModelLoadingGuard mlg(_modelLoading); +// ============================================================================ +// Per-backend OCR option builders +// +// Each backend (NVIDIA / AMD / Intel / CPU) has its own helper that returns +// a fully-populated set of OrtHandlerOptions for the detector, classifier, +// and recognizer sub-models. PaddleOCRV5Engine::Initialize dispatches to the +// correct helper based on the engine type that EPLoader resolved at startup. +// +// Adding a new backend optimization is a strictly contained change: touch +// only that backend's builder. The others — especially NVIDIA, which is +// hand-tuned and should not regress — stay untouched. +// ============================================================================ - // High-perf options. The OCR sub-models split into two groups: - // - // 1. Detector — its input shape varies continuously with every - // plate-ROI aspect ratio. TRT EP is a poor fit because it - // builds a fresh engine for each unique shape (minutes each). - // We keep it on CUDA EP with the largest cuDNN workspace and - // let cuDNN HEURISTIC handle the per-shape algo selection. - // - // 2. Classifier + Recognizer — fixed-bucket shapes (cls is - // [1,3,80,160], rec is [1,3,48,{320,480,640,960}]). These - // benefit massively from TRT EP because the engine is built - // once per shape and reused forever. +namespace { + +struct PerModelOcrOptions { OrtHandlerOptions detectorOpts; - // Detector uses CUDA EP with *conservative* cuDNN workspace. - // Empirical: on VRAM-constrained GPUs (LPD TRT engine + rec TRT - // engine + ORT arena in play) the max-workspace mode causes cuDNN - // to pick Winograd/implicit-precomp-GEMM variants that silently - // fall back to slow NO-WORKSPACE algorithms when the big workspace - // can't be allocated. With "0" cuDNN picks algorithms that are - // known to fit and runs ~10x faster in practice. - detectorOpts.useMaxCudnnWorkspace = false; - detectorOpts.preferTensorRT = false; // never TRT for the detector - - // Classifier (fixed [1,3,80,160]): TRT with no profile is fine. OrtHandlerOptions classifierOpts; - classifierOpts.useMaxCudnnWorkspace = true; - classifierOpts.preferTensorRT = preferTensorRT; - classifierOpts.trtFP16 = true; - - // Recognizer: needs a DYNAMIC profile so one TRT engine covers every - // (batch, bucket_width) pair we generate at runtime. Without this, - // each new shape triggers a ~80s engine rebuild mid-stream when a - // new plate appears or the plate count changes. - // - // Profile range: - // batch : 1 .. 16 (16 plates worth of crops is generous) - // H : 48 (fixed) - // W : 320 .. 960 (covers all 4 recognizer buckets) - // - // Query the actual input name from the .onnx file instead of - // hardcoding — PaddleOCR usually exports it as "x" but the name can - // vary across model versions. OrtHandlerOptions recognizerOpts; - recognizerOpts.useMaxCudnnWorkspace = true; - recognizerOpts.preferTensorRT = preferTensorRT; - recognizerOpts.trtFP16 = true; +}; + +// ---------------------------------------------------------------------------- +// NVIDIA — LOCKED. Do NOT modify this helper unless fixing a specific +// NVIDIA-observable regression. +// +// The OCR sub-models split into two groups: +// 1. Detector — variable input shape per plate-ROI aspect. TRT EP is a +// poor fit (one engine build per unique shape, minutes each). Runs on +// CUDA EP with *conservative* cuDNN workspace: empirical measurements +// showed that max-workspace mode forces cuDNN to pick Winograd/ +// implicit-precomp-GEMM variants that silently fall back to slow +// NO-WORKSPACE algorithms when the big workspace can't be allocated +// under VRAM pressure (LPD TRT engine + rec TRT engine + ORT arena). +// 2. Classifier + Recognizer — TRT EP. Classifier has fixed shape so no +// profile is needed. Recognizer gets a dynamic profile +// [batch=1..16, W=320..960] so a single pre-built engine handles every +// runtime shape without mid-stream rebuilds (fixes 60–90 s hangs). +// ---------------------------------------------------------------------------- +static PerModelOcrOptions BuildNvidiaOcrOptions( + const std::string& recModelPath, + bool preferTensorRT) { + PerModelOcrOptions opts; + + // Detector: CUDA EP, conservative workspace, never TRT. + opts.detectorOpts.useMaxCudnnWorkspace = false; + opts.detectorOpts.preferTensorRT = false; + + // Classifier: TRT EP, no profile (fixed [1,3,80,160]). + opts.classifierOpts.useMaxCudnnWorkspace = true; + opts.classifierOpts.preferTensorRT = preferTensorRT; + opts.classifierOpts.trtFP16 = true; + + // Recognizer: TRT EP with dynamic shape profile. + opts.recognizerOpts.useMaxCudnnWorkspace = true; + opts.recognizerOpts.preferTensorRT = preferTensorRT; + opts.recognizerOpts.trtFP16 = true; if (preferTensorRT) { std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath); if (recInputName.empty()) { @@ -72,10 +74,80 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath, std::cout << "[PaddleOCRV5Engine] Recognizer input name: '" << recInputName << "' — building TRT dynamic profile " << "[batch=1..16, W=320..960]" << std::endl; - recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320"; - recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480"; - recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960"; + opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320"; + opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480"; + opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960"; } + return opts; +} + +// ---------------------------------------------------------------------------- +// Intel (OpenVINO EP) — placeholder. +// +// Returns default-constructed options: no backend-specific tuning applied +// yet. When adding Intel optimizations (OpenVINO cache_dir, explicit device +// selection, INT8 paths, etc.), add the corresponding fields to the Intel +// section of OrtHandlerOptions and populate them here. +// ---------------------------------------------------------------------------- +static PerModelOcrOptions BuildIntelOcrOptions() { + return PerModelOcrOptions{}; // defaults everywhere +} + +// ---------------------------------------------------------------------------- +// AMD (DirectML EP / MIGraphX EP) — placeholder. +// +// Returns default-constructed options: no backend-specific tuning applied +// yet. When adding AMD optimizations (graph opt gate for RDNA3+ desktop +// cards, MIGraphX cache on Linux, etc.), add the corresponding fields to +// the AMD section of OrtHandlerOptions and populate them here. +// ---------------------------------------------------------------------------- +static PerModelOcrOptions BuildAmdOcrOptions() { + return PerModelOcrOptions{}; // defaults everywhere +} + +// ---------------------------------------------------------------------------- +// CPU / unknown hardware — no tuning. +// ---------------------------------------------------------------------------- +static PerModelOcrOptions BuildDefaultOcrOptions() { + return PerModelOcrOptions{}; // defaults everywhere +} + +// Dispatch entry point used by Initialize(). +static PerModelOcrOptions BuildOcrOptionsForBackend( + const std::string& recModelPath, + bool preferTensorRT) { + const EngineType backend = EPLoader::Current().type; + switch (backend) { + case EngineType::NVIDIA_GPU: + return BuildNvidiaOcrOptions(recModelPath, preferTensorRT); + case EngineType::AMD_GPU: + return BuildAmdOcrOptions(); + case EngineType::OPENVINO_GPU: + return BuildIntelOcrOptions(); + case EngineType::CPU: + default: + return BuildDefaultOcrOptions(); + } +} + +} // namespace (anonymous) + +bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath, + const std::string& clsModelPath, + const std::string& recModelPath, + const std::string& dictPath, + bool preferTensorRT) { + std::lock_guard lock(_mutex); + ModelLoadingGuard mlg(_modelLoading); + + // Dispatch to the correct per-backend option builder. The NVIDIA path + // is fully locked-in; AMD/Intel/CPU paths currently return defaults + // and are the place to add future backend-specific tuning. + const PerModelOcrOptions opts = + BuildOcrOptionsForBackend(recModelPath, preferTensorRT); + const OrtHandlerOptions& detectorOpts = opts.detectorOpts; + const OrtHandlerOptions& classifierOpts = opts.classifierOpts; + const OrtHandlerOptions& recognizerOpts = opts.recognizerOpts; try { // Initialize detector (also triggers EPLoader init in BasicOrtHandler)