Support tracker to improve ALPR_OCR

2026-04-14 21:18:10 +10:00
parent f9a0af8949
commit 5706615ed5
4 changed files with 435 additions and 62 deletions
--- a/modules/ANSLPR/ANSLPR_OCR.cpp
+++ b/modules/ANSLPR/ANSLPR_OCR.cpp
@@ -547,6 +547,181 @@ namespace ANSCENTER
 		return colour;
 	}

+	// ── Full-frame vs pipeline auto-detection ────────────────────────────
+	// Mirror of ANSALPR_OD::shouldUseALPRChecker. The auto-detection logic
+	// watches whether consecutive frames from a given camera have the exact
+	// same (width, height). Pre-cropped pipeline inputs vary by a few
+	// pixels per crop, so the exact-match check fails and we return false.
+	// Real video frames are pixel-identical across frames, so after a few
+	// consistent frames we flip into FULL-FRAME mode and start running the
+	// ALPRChecker voting + ensureUniquePlateText dedup.
+	bool ANSALPR_OCR::shouldUseALPRChecker(const cv::Size& imageSize,
+	                                        const std::string& cameraId) {
+		// Force disabled via SetALPRCheckerEnabled(false) → never use.
+		if (!_enableALPRChecker) return false;
+
+		// Small images are always pipeline crops — skip auto-detection.
+		if (imageSize.width < ImageSizeTracker::MIN_FULLFRAME_WIDTH) return false;
+
+		auto& tracker = _imageSizeTrackers[cameraId];
+		bool wasFullFrame = tracker.detectedFullFrame;
+		if (imageSize == tracker.lastSize) {
+			tracker.consistentCount++;
+			if (tracker.consistentCount >= ImageSizeTracker::CONFIRM_THRESHOLD) {
+				tracker.detectedFullFrame = true;
+			}
+		} else {
+			tracker.lastSize          = imageSize;
+			tracker.consistentCount   = 1;
+			tracker.detectedFullFrame = false;
+		}
+		if (tracker.detectedFullFrame != wasFullFrame) {
+			ANS_DBG("ALPR_OCR_Checker",
+			        "cam=%s mode auto-detected: %s (img=%dx%d consistent=%d)",
+			        cameraId.c_str(),
+			        tracker.detectedFullFrame ? "FULL-FRAME (tracker ON)" : "PIPELINE (tracker OFF)",
+			        imageSize.width, imageSize.height, tracker.consistentCount);
+		}
+		return tracker.detectedFullFrame;
+	}
+
+	// ── Spatial plate dedup with accumulated scoring ─────────────────────
+	// Mirror of ANSALPR_OD::ensureUniquePlateText. When more than one
+	// detection in the same frame ends up with the same plate text (e.g.
+	// tracker occlusion or two cars in a single frame reading the same
+	// string), we resolve the ambiguity by accumulating confidence per
+	// spatial location across frames. The location with the higher running
+	// score keeps the plate text; the loser has its className cleared and
+	// is dropped from the output.
+	void ANSALPR_OCR::ensureUniquePlateText(std::vector<Object>& results,
+	                                        const std::string& cameraId) {
+		std::lock_guard<std::mutex> plateLock(_plateIdentitiesMutex);
+		auto& identities = _plateIdentities[cameraId];
+
+		// Auto-detect mode by detection count.
+		//   1 detection  → pipeline/single-crop mode → no dedup needed.
+		//   2+ detections → full-frame mode → apply accumulated scoring.
+		if (results.size() <= 1) {
+			// Still age out stale spatial identities from previous full-frame calls
+			if (!identities.empty()) {
+				constexpr int MAX_UNSEEN_FRAMES = 30;
+				for (auto& id : identities) id.framesSinceLastSeen++;
+				for (auto it = identities.begin(); it != identities.end(); ) {
+					if (it->framesSinceLastSeen > MAX_UNSEEN_FRAMES) {
+						it = identities.erase(it);
+					} else {
+						++it;
+					}
+				}
+			}
+			return;
+		}
+
+		// Helper: IoU between two rects.
+		auto computeIoU = [](const cv::Rect& a, const cv::Rect& b) -> float {
+			int x1 = std::max(a.x, b.x);
+			int y1 = std::max(a.y, b.y);
+			int x2 = std::min(a.x + a.width,  b.x + b.width);
+			int y2 = std::min(a.y + a.height, b.y + b.height);
+			if (x2 <= x1 || y2 <= y1) return 0.0f;
+			float intersection = static_cast<float>((x2 - x1) * (y2 - y1));
+			float unionArea = static_cast<float>(a.area() + b.area()) - intersection;
+			return (unionArea > 0.0f) ? intersection / unionArea : 0.0f;
+		};
+
+		// Helper: find matching spatial identity by bounding-box overlap.
+		auto findSpatialMatch = [&](const cv::Rect& box,
+		                            const std::string& plateText) -> SpatialPlateIdentity* {
+			for (auto& id : identities) {
+				if (id.plateText == plateText) {
+					cv::Rect storedRect(
+						static_cast<int>(id.center.x - box.width  * 0.5f),
+						static_cast<int>(id.center.y - box.height * 0.5f),
+						box.width, box.height);
+					if (computeIoU(box, storedRect) > PLATE_SPATIAL_MATCH_THRESHOLD) {
+						return &id;
+					}
+				}
+			}
+			return nullptr;
+		};
+
+		// Step 1: Build map of plateText → candidate indices
+		std::unordered_map<std::string, std::vector<size_t>> plateCandidates;
+		for (size_t i = 0; i < results.size(); ++i) {
+			if (results[i].className.empty()) continue;
+			plateCandidates[results[i].className].push_back(i);
+		}
+
+		// Step 2: Resolve duplicates using spatial accumulated scores
+		for (auto& [plateText, indices] : plateCandidates) {
+			if (indices.size() <= 1) continue;
+
+			size_t winner = indices[0];
+			float  bestScore = 0.0f;
+
+			for (size_t idx : indices) {
+				float score = results[idx].confidence;
+				auto* match = findSpatialMatch(results[idx].box, plateText);
+				if (match) {
+					score = match->accumulatedScore + results[idx].confidence;
+				}
+				if (score > bestScore) {
+					bestScore = score;
+					winner    = idx;
+				}
+			}
+
+			for (size_t idx : indices) {
+				if (idx != winner) {
+					results[idx].className.clear();
+				}
+			}
+		}
+
+		// Step 3: Update spatial identities — winners accumulate, losers decay
+		constexpr float DECAY_FACTOR     = 0.8f;
+		constexpr float MIN_SCORE        = 0.1f;
+		constexpr int   MAX_UNSEEN_FRAMES = 30;
+
+		for (auto& id : identities) id.framesSinceLastSeen++;
+
+		for (auto& r : results) {
+			if (r.className.empty()) continue;
+
+			cv::Point2f center(
+				r.box.x + r.box.width  * 0.5f,
+				r.box.y + r.box.height * 0.5f);
+
+			auto* match = findSpatialMatch(r.box, r.className);
+			if (match) {
+				match->accumulatedScore += r.confidence;
+				match->center            = center;
+				match->framesSinceLastSeen = 0;
+			} else {
+				identities.push_back({ center, r.className, r.confidence, 0 });
+			}
+		}
+
+		// Decay unseen identities and remove stale ones
+		for (auto it = identities.begin(); it != identities.end(); ) {
+			if (it->framesSinceLastSeen > 0) {
+				it->accumulatedScore *= DECAY_FACTOR;
+			}
+			if (it->accumulatedScore < MIN_SCORE || it->framesSinceLastSeen > MAX_UNSEEN_FRAMES) {
+				it = identities.erase(it);
+			} else {
+				++it;
+			}
+		}
+
+		// Step 4: Remove entries with cleared plate text
+		results.erase(
+			std::remove_if(results.begin(), results.end(),
+				[](const Object& o) { return o.className.empty(); }),
+			results.end());
+	}
+
 	// ── OCR on a single plate ROI ────────────────────────────────────────
 	// Returns the plate text via the out-parameter and populates alprExtraInfo
 	// with the structured ALPR JSON (zone parts) when ALPR mode is active.
@@ -712,6 +887,13 @@ namespace ANSCENTER
 			std::vector<Object> output;
 			output.reserve(plateInfos.size());

+			// Decide once per frame whether the tracker-based correction
+			// layer should run. We auto-detect full-frame vs pipeline mode
+			// by watching for pixel-identical consecutive frames, exactly
+			// the same way ANSALPR_OD does it.
+			const bool useChecker = shouldUseALPRChecker(
+				cv::Size(frameWidth, frameHeight), cameraId);
+
 			for (const auto& info : plateInfos) {
 				std::string combinedText;
 				for (size_t cropIdx : info.cropIndices) {
@@ -726,8 +908,9 @@ namespace ANSCENTER
 				Object lprObject = lprOutput[info.origIndex];
 				lprObject.cameraId = cameraId;

-				// Cross-frame stabilization (unchanged)
-				if (_enableALPRChecker) {
+				// Cross-frame stabilization: per-track majority vote in
+				// full-frame mode, raw OCR text in pipeline mode.
+				if (useChecker) {
 					lprObject.className = alprChecker.checkPlateByTrackId(
 						cameraId, combinedText, lprObject.trackId);
 				}
@@ -747,6 +930,14 @@ namespace ANSCENTER
 				output.push_back(std::move(lprObject));
 			}

+			// Spatial dedup: if two detections in the same frame ended up
+			// with the same plate text, keep only the one whose spatial
+			// history has the higher accumulated confidence. Skip this in
+			// pipeline mode because there's only ever one plate per call.
+			if (useChecker) {
+				ensureUniquePlateText(output, cameraId);
+			}
+
 			return output;
 		}
 		catch (const cv::Exception& e) {
--- a/modules/ANSLPR/ANSLPR_OCR.h
+++ b/modules/ANSLPR/ANSLPR_OCR.h
@@ -6,6 +6,7 @@
 #include <map>
 #include <string>
 #include <mutex>
+#include <unordered_map>
 #include <utility>
 #include <vector>

@@ -45,6 +46,66 @@ namespace ANSCENTER

        ALPRChecker     alprChecker;

+        // ----------------------------------------------------------------
+        //  Full-frame vs pipeline auto-detection (ported from ANSALPR_OD)
+        //
+        //  When the caller feeds ANSLPR_OCR pre-cropped vehicle ROIs (each
+        //  frame is a different small image), the tracker can't work — the
+        //  LP detector sees a totally new image every call so trackIds mean
+        //  nothing. In that "pipeline" mode we must skip the ALPRChecker
+        //  voting layer entirely and return raw OCR results.
+        //
+        //  When the caller feeds full-frame video (same resolution every
+        //  frame, plates moving through the scene), the tracker works
+        //  normally and we run plate text through ALPRChecker majority
+        //  voting + spatial dedup to stabilise readings.
+        //
+        //  Mode is auto-detected by watching whether consecutive frames
+        //  share the exact same (width, height) for at least
+        //  CONFIRM_THRESHOLD frames. Pipeline crops vary by a few pixels;
+        //  full-frame video is pixel-identical.
+        // ----------------------------------------------------------------
+        struct ImageSizeTracker {
+            cv::Size lastSize{ 0, 0 };
+            int      consistentCount = 0;
+            bool     detectedFullFrame = false;
+            static constexpr int CONFIRM_THRESHOLD   = 5;
+            static constexpr int MIN_FULLFRAME_WIDTH = 1000;
+        };
+        std::unordered_map<std::string, ImageSizeTracker> _imageSizeTrackers;
+
+        [[nodiscard]] bool shouldUseALPRChecker(const cv::Size& imageSize,
+                                                const std::string& cameraId);
+
+        // ----------------------------------------------------------------
+        //  Spatial plate identity persistence (ported from ANSALPR_OD)
+        //
+        //  Prevents the same plate string from appearing on two different
+        //  vehicles in the same frame. The LP tracker may briefly assign
+        //  the same trackId to two different plates when vehicles pass
+        //  each other, or two different trackIds to the same plate when
+        //  occlusion breaks a track. In either case, OCR can produce the
+        //  same text for two spatial locations for a frame or two — which
+        //  looks like "plate flicker" in the UI.
+        //
+        //  ensureUniquePlateText() resolves the ambiguity by accumulating
+        //  confidence per spatial location. When two detections share a
+        //  plate text, the one whose spatial history has the higher score
+        //  wins and the other has its className cleared.
+        // ----------------------------------------------------------------
+        struct SpatialPlateIdentity {
+            cv::Point2f center;                   // plate center in frame coords
+            std::string plateText;
+            float       accumulatedScore    = 0.0f;
+            int         framesSinceLastSeen = 0;
+        };
+        std::mutex _plateIdentitiesMutex;
+        std::unordered_map<std::string, std::vector<SpatialPlateIdentity>> _plateIdentities;
+        static constexpr float PLATE_SPATIAL_MATCH_THRESHOLD = 0.3f; // IoU threshold
+
+        void ensureUniquePlateText(std::vector<Object>& results,
+                                   const std::string& cameraId);
+
        // --- Original model zip path (reused for ANSONNXOCR initialization) ---
        std::string     _modelZipFilePath;

--- a/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
+++ b/modules/ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
@@ -8,60 +8,62 @@
 namespace ANSCENTER {
 namespace onnxocr {

-bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
-                                    const std::string& clsModelPath,
-                                    const std::string& recModelPath,
-                                    const std::string& dictPath,
-                                    bool preferTensorRT) {
-    std::lock_guard<std::recursive_mutex> lock(_mutex);
-    ModelLoadingGuard mlg(_modelLoading);
+// ============================================================================
+//  Per-backend OCR option builders
+//
+//  Each backend (NVIDIA / AMD / Intel / CPU) has its own helper that returns
+//  a fully-populated set of OrtHandlerOptions for the detector, classifier,
+//  and recognizer sub-models. PaddleOCRV5Engine::Initialize dispatches to the
+//  correct helper based on the engine type that EPLoader resolved at startup.
+//
+//  Adding a new backend optimization is a strictly contained change: touch
+//  only that backend's builder. The others — especially NVIDIA, which is
+//  hand-tuned and should not regress — stay untouched.
+// ============================================================================

-    // High-perf options.  The OCR sub-models split into two groups:
-    //
-    //   1. Detector — its input shape varies continuously with every
-    //      plate-ROI aspect ratio.  TRT EP is a poor fit because it
-    //      builds a fresh engine for each unique shape (minutes each).
-    //      We keep it on CUDA EP with the largest cuDNN workspace and
-    //      let cuDNN HEURISTIC handle the per-shape algo selection.
-    //
-    //   2. Classifier + Recognizer — fixed-bucket shapes (cls is
-    //      [1,3,80,160], rec is [1,3,48,{320,480,640,960}]).  These
-    //      benefit massively from TRT EP because the engine is built
-    //      once per shape and reused forever.
+namespace {
+
+struct PerModelOcrOptions {
    OrtHandlerOptions detectorOpts;
-    // Detector uses CUDA EP with *conservative* cuDNN workspace.
-    // Empirical: on VRAM-constrained GPUs (LPD TRT engine + rec TRT
-    // engine + ORT arena in play) the max-workspace mode causes cuDNN
-    // to pick Winograd/implicit-precomp-GEMM variants that silently
-    // fall back to slow NO-WORKSPACE algorithms when the big workspace
-    // can't be allocated. With "0" cuDNN picks algorithms that are
-    // known to fit and runs ~10x faster in practice.
-    detectorOpts.useMaxCudnnWorkspace = false;
-    detectorOpts.preferTensorRT       = false;   // never TRT for the detector
-
-    // Classifier (fixed [1,3,80,160]): TRT with no profile is fine.
    OrtHandlerOptions classifierOpts;
-    classifierOpts.useMaxCudnnWorkspace = true;
-    classifierOpts.preferTensorRT       = preferTensorRT;
-    classifierOpts.trtFP16              = true;
-
-    // Recognizer: needs a DYNAMIC profile so one TRT engine covers every
-    // (batch, bucket_width) pair we generate at runtime. Without this,
-    // each new shape triggers a ~80s engine rebuild mid-stream when a
-    // new plate appears or the plate count changes.
-    //
-    // Profile range:
-    //   batch  : 1 .. 16       (16 plates worth of crops is generous)
-    //   H      : 48 (fixed)
-    //   W      : 320 .. 960    (covers all 4 recognizer buckets)
-    //
-    // Query the actual input name from the .onnx file instead of
-    // hardcoding — PaddleOCR usually exports it as "x" but the name can
-    // vary across model versions.
    OrtHandlerOptions recognizerOpts;
-    recognizerOpts.useMaxCudnnWorkspace = true;
-    recognizerOpts.preferTensorRT       = preferTensorRT;
-    recognizerOpts.trtFP16              = true;
+};
+
+// ----------------------------------------------------------------------------
+//  NVIDIA — LOCKED. Do NOT modify this helper unless fixing a specific
+//  NVIDIA-observable regression.
+//
+//  The OCR sub-models split into two groups:
+//    1. Detector — variable input shape per plate-ROI aspect. TRT EP is a
+//       poor fit (one engine build per unique shape, minutes each). Runs on
+//       CUDA EP with *conservative* cuDNN workspace: empirical measurements
+//       showed that max-workspace mode forces cuDNN to pick Winograd/
+//       implicit-precomp-GEMM variants that silently fall back to slow
+//       NO-WORKSPACE algorithms when the big workspace can't be allocated
+//       under VRAM pressure (LPD TRT engine + rec TRT engine + ORT arena).
+//    2. Classifier + Recognizer — TRT EP. Classifier has fixed shape so no
+//       profile is needed. Recognizer gets a dynamic profile
+//       [batch=1..16, W=320..960] so a single pre-built engine handles every
+//       runtime shape without mid-stream rebuilds (fixes 60–90 s hangs).
+// ----------------------------------------------------------------------------
+static PerModelOcrOptions BuildNvidiaOcrOptions(
+        const std::string& recModelPath,
+        bool preferTensorRT) {
+    PerModelOcrOptions opts;
+
+    // Detector: CUDA EP, conservative workspace, never TRT.
+    opts.detectorOpts.useMaxCudnnWorkspace = false;
+    opts.detectorOpts.preferTensorRT       = false;
+
+    // Classifier: TRT EP, no profile (fixed [1,3,80,160]).
+    opts.classifierOpts.useMaxCudnnWorkspace = true;
+    opts.classifierOpts.preferTensorRT       = preferTensorRT;
+    opts.classifierOpts.trtFP16              = true;
+
+    // Recognizer: TRT EP with dynamic shape profile.
+    opts.recognizerOpts.useMaxCudnnWorkspace = true;
+    opts.recognizerOpts.preferTensorRT       = preferTensorRT;
+    opts.recognizerOpts.trtFP16              = true;
    if (preferTensorRT) {
        std::string recInputName = BasicOrtHandler::QueryModelInputName(recModelPath);
        if (recInputName.empty()) {
@@ -72,10 +74,80 @@ bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
        std::cout << "[PaddleOCRV5Engine] Recognizer input name: '"
                  << recInputName << "' — building TRT dynamic profile "
                  << "[batch=1..16, W=320..960]" << std::endl;
-        recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
-        recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
-        recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
+        opts.recognizerOpts.trtProfileMinShapes = recInputName + ":1x3x48x320";
+        opts.recognizerOpts.trtProfileOptShapes = recInputName + ":4x3x48x480";
+        opts.recognizerOpts.trtProfileMaxShapes = recInputName + ":16x3x48x960";
    }
+    return opts;
+}
+
+// ----------------------------------------------------------------------------
+//  Intel (OpenVINO EP) — placeholder.
+//
+//  Returns default-constructed options: no backend-specific tuning applied
+//  yet. When adding Intel optimizations (OpenVINO cache_dir, explicit device
+//  selection, INT8 paths, etc.), add the corresponding fields to the Intel
+//  section of OrtHandlerOptions and populate them here.
+// ----------------------------------------------------------------------------
+static PerModelOcrOptions BuildIntelOcrOptions() {
+    return PerModelOcrOptions{};  // defaults everywhere
+}
+
+// ----------------------------------------------------------------------------
+//  AMD (DirectML EP / MIGraphX EP) — placeholder.
+//
+//  Returns default-constructed options: no backend-specific tuning applied
+//  yet. When adding AMD optimizations (graph opt gate for RDNA3+ desktop
+//  cards, MIGraphX cache on Linux, etc.), add the corresponding fields to
+//  the AMD section of OrtHandlerOptions and populate them here.
+// ----------------------------------------------------------------------------
+static PerModelOcrOptions BuildAmdOcrOptions() {
+    return PerModelOcrOptions{};  // defaults everywhere
+}
+
+// ----------------------------------------------------------------------------
+//  CPU / unknown hardware — no tuning.
+// ----------------------------------------------------------------------------
+static PerModelOcrOptions BuildDefaultOcrOptions() {
+    return PerModelOcrOptions{};  // defaults everywhere
+}
+
+// Dispatch entry point used by Initialize().
+static PerModelOcrOptions BuildOcrOptionsForBackend(
+        const std::string& recModelPath,
+        bool preferTensorRT) {
+    const EngineType backend = EPLoader::Current().type;
+    switch (backend) {
+        case EngineType::NVIDIA_GPU:
+            return BuildNvidiaOcrOptions(recModelPath, preferTensorRT);
+        case EngineType::AMD_GPU:
+            return BuildAmdOcrOptions();
+        case EngineType::OPENVINO_GPU:
+            return BuildIntelOcrOptions();
+        case EngineType::CPU:
+        default:
+            return BuildDefaultOcrOptions();
+    }
+}
+
+} // namespace (anonymous)
+
+bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
+                                    const std::string& clsModelPath,
+                                    const std::string& recModelPath,
+                                    const std::string& dictPath,
+                                    bool preferTensorRT) {
+    std::lock_guard<std::recursive_mutex> lock(_mutex);
+    ModelLoadingGuard mlg(_modelLoading);
+
+    // Dispatch to the correct per-backend option builder. The NVIDIA path
+    // is fully locked-in; AMD/Intel/CPU paths currently return defaults
+    // and are the place to add future backend-specific tuning.
+    const PerModelOcrOptions opts =
+        BuildOcrOptionsForBackend(recModelPath, preferTensorRT);
+    const OrtHandlerOptions& detectorOpts   = opts.detectorOpts;
+    const OrtHandlerOptions& classifierOpts = opts.classifierOpts;
+    const OrtHandlerOptions& recognizerOpts = opts.recognizerOpts;

    try {
        // Initialize detector (also triggers EPLoader init in BasicOrtHandler)