Refactor project structure

2026-03-28 19:56:39 +11:00
parent 1d267378b2
commit 8a2e721058
511 changed files with 59 additions and 48 deletions
--- a/modules/ANSODEngine/NV12PreprocessHelper.h
+++ b/modules/ANSODEngine/NV12PreprocessHelper.h
@@ -0,0 +1,226 @@
+#ifndef NV12_PREPROCESS_HELPER_H
+#define NV12_PREPROCESS_HELPER_H
+#pragma once
+
+// NV12PreprocessHelper — shared NV12/CUDA zero-copy preprocessing utility.
+//
+// Encapsulates all NV12 fast-path logic previously in ANSRTYOLO:
+//   - Warmup gating (skip NV12 for first N inferences)
+//   - Registry lookup + lock lifecycle
+//   - GPU matching (decode GPU vs inference GPU, cross-GPU BGR fallback)
+//   - Pinned buffer management (reusable cudaHostAlloc)
+//   - CUDA zero-copy path (wrap NVDEC device ptrs as GpuMat)
+//   - CPU NV12 upload path (SEH-protected memcpy + pinned upload)
+//   - CUDA health circuit breaker
+//   - BGR full-res path (pixFmt=1000)
+//   - One-shot diagnostic logging flags
+//
+// Composition, not inheritance — engines add it as a private member.
+// CUDA coupling stays explicit and opt-in (ONNX/OpenVINO engines unaffected).
+
+#include <functional>
+#include <cuda_runtime.h>
+#include <opencv2/core/cuda.hpp>
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>       // GetModuleHandleA, GetProcAddress for cross-DLL TLS
+#endif
+
+// ANSENGINE_API is defined by ANSEngineCommon.h (included before this header
+// via ANSRTYOLO.h / ANSYOLOV12RTOD.h / ANSYOLOV10RTOD.h).
+// Guard for standalone inclusion.
+#ifndef ANSENGINE_API
+#define ANSENGINE_API __declspec(dllexport)
+#endif
+
+// Forward declarations — avoid pulling in heavy headers
+struct GpuFrameData;
+
+// Thread-local: current GpuFrameData* for the active inference call.
+// Set by RunInferenceComplete_LV (dllmain.cpp) before calling engine->RunInference().
+// Read by tryNV12() and direct engine lookups instead of registry lookup by datastart.
+//
+// CROSS-DLL SHARING: The canonical thread_local lives in ANSODEngine.dll,
+// exported via ANSODEngine_GetTlsGpuFrame().  All DLLs (ANSLPR, ANSFR,
+// ANSOCR) resolve to that SAME slot via GetProcAddress at first call.
+// This ensures that when ANSLPR sets the frame pointer, ANSODEngine's
+// tryNV12() sees it — enabling NV12 zero-copy for ALPR, FR, and OCR.
+//
+// Previous bug: the old `inline` + `thread_local` created separate TLS
+// instances per DLL on MSVC, silently disabling NV12 for all wrapper DLLs.
+inline GpuFrameData*& tl_currentGpuFrame() {
+    // Resolve the canonical TLS slot exported by ANSODEngine.dll.
+    // GetProcAddress is called once (static lambda), cached forever.
+    using TlsFn = GpuFrameData** (*)();
+    static TlsFn s_fn = []() -> TlsFn {
+#ifdef _WIN32
+        HMODULE h = GetModuleHandleA("ANSODEngine.dll");
+        if (h) {
+            auto f = reinterpret_cast<TlsFn>(
+                GetProcAddress(h, "ANSODEngine_GetTlsGpuFrame"));
+            if (f) return f;
+        }
+#endif
+        return nullptr;
+    }();
+
+    if (s_fn) return *s_fn();
+
+    // Fallback: local TLS for unit tests that don't load ANSODEngine.dll.
+    thread_local GpuFrameData* local = nullptr;
+    return local;
+}
+
+namespace ANSCENTER {
+
+    // Forward declare SPDLogger (defined in ANSLicense.h, ANSCENTER namespace)
+    class SPDLogger;
+
+    // Callback type: engine-specific CUDA kernel launcher.
+    // Default YOLO launcher provided as static method.
+    using NV12KernelLauncher = std::function<void(
+        const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
+        cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
+        int inputW, int inputH, cudaStream_t stream)>;
+
+    // Result of tryNV12() — tells the caller what happened and what to do.
+    struct NV12Result {
+        bool succeeded = false;          // true = gpuRGB is valid model input
+        cv::cuda::GpuMat gpuRGB;         // model-input-sized, letterboxed RGB
+        float metaWidth  = 0.f;          // display-res width (for coord mapping)
+        float metaHeight = 0.f;          // display-res height
+        float ratio      = 1.f;          // letterbox scale
+        bool  useBgrFullRes = false;     // true = pixFmt=1000 path
+        cv::Mat bgrFullResImg;           // valid only when useBgrFullRes
+        float bgrFullResScaleX = 1.f;    // display/fullRes X scale
+        float bgrFullResScaleY = 1.f;    // display/fullRes Y scale
+    };
+
+    class ANSENGINE_API NV12PreprocessHelper {
+    public:
+        NV12PreprocessHelper() = default;
+        ~NV12PreprocessHelper();
+
+        // Non-copyable (pinned buffer ownership)
+        NV12PreprocessHelper(const NV12PreprocessHelper&) = delete;
+        NV12PreprocessHelper& operator=(const NV12PreprocessHelper&) = delete;
+
+        // ── Main entry point ─────────────────────────────────────────────
+        NV12Result tryNV12(const cv::Mat& inputImage, int inferenceGpu,
+                           int inputW, int inputH,
+                           const NV12KernelLauncher& launcher,
+                           SPDLogger& logger, const char* tag);
+
+        // Increment inference counter. Call after each inference (NV12 or BGR).
+        void tickInference() {
+            if (m_inferenceCount < NV12_WARMUP_THRESHOLD + 1)
+                ++m_inferenceCount;
+        }
+
+        // ── CUDA context health ──────────────────────────────────────────
+        bool isCudaContextHealthy(SPDLogger& logger, const char* tag);
+
+        // ── Default YOLO kernel launcher ─────────────────────────────────
+        static NV12KernelLauncher defaultYOLOLauncher();
+
+        // ── Classification launcher (direct resize, no letterbox) ────────
+        static NV12KernelLauncher classificationLauncher();
+
+        // ── SCRFD center-padded letterbox launcher ───────────────────────
+        // SCRFD uses center-padding (dw, dh) unlike YOLO's right-bottom padding.
+        static NV12KernelLauncher scrfdCenterLetterboxLauncher(int padLeft, int padTop);
+
+        // ── NV12→BGR fused resize (for OCR detection) ────────────────────
+        // Single kernel: NV12 Y+UV → bilinear resize → BGR GpuMat output.
+        // No intermediate allocations. Output is CV_8UC3.
+        static void nv12ToBGRResize(
+            const uint8_t* devY, int yPitch,
+            const uint8_t* devUV, int uvPitch,
+            uint8_t* bgrOut, int outPitch,
+            int outW, int outH, int srcW, int srcH,
+            cudaStream_t stream = nullptr);
+
+        // ── NV12 affine warp for face alignment ─────────────────────────
+        // Reads full-res NV12, applies inverse affine transform, outputs
+        // aligned face in BGR (112x112). Used by SCRFD face pipeline.
+        struct NV12AffineResult {
+            bool succeeded = false;
+            cv::Mat alignedFaceBGR;           // small face crop on CPU (e.g. 112x112 BGR)
+            cv::cuda::GpuMat gpuAlignedFace;  // same face on GPU (avoids re-upload in recognizer)
+        };
+
+        NV12AffineResult tryNV12AffineWarp(
+            const cv::Mat& inputImage, int inferenceGpu,
+            const cv::Mat& affineMatrix, int outW, int outH,
+            float scaleX, float scaleY,
+            SPDLogger& logger, const char* tag);
+
+        // ── NV12 rectangular crop → BGR ──────────────────────────────────
+        // Crops a rectangular region from the full-res NV12 frame in GPU VRAM
+        // and returns a small BGR cv::Mat on CPU. Used by LPR pipeline to get
+        // high-res plate crops without converting the entire 4K frame.
+        // bbox is in display-res coords; scaleX/scaleY map to full-res NV12.
+        struct NV12CropResult {
+            bool succeeded = false;
+            cv::Mat bgrCrop;     // cropped BGR on CPU (full-res quality)
+        };
+
+        NV12CropResult tryNV12CropToBGR(
+            const cv::Mat& inputImage, int inferenceGpu,
+            const cv::Rect& bbox, int padding,
+            float scaleX, float scaleY,
+            SPDLogger& logger, const char* tag);
+
+        // ── Direct-to-buffer CHW output (SAM3 image encoder) ─────────────
+        // Writes NV12→RGB + resize directly into a pre-allocated TRT GPU
+        // buffer in CHW planar format.  No intermediate GpuMat allocation.
+        struct NV12DirectResult {
+            bool succeeded = false;
+            float metaWidth  = 0.f;
+            float metaHeight = 0.f;
+        };
+
+        NV12DirectResult tryNV12DirectToBuffer(
+            const cv::Mat& inputImage, int inferenceGpu,
+            void* dstGpuBuffer, int inputW, int inputH,
+            bool isFloat32,           // false=uint8 CHW, true=float32 CHW
+            cudaStream_t stream,
+            SPDLogger& logger, const char* tag);
+
+        // ── Cleanup ──────────────────────────────────────────────────────
+        void destroy();
+
+    private:
+        void ensurePinnedBuffer(size_t bytes, SPDLogger& logger, const char* tag);
+
+        // Warmup gating
+        int m_inferenceCount = 0;
+        static constexpr int NV12_WARMUP_THRESHOLD = 30;
+
+        // Pinned (page-locked) host buffer for fast H2D uploads
+        void*  m_pinnedBuf     = nullptr;
+        size_t m_pinnedBufSize = 0;
+
+        // CUDA context health circuit breaker
+        int  m_cudaFailStreak = 0;
+        static constexpr int CUDA_FAIL_LIMIT = 10;
+        bool m_cudaContextDead = false;
+
+        // One-shot diagnostic logging flags
+        bool m_gpuMismatchLogged = false;
+        bool m_bgrFullResLogged  = false;
+        bool m_nv12NullLogged    = false;
+        bool m_nv12DimLogged     = false;
+        bool m_nv12DeadLogged    = false;
+        bool m_nv12PathLogged    = false;
+        bool m_nv12PinnedLogged  = false;
+        bool m_nv12ActiveLogged  = false;
+        bool m_nv12MetaLogged    = false;
+        bool m_nv12CropLogged    = false;
+    };
+
+} // namespace ANSCENTER
+#endif