ANSCORE/ANSODEngine/NV12PreprocessHelper.h

#ifndef NV12_PREPROCESS_HELPER_H
#define NV12_PREPROCESS_HELPER_H
#pragma once

// NV12PreprocessHelper — shared NV12/CUDA zero-copy preprocessing utility.
//
// Encapsulates all NV12 fast-path logic previously in ANSRTYOLO:
//   - Warmup gating (skip NV12 for first N inferences)
//   - Registry lookup + lock lifecycle
//   - GPU matching (decode GPU vs inference GPU, cross-GPU BGR fallback)
//   - Pinned buffer management (reusable cudaHostAlloc)
//   - CUDA zero-copy path (wrap NVDEC device ptrs as GpuMat)
//   - CPU NV12 upload path (SEH-protected memcpy + pinned upload)
//   - CUDA health circuit breaker
//   - BGR full-res path (pixFmt=1000)
//   - One-shot diagnostic logging flags
//
// Composition, not inheritance — engines add it as a private member.
// CUDA coupling stays explicit and opt-in (ONNX/OpenVINO engines unaffected).

#include <functional>
#include <cuda_runtime.h>
#include <opencv2/core/cuda.hpp>
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>       // GetModuleHandleA, GetProcAddress for cross-DLL TLS
#endif

// ANSENGINE_API is defined by ANSEngineCommon.h (included before this header
// via ANSRTYOLO.h / ANSYOLOV12RTOD.h / ANSYOLOV10RTOD.h).
// Guard for standalone inclusion.
#ifndef ANSENGINE_API
#define ANSENGINE_API __declspec(dllexport)
#endif

// Forward declarations — avoid pulling in heavy headers
struct GpuFrameData;

// Thread-local: current GpuFrameData* for the active inference call.
// Set by RunInferenceComplete_LV (dllmain.cpp) before calling engine->RunInference().
// Read by tryNV12() and direct engine lookups instead of registry lookup by datastart.
//
// CROSS-DLL SHARING: The canonical thread_local lives in ANSODEngine.dll,
// exported via ANSODEngine_GetTlsGpuFrame().  All DLLs (ANSLPR, ANSFR,
// ANSOCR) resolve to that SAME slot via GetProcAddress at first call.
// This ensures that when ANSLPR sets the frame pointer, ANSODEngine's
// tryNV12() sees it — enabling NV12 zero-copy for ALPR, FR, and OCR.
//
// Previous bug: the old `inline` + `thread_local` created separate TLS
// instances per DLL on MSVC, silently disabling NV12 for all wrapper DLLs.
inline GpuFrameData*& tl_currentGpuFrame() {
    // Resolve the canonical TLS slot exported by ANSODEngine.dll.
    // GetProcAddress is called once (static lambda), cached forever.
    using TlsFn = GpuFrameData** (*)();
    static TlsFn s_fn = []() -> TlsFn {
#ifdef _WIN32
        HMODULE h = GetModuleHandleA("ANSODEngine.dll");
        if (h) {
            auto f = reinterpret_cast<TlsFn>(
                GetProcAddress(h, "ANSODEngine_GetTlsGpuFrame"));
            if (f) return f;
        }
#endif
        return nullptr;
    }();

    if (s_fn) return *s_fn();

    // Fallback: local TLS for unit tests that don't load ANSODEngine.dll.
    thread_local GpuFrameData* local = nullptr;
    return local;
}

namespace ANSCENTER {

    // Forward declare SPDLogger (defined in ANSLicense.h, ANSCENTER namespace)
    class SPDLogger;

    // Callback type: engine-specific CUDA kernel launcher.
    // Default YOLO launcher provided as static method.
    using NV12KernelLauncher = std::function<void(
        const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
        cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
        int inputW, int inputH, cudaStream_t stream)>;

    // Result of tryNV12() — tells the caller what happened and what to do.
    struct NV12Result {
        bool succeeded = false;          // true = gpuRGB is valid model input
        cv::cuda::GpuMat gpuRGB;         // model-input-sized, letterboxed RGB
        float metaWidth  = 0.f;          // display-res width (for coord mapping)
        float metaHeight = 0.f;          // display-res height
        float ratio      = 1.f;          // letterbox scale
        bool  useBgrFullRes = false;     // true = pixFmt=1000 path
        cv::Mat bgrFullResImg;           // valid only when useBgrFullRes
        float bgrFullResScaleX = 1.f;    // display/fullRes X scale
        float bgrFullResScaleY = 1.f;    // display/fullRes Y scale
    };

    class ANSENGINE_API NV12PreprocessHelper {
    public:
        NV12PreprocessHelper() = default;
        ~NV12PreprocessHelper();

        // Non-copyable (pinned buffer ownership)
        NV12PreprocessHelper(const NV12PreprocessHelper&) = delete;
        NV12PreprocessHelper& operator=(const NV12PreprocessHelper&) = delete;

        // ── Main entry point ─────────────────────────────────────────────
        NV12Result tryNV12(const cv::Mat& inputImage, int inferenceGpu,
                           int inputW, int inputH,
                           const NV12KernelLauncher& launcher,
                           SPDLogger& logger, const char* tag);

        // Increment inference counter. Call after each inference (NV12 or BGR).
        void tickInference() {
            if (m_inferenceCount < NV12_WARMUP_THRESHOLD + 1)
                ++m_inferenceCount;
        }

        // ── CUDA context health ──────────────────────────────────────────
        bool isCudaContextHealthy(SPDLogger& logger, const char* tag);

        // ── Default YOLO kernel launcher ─────────────────────────────────
        static NV12KernelLauncher defaultYOLOLauncher();

        // ── Classification launcher (direct resize, no letterbox) ────────
        static NV12KernelLauncher classificationLauncher();

        // ── SCRFD center-padded letterbox launcher ───────────────────────
        // SCRFD uses center-padding (dw, dh) unlike YOLO's right-bottom padding.
        static NV12KernelLauncher scrfdCenterLetterboxLauncher(int padLeft, int padTop);

        // ── NV12→BGR fused resize (for OCR detection) ────────────────────
        // Single kernel: NV12 Y+UV → bilinear resize → BGR GpuMat output.
        // No intermediate allocations. Output is CV_8UC3.
        static void nv12ToBGRResize(
            const uint8_t* devY, int yPitch,
            const uint8_t* devUV, int uvPitch,
            uint8_t* bgrOut, int outPitch,
            int outW, int outH, int srcW, int srcH,
            cudaStream_t stream = nullptr);

        // ── NV12 affine warp for face alignment ─────────────────────────
        // Reads full-res NV12, applies inverse affine transform, outputs
        // aligned face in BGR (112x112). Used by SCRFD face pipeline.
        struct NV12AffineResult {
            bool succeeded = false;
            cv::Mat alignedFaceBGR;           // small face crop on CPU (e.g. 112x112 BGR)
            cv::cuda::GpuMat gpuAlignedFace;  // same face on GPU (avoids re-upload in recognizer)
        };

        NV12AffineResult tryNV12AffineWarp(
            const cv::Mat& inputImage, int inferenceGpu,
            const cv::Mat& affineMatrix, int outW, int outH,
            float scaleX, float scaleY,
            SPDLogger& logger, const char* tag);

        // ── NV12 rectangular crop → BGR ──────────────────────────────────
        // Crops a rectangular region from the full-res NV12 frame in GPU VRAM
        // and returns a small BGR cv::Mat on CPU. Used by LPR pipeline to get
        // high-res plate crops without converting the entire 4K frame.
        // bbox is in display-res coords; scaleX/scaleY map to full-res NV12.
        struct NV12CropResult {
            bool succeeded = false;
            cv::Mat bgrCrop;     // cropped BGR on CPU (full-res quality)
        };

        NV12CropResult tryNV12CropToBGR(
            const cv::Mat& inputImage, int inferenceGpu,
            const cv::Rect& bbox, int padding,
            float scaleX, float scaleY,
            SPDLogger& logger, const char* tag);

        // ── Direct-to-buffer CHW output (SAM3 image encoder) ─────────────
        // Writes NV12→RGB + resize directly into a pre-allocated TRT GPU
        // buffer in CHW planar format.  No intermediate GpuMat allocation.
        struct NV12DirectResult {
            bool succeeded = false;
            float metaWidth  = 0.f;
            float metaHeight = 0.f;
        };

        NV12DirectResult tryNV12DirectToBuffer(
            const cv::Mat& inputImage, int inferenceGpu,
            void* dstGpuBuffer, int inputW, int inputH,
            bool isFloat32,           // false=uint8 CHW, true=float32 CHW
            cudaStream_t stream,
            SPDLogger& logger, const char* tag);

        // ── Cleanup ──────────────────────────────────────────────────────
        void destroy();

    private:
        void ensurePinnedBuffer(size_t bytes, SPDLogger& logger, const char* tag);

        // Warmup gating
        int m_inferenceCount = 0;
        static constexpr int NV12_WARMUP_THRESHOLD = 30;

        // Pinned (page-locked) host buffer for fast H2D uploads
        void*  m_pinnedBuf     = nullptr;
        size_t m_pinnedBufSize = 0;

        // CUDA context health circuit breaker
        int  m_cudaFailStreak = 0;
        static constexpr int CUDA_FAIL_LIMIT = 10;
        bool m_cudaContextDead = false;

        // One-shot diagnostic logging flags
        bool m_gpuMismatchLogged = false;
        bool m_bgrFullResLogged  = false;
        bool m_nv12NullLogged    = false;
        bool m_nv12DimLogged     = false;
        bool m_nv12DeadLogged    = false;
        bool m_nv12PathLogged    = false;
        bool m_nv12PinnedLogged  = false;
        bool m_nv12ActiveLogged  = false;
        bool m_nv12MetaLogged    = false;
        bool m_nv12CropLogged    = false;
    };

} // namespace ANSCENTER
#endif