#ifndef NV12_PREPROCESS_HELPER_H #define NV12_PREPROCESS_HELPER_H #pragma once // NV12PreprocessHelper — shared NV12/CUDA zero-copy preprocessing utility. // // Encapsulates all NV12 fast-path logic previously in ANSRTYOLO: // - Warmup gating (skip NV12 for first N inferences) // - Registry lookup + lock lifecycle // - GPU matching (decode GPU vs inference GPU, cross-GPU BGR fallback) // - Pinned buffer management (reusable cudaHostAlloc) // - CUDA zero-copy path (wrap NVDEC device ptrs as GpuMat) // - CPU NV12 upload path (SEH-protected memcpy + pinned upload) // - CUDA health circuit breaker // - BGR full-res path (pixFmt=1000) // - One-shot diagnostic logging flags // // Composition, not inheritance — engines add it as a private member. // CUDA coupling stays explicit and opt-in (ONNX/OpenVINO engines unaffected). #include #include #include #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN #ifndef NOMINMAX #define NOMINMAX #endif #include // GetModuleHandleA, GetProcAddress for cross-DLL TLS #endif // ANSENGINE_API is defined by ANSEngineCommon.h (included before this header // via ANSRTYOLO.h / ANSYOLOV12RTOD.h / ANSYOLOV10RTOD.h). // Guard for standalone inclusion. #ifndef ANSENGINE_API #define ANSENGINE_API __declspec(dllexport) #endif // Forward declarations — avoid pulling in heavy headers struct GpuFrameData; // Thread-local: current GpuFrameData* for the active inference call. // Set by RunInferenceComplete_LV (dllmain.cpp) before calling engine->RunInference(). // Read by tryNV12() and direct engine lookups instead of registry lookup by datastart. // // CROSS-DLL SHARING: The canonical thread_local lives in ANSODEngine.dll, // exported via ANSODEngine_GetTlsGpuFrame(). All DLLs (ANSLPR, ANSFR, // ANSOCR) resolve to that SAME slot via GetProcAddress at first call. // This ensures that when ANSLPR sets the frame pointer, ANSODEngine's // tryNV12() sees it — enabling NV12 zero-copy for ALPR, FR, and OCR. // // Previous bug: the old `inline` + `thread_local` created separate TLS // instances per DLL on MSVC, silently disabling NV12 for all wrapper DLLs. inline GpuFrameData*& tl_currentGpuFrame() { // Resolve the canonical TLS slot exported by ANSODEngine.dll. // GetProcAddress is called once (static lambda), cached forever. using TlsFn = GpuFrameData** (*)(); static TlsFn s_fn = []() -> TlsFn { #ifdef _WIN32 HMODULE h = GetModuleHandleA("ANSODEngine.dll"); if (h) { auto f = reinterpret_cast( GetProcAddress(h, "ANSODEngine_GetTlsGpuFrame")); if (f) return f; } #endif return nullptr; }(); if (s_fn) return *s_fn(); // Fallback: local TLS for unit tests that don't load ANSODEngine.dll. thread_local GpuFrameData* local = nullptr; return local; } namespace ANSCENTER { // Forward declare SPDLogger (defined in ANSLicense.h, ANSCENTER namespace) class SPDLogger; // Callback type: engine-specific CUDA kernel launcher. // Default YOLO launcher provided as static method. using NV12KernelLauncher = std::function; // Result of tryNV12() — tells the caller what happened and what to do. struct NV12Result { bool succeeded = false; // true = gpuRGB is valid model input cv::cuda::GpuMat gpuRGB; // model-input-sized, letterboxed RGB float metaWidth = 0.f; // display-res width (for coord mapping) float metaHeight = 0.f; // display-res height float ratio = 1.f; // letterbox scale bool useBgrFullRes = false; // true = pixFmt=1000 path cv::Mat bgrFullResImg; // valid only when useBgrFullRes float bgrFullResScaleX = 1.f; // display/fullRes X scale float bgrFullResScaleY = 1.f; // display/fullRes Y scale }; class ANSENGINE_API NV12PreprocessHelper { public: NV12PreprocessHelper() = default; ~NV12PreprocessHelper(); // Non-copyable (pinned buffer ownership) NV12PreprocessHelper(const NV12PreprocessHelper&) = delete; NV12PreprocessHelper& operator=(const NV12PreprocessHelper&) = delete; // ── Main entry point ───────────────────────────────────────────── NV12Result tryNV12(const cv::Mat& inputImage, int inferenceGpu, int inputW, int inputH, const NV12KernelLauncher& launcher, SPDLogger& logger, const char* tag); // Increment inference counter. Call after each inference (NV12 or BGR). void tickInference() { if (m_inferenceCount < NV12_WARMUP_THRESHOLD + 1) ++m_inferenceCount; } // ── CUDA context health ────────────────────────────────────────── bool isCudaContextHealthy(SPDLogger& logger, const char* tag); // ── Default YOLO kernel launcher ───────────────────────────────── static NV12KernelLauncher defaultYOLOLauncher(); // ── Classification launcher (direct resize, no letterbox) ──────── static NV12KernelLauncher classificationLauncher(); // ── SCRFD center-padded letterbox launcher ─────────────────────── // SCRFD uses center-padding (dw, dh) unlike YOLO's right-bottom padding. static NV12KernelLauncher scrfdCenterLetterboxLauncher(int padLeft, int padTop); // ── NV12→BGR fused resize (for OCR detection) ──────────────────── // Single kernel: NV12 Y+UV → bilinear resize → BGR GpuMat output. // No intermediate allocations. Output is CV_8UC3. static void nv12ToBGRResize( const uint8_t* devY, int yPitch, const uint8_t* devUV, int uvPitch, uint8_t* bgrOut, int outPitch, int outW, int outH, int srcW, int srcH, cudaStream_t stream = nullptr); // ── NV12 affine warp for face alignment ───────────────────────── // Reads full-res NV12, applies inverse affine transform, outputs // aligned face in BGR (112x112). Used by SCRFD face pipeline. struct NV12AffineResult { bool succeeded = false; cv::Mat alignedFaceBGR; // small face crop on CPU (e.g. 112x112 BGR) cv::cuda::GpuMat gpuAlignedFace; // same face on GPU (avoids re-upload in recognizer) }; NV12AffineResult tryNV12AffineWarp( const cv::Mat& inputImage, int inferenceGpu, const cv::Mat& affineMatrix, int outW, int outH, float scaleX, float scaleY, SPDLogger& logger, const char* tag); // ── NV12 rectangular crop → BGR ────────────────────────────────── // Crops a rectangular region from the full-res NV12 frame in GPU VRAM // and returns a small BGR cv::Mat on CPU. Used by LPR pipeline to get // high-res plate crops without converting the entire 4K frame. // bbox is in display-res coords; scaleX/scaleY map to full-res NV12. struct NV12CropResult { bool succeeded = false; cv::Mat bgrCrop; // cropped BGR on CPU (full-res quality) }; NV12CropResult tryNV12CropToBGR( const cv::Mat& inputImage, int inferenceGpu, const cv::Rect& bbox, int padding, float scaleX, float scaleY, SPDLogger& logger, const char* tag); // ── Direct-to-buffer CHW output (SAM3 image encoder) ───────────── // Writes NV12→RGB + resize directly into a pre-allocated TRT GPU // buffer in CHW planar format. No intermediate GpuMat allocation. struct NV12DirectResult { bool succeeded = false; float metaWidth = 0.f; float metaHeight = 0.f; }; NV12DirectResult tryNV12DirectToBuffer( const cv::Mat& inputImage, int inferenceGpu, void* dstGpuBuffer, int inputW, int inputH, bool isFloat32, // false=uint8 CHW, true=float32 CHW cudaStream_t stream, SPDLogger& logger, const char* tag); // ── Cleanup ────────────────────────────────────────────────────── void destroy(); private: void ensurePinnedBuffer(size_t bytes, SPDLogger& logger, const char* tag); // Warmup gating int m_inferenceCount = 0; static constexpr int NV12_WARMUP_THRESHOLD = 30; // Pinned (page-locked) host buffer for fast H2D uploads void* m_pinnedBuf = nullptr; size_t m_pinnedBufSize = 0; // CUDA context health circuit breaker int m_cudaFailStreak = 0; static constexpr int CUDA_FAIL_LIMIT = 10; bool m_cudaContextDead = false; // One-shot diagnostic logging flags bool m_gpuMismatchLogged = false; bool m_bgrFullResLogged = false; bool m_nv12NullLogged = false; bool m_nv12DimLogged = false; bool m_nv12DeadLogged = false; bool m_nv12PathLogged = false; bool m_nv12PinnedLogged = false; bool m_nv12ActiveLogged = false; bool m_nv12MetaLogged = false; bool m_nv12CropLogged = false; }; } // namespace ANSCENTER #endif