227 lines
9.9 KiB
C++
227 lines
9.9 KiB
C++
#ifndef NV12_PREPROCESS_HELPER_H
|
|
#define NV12_PREPROCESS_HELPER_H
|
|
#pragma once
|
|
|
|
// NV12PreprocessHelper — shared NV12/CUDA zero-copy preprocessing utility.
|
|
//
|
|
// Encapsulates all NV12 fast-path logic previously in ANSRTYOLO:
|
|
// - Warmup gating (skip NV12 for first N inferences)
|
|
// - Registry lookup + lock lifecycle
|
|
// - GPU matching (decode GPU vs inference GPU, cross-GPU BGR fallback)
|
|
// - Pinned buffer management (reusable cudaHostAlloc)
|
|
// - CUDA zero-copy path (wrap NVDEC device ptrs as GpuMat)
|
|
// - CPU NV12 upload path (SEH-protected memcpy + pinned upload)
|
|
// - CUDA health circuit breaker
|
|
// - BGR full-res path (pixFmt=1000)
|
|
// - One-shot diagnostic logging flags
|
|
//
|
|
// Composition, not inheritance — engines add it as a private member.
|
|
// CUDA coupling stays explicit and opt-in (ONNX/OpenVINO engines unaffected).
|
|
|
|
#include <functional>
|
|
#include <cuda_runtime.h>
|
|
#include <opencv2/core/cuda.hpp>
|
|
#ifdef _WIN32
|
|
#define WIN32_LEAN_AND_MEAN
|
|
#ifndef NOMINMAX
|
|
#define NOMINMAX
|
|
#endif
|
|
#include <windows.h> // GetModuleHandleA, GetProcAddress for cross-DLL TLS
|
|
#endif
|
|
|
|
// ANSENGINE_API is defined by ANSEngineCommon.h (included before this header
|
|
// via ANSRTYOLO.h / ANSYOLOV12RTOD.h / ANSYOLOV10RTOD.h).
|
|
// Guard for standalone inclusion.
|
|
#ifndef ANSENGINE_API
|
|
#define ANSENGINE_API __declspec(dllexport)
|
|
#endif
|
|
|
|
// Forward declarations — avoid pulling in heavy headers
|
|
struct GpuFrameData;
|
|
|
|
// Thread-local: current GpuFrameData* for the active inference call.
|
|
// Set by RunInferenceComplete_LV (dllmain.cpp) before calling engine->RunInference().
|
|
// Read by tryNV12() and direct engine lookups instead of registry lookup by datastart.
|
|
//
|
|
// CROSS-DLL SHARING: The canonical thread_local lives in ANSODEngine.dll,
|
|
// exported via ANSODEngine_GetTlsGpuFrame(). All DLLs (ANSLPR, ANSFR,
|
|
// ANSOCR) resolve to that SAME slot via GetProcAddress at first call.
|
|
// This ensures that when ANSLPR sets the frame pointer, ANSODEngine's
|
|
// tryNV12() sees it — enabling NV12 zero-copy for ALPR, FR, and OCR.
|
|
//
|
|
// Previous bug: the old `inline` + `thread_local` created separate TLS
|
|
// instances per DLL on MSVC, silently disabling NV12 for all wrapper DLLs.
|
|
inline GpuFrameData*& tl_currentGpuFrame() {
|
|
// Resolve the canonical TLS slot exported by ANSODEngine.dll.
|
|
// GetProcAddress is called once (static lambda), cached forever.
|
|
using TlsFn = GpuFrameData** (*)();
|
|
static TlsFn s_fn = []() -> TlsFn {
|
|
#ifdef _WIN32
|
|
HMODULE h = GetModuleHandleA("ANSODEngine.dll");
|
|
if (h) {
|
|
auto f = reinterpret_cast<TlsFn>(
|
|
GetProcAddress(h, "ANSODEngine_GetTlsGpuFrame"));
|
|
if (f) return f;
|
|
}
|
|
#endif
|
|
return nullptr;
|
|
}();
|
|
|
|
if (s_fn) return *s_fn();
|
|
|
|
// Fallback: local TLS for unit tests that don't load ANSODEngine.dll.
|
|
thread_local GpuFrameData* local = nullptr;
|
|
return local;
|
|
}
|
|
|
|
namespace ANSCENTER {
|
|
|
|
// Forward declare SPDLogger (defined in ANSLicense.h, ANSCENTER namespace)
|
|
class SPDLogger;
|
|
|
|
// Callback type: engine-specific CUDA kernel launcher.
|
|
// Default YOLO launcher provided as static method.
|
|
using NV12KernelLauncher = std::function<void(
|
|
const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
|
|
cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
|
|
int inputW, int inputH, cudaStream_t stream)>;
|
|
|
|
// Result of tryNV12() — tells the caller what happened and what to do.
|
|
struct NV12Result {
|
|
bool succeeded = false; // true = gpuRGB is valid model input
|
|
cv::cuda::GpuMat gpuRGB; // model-input-sized, letterboxed RGB
|
|
float metaWidth = 0.f; // display-res width (for coord mapping)
|
|
float metaHeight = 0.f; // display-res height
|
|
float ratio = 1.f; // letterbox scale
|
|
bool useBgrFullRes = false; // true = pixFmt=1000 path
|
|
cv::Mat bgrFullResImg; // valid only when useBgrFullRes
|
|
float bgrFullResScaleX = 1.f; // display/fullRes X scale
|
|
float bgrFullResScaleY = 1.f; // display/fullRes Y scale
|
|
};
|
|
|
|
class ANSENGINE_API NV12PreprocessHelper {
|
|
public:
|
|
NV12PreprocessHelper() = default;
|
|
~NV12PreprocessHelper();
|
|
|
|
// Non-copyable (pinned buffer ownership)
|
|
NV12PreprocessHelper(const NV12PreprocessHelper&) = delete;
|
|
NV12PreprocessHelper& operator=(const NV12PreprocessHelper&) = delete;
|
|
|
|
// ── Main entry point ─────────────────────────────────────────────
|
|
NV12Result tryNV12(const cv::Mat& inputImage, int inferenceGpu,
|
|
int inputW, int inputH,
|
|
const NV12KernelLauncher& launcher,
|
|
SPDLogger& logger, const char* tag);
|
|
|
|
// Increment inference counter. Call after each inference (NV12 or BGR).
|
|
void tickInference() {
|
|
if (m_inferenceCount < NV12_WARMUP_THRESHOLD + 1)
|
|
++m_inferenceCount;
|
|
}
|
|
|
|
// ── CUDA context health ──────────────────────────────────────────
|
|
bool isCudaContextHealthy(SPDLogger& logger, const char* tag);
|
|
|
|
// ── Default YOLO kernel launcher ─────────────────────────────────
|
|
static NV12KernelLauncher defaultYOLOLauncher();
|
|
|
|
// ── Classification launcher (direct resize, no letterbox) ────────
|
|
static NV12KernelLauncher classificationLauncher();
|
|
|
|
// ── SCRFD center-padded letterbox launcher ───────────────────────
|
|
// SCRFD uses center-padding (dw, dh) unlike YOLO's right-bottom padding.
|
|
static NV12KernelLauncher scrfdCenterLetterboxLauncher(int padLeft, int padTop);
|
|
|
|
// ── NV12→BGR fused resize (for OCR detection) ────────────────────
|
|
// Single kernel: NV12 Y+UV → bilinear resize → BGR GpuMat output.
|
|
// No intermediate allocations. Output is CV_8UC3.
|
|
static void nv12ToBGRResize(
|
|
const uint8_t* devY, int yPitch,
|
|
const uint8_t* devUV, int uvPitch,
|
|
uint8_t* bgrOut, int outPitch,
|
|
int outW, int outH, int srcW, int srcH,
|
|
cudaStream_t stream = nullptr);
|
|
|
|
// ── NV12 affine warp for face alignment ─────────────────────────
|
|
// Reads full-res NV12, applies inverse affine transform, outputs
|
|
// aligned face in BGR (112x112). Used by SCRFD face pipeline.
|
|
struct NV12AffineResult {
|
|
bool succeeded = false;
|
|
cv::Mat alignedFaceBGR; // small face crop on CPU (e.g. 112x112 BGR)
|
|
cv::cuda::GpuMat gpuAlignedFace; // same face on GPU (avoids re-upload in recognizer)
|
|
};
|
|
|
|
NV12AffineResult tryNV12AffineWarp(
|
|
const cv::Mat& inputImage, int inferenceGpu,
|
|
const cv::Mat& affineMatrix, int outW, int outH,
|
|
float scaleX, float scaleY,
|
|
SPDLogger& logger, const char* tag);
|
|
|
|
// ── NV12 rectangular crop → BGR ──────────────────────────────────
|
|
// Crops a rectangular region from the full-res NV12 frame in GPU VRAM
|
|
// and returns a small BGR cv::Mat on CPU. Used by LPR pipeline to get
|
|
// high-res plate crops without converting the entire 4K frame.
|
|
// bbox is in display-res coords; scaleX/scaleY map to full-res NV12.
|
|
struct NV12CropResult {
|
|
bool succeeded = false;
|
|
cv::Mat bgrCrop; // cropped BGR on CPU (full-res quality)
|
|
};
|
|
|
|
NV12CropResult tryNV12CropToBGR(
|
|
const cv::Mat& inputImage, int inferenceGpu,
|
|
const cv::Rect& bbox, int padding,
|
|
float scaleX, float scaleY,
|
|
SPDLogger& logger, const char* tag);
|
|
|
|
// ── Direct-to-buffer CHW output (SAM3 image encoder) ─────────────
|
|
// Writes NV12→RGB + resize directly into a pre-allocated TRT GPU
|
|
// buffer in CHW planar format. No intermediate GpuMat allocation.
|
|
struct NV12DirectResult {
|
|
bool succeeded = false;
|
|
float metaWidth = 0.f;
|
|
float metaHeight = 0.f;
|
|
};
|
|
|
|
NV12DirectResult tryNV12DirectToBuffer(
|
|
const cv::Mat& inputImage, int inferenceGpu,
|
|
void* dstGpuBuffer, int inputW, int inputH,
|
|
bool isFloat32, // false=uint8 CHW, true=float32 CHW
|
|
cudaStream_t stream,
|
|
SPDLogger& logger, const char* tag);
|
|
|
|
// ── Cleanup ──────────────────────────────────────────────────────
|
|
void destroy();
|
|
|
|
private:
|
|
void ensurePinnedBuffer(size_t bytes, SPDLogger& logger, const char* tag);
|
|
|
|
// Warmup gating
|
|
int m_inferenceCount = 0;
|
|
static constexpr int NV12_WARMUP_THRESHOLD = 30;
|
|
|
|
// Pinned (page-locked) host buffer for fast H2D uploads
|
|
void* m_pinnedBuf = nullptr;
|
|
size_t m_pinnedBufSize = 0;
|
|
|
|
// CUDA context health circuit breaker
|
|
int m_cudaFailStreak = 0;
|
|
static constexpr int CUDA_FAIL_LIMIT = 10;
|
|
bool m_cudaContextDead = false;
|
|
|
|
// One-shot diagnostic logging flags
|
|
bool m_gpuMismatchLogged = false;
|
|
bool m_bgrFullResLogged = false;
|
|
bool m_nv12NullLogged = false;
|
|
bool m_nv12DimLogged = false;
|
|
bool m_nv12DeadLogged = false;
|
|
bool m_nv12PathLogged = false;
|
|
bool m_nv12PinnedLogged = false;
|
|
bool m_nv12ActiveLogged = false;
|
|
bool m_nv12MetaLogged = false;
|
|
bool m_nv12CropLogged = false;
|
|
};
|
|
|
|
} // namespace ANSCENTER
|
|
#endif
|