Refactor project structure
This commit is contained in:
226
modules/ANSODEngine/NV12PreprocessHelper.h
Normal file
226
modules/ANSODEngine/NV12PreprocessHelper.h
Normal file
@@ -0,0 +1,226 @@
|
||||
#ifndef NV12_PREPROCESS_HELPER_H
|
||||
#define NV12_PREPROCESS_HELPER_H
|
||||
#pragma once
|
||||
|
||||
// NV12PreprocessHelper — shared NV12/CUDA zero-copy preprocessing utility.
|
||||
//
|
||||
// Encapsulates all NV12 fast-path logic previously in ANSRTYOLO:
|
||||
// - Warmup gating (skip NV12 for first N inferences)
|
||||
// - Registry lookup + lock lifecycle
|
||||
// - GPU matching (decode GPU vs inference GPU, cross-GPU BGR fallback)
|
||||
// - Pinned buffer management (reusable cudaHostAlloc)
|
||||
// - CUDA zero-copy path (wrap NVDEC device ptrs as GpuMat)
|
||||
// - CPU NV12 upload path (SEH-protected memcpy + pinned upload)
|
||||
// - CUDA health circuit breaker
|
||||
// - BGR full-res path (pixFmt=1000)
|
||||
// - One-shot diagnostic logging flags
|
||||
//
|
||||
// Composition, not inheritance — engines add it as a private member.
|
||||
// CUDA coupling stays explicit and opt-in (ONNX/OpenVINO engines unaffected).
|
||||
|
||||
#include <functional>
|
||||
#include <cuda_runtime.h>
|
||||
#include <opencv2/core/cuda.hpp>
|
||||
#ifdef _WIN32
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h> // GetModuleHandleA, GetProcAddress for cross-DLL TLS
|
||||
#endif
|
||||
|
||||
// ANSENGINE_API is defined by ANSEngineCommon.h (included before this header
|
||||
// via ANSRTYOLO.h / ANSYOLOV12RTOD.h / ANSYOLOV10RTOD.h).
|
||||
// Guard for standalone inclusion.
|
||||
#ifndef ANSENGINE_API
|
||||
#define ANSENGINE_API __declspec(dllexport)
|
||||
#endif
|
||||
|
||||
// Forward declarations — avoid pulling in heavy headers
|
||||
struct GpuFrameData;
|
||||
|
||||
// Thread-local: current GpuFrameData* for the active inference call.
|
||||
// Set by RunInferenceComplete_LV (dllmain.cpp) before calling engine->RunInference().
|
||||
// Read by tryNV12() and direct engine lookups instead of registry lookup by datastart.
|
||||
//
|
||||
// CROSS-DLL SHARING: The canonical thread_local lives in ANSODEngine.dll,
|
||||
// exported via ANSODEngine_GetTlsGpuFrame(). All DLLs (ANSLPR, ANSFR,
|
||||
// ANSOCR) resolve to that SAME slot via GetProcAddress at first call.
|
||||
// This ensures that when ANSLPR sets the frame pointer, ANSODEngine's
|
||||
// tryNV12() sees it — enabling NV12 zero-copy for ALPR, FR, and OCR.
|
||||
//
|
||||
// Previous bug: the old `inline` + `thread_local` created separate TLS
|
||||
// instances per DLL on MSVC, silently disabling NV12 for all wrapper DLLs.
|
||||
inline GpuFrameData*& tl_currentGpuFrame() {
|
||||
// Resolve the canonical TLS slot exported by ANSODEngine.dll.
|
||||
// GetProcAddress is called once (static lambda), cached forever.
|
||||
using TlsFn = GpuFrameData** (*)();
|
||||
static TlsFn s_fn = []() -> TlsFn {
|
||||
#ifdef _WIN32
|
||||
HMODULE h = GetModuleHandleA("ANSODEngine.dll");
|
||||
if (h) {
|
||||
auto f = reinterpret_cast<TlsFn>(
|
||||
GetProcAddress(h, "ANSODEngine_GetTlsGpuFrame"));
|
||||
if (f) return f;
|
||||
}
|
||||
#endif
|
||||
return nullptr;
|
||||
}();
|
||||
|
||||
if (s_fn) return *s_fn();
|
||||
|
||||
// Fallback: local TLS for unit tests that don't load ANSODEngine.dll.
|
||||
thread_local GpuFrameData* local = nullptr;
|
||||
return local;
|
||||
}
|
||||
|
||||
namespace ANSCENTER {
|
||||
|
||||
// Forward declare SPDLogger (defined in ANSLicense.h, ANSCENTER namespace)
|
||||
class SPDLogger;
|
||||
|
||||
// Callback type: engine-specific CUDA kernel launcher.
|
||||
// Default YOLO launcher provided as static method.
|
||||
using NV12KernelLauncher = std::function<void(
|
||||
const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
|
||||
cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
|
||||
int inputW, int inputH, cudaStream_t stream)>;
|
||||
|
||||
// Result of tryNV12() — tells the caller what happened and what to do.
|
||||
struct NV12Result {
|
||||
bool succeeded = false; // true = gpuRGB is valid model input
|
||||
cv::cuda::GpuMat gpuRGB; // model-input-sized, letterboxed RGB
|
||||
float metaWidth = 0.f; // display-res width (for coord mapping)
|
||||
float metaHeight = 0.f; // display-res height
|
||||
float ratio = 1.f; // letterbox scale
|
||||
bool useBgrFullRes = false; // true = pixFmt=1000 path
|
||||
cv::Mat bgrFullResImg; // valid only when useBgrFullRes
|
||||
float bgrFullResScaleX = 1.f; // display/fullRes X scale
|
||||
float bgrFullResScaleY = 1.f; // display/fullRes Y scale
|
||||
};
|
||||
|
||||
class ANSENGINE_API NV12PreprocessHelper {
|
||||
public:
|
||||
NV12PreprocessHelper() = default;
|
||||
~NV12PreprocessHelper();
|
||||
|
||||
// Non-copyable (pinned buffer ownership)
|
||||
NV12PreprocessHelper(const NV12PreprocessHelper&) = delete;
|
||||
NV12PreprocessHelper& operator=(const NV12PreprocessHelper&) = delete;
|
||||
|
||||
// ── Main entry point ─────────────────────────────────────────────
|
||||
NV12Result tryNV12(const cv::Mat& inputImage, int inferenceGpu,
|
||||
int inputW, int inputH,
|
||||
const NV12KernelLauncher& launcher,
|
||||
SPDLogger& logger, const char* tag);
|
||||
|
||||
// Increment inference counter. Call after each inference (NV12 or BGR).
|
||||
void tickInference() {
|
||||
if (m_inferenceCount < NV12_WARMUP_THRESHOLD + 1)
|
||||
++m_inferenceCount;
|
||||
}
|
||||
|
||||
// ── CUDA context health ──────────────────────────────────────────
|
||||
bool isCudaContextHealthy(SPDLogger& logger, const char* tag);
|
||||
|
||||
// ── Default YOLO kernel launcher ─────────────────────────────────
|
||||
static NV12KernelLauncher defaultYOLOLauncher();
|
||||
|
||||
// ── Classification launcher (direct resize, no letterbox) ────────
|
||||
static NV12KernelLauncher classificationLauncher();
|
||||
|
||||
// ── SCRFD center-padded letterbox launcher ───────────────────────
|
||||
// SCRFD uses center-padding (dw, dh) unlike YOLO's right-bottom padding.
|
||||
static NV12KernelLauncher scrfdCenterLetterboxLauncher(int padLeft, int padTop);
|
||||
|
||||
// ── NV12→BGR fused resize (for OCR detection) ────────────────────
|
||||
// Single kernel: NV12 Y+UV → bilinear resize → BGR GpuMat output.
|
||||
// No intermediate allocations. Output is CV_8UC3.
|
||||
static void nv12ToBGRResize(
|
||||
const uint8_t* devY, int yPitch,
|
||||
const uint8_t* devUV, int uvPitch,
|
||||
uint8_t* bgrOut, int outPitch,
|
||||
int outW, int outH, int srcW, int srcH,
|
||||
cudaStream_t stream = nullptr);
|
||||
|
||||
// ── NV12 affine warp for face alignment ─────────────────────────
|
||||
// Reads full-res NV12, applies inverse affine transform, outputs
|
||||
// aligned face in BGR (112x112). Used by SCRFD face pipeline.
|
||||
struct NV12AffineResult {
|
||||
bool succeeded = false;
|
||||
cv::Mat alignedFaceBGR; // small face crop on CPU (e.g. 112x112 BGR)
|
||||
cv::cuda::GpuMat gpuAlignedFace; // same face on GPU (avoids re-upload in recognizer)
|
||||
};
|
||||
|
||||
NV12AffineResult tryNV12AffineWarp(
|
||||
const cv::Mat& inputImage, int inferenceGpu,
|
||||
const cv::Mat& affineMatrix, int outW, int outH,
|
||||
float scaleX, float scaleY,
|
||||
SPDLogger& logger, const char* tag);
|
||||
|
||||
// ── NV12 rectangular crop → BGR ──────────────────────────────────
|
||||
// Crops a rectangular region from the full-res NV12 frame in GPU VRAM
|
||||
// and returns a small BGR cv::Mat on CPU. Used by LPR pipeline to get
|
||||
// high-res plate crops without converting the entire 4K frame.
|
||||
// bbox is in display-res coords; scaleX/scaleY map to full-res NV12.
|
||||
struct NV12CropResult {
|
||||
bool succeeded = false;
|
||||
cv::Mat bgrCrop; // cropped BGR on CPU (full-res quality)
|
||||
};
|
||||
|
||||
NV12CropResult tryNV12CropToBGR(
|
||||
const cv::Mat& inputImage, int inferenceGpu,
|
||||
const cv::Rect& bbox, int padding,
|
||||
float scaleX, float scaleY,
|
||||
SPDLogger& logger, const char* tag);
|
||||
|
||||
// ── Direct-to-buffer CHW output (SAM3 image encoder) ─────────────
|
||||
// Writes NV12→RGB + resize directly into a pre-allocated TRT GPU
|
||||
// buffer in CHW planar format. No intermediate GpuMat allocation.
|
||||
struct NV12DirectResult {
|
||||
bool succeeded = false;
|
||||
float metaWidth = 0.f;
|
||||
float metaHeight = 0.f;
|
||||
};
|
||||
|
||||
NV12DirectResult tryNV12DirectToBuffer(
|
||||
const cv::Mat& inputImage, int inferenceGpu,
|
||||
void* dstGpuBuffer, int inputW, int inputH,
|
||||
bool isFloat32, // false=uint8 CHW, true=float32 CHW
|
||||
cudaStream_t stream,
|
||||
SPDLogger& logger, const char* tag);
|
||||
|
||||
// ── Cleanup ──────────────────────────────────────────────────────
|
||||
void destroy();
|
||||
|
||||
private:
|
||||
void ensurePinnedBuffer(size_t bytes, SPDLogger& logger, const char* tag);
|
||||
|
||||
// Warmup gating
|
||||
int m_inferenceCount = 0;
|
||||
static constexpr int NV12_WARMUP_THRESHOLD = 30;
|
||||
|
||||
// Pinned (page-locked) host buffer for fast H2D uploads
|
||||
void* m_pinnedBuf = nullptr;
|
||||
size_t m_pinnedBufSize = 0;
|
||||
|
||||
// CUDA context health circuit breaker
|
||||
int m_cudaFailStreak = 0;
|
||||
static constexpr int CUDA_FAIL_LIMIT = 10;
|
||||
bool m_cudaContextDead = false;
|
||||
|
||||
// One-shot diagnostic logging flags
|
||||
bool m_gpuMismatchLogged = false;
|
||||
bool m_bgrFullResLogged = false;
|
||||
bool m_nv12NullLogged = false;
|
||||
bool m_nv12DimLogged = false;
|
||||
bool m_nv12DeadLogged = false;
|
||||
bool m_nv12PathLogged = false;
|
||||
bool m_nv12PinnedLogged = false;
|
||||
bool m_nv12ActiveLogged = false;
|
||||
bool m_nv12MetaLogged = false;
|
||||
bool m_nv12CropLogged = false;
|
||||
};
|
||||
|
||||
} // namespace ANSCENTER
|
||||
#endif
|
||||
Reference in New Issue
Block a user