Refactor project structure

This commit is contained in:
2026-03-28 19:56:39 +11:00
parent 1d267378b2
commit 8a2e721058
511 changed files with 59 additions and 48 deletions

View File

@@ -0,0 +1,226 @@
#ifndef NV12_PREPROCESS_HELPER_H
#define NV12_PREPROCESS_HELPER_H
#pragma once
// NV12PreprocessHelper — shared NV12/CUDA zero-copy preprocessing utility.
//
// Encapsulates all NV12 fast-path logic previously in ANSRTYOLO:
// - Warmup gating (skip NV12 for first N inferences)
// - Registry lookup + lock lifecycle
// - GPU matching (decode GPU vs inference GPU, cross-GPU BGR fallback)
// - Pinned buffer management (reusable cudaHostAlloc)
// - CUDA zero-copy path (wrap NVDEC device ptrs as GpuMat)
// - CPU NV12 upload path (SEH-protected memcpy + pinned upload)
// - CUDA health circuit breaker
// - BGR full-res path (pixFmt=1000)
// - One-shot diagnostic logging flags
//
// Composition, not inheritance — engines add it as a private member.
// CUDA coupling stays explicit and opt-in (ONNX/OpenVINO engines unaffected).
#include <functional>
#include <cuda_runtime.h>
#include <opencv2/core/cuda.hpp>
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h> // GetModuleHandleA, GetProcAddress for cross-DLL TLS
#endif
// ANSENGINE_API is defined by ANSEngineCommon.h (included before this header
// via ANSRTYOLO.h / ANSYOLOV12RTOD.h / ANSYOLOV10RTOD.h).
// Guard for standalone inclusion.
#ifndef ANSENGINE_API
#define ANSENGINE_API __declspec(dllexport)
#endif
// Forward declarations — avoid pulling in heavy headers
struct GpuFrameData;
// Thread-local: current GpuFrameData* for the active inference call.
// Set by RunInferenceComplete_LV (dllmain.cpp) before calling engine->RunInference().
// Read by tryNV12() and direct engine lookups instead of registry lookup by datastart.
//
// CROSS-DLL SHARING: The canonical thread_local lives in ANSODEngine.dll,
// exported via ANSODEngine_GetTlsGpuFrame(). All DLLs (ANSLPR, ANSFR,
// ANSOCR) resolve to that SAME slot via GetProcAddress at first call.
// This ensures that when ANSLPR sets the frame pointer, ANSODEngine's
// tryNV12() sees it — enabling NV12 zero-copy for ALPR, FR, and OCR.
//
// Previous bug: the old `inline` + `thread_local` created separate TLS
// instances per DLL on MSVC, silently disabling NV12 for all wrapper DLLs.
inline GpuFrameData*& tl_currentGpuFrame() {
// Resolve the canonical TLS slot exported by ANSODEngine.dll.
// GetProcAddress is called once (static lambda), cached forever.
using TlsFn = GpuFrameData** (*)();
static TlsFn s_fn = []() -> TlsFn {
#ifdef _WIN32
HMODULE h = GetModuleHandleA("ANSODEngine.dll");
if (h) {
auto f = reinterpret_cast<TlsFn>(
GetProcAddress(h, "ANSODEngine_GetTlsGpuFrame"));
if (f) return f;
}
#endif
return nullptr;
}();
if (s_fn) return *s_fn();
// Fallback: local TLS for unit tests that don't load ANSODEngine.dll.
thread_local GpuFrameData* local = nullptr;
return local;
}
namespace ANSCENTER {
// Forward declare SPDLogger (defined in ANSLicense.h, ANSCENTER namespace)
class SPDLogger;
// Callback type: engine-specific CUDA kernel launcher.
// Default YOLO launcher provided as static method.
using NV12KernelLauncher = std::function<void(
const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
int inputW, int inputH, cudaStream_t stream)>;
// Result of tryNV12() — tells the caller what happened and what to do.
struct NV12Result {
bool succeeded = false; // true = gpuRGB is valid model input
cv::cuda::GpuMat gpuRGB; // model-input-sized, letterboxed RGB
float metaWidth = 0.f; // display-res width (for coord mapping)
float metaHeight = 0.f; // display-res height
float ratio = 1.f; // letterbox scale
bool useBgrFullRes = false; // true = pixFmt=1000 path
cv::Mat bgrFullResImg; // valid only when useBgrFullRes
float bgrFullResScaleX = 1.f; // display/fullRes X scale
float bgrFullResScaleY = 1.f; // display/fullRes Y scale
};
class ANSENGINE_API NV12PreprocessHelper {
public:
NV12PreprocessHelper() = default;
~NV12PreprocessHelper();
// Non-copyable (pinned buffer ownership)
NV12PreprocessHelper(const NV12PreprocessHelper&) = delete;
NV12PreprocessHelper& operator=(const NV12PreprocessHelper&) = delete;
// ── Main entry point ─────────────────────────────────────────────
NV12Result tryNV12(const cv::Mat& inputImage, int inferenceGpu,
int inputW, int inputH,
const NV12KernelLauncher& launcher,
SPDLogger& logger, const char* tag);
// Increment inference counter. Call after each inference (NV12 or BGR).
void tickInference() {
if (m_inferenceCount < NV12_WARMUP_THRESHOLD + 1)
++m_inferenceCount;
}
// ── CUDA context health ──────────────────────────────────────────
bool isCudaContextHealthy(SPDLogger& logger, const char* tag);
// ── Default YOLO kernel launcher ─────────────────────────────────
static NV12KernelLauncher defaultYOLOLauncher();
// ── Classification launcher (direct resize, no letterbox) ────────
static NV12KernelLauncher classificationLauncher();
// ── SCRFD center-padded letterbox launcher ───────────────────────
// SCRFD uses center-padding (dw, dh) unlike YOLO's right-bottom padding.
static NV12KernelLauncher scrfdCenterLetterboxLauncher(int padLeft, int padTop);
// ── NV12→BGR fused resize (for OCR detection) ────────────────────
// Single kernel: NV12 Y+UV → bilinear resize → BGR GpuMat output.
// No intermediate allocations. Output is CV_8UC3.
static void nv12ToBGRResize(
const uint8_t* devY, int yPitch,
const uint8_t* devUV, int uvPitch,
uint8_t* bgrOut, int outPitch,
int outW, int outH, int srcW, int srcH,
cudaStream_t stream = nullptr);
// ── NV12 affine warp for face alignment ─────────────────────────
// Reads full-res NV12, applies inverse affine transform, outputs
// aligned face in BGR (112x112). Used by SCRFD face pipeline.
struct NV12AffineResult {
bool succeeded = false;
cv::Mat alignedFaceBGR; // small face crop on CPU (e.g. 112x112 BGR)
cv::cuda::GpuMat gpuAlignedFace; // same face on GPU (avoids re-upload in recognizer)
};
NV12AffineResult tryNV12AffineWarp(
const cv::Mat& inputImage, int inferenceGpu,
const cv::Mat& affineMatrix, int outW, int outH,
float scaleX, float scaleY,
SPDLogger& logger, const char* tag);
// ── NV12 rectangular crop → BGR ──────────────────────────────────
// Crops a rectangular region from the full-res NV12 frame in GPU VRAM
// and returns a small BGR cv::Mat on CPU. Used by LPR pipeline to get
// high-res plate crops without converting the entire 4K frame.
// bbox is in display-res coords; scaleX/scaleY map to full-res NV12.
struct NV12CropResult {
bool succeeded = false;
cv::Mat bgrCrop; // cropped BGR on CPU (full-res quality)
};
NV12CropResult tryNV12CropToBGR(
const cv::Mat& inputImage, int inferenceGpu,
const cv::Rect& bbox, int padding,
float scaleX, float scaleY,
SPDLogger& logger, const char* tag);
// ── Direct-to-buffer CHW output (SAM3 image encoder) ─────────────
// Writes NV12→RGB + resize directly into a pre-allocated TRT GPU
// buffer in CHW planar format. No intermediate GpuMat allocation.
struct NV12DirectResult {
bool succeeded = false;
float metaWidth = 0.f;
float metaHeight = 0.f;
};
NV12DirectResult tryNV12DirectToBuffer(
const cv::Mat& inputImage, int inferenceGpu,
void* dstGpuBuffer, int inputW, int inputH,
bool isFloat32, // false=uint8 CHW, true=float32 CHW
cudaStream_t stream,
SPDLogger& logger, const char* tag);
// ── Cleanup ──────────────────────────────────────────────────────
void destroy();
private:
void ensurePinnedBuffer(size_t bytes, SPDLogger& logger, const char* tag);
// Warmup gating
int m_inferenceCount = 0;
static constexpr int NV12_WARMUP_THRESHOLD = 30;
// Pinned (page-locked) host buffer for fast H2D uploads
void* m_pinnedBuf = nullptr;
size_t m_pinnedBufSize = 0;
// CUDA context health circuit breaker
int m_cudaFailStreak = 0;
static constexpr int CUDA_FAIL_LIMIT = 10;
bool m_cudaContextDead = false;
// One-shot diagnostic logging flags
bool m_gpuMismatchLogged = false;
bool m_bgrFullResLogged = false;
bool m_nv12NullLogged = false;
bool m_nv12DimLogged = false;
bool m_nv12DeadLogged = false;
bool m_nv12PathLogged = false;
bool m_nv12PinnedLogged = false;
bool m_nv12ActiveLogged = false;
bool m_nv12MetaLogged = false;
bool m_nv12CropLogged = false;
};
} // namespace ANSCENTER
#endif