Files
ANSCORE/modules/ANSODEngine/NV12PreprocessHelper.cpp

1018 lines
41 KiB
C++

#include "NV12PreprocessHelper.h"
#include "ANSGpuFrameRegistry.h"
#include "ANSEngineCommon.h"
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>
#include <cstring>
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
// SEH-protected memcpy: returns false on access violation instead of crashing.
// Used for NV12 plane data which may originate from D3D11VA staging textures.
static bool safeMemcpy(void* dst, const void* src, size_t bytes) {
__try {
std::memcpy(dst, src, bytes);
return true;
}
__except (EXCEPTION_EXECUTE_HANDLER) {
return false;
}
}
// Check if a memory range is readable (non-crashing probe via VirtualQuery)
static bool isMemoryReadable(const void* ptr, size_t bytes) {
if (!ptr || bytes == 0) return false;
MEMORY_BASIC_INFORMATION mbi{};
if (VirtualQuery(ptr, &mbi, sizeof(mbi)) == 0) return false;
if (mbi.State != MEM_COMMIT) return false;
if (mbi.Protect & (PAGE_NOACCESS | PAGE_GUARD)) return false;
const DWORD readFlags = PAGE_READONLY | PAGE_READWRITE | PAGE_EXECUTE_READ | PAGE_EXECUTE_READWRITE;
return (mbi.Protect & readFlags) != 0;
}
#else
static bool safeMemcpy(void* dst, const void* src, size_t bytes) {
std::memcpy(dst, src, bytes);
return true;
}
static bool isMemoryReadable(const void*, size_t) { return true; }
#endif
// NV12→RGB CUDA kernel (nv12_to_rgb.cu)
extern "C" void launchNV12ToRGB(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* rgbOut, int outPitch,
int width, int height,
cudaStream_t stream);
extern "C" void launchNV12ToRGBLetterbox(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* rgbOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
int unpadW, int unpadH,
float invScale,
cudaStream_t stream);
extern "C" void launchNV12ToRGBCenterLetterbox(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* rgbOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
int unpadW, int unpadH,
float invScale,
int padLeft, int padTop,
cudaStream_t stream);
extern "C" void launchNV12AffineWarpToBGR(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* bgrOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
float m00, float m01, float m02,
float m10, float m11, float m12,
cudaStream_t stream);
extern "C" void launchNV12ToRGBResize(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* rgbOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
cudaStream_t stream);
extern "C" void launchNV12ToRGBResizeCHW_uint8(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* chwOut,
int outW, int outH,
int srcW, int srcH,
cudaStream_t stream);
extern "C" void launchNV12ToRGBResizeCHW_float(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
float* chwOut,
int outW, int outH,
int srcW, int srcH,
cudaStream_t stream);
extern "C" void launchNV12ToBGRResize(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* bgrOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
cudaStream_t stream);
extern "C" void launchNV12CropToBGR(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* bgrOut, int outPitch,
int outW, int outH,
int cropX, int cropY,
int srcW, int srcH,
cudaStream_t stream);
namespace ANSCENTER {
NV12PreprocessHelper::~NV12PreprocessHelper() {
destroy();
}
void NV12PreprocessHelper::destroy() {
if (m_pinnedBuf) {
cudaFreeHost(m_pinnedBuf);
m_pinnedBuf = nullptr;
m_pinnedBufSize = 0;
}
}
void NV12PreprocessHelper::ensurePinnedBuffer(size_t bytes, SPDLogger& logger, const char* tag) {
if (bytes <= m_pinnedBufSize) return;
if (m_pinnedBuf) {
cudaFreeHost(m_pinnedBuf);
m_pinnedBuf = nullptr;
m_pinnedBufSize = 0;
}
cudaError_t err = cudaHostAlloc(&m_pinnedBuf, bytes, cudaHostAllocPortable);
if (err != cudaSuccess) {
if (!m_nv12PinnedLogged) {
m_nv12PinnedLogged = true;
logger.LogWarn(std::string(tag) + "::NV12Helper",
"cudaHostAlloc failed (" + std::string(cudaGetErrorString(err)) +
") for " + std::to_string(bytes) + " bytes",
__FILE__, __LINE__);
}
m_pinnedBuf = nullptr;
m_pinnedBufSize = 0;
return;
}
m_pinnedBufSize = bytes;
}
bool NV12PreprocessHelper::isCudaContextHealthy(SPDLogger& logger, const char* tag) {
if (m_cudaContextDead) return false;
cudaError_t pending = cudaGetLastError();
cudaError_t err = cudaStreamQuery(nullptr);
if (err == cudaSuccess || err == cudaErrorNotReady) {
m_cudaFailStreak = 0;
return true;
}
++m_cudaFailStreak;
if (m_cudaFailStreak >= CUDA_FAIL_LIMIT && !m_cudaContextDead) {
m_cudaContextDead = true;
logger.LogFatal(std::string(tag) + "::NV12Helper",
"CUDA context permanently corrupted after " +
std::to_string(m_cudaFailStreak) +
" consecutive failures (last error: " +
std::string(cudaGetErrorString(err)) +
", pending: " + std::string(cudaGetErrorString(pending)) +
"). GPU preprocessing DISABLED for this engine instance. "
"This is typically caused by NVDEC/CUVID surface pool exhaustion "
"(Error mapping a picture with CUVID: 205). "
"Reduce the number of concurrent HW-decoded 4K streams or "
"restart the application to recover.",
__FILE__, __LINE__);
}
return false;
}
// ====================================================================
// tryNV12 — attempt NV12 fast-path preprocessing
// ====================================================================
NV12Result NV12PreprocessHelper::tryNV12(
const cv::Mat& inputImage, int inferenceGpu,
int inputW, int inputH,
const NV12KernelLauncher& launcher,
SPDLogger& logger, const char* tag) {
NV12Result result;
// Skip during warmup phase
if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
return result;
}
// Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
GpuFrameData* gpuData = tl_currentGpuFrame();
if (!gpuData || !gpuData->yPlane) {
return result; // no NV12 data — caller uses BGR
}
// ── BGR full-res path (pixFmt=1000 from ANSVideoPlayer) ──────────
if (gpuData->pixelFormat == 1000) {
cv::Mat fullResImg(gpuData->height, gpuData->width, CV_8UC3,
gpuData->yPlane, gpuData->yLinesize);
if (!m_bgrFullResLogged) {
m_bgrFullResLogged = true;
logger.LogInfo(std::string(tag) + "::NV12Helper",
"BGR full-res ACTIVE: " + std::to_string(gpuData->width) + "x" +
std::to_string(gpuData->height) + " -> inference (display=" +
std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) + ")",
__FILE__, __LINE__);
}
result.useBgrFullRes = true;
result.bgrFullResImg = fullResImg.clone(); // clone under lock
if (inputImage.cols > 0 && inputImage.rows > 0 &&
(fullResImg.cols != inputImage.cols || fullResImg.rows != inputImage.rows)) {
result.bgrFullResScaleX = static_cast<float>(inputImage.cols) / static_cast<float>(fullResImg.cols);
result.bgrFullResScaleY = static_cast<float>(inputImage.rows) / static_cast<float>(fullResImg.rows);
}
// (No registry lock — data kept alive by refcount)
return result;
}
// ── NV12 path (pixFmt=23 = AV_PIX_FMT_NV12) ────────────────────
if (gpuData->pixelFormat != 23) {
return result; // unknown format — fall back to BGR
}
if (!gpuData->uvPlane) {
if (!m_nv12NullLogged) {
m_nv12NullLogged = true;
logger.LogWarn(std::string(tag) + "::NV12Helper",
"Early exit: null uvPlane",
__FILE__, __LINE__);
}
return result;
}
const int frameW = gpuData->width;
const int frameH = gpuData->height;
if (frameW <= 0 || frameH <= 0) {
if (!m_nv12DimLogged) {
m_nv12DimLogged = true;
logger.LogWarn(std::string(tag) + "::NV12Helper",
"Early exit: bad dimensions — w=" + std::to_string(frameW) +
" h=" + std::to_string(frameH),
__FILE__, __LINE__);
}
return result;
}
if (m_cudaContextDead) {
if (!m_nv12DeadLogged) {
m_nv12DeadLogged = true;
logger.LogWarn(std::string(tag) + "::NV12Helper",
"Early exit: CUDA context dead",
__FILE__, __LINE__);
}
return result;
}
const bool isCudaDevice = gpuData->isCudaDevicePtr;
const bool gpuMatch = !isCudaDevice ||
gpuData->gpuIndex < 0 ||
gpuData->gpuIndex == inferenceGpu;
const bool useZeroCopy = isCudaDevice && gpuMatch;
// --- Debug: log pointer state before reading ---
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
{
char _nv12_dbg[512];
snprintf(_nv12_dbg, sizeof(_nv12_dbg),
"[NV12Helper] tryNV12: gpuData=%p yPlane=%p uvPlane=%p isCuda=%d "
"gpuIdx=%d infGpu=%d gpuMatch=%d zeroCopy=%d "
"gpuCacheY=%p gpuCacheUV=%p gpuCacheValid=%d refcount=%d %dx%d\n",
(void*)gpuData, (void*)gpuData->yPlane, (void*)gpuData->uvPlane,
(int)isCudaDevice, gpuData->gpuIndex, inferenceGpu,
(int)gpuMatch, (int)useZeroCopy,
gpuData->gpuCacheY, gpuData->gpuCacheUV,
(int)gpuData->gpuCacheValid,
gpuData->refcount.load(),
frameW, frameH);
#ifdef _WIN32
OutputDebugStringA(_nv12_dbg);
#endif
fprintf(stderr, "%s", _nv12_dbg);
}
#endif
// Effective plane pointers — for zero-copy, use CUDA device ptrs;
// for CPU upload, use the CPU snapshot buffers.
uint8_t* effYPlane;
uint8_t* effUvPlane;
int effYLinesize;
int effUvLinesize;
if (useZeroCopy) {
// Same GPU: wrap owned CUDA device pointers directly
effYPlane = gpuData->yPlane;
effUvPlane = gpuData->uvPlane;
effYLinesize = gpuData->yLinesize;
effUvLinesize = gpuData->uvLinesize;
} else if (isCudaDevice && !gpuMatch) {
// Cross-GPU: can't use CUDA device ptrs from another GPU context.
// Fall through to CPU upload using the CPU snapshot buffers.
if (gpuData->cpuYPlane && gpuData->cpuUvPlane) {
effYPlane = gpuData->cpuYPlane;
effUvPlane = gpuData->cpuUvPlane;
effYLinesize = gpuData->cpuYLinesize;
effUvLinesize = gpuData->cpuUvLinesize;
if (!m_gpuMismatchLogged) {
m_gpuMismatchLogged = true;
logger.LogInfo(std::string(tag) + "::NV12Helper",
"GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) +
" vs inference GPU " + std::to_string(inferenceGpu) +
") — using CPU NV12 upload fallback",
__FILE__, __LINE__);
}
} else {
// No CPU fallback available — fall back to BGR
if (!m_gpuMismatchLogged) {
m_gpuMismatchLogged = true;
logger.LogInfo(std::string(tag) + "::NV12Helper",
"GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) +
" vs inference GPU " + std::to_string(inferenceGpu) +
") — no CPU fallback, using BGR",
__FILE__, __LINE__);
}
return result;
}
} else {
// Not CUDA device ptrs: use whatever yPlane/uvPlane point to (CPU data)
effYPlane = gpuData->yPlane;
effUvPlane = gpuData->uvPlane;
effYLinesize = gpuData->yLinesize;
effUvLinesize = gpuData->uvLinesize;
}
// Log path selection once — both spdlog and DebugView
if (!m_nv12PathLogged) {
m_nv12PathLogged = true;
const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY" : "CPU_NV12_UPLOAD";
auto msg = std::string("Path: ") + pathName +
" | isCuda=" + std::to_string(isCudaDevice) +
" gpuMatch=" + std::to_string(gpuMatch) +
" decodeGpu=" + std::to_string(gpuData->gpuIndex) +
" infGpu=" + std::to_string(inferenceGpu) +
" frame=" + std::to_string(frameW) + "x" + std::to_string(frameH);
logger.LogInfo(std::string(tag) + "::NV12Helper", msg, __FILE__, __LINE__);
#ifdef _WIN32
auto dbg = std::string("[NV12 ACTIVE] ") + tag + " " + msg + "\n";
OutputDebugStringA(dbg.c_str());
#endif
}
cv::cuda::Stream stream;
cv::cuda::GpuMat gpuY, gpuUV;
if (useZeroCopy) {
// CUDA zero-copy: wrap pool buffer device pointers directly
gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1,
effYPlane, static_cast<size_t>(effYLinesize));
gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
effUvPlane, static_cast<size_t>(effUvLinesize));
} else {
// CPU path: memcpy + upload
const size_t ySize = static_cast<size_t>(frameW) * frameH;
const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
const size_t nv12Size = ySize + uvSize;
ensurePinnedBuffer(nv12Size, logger, tag);
if (!m_pinnedBuf) {
return result;
}
// Validate NV12 plane pointers before memcpy
const size_t yBufNeeded = (effYLinesize == frameW)
? ySize
: static_cast<size_t>(effYLinesize) * frameH;
const size_t uvBufNeeded = (effUvLinesize == frameW)
? uvSize
: static_cast<size_t>(effUvLinesize) * (frameH / 2);
if (!isMemoryReadable(effYPlane, std::min(yBufNeeded, (size_t)4096)) ||
!isMemoryReadable(effUvPlane, std::min(uvBufNeeded, (size_t)4096))) {
logger.LogWarn(std::string(tag) + "::NV12Helper",
"NV12 plane pointers not readable — falling back to BGR",
__FILE__, __LINE__);
return result;
}
uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
bool cpyOk = true;
if (effYLinesize == frameW) {
cpyOk = safeMemcpy(dst, effYPlane, ySize);
} else {
for (int row = 0; row < frameH && cpyOk; row++)
cpyOk = safeMemcpy(dst + row * frameW,
effYPlane + row * effYLinesize, frameW);
}
if (cpyOk) {
uint8_t* uvDst = dst + ySize;
if (effUvLinesize == frameW) {
cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
} else {
for (int row = 0; row < frameH / 2 && cpyOk; row++)
cpyOk = safeMemcpy(uvDst + row * frameW,
effUvPlane + row * effUvLinesize, frameW);
}
}
if (!cpyOk) {
logger.LogWarn(std::string(tag) + "::NV12Helper",
"Access violation during NV12 memcpy — falling back to BGR",
__FILE__, __LINE__);
return result;
}
// NV12 data safely in pinned memory — release registry lock
// (No registry lock to release — data kept alive by refcount)
cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
static_cast<uint8_t*>(m_pinnedBuf) + ySize);
gpuY.upload(pinnedY, stream);
gpuUV.upload(pinnedUV, stream);
}
// Use display dimensions for coordinate mapping
const float metaW = static_cast<float>(inputImage.cols > 0 ? inputImage.cols : frameW);
const float metaH = static_cast<float>(inputImage.rows > 0 ? inputImage.rows : frameH);
if (metaH <= 0 || metaW <= 0) {
if (!m_nv12MetaLogged) {
m_nv12MetaLogged = true;
logger.LogWarn(std::string(tag) + "::NV12Helper",
"Early exit: metadata dims invalid",
__FILE__, __LINE__);
}
return result;
}
result.metaWidth = metaW;
result.metaHeight = metaH;
result.ratio = 1.f / std::min(
static_cast<float>(inputW) / metaW,
static_cast<float>(inputH) / metaH);
// Launch engine-specific kernel
cv::cuda::GpuMat gpuResized;
gpuResized.create(inputH, inputW, CV_8UC3);
cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
{
char _nv12_dbg2[256];
snprintf(_nv12_dbg2, sizeof(_nv12_dbg2),
"[NV12Helper] KERNEL LAUNCH: gpuY=%p(%dx%d) gpuUV=%p(%dx%d) -> %dx%d zeroCopy=%d\n",
(void*)gpuY.data, gpuY.cols, gpuY.rows,
(void*)gpuUV.data, gpuUV.cols, gpuUV.rows,
inputW, inputH, (int)useZeroCopy);
#ifdef _WIN32
OutputDebugStringA(_nv12_dbg2);
#endif
fprintf(stderr, "%s", _nv12_dbg2);
}
#endif
launcher(gpuY, gpuUV, gpuResized, frameW, frameH, inputW, inputH, rawStream);
stream.waitForCompletion();
// (No registry lock to release — data kept alive by refcount)
// Log NV12 fast-path usage once per instance
if (!m_nv12ActiveLogged) {
m_nv12ActiveLogged = true;
const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload";
logger.LogInfo(std::string(tag) + "::NV12Helper",
std::string(mode) + " ACTIVE: " +
std::to_string(frameW) + "x" + std::to_string(frameH) +
" NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) +
" display=" + std::to_string(static_cast<int>(metaW)) + "x" +
std::to_string(static_cast<int>(metaH)),
__FILE__, __LINE__);
}
result.succeeded = true;
result.gpuRGB = std::move(gpuResized);
return result;
}
// ====================================================================
// Default YOLO kernel launcher
//
// For detection/seg/pose/OBB: fused NV12→RGB + letterbox in a single
// kernel at model input resolution (e.g. 640x640).
// For classification: separate NV12→RGB at full res, then cuda::resize.
// (Classification detection is not done here — the caller's engine
// should set up the appropriate launcher if needed. This default
// always uses the fused letterbox path, which works for all YOLO
// detection/seg/pose/OBB tasks.)
// ====================================================================
NV12KernelLauncher NV12PreprocessHelper::defaultYOLOLauncher() {
return [](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
int inputW, int inputH, cudaStream_t stream) {
if (srcW == inputW && srcH == inputH) {
// Source matches model input — direct NV12→RGB, no resize needed
launchNV12ToRGB(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
srcW, srcH, stream);
} else {
// Fused NV12→RGB + letterbox resize
float r = std::min(static_cast<float>(inputW) / srcW,
static_cast<float>(inputH) / srcH);
int unpadW = static_cast<int>(r * srcW);
int unpadH = static_cast<int>(r * srcH);
float invScale = 1.0f / r;
launchNV12ToRGBLetterbox(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
inputW, inputH,
srcW, srcH,
unpadW, unpadH,
invScale, stream);
}
};
}
// ====================================================================
// Classification launcher (direct resize, no letterbox)
// ====================================================================
NV12KernelLauncher NV12PreprocessHelper::classificationLauncher() {
return [](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
int inputW, int inputH, cudaStream_t stream) {
if (srcW == inputW && srcH == inputH) {
// Source matches model input — direct NV12→RGB, no resize needed
launchNV12ToRGB(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
srcW, srcH, stream);
} else {
// Fused NV12→RGB + direct resize (stretch to fill, no padding)
launchNV12ToRGBResize(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
inputW, inputH,
srcW, srcH, stream);
}
};
}
// ====================================================================
// NV12→BGR fused resize (for OCR detection)
// ====================================================================
void NV12PreprocessHelper::nv12ToBGRResize(
const uint8_t* devY, int yPitch,
const uint8_t* devUV, int uvPitch,
uint8_t* bgrOut, int outPitch,
int outW, int outH, int srcW, int srcH,
cudaStream_t stream) {
launchNV12ToBGRResize(devY, yPitch, devUV, uvPitch,
bgrOut, outPitch, outW, outH, srcW, srcH, stream);
}
// SCRFD center-padded letterbox launcher
// ====================================================================
NV12KernelLauncher NV12PreprocessHelper::scrfdCenterLetterboxLauncher(int padLeft, int padTop) {
return [padLeft, padTop](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
int inputW, int inputH, cudaStream_t stream) {
float r = std::min(static_cast<float>(inputW) / srcW,
static_cast<float>(inputH) / srcH);
int unpadW = static_cast<int>(r * srcW);
int unpadH = static_cast<int>(r * srcH);
float invScale = 1.0f / r;
launchNV12ToRGBCenterLetterbox(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
inputW, inputH,
srcW, srcH,
unpadW, unpadH,
invScale,
padLeft, padTop,
stream);
};
}
// ====================================================================
// NV12 affine warp for face alignment
//
// Reads full-res NV12 from the registry, applies inverse affine
// transform via CUDA kernel, outputs a small aligned face in BGR.
// The affine matrix is computed on CPU (from detected landmarks to
// ArcFace template), then scaled to map to full-res NV12 source.
// ====================================================================
NV12PreprocessHelper::NV12AffineResult NV12PreprocessHelper::tryNV12AffineWarp(
const cv::Mat& inputImage, int inferenceGpu,
const cv::Mat& affineMatrix, int outW, int outH,
float scaleX, float scaleY,
SPDLogger& logger, const char* tag) {
NV12AffineResult result;
// Skip during warmup
if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
return result;
}
if (affineMatrix.empty()) {
return result;
}
// Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
GpuFrameData* gpuData = tl_currentGpuFrame();
if (!gpuData || !gpuData->yPlane || !gpuData->uvPlane || gpuData->pixelFormat != 23) {
return result;
}
const int frameW = gpuData->width;
const int frameH = gpuData->height;
if (frameW <= 0 || frameH <= 0 || m_cudaContextDead) {
return result;
}
const bool isCudaDevice = gpuData->isCudaDevicePtr;
const bool gpuMatch = !isCudaDevice ||
gpuData->gpuIndex < 0 ||
gpuData->gpuIndex == inferenceGpu;
if (isCudaDevice && !gpuMatch) {
return result; // cross-GPU — fall back to CPU warpAffine
}
const bool useZeroCopy = isCudaDevice && gpuMatch;
uint8_t* effYPlane = gpuData->yPlane;
uint8_t* effUvPlane = gpuData->uvPlane;
int effYLinesize = gpuData->yLinesize;
int effUvLinesize = gpuData->uvLinesize;
// Compute inverse affine matrix and scale for full-res NV12
cv::Mat M_inv;
cv::invertAffineTransform(affineMatrix, M_inv);
M_inv.convertTo(M_inv, CV_64F);
// Scale inverse affine to map from 112x112 output → full-res NV12 source
// Row 0 maps to X (multiply by scaleX), Row 1 maps to Y (multiply by scaleY)
double* row0 = M_inv.ptr<double>(0);
double* row1 = M_inv.ptr<double>(1);
row0[0] *= scaleX; row0[1] *= scaleX; row0[2] *= scaleX;
row1[0] *= scaleY; row1[1] *= scaleY; row1[2] *= scaleY;
float m00 = static_cast<float>(row0[0]);
float m01 = static_cast<float>(row0[1]);
float m02 = static_cast<float>(row0[2]);
float m10 = static_cast<float>(row1[0]);
float m11 = static_cast<float>(row1[1]);
float m12 = static_cast<float>(row1[2]);
cv::cuda::Stream stream;
cv::cuda::GpuMat gpuY, gpuUV;
if (useZeroCopy) {
gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1,
effYPlane, static_cast<size_t>(effYLinesize));
gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
effUvPlane, static_cast<size_t>(effUvLinesize));
} else {
// CPU path: memcpy + upload
const size_t ySize = static_cast<size_t>(frameW) * frameH;
const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
const size_t nv12Size = ySize + uvSize;
ensurePinnedBuffer(nv12Size, logger, tag);
if (!m_pinnedBuf) {
return result;
}
uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
bool cpyOk = true;
if (effYLinesize == frameW) {
cpyOk = safeMemcpy(dst, effYPlane, ySize);
} else {
for (int row = 0; row < frameH && cpyOk; row++)
cpyOk = safeMemcpy(dst + row * frameW,
effYPlane + row * effYLinesize, frameW);
}
if (cpyOk) {
uint8_t* uvDst = dst + ySize;
if (effUvLinesize == frameW) {
cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
} else {
for (int row = 0; row < frameH / 2 && cpyOk; row++)
cpyOk = safeMemcpy(uvDst + row * frameW,
effUvPlane + row * effUvLinesize, frameW);
}
}
if (!cpyOk) {
return result;
}
// (No registry lock to release — data kept alive by refcount)
cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
static_cast<uint8_t*>(m_pinnedBuf) + ySize);
gpuY.upload(pinnedY, stream);
gpuUV.upload(pinnedUV, stream);
}
// Launch affine warp kernel (outputs BGR 112x112)
cv::cuda::GpuMat gpuFace(outH, outW, CV_8UC3);
cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
launchNV12AffineWarpToBGR(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
gpuFace.ptr<uint8_t>(), static_cast<int>(gpuFace.step),
outW, outH,
frameW, frameH,
m00, m01, m02, m10, m11, m12,
rawStream);
stream.waitForCompletion();
// (No registry lock to release — data kept alive by refcount)
// Keep GPU copy for recognizer (avoids re-upload), download CPU copy for mask
result.gpuAlignedFace = gpuFace;
gpuFace.download(result.alignedFaceBGR);
result.succeeded = true;
return result;
}
// ── NV12 rectangular crop → BGR ────────────────────────────────────
NV12PreprocessHelper::NV12CropResult NV12PreprocessHelper::tryNV12CropToBGR(
const cv::Mat& inputImage, int inferenceGpu,
const cv::Rect& bbox, int padding,
float scaleX, float scaleY,
SPDLogger& logger, const char* tag)
{
NV12CropResult result;
// Warmup gating
if (m_inferenceCount < NV12_WARMUP_THRESHOLD)
return result;
// CUDA health check
if (m_cudaContextDead)
return result;
// Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
auto* gpuData = tl_currentGpuFrame();
if (!gpuData || gpuData->pixelFormat != 23)
return result; // no NV12 data
if (!gpuData->isCudaDevicePtr || !gpuData->yPlane || !gpuData->uvPlane)
return result; // NV12 not on GPU
const int frameW = gpuData->width;
const int frameH = gpuData->height;
// Scale bbox from display-res to full-res NV12 coords with padding
const int fx1 = std::max(0, static_cast<int>((bbox.x - padding) * scaleX));
const int fy1 = std::max(0, static_cast<int>((bbox.y - padding) * scaleY));
const int fx2 = std::min(frameW, static_cast<int>((bbox.x + bbox.width + padding) * scaleX));
const int fy2 = std::min(frameH, static_cast<int>((bbox.y + bbox.height + padding) * scaleY));
const int cropW = fx2 - fx1;
const int cropH = fy2 - fy1;
if (cropW < 5 || cropH < 5)
return result;
// GPU device matching
int currentDev = 0;
cudaGetDevice(&currentDev);
if (inferenceGpu >= 0 && currentDev != inferenceGpu)
cudaSetDevice(inferenceGpu);
// Wrap existing GPU NV12 planes
cv::cuda::GpuMat gpuY(frameH, frameW, CV_8UC1,
gpuData->yPlane, gpuData->yLinesize);
cv::cuda::GpuMat gpuUV(frameH / 2, frameW / 2, CV_8UC2,
gpuData->uvPlane, gpuData->uvLinesize);
// Allocate output on GPU
cv::cuda::GpuMat gpuCrop(cropH, cropW, CV_8UC3);
// Launch crop kernel
cv::cuda::Stream stream;
cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
launchNV12CropToBGR(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
gpuCrop.ptr<uint8_t>(), static_cast<int>(gpuCrop.step),
cropW, cropH,
fx1, fy1,
frameW, frameH,
rawStream);
stream.waitForCompletion();
// (No registry lock to release — data kept alive by refcount)
// Download small crop to CPU
gpuCrop.download(result.bgrCrop);
result.succeeded = true;
// One-shot diagnostic log
if (!m_nv12CropLogged) {
m_nv12CropLogged = true;
logger.LogInfo(tag,
"NV12 GPU crop ACTIVE: " + std::to_string(cropW) + "x" + std::to_string(cropH) +
" BGR cropped from " + std::to_string(frameW) + "x" + std::to_string(frameH) +
" NV12 (display=" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) +
" scaleX=" + std::to_string(scaleX) + " scaleY=" + std::to_string(scaleY) + ")",
__FILE__, __LINE__);
}
return result;
}
// ====================================================================
// tryNV12DirectToBuffer — write NV12→RGB CHW directly into TRT buffer
//
// For engines that manage their own GPU buffers (e.g. SAM3 with raw
// TRT enqueueV3). Same registry lookup / GPU matching / safety logic
// as tryNV12(), but writes CHW planar output into a pre-allocated
// void* GPU buffer instead of allocating an HWC GpuMat.
// ====================================================================
NV12PreprocessHelper::NV12DirectResult NV12PreprocessHelper::tryNV12DirectToBuffer(
const cv::Mat& inputImage, int inferenceGpu,
void* dstGpuBuffer, int inputW, int inputH,
bool isFloat32,
cudaStream_t stream,
SPDLogger& logger, const char* tag) {
NV12DirectResult result;
// Skip during warmup phase
if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
return result;
}
if (!dstGpuBuffer) return result;
// Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
GpuFrameData* gpuData = tl_currentGpuFrame();
if (!gpuData || !gpuData->yPlane) {
return result; // no NV12 data — caller uses CPU path
}
// BGR full-res path not applicable for direct-to-buffer (SAM3 handles its own resize)
if (gpuData->pixelFormat == 1000) {
return result;
}
// NV12 path only
if (gpuData->pixelFormat != 23) {
return result;
}
if (!gpuData->uvPlane) {
return result;
}
const int frameW = gpuData->width;
const int frameH = gpuData->height;
if (frameW <= 0 || frameH <= 0 || m_cudaContextDead) {
return result;
}
const bool isCudaDevice = gpuData->isCudaDevicePtr;
const bool gpuMatch = !isCudaDevice ||
gpuData->gpuIndex < 0 ||
gpuData->gpuIndex == inferenceGpu;
const bool useZeroCopy = isCudaDevice && gpuMatch;
if (isCudaDevice && !gpuMatch) {
return result; // cross-GPU — fall back to CPU path
}
uint8_t* effYPlane = gpuData->yPlane;
uint8_t* effUvPlane = gpuData->uvPlane;
int effYLinesize = gpuData->yLinesize;
int effUvLinesize = gpuData->uvLinesize;
// Log path selection once
if (!m_nv12PathLogged) {
m_nv12PathLogged = true;
const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY" : "CPU_NV12_UPLOAD";
logger.LogInfo(std::string(tag) + "::NV12Helper",
std::string("Path: ") + pathName +
" | isCuda=" + std::to_string(isCudaDevice) +
" gpuMatch=" + std::to_string(gpuMatch) +
" frame=" + std::to_string(frameW) + "x" + std::to_string(frameH) +
" (direct CHW " + (isFloat32 ? "float32" : "uint8") + ")",
__FILE__, __LINE__);
}
cv::cuda::GpuMat gpuY, gpuUV;
if (useZeroCopy) {
gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1,
effYPlane, static_cast<size_t>(effYLinesize));
gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
effUvPlane, static_cast<size_t>(effUvLinesize));
} else {
// CPU path: memcpy NV12 to pinned buffer, upload
const size_t ySize = static_cast<size_t>(frameW) * frameH;
const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
const size_t nv12Size = ySize + uvSize;
ensurePinnedBuffer(nv12Size, logger, tag);
if (!m_pinnedBuf) return result;
if (!isMemoryReadable(effYPlane, std::min(static_cast<size_t>(effYLinesize) * frameH, (size_t)4096)) ||
!isMemoryReadable(effUvPlane, std::min(static_cast<size_t>(effUvLinesize) * (frameH / 2), (size_t)4096))) {
return result;
}
uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
bool cpyOk = true;
if (effYLinesize == frameW) {
cpyOk = safeMemcpy(dst, effYPlane, ySize);
} else {
for (int row = 0; row < frameH && cpyOk; row++)
cpyOk = safeMemcpy(dst + row * frameW,
effYPlane + row * effYLinesize, frameW);
}
if (cpyOk) {
uint8_t* uvDst = dst + ySize;
if (effUvLinesize == frameW) {
cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
} else {
for (int row = 0; row < frameH / 2 && cpyOk; row++)
cpyOk = safeMemcpy(uvDst + row * frameW,
effUvPlane + row * effUvLinesize, frameW);
}
}
if (!cpyOk) return result;
// (No registry lock to release — data kept alive by refcount)
cv::cuda::Stream cvStream = cv::cuda::StreamAccessor::wrapStream(stream);
cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
static_cast<uint8_t*>(m_pinnedBuf) + ySize);
gpuY.upload(pinnedY, cvStream);
gpuUV.upload(pinnedUV, cvStream);
}
// Launch CHW kernel directly into TRT buffer
if (isFloat32) {
launchNV12ToRGBResizeCHW_float(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
static_cast<float*>(dstGpuBuffer),
inputW, inputH, frameW, frameH, stream);
} else {
launchNV12ToRGBResizeCHW_uint8(
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
static_cast<uint8_t*>(dstGpuBuffer),
inputW, inputH, frameW, frameH, stream);
}
// Use polling sync instead of cudaStreamSynchronize to avoid
// holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
{
cudaError_t err = cudaStreamQuery(stream);
while (err == cudaErrorNotReady) {
Sleep(0);
err = cudaStreamQuery(stream);
}
}
// (No registry lock to release — data kept alive by refcount)
// Log activation once
if (!m_nv12ActiveLogged) {
m_nv12ActiveLogged = true;
const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload";
logger.LogInfo(std::string(tag) + "::NV12Helper",
std::string(mode) + " ACTIVE: " +
std::to_string(frameW) + "x" + std::to_string(frameH) +
" NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) +
" CHW " + (isFloat32 ? "float32" : "uint8"),
__FILE__, __LINE__);
}
result.succeeded = true;
result.metaWidth = static_cast<float>(inputImage.cols > 0 ? inputImage.cols : frameW);
result.metaHeight = static_cast<float>(inputImage.rows > 0 ? inputImage.rows : frameH);
return result;
}
} // namespace ANSCENTER