1044 lines
43 KiB
C++
1044 lines
43 KiB
C++
#include "NV12PreprocessHelper.h"
|
|
#include "ANSGpuFrameRegistry.h"
|
|
#include "ANSEngineCommon.h"
|
|
#include <opencv2/cudaimgproc.hpp>
|
|
#include <opencv2/cudawarping.hpp>
|
|
#include <opencv2/core/cuda_stream_accessor.hpp>
|
|
#include <cstring>
|
|
|
|
#ifdef _WIN32
|
|
#define WIN32_LEAN_AND_MEAN
|
|
#define NOMINMAX
|
|
#include <windows.h>
|
|
// SEH-protected memcpy: returns false on access violation instead of crashing.
|
|
// Used for NV12 plane data which may originate from D3D11VA staging textures.
|
|
static bool safeMemcpy(void* dst, const void* src, size_t bytes) {
|
|
__try {
|
|
std::memcpy(dst, src, bytes);
|
|
return true;
|
|
}
|
|
__except (EXCEPTION_EXECUTE_HANDLER) {
|
|
return false;
|
|
}
|
|
}
|
|
// Check if a memory range is readable (non-crashing probe via VirtualQuery)
|
|
static bool isMemoryReadable(const void* ptr, size_t bytes) {
|
|
if (!ptr || bytes == 0) return false;
|
|
MEMORY_BASIC_INFORMATION mbi{};
|
|
if (VirtualQuery(ptr, &mbi, sizeof(mbi)) == 0) return false;
|
|
if (mbi.State != MEM_COMMIT) return false;
|
|
if (mbi.Protect & (PAGE_NOACCESS | PAGE_GUARD)) return false;
|
|
const DWORD readFlags = PAGE_READONLY | PAGE_READWRITE | PAGE_EXECUTE_READ | PAGE_EXECUTE_READWRITE;
|
|
return (mbi.Protect & readFlags) != 0;
|
|
}
|
|
#else
|
|
static bool safeMemcpy(void* dst, const void* src, size_t bytes) {
|
|
std::memcpy(dst, src, bytes);
|
|
return true;
|
|
}
|
|
static bool isMemoryReadable(const void*, size_t) { return true; }
|
|
#endif
|
|
|
|
// NV12→RGB CUDA kernel (nv12_to_rgb.cu)
|
|
extern "C" void launchNV12ToRGB(
|
|
const uint8_t* yPlane, int yPitch,
|
|
const uint8_t* uvPlane, int uvPitch,
|
|
uint8_t* rgbOut, int outPitch,
|
|
int width, int height,
|
|
cudaStream_t stream);
|
|
|
|
extern "C" void launchNV12ToRGBLetterbox(
|
|
const uint8_t* yPlane, int yPitch,
|
|
const uint8_t* uvPlane, int uvPitch,
|
|
uint8_t* rgbOut, int outPitch,
|
|
int outW, int outH,
|
|
int srcW, int srcH,
|
|
int unpadW, int unpadH,
|
|
float invScale,
|
|
cudaStream_t stream);
|
|
|
|
extern "C" void launchNV12ToRGBCenterLetterbox(
|
|
const uint8_t* yPlane, int yPitch,
|
|
const uint8_t* uvPlane, int uvPitch,
|
|
uint8_t* rgbOut, int outPitch,
|
|
int outW, int outH,
|
|
int srcW, int srcH,
|
|
int unpadW, int unpadH,
|
|
float invScale,
|
|
int padLeft, int padTop,
|
|
cudaStream_t stream);
|
|
|
|
extern "C" void launchNV12AffineWarpToBGR(
|
|
const uint8_t* yPlane, int yPitch,
|
|
const uint8_t* uvPlane, int uvPitch,
|
|
uint8_t* bgrOut, int outPitch,
|
|
int outW, int outH,
|
|
int srcW, int srcH,
|
|
float m00, float m01, float m02,
|
|
float m10, float m11, float m12,
|
|
cudaStream_t stream);
|
|
|
|
extern "C" void launchNV12ToRGBResize(
|
|
const uint8_t* yPlane, int yPitch,
|
|
const uint8_t* uvPlane, int uvPitch,
|
|
uint8_t* rgbOut, int outPitch,
|
|
int outW, int outH,
|
|
int srcW, int srcH,
|
|
cudaStream_t stream);
|
|
|
|
extern "C" void launchNV12ToRGBResizeCHW_uint8(
|
|
const uint8_t* yPlane, int yPitch,
|
|
const uint8_t* uvPlane, int uvPitch,
|
|
uint8_t* chwOut,
|
|
int outW, int outH,
|
|
int srcW, int srcH,
|
|
cudaStream_t stream);
|
|
|
|
extern "C" void launchNV12ToRGBResizeCHW_float(
|
|
const uint8_t* yPlane, int yPitch,
|
|
const uint8_t* uvPlane, int uvPitch,
|
|
float* chwOut,
|
|
int outW, int outH,
|
|
int srcW, int srcH,
|
|
cudaStream_t stream);
|
|
|
|
extern "C" void launchNV12ToBGRResize(
|
|
const uint8_t* yPlane, int yPitch,
|
|
const uint8_t* uvPlane, int uvPitch,
|
|
uint8_t* bgrOut, int outPitch,
|
|
int outW, int outH,
|
|
int srcW, int srcH,
|
|
cudaStream_t stream);
|
|
|
|
extern "C" void launchNV12CropToBGR(
|
|
const uint8_t* yPlane, int yPitch,
|
|
const uint8_t* uvPlane, int uvPitch,
|
|
uint8_t* bgrOut, int outPitch,
|
|
int outW, int outH,
|
|
int cropX, int cropY,
|
|
int srcW, int srcH,
|
|
cudaStream_t stream);
|
|
|
|
namespace ANSCENTER {
|
|
|
|
NV12PreprocessHelper::~NV12PreprocessHelper() {
|
|
destroy();
|
|
}
|
|
|
|
void NV12PreprocessHelper::destroy() {
|
|
if (m_pinnedBuf) {
|
|
cudaFreeHost(m_pinnedBuf);
|
|
m_pinnedBuf = nullptr;
|
|
m_pinnedBufSize = 0;
|
|
}
|
|
}
|
|
|
|
void NV12PreprocessHelper::ensurePinnedBuffer(size_t bytes, SPDLogger& logger, const char* tag) {
|
|
if (bytes <= m_pinnedBufSize) return;
|
|
if (m_pinnedBuf) {
|
|
cudaFreeHost(m_pinnedBuf);
|
|
m_pinnedBuf = nullptr;
|
|
m_pinnedBufSize = 0;
|
|
}
|
|
cudaError_t err = cudaHostAlloc(&m_pinnedBuf, bytes, cudaHostAllocPortable);
|
|
if (err != cudaSuccess) {
|
|
if (!m_nv12PinnedLogged) {
|
|
m_nv12PinnedLogged = true;
|
|
logger.LogWarn(std::string(tag) + "::NV12Helper",
|
|
"cudaHostAlloc failed (" + std::string(cudaGetErrorString(err)) +
|
|
") for " + std::to_string(bytes) + " bytes",
|
|
__FILE__, __LINE__);
|
|
}
|
|
m_pinnedBuf = nullptr;
|
|
m_pinnedBufSize = 0;
|
|
return;
|
|
}
|
|
m_pinnedBufSize = bytes;
|
|
}
|
|
|
|
bool NV12PreprocessHelper::isCudaContextHealthy(SPDLogger& logger, const char* tag) {
|
|
if (m_cudaContextDead) return false;
|
|
|
|
cudaError_t pending = cudaGetLastError();
|
|
cudaError_t err = cudaStreamQuery(nullptr);
|
|
if (err == cudaSuccess || err == cudaErrorNotReady) {
|
|
m_cudaFailStreak = 0;
|
|
return true;
|
|
}
|
|
|
|
++m_cudaFailStreak;
|
|
if (m_cudaFailStreak >= CUDA_FAIL_LIMIT && !m_cudaContextDead) {
|
|
m_cudaContextDead = true;
|
|
logger.LogFatal(std::string(tag) + "::NV12Helper",
|
|
"CUDA context permanently corrupted after " +
|
|
std::to_string(m_cudaFailStreak) +
|
|
" consecutive failures (last error: " +
|
|
std::string(cudaGetErrorString(err)) +
|
|
", pending: " + std::string(cudaGetErrorString(pending)) +
|
|
"). GPU preprocessing DISABLED for this engine instance. "
|
|
"This is typically caused by NVDEC/CUVID surface pool exhaustion "
|
|
"(Error mapping a picture with CUVID: 205). "
|
|
"Reduce the number of concurrent HW-decoded 4K streams or "
|
|
"restart the application to recover.",
|
|
__FILE__, __LINE__);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// ====================================================================
|
|
// tryNV12 — attempt NV12 fast-path preprocessing
|
|
// ====================================================================
|
|
NV12Result NV12PreprocessHelper::tryNV12(
|
|
const cv::Mat& inputImage, int inferenceGpu,
|
|
int inputW, int inputH,
|
|
const NV12KernelLauncher& launcher,
|
|
SPDLogger& logger, const char* tag) {
|
|
NV12Result result;
|
|
|
|
// Skip during warmup phase
|
|
if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
|
|
return result;
|
|
}
|
|
|
|
// Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
|
|
GpuFrameData* gpuData = tl_currentGpuFrame();
|
|
|
|
if (!gpuData || !gpuData->yPlane) {
|
|
return result; // no NV12 data — caller uses BGR
|
|
}
|
|
|
|
// ── BGR full-res path (pixFmt=1000 from ANSVideoPlayer) ──────────
|
|
if (gpuData->pixelFormat == 1000) {
|
|
cv::Mat fullResImg(gpuData->height, gpuData->width, CV_8UC3,
|
|
gpuData->yPlane, gpuData->yLinesize);
|
|
if (!m_bgrFullResLogged) {
|
|
m_bgrFullResLogged = true;
|
|
logger.LogInfo(std::string(tag) + "::NV12Helper",
|
|
"BGR full-res ACTIVE: " + std::to_string(gpuData->width) + "x" +
|
|
std::to_string(gpuData->height) + " -> inference (display=" +
|
|
std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) + ")",
|
|
__FILE__, __LINE__);
|
|
}
|
|
result.useBgrFullRes = true;
|
|
result.bgrFullResImg = fullResImg.clone(); // clone under lock
|
|
if (inputImage.cols > 0 && inputImage.rows > 0 &&
|
|
(fullResImg.cols != inputImage.cols || fullResImg.rows != inputImage.rows)) {
|
|
result.bgrFullResScaleX = static_cast<float>(inputImage.cols) / static_cast<float>(fullResImg.cols);
|
|
result.bgrFullResScaleY = static_cast<float>(inputImage.rows) / static_cast<float>(fullResImg.rows);
|
|
}
|
|
// (No registry lock — data kept alive by refcount)
|
|
return result;
|
|
}
|
|
|
|
// ── NV12 path (pixFmt=23 = AV_PIX_FMT_NV12) ────────────────────
|
|
if (gpuData->pixelFormat != 23) {
|
|
return result; // unknown format — fall back to BGR
|
|
}
|
|
|
|
if (!gpuData->uvPlane) {
|
|
if (!m_nv12NullLogged) {
|
|
m_nv12NullLogged = true;
|
|
logger.LogWarn(std::string(tag) + "::NV12Helper",
|
|
"Early exit: null uvPlane",
|
|
__FILE__, __LINE__);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
const int frameW = gpuData->width;
|
|
const int frameH = gpuData->height;
|
|
|
|
if (frameW <= 0 || frameH <= 0) {
|
|
if (!m_nv12DimLogged) {
|
|
m_nv12DimLogged = true;
|
|
logger.LogWarn(std::string(tag) + "::NV12Helper",
|
|
"Early exit: bad dimensions — w=" + std::to_string(frameW) +
|
|
" h=" + std::to_string(frameH),
|
|
__FILE__, __LINE__);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
if (m_cudaContextDead) {
|
|
if (!m_nv12DeadLogged) {
|
|
m_nv12DeadLogged = true;
|
|
logger.LogWarn(std::string(tag) + "::NV12Helper",
|
|
"Early exit: CUDA context dead",
|
|
__FILE__, __LINE__);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// Ensure async D2D copy (NVDEC → pool buffer) has completed before
|
|
// reading yPlane/uvPlane. The copy was queued in gpu_frame_attach_cuda()
|
|
// on a non-blocking stream. By the time inference runs (~50-200ms later),
|
|
// the copy (~0.3ms) has long finished, so this sync returns immediately.
|
|
if (gpuData->d2dCopyStream) {
|
|
cudaStreamSynchronize(static_cast<cudaStream_t>(gpuData->d2dCopyStream));
|
|
gpuData->d2dCopyStream = nullptr; // Only sync once per frame
|
|
}
|
|
|
|
const bool isCudaDevice = gpuData->isCudaDevicePtr;
|
|
const bool gpuMatch = !isCudaDevice ||
|
|
gpuData->gpuIndex < 0 ||
|
|
gpuData->gpuIndex == inferenceGpu;
|
|
const bool useZeroCopy = isCudaDevice && gpuMatch;
|
|
|
|
// --- Debug: log pointer state before reading ---
|
|
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
|
|
{
|
|
char _nv12_dbg[512];
|
|
snprintf(_nv12_dbg, sizeof(_nv12_dbg),
|
|
"[NV12Helper] tryNV12: gpuData=%p yPlane=%p uvPlane=%p isCuda=%d "
|
|
"gpuIdx=%d infGpu=%d gpuMatch=%d zeroCopy=%d "
|
|
"gpuCacheY=%p gpuCacheUV=%p gpuCacheValid=%d refcount=%d %dx%d\n",
|
|
(void*)gpuData, (void*)gpuData->yPlane, (void*)gpuData->uvPlane,
|
|
(int)isCudaDevice, gpuData->gpuIndex, inferenceGpu,
|
|
(int)gpuMatch, (int)useZeroCopy,
|
|
gpuData->gpuCacheY, gpuData->gpuCacheUV,
|
|
(int)gpuData->gpuCacheValid,
|
|
gpuData->refcount.load(),
|
|
frameW, frameH);
|
|
#ifdef _WIN32
|
|
OutputDebugStringA(_nv12_dbg);
|
|
#endif
|
|
fprintf(stderr, "%s", _nv12_dbg);
|
|
}
|
|
#endif
|
|
|
|
// Effective plane pointers — for zero-copy, use CUDA device ptrs;
|
|
// for CPU upload, use the CPU snapshot buffers.
|
|
uint8_t* effYPlane;
|
|
uint8_t* effUvPlane;
|
|
int effYLinesize;
|
|
int effUvLinesize;
|
|
|
|
if (useZeroCopy) {
|
|
// Same GPU: wrap owned CUDA device pointers directly
|
|
effYPlane = gpuData->yPlane;
|
|
effUvPlane = gpuData->uvPlane;
|
|
effYLinesize = gpuData->yLinesize;
|
|
effUvLinesize = gpuData->uvLinesize;
|
|
} else if (isCudaDevice && !gpuMatch) {
|
|
// Cross-GPU: can't use CUDA device ptrs from another GPU context.
|
|
// Fall through to CPU upload using the CPU snapshot buffers.
|
|
if (gpuData->cpuYPlane && gpuData->cpuUvPlane) {
|
|
effYPlane = gpuData->cpuYPlane;
|
|
effUvPlane = gpuData->cpuUvPlane;
|
|
effYLinesize = gpuData->cpuYLinesize;
|
|
effUvLinesize = gpuData->cpuUvLinesize;
|
|
if (!m_gpuMismatchLogged) {
|
|
m_gpuMismatchLogged = true;
|
|
logger.LogInfo(std::string(tag) + "::NV12Helper",
|
|
"GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) +
|
|
" vs inference GPU " + std::to_string(inferenceGpu) +
|
|
") — using CPU NV12 upload fallback",
|
|
__FILE__, __LINE__);
|
|
}
|
|
} else {
|
|
// No CPU fallback available — fall back to BGR
|
|
if (!m_gpuMismatchLogged) {
|
|
m_gpuMismatchLogged = true;
|
|
logger.LogInfo(std::string(tag) + "::NV12Helper",
|
|
"GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) +
|
|
" vs inference GPU " + std::to_string(inferenceGpu) +
|
|
") — no CPU fallback, using BGR",
|
|
__FILE__, __LINE__);
|
|
}
|
|
return result;
|
|
}
|
|
} else {
|
|
// Not CUDA device ptrs: use whatever yPlane/uvPlane point to (CPU data)
|
|
effYPlane = gpuData->yPlane;
|
|
effUvPlane = gpuData->uvPlane;
|
|
effYLinesize = gpuData->yLinesize;
|
|
effUvLinesize = gpuData->uvLinesize;
|
|
}
|
|
|
|
// Log path selection once — both spdlog and DebugView
|
|
if (!m_nv12PathLogged) {
|
|
m_nv12PathLogged = true;
|
|
const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY" : "CPU_NV12_UPLOAD";
|
|
auto msg = std::string("Path: ") + pathName +
|
|
" | isCuda=" + std::to_string(isCudaDevice) +
|
|
" gpuMatch=" + std::to_string(gpuMatch) +
|
|
" decodeGpu=" + std::to_string(gpuData->gpuIndex) +
|
|
" infGpu=" + std::to_string(inferenceGpu) +
|
|
" frame=" + std::to_string(frameW) + "x" + std::to_string(frameH);
|
|
logger.LogInfo(std::string(tag) + "::NV12Helper", msg, __FILE__, __LINE__);
|
|
#ifdef _WIN32
|
|
auto dbg = std::string("[NV12 ACTIVE] ") + tag + " " + msg + "\n";
|
|
OutputDebugStringA(dbg.c_str());
|
|
#endif
|
|
}
|
|
|
|
cv::cuda::Stream stream;
|
|
cv::cuda::GpuMat gpuY, gpuUV;
|
|
|
|
if (useZeroCopy) {
|
|
gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1,
|
|
effYPlane, static_cast<size_t>(effYLinesize));
|
|
gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
|
|
effUvPlane, static_cast<size_t>(effUvLinesize));
|
|
} else {
|
|
// CPU path: memcpy + upload
|
|
const size_t ySize = static_cast<size_t>(frameW) * frameH;
|
|
const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
|
|
const size_t nv12Size = ySize + uvSize;
|
|
ensurePinnedBuffer(nv12Size, logger, tag);
|
|
if (!m_pinnedBuf) {
|
|
return result;
|
|
}
|
|
|
|
// Validate NV12 plane pointers before memcpy
|
|
const size_t yBufNeeded = (effYLinesize == frameW)
|
|
? ySize
|
|
: static_cast<size_t>(effYLinesize) * frameH;
|
|
const size_t uvBufNeeded = (effUvLinesize == frameW)
|
|
? uvSize
|
|
: static_cast<size_t>(effUvLinesize) * (frameH / 2);
|
|
|
|
if (!isMemoryReadable(effYPlane, std::min(yBufNeeded, (size_t)4096)) ||
|
|
!isMemoryReadable(effUvPlane, std::min(uvBufNeeded, (size_t)4096))) {
|
|
logger.LogWarn(std::string(tag) + "::NV12Helper",
|
|
"NV12 plane pointers not readable — falling back to BGR",
|
|
__FILE__, __LINE__);
|
|
return result;
|
|
}
|
|
|
|
uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
|
|
bool cpyOk = true;
|
|
if (effYLinesize == frameW) {
|
|
cpyOk = safeMemcpy(dst, effYPlane, ySize);
|
|
} else {
|
|
for (int row = 0; row < frameH && cpyOk; row++)
|
|
cpyOk = safeMemcpy(dst + row * frameW,
|
|
effYPlane + row * effYLinesize, frameW);
|
|
}
|
|
if (cpyOk) {
|
|
uint8_t* uvDst = dst + ySize;
|
|
if (effUvLinesize == frameW) {
|
|
cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
|
|
} else {
|
|
for (int row = 0; row < frameH / 2 && cpyOk; row++)
|
|
cpyOk = safeMemcpy(uvDst + row * frameW,
|
|
effUvPlane + row * effUvLinesize, frameW);
|
|
}
|
|
}
|
|
|
|
if (!cpyOk) {
|
|
logger.LogWarn(std::string(tag) + "::NV12Helper",
|
|
"Access violation during NV12 memcpy — falling back to BGR",
|
|
__FILE__, __LINE__);
|
|
return result;
|
|
}
|
|
|
|
// NV12 data safely in pinned memory — release registry lock
|
|
// (No registry lock to release — data kept alive by refcount)
|
|
|
|
cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
|
|
cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
|
|
static_cast<uint8_t*>(m_pinnedBuf) + ySize);
|
|
gpuY.upload(pinnedY, stream);
|
|
gpuUV.upload(pinnedUV, stream);
|
|
}
|
|
|
|
// Use display dimensions for coordinate mapping
|
|
const float metaW = static_cast<float>(inputImage.cols > 0 ? inputImage.cols : frameW);
|
|
const float metaH = static_cast<float>(inputImage.rows > 0 ? inputImage.rows : frameH);
|
|
|
|
if (metaH <= 0 || metaW <= 0) {
|
|
if (!m_nv12MetaLogged) {
|
|
m_nv12MetaLogged = true;
|
|
logger.LogWarn(std::string(tag) + "::NV12Helper",
|
|
"Early exit: metadata dims invalid",
|
|
__FILE__, __LINE__);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
result.metaWidth = metaW;
|
|
result.metaHeight = metaH;
|
|
result.ratio = 1.f / std::min(
|
|
static_cast<float>(inputW) / metaW,
|
|
static_cast<float>(inputH) / metaH);
|
|
|
|
// Launch engine-specific kernel
|
|
cv::cuda::GpuMat gpuResized;
|
|
gpuResized.create(inputH, inputW, CV_8UC3);
|
|
|
|
cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
|
|
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
|
|
{
|
|
char _nv12_dbg2[256];
|
|
snprintf(_nv12_dbg2, sizeof(_nv12_dbg2),
|
|
"[NV12Helper] KERNEL LAUNCH: gpuY=%p(%dx%d) gpuUV=%p(%dx%d) -> %dx%d zeroCopy=%d\n",
|
|
(void*)gpuY.data, gpuY.cols, gpuY.rows,
|
|
(void*)gpuUV.data, gpuUV.cols, gpuUV.rows,
|
|
inputW, inputH, (int)useZeroCopy);
|
|
#ifdef _WIN32
|
|
OutputDebugStringA(_nv12_dbg2);
|
|
#endif
|
|
fprintf(stderr, "%s", _nv12_dbg2);
|
|
}
|
|
#endif
|
|
launcher(gpuY, gpuUV, gpuResized, frameW, frameH, inputW, inputH, rawStream);
|
|
|
|
stream.waitForCompletion();
|
|
|
|
// (No registry lock to release — data kept alive by refcount)
|
|
|
|
// Log NV12 fast-path usage once per instance
|
|
if (!m_nv12ActiveLogged) {
|
|
m_nv12ActiveLogged = true;
|
|
const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload";
|
|
logger.LogInfo(std::string(tag) + "::NV12Helper",
|
|
std::string(mode) + " ACTIVE: " +
|
|
std::to_string(frameW) + "x" + std::to_string(frameH) +
|
|
" NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) +
|
|
" display=" + std::to_string(static_cast<int>(metaW)) + "x" +
|
|
std::to_string(static_cast<int>(metaH)),
|
|
__FILE__, __LINE__);
|
|
}
|
|
|
|
result.succeeded = true;
|
|
result.gpuRGB = std::move(gpuResized);
|
|
return result;
|
|
}
|
|
|
|
// ====================================================================
|
|
// Default YOLO kernel launcher
|
|
//
|
|
// For detection/seg/pose/OBB: fused NV12→RGB + letterbox in a single
|
|
// kernel at model input resolution (e.g. 640x640).
|
|
// For classification: separate NV12→RGB at full res, then cuda::resize.
|
|
// (Classification detection is not done here — the caller's engine
|
|
// should set up the appropriate launcher if needed. This default
|
|
// always uses the fused letterbox path, which works for all YOLO
|
|
// detection/seg/pose/OBB tasks.)
|
|
// ====================================================================
|
|
NV12KernelLauncher NV12PreprocessHelper::defaultYOLOLauncher() {
|
|
return [](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
|
|
cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
|
|
int inputW, int inputH, cudaStream_t stream) {
|
|
if (srcW == inputW && srcH == inputH) {
|
|
// Source matches model input — direct NV12→RGB, no resize needed
|
|
launchNV12ToRGB(
|
|
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
|
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
|
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
|
|
srcW, srcH, stream);
|
|
} else {
|
|
// Fused NV12→RGB + letterbox resize
|
|
float r = std::min(static_cast<float>(inputW) / srcW,
|
|
static_cast<float>(inputH) / srcH);
|
|
int unpadW = static_cast<int>(r * srcW);
|
|
int unpadH = static_cast<int>(r * srcH);
|
|
float invScale = 1.0f / r;
|
|
|
|
launchNV12ToRGBLetterbox(
|
|
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
|
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
|
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
|
|
inputW, inputH,
|
|
srcW, srcH,
|
|
unpadW, unpadH,
|
|
invScale, stream);
|
|
}
|
|
};
|
|
}
|
|
|
|
// ====================================================================
|
|
// Classification launcher (direct resize, no letterbox)
|
|
// ====================================================================
|
|
NV12KernelLauncher NV12PreprocessHelper::classificationLauncher() {
|
|
return [](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
|
|
cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
|
|
int inputW, int inputH, cudaStream_t stream) {
|
|
if (srcW == inputW && srcH == inputH) {
|
|
// Source matches model input — direct NV12→RGB, no resize needed
|
|
launchNV12ToRGB(
|
|
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
|
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
|
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
|
|
srcW, srcH, stream);
|
|
} else {
|
|
// Fused NV12→RGB + direct resize (stretch to fill, no padding)
|
|
launchNV12ToRGBResize(
|
|
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
|
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
|
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
|
|
inputW, inputH,
|
|
srcW, srcH, stream);
|
|
}
|
|
};
|
|
}
|
|
|
|
// ====================================================================
|
|
// NV12→BGR fused resize (for OCR detection)
|
|
// ====================================================================
|
|
void NV12PreprocessHelper::nv12ToBGRResize(
|
|
const uint8_t* devY, int yPitch,
|
|
const uint8_t* devUV, int uvPitch,
|
|
uint8_t* bgrOut, int outPitch,
|
|
int outW, int outH, int srcW, int srcH,
|
|
cudaStream_t stream) {
|
|
launchNV12ToBGRResize(devY, yPitch, devUV, uvPitch,
|
|
bgrOut, outPitch, outW, outH, srcW, srcH, stream);
|
|
}
|
|
|
|
// SCRFD center-padded letterbox launcher
|
|
// ====================================================================
|
|
NV12KernelLauncher NV12PreprocessHelper::scrfdCenterLetterboxLauncher(int padLeft, int padTop) {
|
|
return [padLeft, padTop](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
|
|
cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
|
|
int inputW, int inputH, cudaStream_t stream) {
|
|
float r = std::min(static_cast<float>(inputW) / srcW,
|
|
static_cast<float>(inputH) / srcH);
|
|
int unpadW = static_cast<int>(r * srcW);
|
|
int unpadH = static_cast<int>(r * srcH);
|
|
float invScale = 1.0f / r;
|
|
|
|
launchNV12ToRGBCenterLetterbox(
|
|
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
|
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
|
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
|
|
inputW, inputH,
|
|
srcW, srcH,
|
|
unpadW, unpadH,
|
|
invScale,
|
|
padLeft, padTop,
|
|
stream);
|
|
};
|
|
}
|
|
|
|
// ====================================================================
|
|
// NV12 affine warp for face alignment
|
|
//
|
|
// Reads full-res NV12 from the registry, applies inverse affine
|
|
// transform via CUDA kernel, outputs a small aligned face in BGR.
|
|
// The affine matrix is computed on CPU (from detected landmarks to
|
|
// ArcFace template), then scaled to map to full-res NV12 source.
|
|
// ====================================================================
|
|
NV12PreprocessHelper::NV12AffineResult NV12PreprocessHelper::tryNV12AffineWarp(
|
|
const cv::Mat& inputImage, int inferenceGpu,
|
|
const cv::Mat& affineMatrix, int outW, int outH,
|
|
float scaleX, float scaleY,
|
|
SPDLogger& logger, const char* tag) {
|
|
NV12AffineResult result;
|
|
|
|
// Skip during warmup
|
|
if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
|
|
return result;
|
|
}
|
|
|
|
if (affineMatrix.empty()) {
|
|
return result;
|
|
}
|
|
|
|
// Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
|
|
GpuFrameData* gpuData = tl_currentGpuFrame();
|
|
|
|
if (!gpuData || !gpuData->yPlane || !gpuData->uvPlane || gpuData->pixelFormat != 23) {
|
|
return result;
|
|
}
|
|
|
|
const int frameW = gpuData->width;
|
|
const int frameH = gpuData->height;
|
|
if (frameW <= 0 || frameH <= 0 || m_cudaContextDead) {
|
|
return result;
|
|
}
|
|
|
|
// Ensure async D2D copy has completed before reading NV12 buffers
|
|
if (gpuData->d2dCopyStream) {
|
|
cudaStreamSynchronize(static_cast<cudaStream_t>(gpuData->d2dCopyStream));
|
|
gpuData->d2dCopyStream = nullptr;
|
|
}
|
|
|
|
const bool isCudaDevice = gpuData->isCudaDevicePtr;
|
|
const bool gpuMatch = !isCudaDevice ||
|
|
gpuData->gpuIndex < 0 ||
|
|
gpuData->gpuIndex == inferenceGpu;
|
|
|
|
if (isCudaDevice && !gpuMatch) {
|
|
return result; // cross-GPU — fall back to CPU warpAffine
|
|
}
|
|
|
|
const bool useZeroCopy = isCudaDevice && gpuMatch;
|
|
uint8_t* effYPlane = gpuData->yPlane;
|
|
uint8_t* effUvPlane = gpuData->uvPlane;
|
|
int effYLinesize = gpuData->yLinesize;
|
|
int effUvLinesize = gpuData->uvLinesize;
|
|
|
|
// Compute inverse affine matrix and scale for full-res NV12
|
|
cv::Mat M_inv;
|
|
cv::invertAffineTransform(affineMatrix, M_inv);
|
|
M_inv.convertTo(M_inv, CV_64F);
|
|
|
|
// Scale inverse affine to map from 112x112 output → full-res NV12 source
|
|
// Row 0 maps to X (multiply by scaleX), Row 1 maps to Y (multiply by scaleY)
|
|
double* row0 = M_inv.ptr<double>(0);
|
|
double* row1 = M_inv.ptr<double>(1);
|
|
row0[0] *= scaleX; row0[1] *= scaleX; row0[2] *= scaleX;
|
|
row1[0] *= scaleY; row1[1] *= scaleY; row1[2] *= scaleY;
|
|
|
|
float m00 = static_cast<float>(row0[0]);
|
|
float m01 = static_cast<float>(row0[1]);
|
|
float m02 = static_cast<float>(row0[2]);
|
|
float m10 = static_cast<float>(row1[0]);
|
|
float m11 = static_cast<float>(row1[1]);
|
|
float m12 = static_cast<float>(row1[2]);
|
|
|
|
cv::cuda::Stream stream;
|
|
cv::cuda::GpuMat gpuY, gpuUV;
|
|
|
|
if (useZeroCopy) {
|
|
gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1,
|
|
effYPlane, static_cast<size_t>(effYLinesize));
|
|
gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
|
|
effUvPlane, static_cast<size_t>(effUvLinesize));
|
|
} else {
|
|
// CPU path: memcpy + upload
|
|
const size_t ySize = static_cast<size_t>(frameW) * frameH;
|
|
const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
|
|
const size_t nv12Size = ySize + uvSize;
|
|
ensurePinnedBuffer(nv12Size, logger, tag);
|
|
if (!m_pinnedBuf) {
|
|
return result;
|
|
}
|
|
|
|
uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
|
|
bool cpyOk = true;
|
|
if (effYLinesize == frameW) {
|
|
cpyOk = safeMemcpy(dst, effYPlane, ySize);
|
|
} else {
|
|
for (int row = 0; row < frameH && cpyOk; row++)
|
|
cpyOk = safeMemcpy(dst + row * frameW,
|
|
effYPlane + row * effYLinesize, frameW);
|
|
}
|
|
if (cpyOk) {
|
|
uint8_t* uvDst = dst + ySize;
|
|
if (effUvLinesize == frameW) {
|
|
cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
|
|
} else {
|
|
for (int row = 0; row < frameH / 2 && cpyOk; row++)
|
|
cpyOk = safeMemcpy(uvDst + row * frameW,
|
|
effUvPlane + row * effUvLinesize, frameW);
|
|
}
|
|
}
|
|
|
|
if (!cpyOk) {
|
|
return result;
|
|
}
|
|
|
|
// (No registry lock to release — data kept alive by refcount)
|
|
|
|
cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
|
|
cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
|
|
static_cast<uint8_t*>(m_pinnedBuf) + ySize);
|
|
gpuY.upload(pinnedY, stream);
|
|
gpuUV.upload(pinnedUV, stream);
|
|
}
|
|
|
|
// Launch affine warp kernel (outputs BGR 112x112)
|
|
cv::cuda::GpuMat gpuFace(outH, outW, CV_8UC3);
|
|
cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
|
|
|
|
launchNV12AffineWarpToBGR(
|
|
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
|
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
|
gpuFace.ptr<uint8_t>(), static_cast<int>(gpuFace.step),
|
|
outW, outH,
|
|
frameW, frameH,
|
|
m00, m01, m02, m10, m11, m12,
|
|
rawStream);
|
|
|
|
stream.waitForCompletion();
|
|
|
|
// (No registry lock to release — data kept alive by refcount)
|
|
|
|
// Keep GPU copy for recognizer (avoids re-upload), download CPU copy for mask
|
|
result.gpuAlignedFace = gpuFace;
|
|
gpuFace.download(result.alignedFaceBGR);
|
|
result.succeeded = true;
|
|
return result;
|
|
}
|
|
|
|
// ── NV12 rectangular crop → BGR ────────────────────────────────────
|
|
NV12PreprocessHelper::NV12CropResult NV12PreprocessHelper::tryNV12CropToBGR(
|
|
const cv::Mat& inputImage, int inferenceGpu,
|
|
const cv::Rect& bbox, int padding,
|
|
float scaleX, float scaleY,
|
|
SPDLogger& logger, const char* tag)
|
|
{
|
|
NV12CropResult result;
|
|
|
|
// Warmup gating
|
|
if (m_inferenceCount < NV12_WARMUP_THRESHOLD)
|
|
return result;
|
|
|
|
// CUDA health check
|
|
if (m_cudaContextDead)
|
|
return result;
|
|
|
|
// Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
|
|
auto* gpuData = tl_currentGpuFrame();
|
|
if (!gpuData || gpuData->pixelFormat != 23)
|
|
return result; // no NV12 data
|
|
|
|
if (!gpuData->isCudaDevicePtr || !gpuData->yPlane || !gpuData->uvPlane)
|
|
return result; // NV12 not on GPU
|
|
|
|
// Ensure async D2D copy has completed before reading NV12 buffers
|
|
if (gpuData->d2dCopyStream) {
|
|
cudaStreamSynchronize(static_cast<cudaStream_t>(gpuData->d2dCopyStream));
|
|
gpuData->d2dCopyStream = nullptr;
|
|
}
|
|
|
|
const int frameW = gpuData->width;
|
|
const int frameH = gpuData->height;
|
|
|
|
// Scale bbox from display-res to full-res NV12 coords with padding
|
|
const int fx1 = std::max(0, static_cast<int>((bbox.x - padding) * scaleX));
|
|
const int fy1 = std::max(0, static_cast<int>((bbox.y - padding) * scaleY));
|
|
const int fx2 = std::min(frameW, static_cast<int>((bbox.x + bbox.width + padding) * scaleX));
|
|
const int fy2 = std::min(frameH, static_cast<int>((bbox.y + bbox.height + padding) * scaleY));
|
|
const int cropW = fx2 - fx1;
|
|
const int cropH = fy2 - fy1;
|
|
|
|
if (cropW < 5 || cropH < 5)
|
|
return result;
|
|
|
|
// GPU device matching
|
|
int currentDev = 0;
|
|
cudaGetDevice(¤tDev);
|
|
if (inferenceGpu >= 0 && currentDev != inferenceGpu)
|
|
cudaSetDevice(inferenceGpu);
|
|
|
|
// Wrap existing GPU NV12 planes
|
|
cv::cuda::GpuMat gpuY(frameH, frameW, CV_8UC1,
|
|
gpuData->yPlane, gpuData->yLinesize);
|
|
cv::cuda::GpuMat gpuUV(frameH / 2, frameW / 2, CV_8UC2,
|
|
gpuData->uvPlane, gpuData->uvLinesize);
|
|
|
|
// Allocate output on GPU
|
|
cv::cuda::GpuMat gpuCrop(cropH, cropW, CV_8UC3);
|
|
|
|
// Launch crop kernel
|
|
cv::cuda::Stream stream;
|
|
cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
|
|
|
|
launchNV12CropToBGR(
|
|
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
|
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
|
gpuCrop.ptr<uint8_t>(), static_cast<int>(gpuCrop.step),
|
|
cropW, cropH,
|
|
fx1, fy1,
|
|
frameW, frameH,
|
|
rawStream);
|
|
|
|
stream.waitForCompletion();
|
|
|
|
// (No registry lock to release — data kept alive by refcount)
|
|
|
|
// Download small crop to CPU
|
|
gpuCrop.download(result.bgrCrop);
|
|
result.succeeded = true;
|
|
|
|
// One-shot diagnostic log
|
|
if (!m_nv12CropLogged) {
|
|
m_nv12CropLogged = true;
|
|
logger.LogInfo(tag,
|
|
"NV12 GPU crop ACTIVE: " + std::to_string(cropW) + "x" + std::to_string(cropH) +
|
|
" BGR cropped from " + std::to_string(frameW) + "x" + std::to_string(frameH) +
|
|
" NV12 (display=" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) +
|
|
" scaleX=" + std::to_string(scaleX) + " scaleY=" + std::to_string(scaleY) + ")",
|
|
__FILE__, __LINE__);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
// ====================================================================
|
|
// tryNV12DirectToBuffer — write NV12→RGB CHW directly into TRT buffer
|
|
//
|
|
// For engines that manage their own GPU buffers (e.g. SAM3 with raw
|
|
// TRT enqueueV3). Same registry lookup / GPU matching / safety logic
|
|
// as tryNV12(), but writes CHW planar output into a pre-allocated
|
|
// void* GPU buffer instead of allocating an HWC GpuMat.
|
|
// ====================================================================
|
|
NV12PreprocessHelper::NV12DirectResult NV12PreprocessHelper::tryNV12DirectToBuffer(
|
|
const cv::Mat& inputImage, int inferenceGpu,
|
|
void* dstGpuBuffer, int inputW, int inputH,
|
|
bool isFloat32,
|
|
cudaStream_t stream,
|
|
SPDLogger& logger, const char* tag) {
|
|
NV12DirectResult result;
|
|
|
|
// Skip during warmup phase
|
|
if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
|
|
return result;
|
|
}
|
|
|
|
if (!dstGpuBuffer) return result;
|
|
|
|
// Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
|
|
GpuFrameData* gpuData = tl_currentGpuFrame();
|
|
|
|
if (!gpuData || !gpuData->yPlane) {
|
|
return result; // no NV12 data — caller uses CPU path
|
|
}
|
|
|
|
// BGR full-res path not applicable for direct-to-buffer (SAM3 handles its own resize)
|
|
if (gpuData->pixelFormat == 1000) {
|
|
return result;
|
|
}
|
|
|
|
// NV12 path only
|
|
if (gpuData->pixelFormat != 23) {
|
|
return result;
|
|
}
|
|
|
|
if (!gpuData->uvPlane) {
|
|
return result;
|
|
}
|
|
|
|
const int frameW = gpuData->width;
|
|
const int frameH = gpuData->height;
|
|
|
|
if (frameW <= 0 || frameH <= 0 || m_cudaContextDead) {
|
|
return result;
|
|
}
|
|
|
|
// Ensure async D2D copy has completed before reading NV12 buffers
|
|
if (gpuData->d2dCopyStream) {
|
|
cudaStreamSynchronize(static_cast<cudaStream_t>(gpuData->d2dCopyStream));
|
|
gpuData->d2dCopyStream = nullptr;
|
|
}
|
|
|
|
const bool isCudaDevice = gpuData->isCudaDevicePtr;
|
|
const bool gpuMatch = !isCudaDevice ||
|
|
gpuData->gpuIndex < 0 ||
|
|
gpuData->gpuIndex == inferenceGpu;
|
|
const bool useZeroCopy = isCudaDevice && gpuMatch;
|
|
|
|
if (isCudaDevice && !gpuMatch) {
|
|
return result; // cross-GPU — fall back to CPU path
|
|
}
|
|
|
|
uint8_t* effYPlane = gpuData->yPlane;
|
|
uint8_t* effUvPlane = gpuData->uvPlane;
|
|
int effYLinesize = gpuData->yLinesize;
|
|
int effUvLinesize = gpuData->uvLinesize;
|
|
|
|
// Log path selection once
|
|
if (!m_nv12PathLogged) {
|
|
m_nv12PathLogged = true;
|
|
const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY" : "CPU_NV12_UPLOAD";
|
|
logger.LogInfo(std::string(tag) + "::NV12Helper",
|
|
std::string("Path: ") + pathName +
|
|
" | isCuda=" + std::to_string(isCudaDevice) +
|
|
" gpuMatch=" + std::to_string(gpuMatch) +
|
|
" frame=" + std::to_string(frameW) + "x" + std::to_string(frameH) +
|
|
" (direct CHW " + (isFloat32 ? "float32" : "uint8") + ")",
|
|
__FILE__, __LINE__);
|
|
}
|
|
|
|
cv::cuda::GpuMat gpuY, gpuUV;
|
|
|
|
if (useZeroCopy) {
|
|
gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1,
|
|
effYPlane, static_cast<size_t>(effYLinesize));
|
|
gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
|
|
effUvPlane, static_cast<size_t>(effUvLinesize));
|
|
} else {
|
|
// CPU path: memcpy NV12 to pinned buffer, upload
|
|
const size_t ySize = static_cast<size_t>(frameW) * frameH;
|
|
const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
|
|
const size_t nv12Size = ySize + uvSize;
|
|
ensurePinnedBuffer(nv12Size, logger, tag);
|
|
if (!m_pinnedBuf) return result;
|
|
|
|
if (!isMemoryReadable(effYPlane, std::min(static_cast<size_t>(effYLinesize) * frameH, (size_t)4096)) ||
|
|
!isMemoryReadable(effUvPlane, std::min(static_cast<size_t>(effUvLinesize) * (frameH / 2), (size_t)4096))) {
|
|
return result;
|
|
}
|
|
|
|
uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
|
|
bool cpyOk = true;
|
|
if (effYLinesize == frameW) {
|
|
cpyOk = safeMemcpy(dst, effYPlane, ySize);
|
|
} else {
|
|
for (int row = 0; row < frameH && cpyOk; row++)
|
|
cpyOk = safeMemcpy(dst + row * frameW,
|
|
effYPlane + row * effYLinesize, frameW);
|
|
}
|
|
if (cpyOk) {
|
|
uint8_t* uvDst = dst + ySize;
|
|
if (effUvLinesize == frameW) {
|
|
cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
|
|
} else {
|
|
for (int row = 0; row < frameH / 2 && cpyOk; row++)
|
|
cpyOk = safeMemcpy(uvDst + row * frameW,
|
|
effUvPlane + row * effUvLinesize, frameW);
|
|
}
|
|
}
|
|
if (!cpyOk) return result;
|
|
|
|
// (No registry lock to release — data kept alive by refcount)
|
|
|
|
cv::cuda::Stream cvStream = cv::cuda::StreamAccessor::wrapStream(stream);
|
|
cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
|
|
cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
|
|
static_cast<uint8_t*>(m_pinnedBuf) + ySize);
|
|
gpuY.upload(pinnedY, cvStream);
|
|
gpuUV.upload(pinnedUV, cvStream);
|
|
}
|
|
|
|
// Launch CHW kernel directly into TRT buffer
|
|
if (isFloat32) {
|
|
launchNV12ToRGBResizeCHW_float(
|
|
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
|
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
|
static_cast<float*>(dstGpuBuffer),
|
|
inputW, inputH, frameW, frameH, stream);
|
|
} else {
|
|
launchNV12ToRGBResizeCHW_uint8(
|
|
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
|
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
|
static_cast<uint8_t*>(dstGpuBuffer),
|
|
inputW, inputH, frameW, frameH, stream);
|
|
}
|
|
|
|
// Use polling sync instead of cudaStreamSynchronize to avoid
|
|
// holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
|
|
{
|
|
cudaError_t err = cudaStreamQuery(stream);
|
|
while (err == cudaErrorNotReady) {
|
|
Sleep(0);
|
|
err = cudaStreamQuery(stream);
|
|
}
|
|
}
|
|
|
|
// (No registry lock to release — data kept alive by refcount)
|
|
|
|
// Log activation once
|
|
if (!m_nv12ActiveLogged) {
|
|
m_nv12ActiveLogged = true;
|
|
const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload";
|
|
logger.LogInfo(std::string(tag) + "::NV12Helper",
|
|
std::string(mode) + " ACTIVE: " +
|
|
std::to_string(frameW) + "x" + std::to_string(frameH) +
|
|
" NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) +
|
|
" CHW " + (isFloat32 ? "float32" : "uint8"),
|
|
__FILE__, __LINE__);
|
|
}
|
|
|
|
result.succeeded = true;
|
|
result.metaWidth = static_cast<float>(inputImage.cols > 0 ? inputImage.cols : frameW);
|
|
result.metaHeight = static_cast<float>(inputImage.rows > 0 ? inputImage.rows : frameH);
|
|
return result;
|
|
}
|
|
|
|
} // namespace ANSCENTER
|