#include "NV12PreprocessHelper.h" #include "ANSGpuFrameRegistry.h" #include "ANSEngineCommon.h" #include #include #include #include #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN #define NOMINMAX #include // SEH-protected memcpy: returns false on access violation instead of crashing. // Used for NV12 plane data which may originate from D3D11VA staging textures. static bool safeMemcpy(void* dst, const void* src, size_t bytes) { __try { std::memcpy(dst, src, bytes); return true; } __except (EXCEPTION_EXECUTE_HANDLER) { return false; } } // Check if a memory range is readable (non-crashing probe via VirtualQuery) static bool isMemoryReadable(const void* ptr, size_t bytes) { if (!ptr || bytes == 0) return false; MEMORY_BASIC_INFORMATION mbi{}; if (VirtualQuery(ptr, &mbi, sizeof(mbi)) == 0) return false; if (mbi.State != MEM_COMMIT) return false; if (mbi.Protect & (PAGE_NOACCESS | PAGE_GUARD)) return false; const DWORD readFlags = PAGE_READONLY | PAGE_READWRITE | PAGE_EXECUTE_READ | PAGE_EXECUTE_READWRITE; return (mbi.Protect & readFlags) != 0; } #else static bool safeMemcpy(void* dst, const void* src, size_t bytes) { std::memcpy(dst, src, bytes); return true; } static bool isMemoryReadable(const void*, size_t) { return true; } #endif // NV12→RGB CUDA kernel (nv12_to_rgb.cu) extern "C" void launchNV12ToRGB( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* rgbOut, int outPitch, int width, int height, cudaStream_t stream); extern "C" void launchNV12ToRGBLetterbox( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* rgbOut, int outPitch, int outW, int outH, int srcW, int srcH, int unpadW, int unpadH, float invScale, cudaStream_t stream); extern "C" void launchNV12ToRGBCenterLetterbox( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* rgbOut, int outPitch, int outW, int outH, int srcW, int srcH, int unpadW, int unpadH, float invScale, int padLeft, int padTop, cudaStream_t stream); extern "C" void launchNV12AffineWarpToBGR( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* bgrOut, int outPitch, int outW, int outH, int srcW, int srcH, float m00, float m01, float m02, float m10, float m11, float m12, cudaStream_t stream); extern "C" void launchNV12ToRGBResize( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* rgbOut, int outPitch, int outW, int outH, int srcW, int srcH, cudaStream_t stream); extern "C" void launchNV12ToRGBResizeCHW_uint8( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* chwOut, int outW, int outH, int srcW, int srcH, cudaStream_t stream); extern "C" void launchNV12ToRGBResizeCHW_float( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, float* chwOut, int outW, int outH, int srcW, int srcH, cudaStream_t stream); extern "C" void launchNV12ToBGRResize( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* bgrOut, int outPitch, int outW, int outH, int srcW, int srcH, cudaStream_t stream); extern "C" void launchNV12CropToBGR( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* bgrOut, int outPitch, int outW, int outH, int cropX, int cropY, int srcW, int srcH, cudaStream_t stream); namespace ANSCENTER { NV12PreprocessHelper::~NV12PreprocessHelper() { destroy(); } void NV12PreprocessHelper::destroy() { if (m_pinnedBuf) { cudaFreeHost(m_pinnedBuf); m_pinnedBuf = nullptr; m_pinnedBufSize = 0; } } void NV12PreprocessHelper::ensurePinnedBuffer(size_t bytes, SPDLogger& logger, const char* tag) { if (bytes <= m_pinnedBufSize) return; if (m_pinnedBuf) { cudaFreeHost(m_pinnedBuf); m_pinnedBuf = nullptr; m_pinnedBufSize = 0; } cudaError_t err = cudaHostAlloc(&m_pinnedBuf, bytes, cudaHostAllocPortable); if (err != cudaSuccess) { if (!m_nv12PinnedLogged) { m_nv12PinnedLogged = true; logger.LogWarn(std::string(tag) + "::NV12Helper", "cudaHostAlloc failed (" + std::string(cudaGetErrorString(err)) + ") for " + std::to_string(bytes) + " bytes", __FILE__, __LINE__); } m_pinnedBuf = nullptr; m_pinnedBufSize = 0; return; } m_pinnedBufSize = bytes; } bool NV12PreprocessHelper::isCudaContextHealthy(SPDLogger& logger, const char* tag) { if (m_cudaContextDead) return false; cudaError_t pending = cudaGetLastError(); cudaError_t err = cudaStreamQuery(nullptr); if (err == cudaSuccess || err == cudaErrorNotReady) { m_cudaFailStreak = 0; return true; } ++m_cudaFailStreak; if (m_cudaFailStreak >= CUDA_FAIL_LIMIT && !m_cudaContextDead) { m_cudaContextDead = true; logger.LogFatal(std::string(tag) + "::NV12Helper", "CUDA context permanently corrupted after " + std::to_string(m_cudaFailStreak) + " consecutive failures (last error: " + std::string(cudaGetErrorString(err)) + ", pending: " + std::string(cudaGetErrorString(pending)) + "). GPU preprocessing DISABLED for this engine instance. " "This is typically caused by NVDEC/CUVID surface pool exhaustion " "(Error mapping a picture with CUVID: 205). " "Reduce the number of concurrent HW-decoded 4K streams or " "restart the application to recover.", __FILE__, __LINE__); } return false; } // ==================================================================== // tryNV12 — attempt NV12 fast-path preprocessing // ==================================================================== NV12Result NV12PreprocessHelper::tryNV12( const cv::Mat& inputImage, int inferenceGpu, int inputW, int inputH, const NV12KernelLauncher& launcher, SPDLogger& logger, const char* tag) { NV12Result result; // Skip during warmup phase if (m_inferenceCount < NV12_WARMUP_THRESHOLD) { return result; } // Get NV12 frame data from thread-local (set by RunInferenceComplete_LV) GpuFrameData* gpuData = tl_currentGpuFrame(); if (!gpuData || !gpuData->yPlane) { return result; // no NV12 data — caller uses BGR } // ── BGR full-res path (pixFmt=1000 from ANSVideoPlayer) ────────── if (gpuData->pixelFormat == 1000) { cv::Mat fullResImg(gpuData->height, gpuData->width, CV_8UC3, gpuData->yPlane, gpuData->yLinesize); if (!m_bgrFullResLogged) { m_bgrFullResLogged = true; logger.LogInfo(std::string(tag) + "::NV12Helper", "BGR full-res ACTIVE: " + std::to_string(gpuData->width) + "x" + std::to_string(gpuData->height) + " -> inference (display=" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) + ")", __FILE__, __LINE__); } result.useBgrFullRes = true; result.bgrFullResImg = fullResImg.clone(); // clone under lock if (inputImage.cols > 0 && inputImage.rows > 0 && (fullResImg.cols != inputImage.cols || fullResImg.rows != inputImage.rows)) { result.bgrFullResScaleX = static_cast(inputImage.cols) / static_cast(fullResImg.cols); result.bgrFullResScaleY = static_cast(inputImage.rows) / static_cast(fullResImg.rows); } // (No registry lock — data kept alive by refcount) return result; } // ── NV12 path (pixFmt=23 = AV_PIX_FMT_NV12) ──────────────────── if (gpuData->pixelFormat != 23) { return result; // unknown format — fall back to BGR } if (!gpuData->uvPlane) { if (!m_nv12NullLogged) { m_nv12NullLogged = true; logger.LogWarn(std::string(tag) + "::NV12Helper", "Early exit: null uvPlane", __FILE__, __LINE__); } return result; } const int frameW = gpuData->width; const int frameH = gpuData->height; if (frameW <= 0 || frameH <= 0) { if (!m_nv12DimLogged) { m_nv12DimLogged = true; logger.LogWarn(std::string(tag) + "::NV12Helper", "Early exit: bad dimensions — w=" + std::to_string(frameW) + " h=" + std::to_string(frameH), __FILE__, __LINE__); } return result; } if (m_cudaContextDead) { if (!m_nv12DeadLogged) { m_nv12DeadLogged = true; logger.LogWarn(std::string(tag) + "::NV12Helper", "Early exit: CUDA context dead", __FILE__, __LINE__); } return result; } // Ensure async D2D copy (NVDEC → pool buffer) has completed before // reading yPlane/uvPlane. The copy was queued in gpu_frame_attach_cuda() // on a non-blocking stream. By the time inference runs (~50-200ms later), // the copy (~0.3ms) has long finished, so this sync returns immediately. if (gpuData->d2dCopyStream) { cudaStreamSynchronize(static_cast(gpuData->d2dCopyStream)); gpuData->d2dCopyStream = nullptr; // Only sync once per frame } const bool isCudaDevice = gpuData->isCudaDevicePtr; const bool gpuMatch = !isCudaDevice || gpuData->gpuIndex < 0 || gpuData->gpuIndex == inferenceGpu; const bool useZeroCopy = isCudaDevice && gpuMatch; // --- Debug: log pointer state before reading --- #if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG { char _nv12_dbg[512]; snprintf(_nv12_dbg, sizeof(_nv12_dbg), "[NV12Helper] tryNV12: gpuData=%p yPlane=%p uvPlane=%p isCuda=%d " "gpuIdx=%d infGpu=%d gpuMatch=%d zeroCopy=%d " "gpuCacheY=%p gpuCacheUV=%p gpuCacheValid=%d refcount=%d %dx%d\n", (void*)gpuData, (void*)gpuData->yPlane, (void*)gpuData->uvPlane, (int)isCudaDevice, gpuData->gpuIndex, inferenceGpu, (int)gpuMatch, (int)useZeroCopy, gpuData->gpuCacheY, gpuData->gpuCacheUV, (int)gpuData->gpuCacheValid, gpuData->refcount.load(), frameW, frameH); #ifdef _WIN32 OutputDebugStringA(_nv12_dbg); #endif fprintf(stderr, "%s", _nv12_dbg); } #endif // Effective plane pointers — for zero-copy, use CUDA device ptrs; // for CPU upload, use the CPU snapshot buffers. uint8_t* effYPlane; uint8_t* effUvPlane; int effYLinesize; int effUvLinesize; if (useZeroCopy) { // Same GPU: wrap owned CUDA device pointers directly effYPlane = gpuData->yPlane; effUvPlane = gpuData->uvPlane; effYLinesize = gpuData->yLinesize; effUvLinesize = gpuData->uvLinesize; } else if (isCudaDevice && !gpuMatch) { // Cross-GPU: can't use CUDA device ptrs from another GPU context. // Fall through to CPU upload using the CPU snapshot buffers. if (gpuData->cpuYPlane && gpuData->cpuUvPlane) { effYPlane = gpuData->cpuYPlane; effUvPlane = gpuData->cpuUvPlane; effYLinesize = gpuData->cpuYLinesize; effUvLinesize = gpuData->cpuUvLinesize; if (!m_gpuMismatchLogged) { m_gpuMismatchLogged = true; logger.LogInfo(std::string(tag) + "::NV12Helper", "GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) + " vs inference GPU " + std::to_string(inferenceGpu) + ") — using CPU NV12 upload fallback", __FILE__, __LINE__); } } else { // No CPU fallback available — fall back to BGR if (!m_gpuMismatchLogged) { m_gpuMismatchLogged = true; logger.LogInfo(std::string(tag) + "::NV12Helper", "GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) + " vs inference GPU " + std::to_string(inferenceGpu) + ") — no CPU fallback, using BGR", __FILE__, __LINE__); } return result; } } else { // Not CUDA device ptrs: use whatever yPlane/uvPlane point to (CPU data) effYPlane = gpuData->yPlane; effUvPlane = gpuData->uvPlane; effYLinesize = gpuData->yLinesize; effUvLinesize = gpuData->uvLinesize; } // Log path selection once — both spdlog and DebugView if (!m_nv12PathLogged) { m_nv12PathLogged = true; const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY" : "CPU_NV12_UPLOAD"; auto msg = std::string("Path: ") + pathName + " | isCuda=" + std::to_string(isCudaDevice) + " gpuMatch=" + std::to_string(gpuMatch) + " decodeGpu=" + std::to_string(gpuData->gpuIndex) + " infGpu=" + std::to_string(inferenceGpu) + " frame=" + std::to_string(frameW) + "x" + std::to_string(frameH); logger.LogInfo(std::string(tag) + "::NV12Helper", msg, __FILE__, __LINE__); #ifdef _WIN32 auto dbg = std::string("[NV12 ACTIVE] ") + tag + " " + msg + "\n"; OutputDebugStringA(dbg.c_str()); #endif } cv::cuda::Stream stream; cv::cuda::GpuMat gpuY, gpuUV; if (useZeroCopy) { gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1, effYPlane, static_cast(effYLinesize)); gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1, effUvPlane, static_cast(effUvLinesize)); } else { // CPU path: memcpy + upload const size_t ySize = static_cast(frameW) * frameH; const size_t uvSize = static_cast(frameW) * frameH / 2; const size_t nv12Size = ySize + uvSize; ensurePinnedBuffer(nv12Size, logger, tag); if (!m_pinnedBuf) { return result; } // Validate NV12 plane pointers before memcpy const size_t yBufNeeded = (effYLinesize == frameW) ? ySize : static_cast(effYLinesize) * frameH; const size_t uvBufNeeded = (effUvLinesize == frameW) ? uvSize : static_cast(effUvLinesize) * (frameH / 2); if (!isMemoryReadable(effYPlane, std::min(yBufNeeded, (size_t)4096)) || !isMemoryReadable(effUvPlane, std::min(uvBufNeeded, (size_t)4096))) { logger.LogWarn(std::string(tag) + "::NV12Helper", "NV12 plane pointers not readable — falling back to BGR", __FILE__, __LINE__); return result; } uint8_t* dst = static_cast(m_pinnedBuf); bool cpyOk = true; if (effYLinesize == frameW) { cpyOk = safeMemcpy(dst, effYPlane, ySize); } else { for (int row = 0; row < frameH && cpyOk; row++) cpyOk = safeMemcpy(dst + row * frameW, effYPlane + row * effYLinesize, frameW); } if (cpyOk) { uint8_t* uvDst = dst + ySize; if (effUvLinesize == frameW) { cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize); } else { for (int row = 0; row < frameH / 2 && cpyOk; row++) cpyOk = safeMemcpy(uvDst + row * frameW, effUvPlane + row * effUvLinesize, frameW); } } if (!cpyOk) { logger.LogWarn(std::string(tag) + "::NV12Helper", "Access violation during NV12 memcpy — falling back to BGR", __FILE__, __LINE__); return result; } // NV12 data safely in pinned memory — release registry lock // (No registry lock to release — data kept alive by refcount) cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf); cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1, static_cast(m_pinnedBuf) + ySize); gpuY.upload(pinnedY, stream); gpuUV.upload(pinnedUV, stream); } // Use display dimensions for coordinate mapping const float metaW = static_cast(inputImage.cols > 0 ? inputImage.cols : frameW); const float metaH = static_cast(inputImage.rows > 0 ? inputImage.rows : frameH); if (metaH <= 0 || metaW <= 0) { if (!m_nv12MetaLogged) { m_nv12MetaLogged = true; logger.LogWarn(std::string(tag) + "::NV12Helper", "Early exit: metadata dims invalid", __FILE__, __LINE__); } return result; } result.metaWidth = metaW; result.metaHeight = metaH; result.ratio = 1.f / std::min( static_cast(inputW) / metaW, static_cast(inputH) / metaH); // Launch engine-specific kernel cv::cuda::GpuMat gpuResized; gpuResized.create(inputH, inputW, CV_8UC3); cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream); #if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG { char _nv12_dbg2[256]; snprintf(_nv12_dbg2, sizeof(_nv12_dbg2), "[NV12Helper] KERNEL LAUNCH: gpuY=%p(%dx%d) gpuUV=%p(%dx%d) -> %dx%d zeroCopy=%d\n", (void*)gpuY.data, gpuY.cols, gpuY.rows, (void*)gpuUV.data, gpuUV.cols, gpuUV.rows, inputW, inputH, (int)useZeroCopy); #ifdef _WIN32 OutputDebugStringA(_nv12_dbg2); #endif fprintf(stderr, "%s", _nv12_dbg2); } #endif launcher(gpuY, gpuUV, gpuResized, frameW, frameH, inputW, inputH, rawStream); stream.waitForCompletion(); // (No registry lock to release — data kept alive by refcount) // Log NV12 fast-path usage once per instance if (!m_nv12ActiveLogged) { m_nv12ActiveLogged = true; const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload"; logger.LogInfo(std::string(tag) + "::NV12Helper", std::string(mode) + " ACTIVE: " + std::to_string(frameW) + "x" + std::to_string(frameH) + " NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) + " display=" + std::to_string(static_cast(metaW)) + "x" + std::to_string(static_cast(metaH)), __FILE__, __LINE__); } result.succeeded = true; result.gpuRGB = std::move(gpuResized); return result; } // ==================================================================== // Default YOLO kernel launcher // // For detection/seg/pose/OBB: fused NV12→RGB + letterbox in a single // kernel at model input resolution (e.g. 640x640). // For classification: separate NV12→RGB at full res, then cuda::resize. // (Classification detection is not done here — the caller's engine // should set up the appropriate launcher if needed. This default // always uses the fused letterbox path, which works for all YOLO // detection/seg/pose/OBB tasks.) // ==================================================================== NV12KernelLauncher NV12PreprocessHelper::defaultYOLOLauncher() { return [](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV, cv::cuda::GpuMat& gpuOut, int srcW, int srcH, int inputW, int inputH, cudaStream_t stream) { if (srcW == inputW && srcH == inputH) { // Source matches model input — direct NV12→RGB, no resize needed launchNV12ToRGB( gpuY.ptr(), static_cast(gpuY.step), gpuUV.ptr(), static_cast(gpuUV.step), gpuOut.ptr(), static_cast(gpuOut.step), srcW, srcH, stream); } else { // Fused NV12→RGB + letterbox resize float r = std::min(static_cast(inputW) / srcW, static_cast(inputH) / srcH); int unpadW = static_cast(r * srcW); int unpadH = static_cast(r * srcH); float invScale = 1.0f / r; launchNV12ToRGBLetterbox( gpuY.ptr(), static_cast(gpuY.step), gpuUV.ptr(), static_cast(gpuUV.step), gpuOut.ptr(), static_cast(gpuOut.step), inputW, inputH, srcW, srcH, unpadW, unpadH, invScale, stream); } }; } // ==================================================================== // Classification launcher (direct resize, no letterbox) // ==================================================================== NV12KernelLauncher NV12PreprocessHelper::classificationLauncher() { return [](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV, cv::cuda::GpuMat& gpuOut, int srcW, int srcH, int inputW, int inputH, cudaStream_t stream) { if (srcW == inputW && srcH == inputH) { // Source matches model input — direct NV12→RGB, no resize needed launchNV12ToRGB( gpuY.ptr(), static_cast(gpuY.step), gpuUV.ptr(), static_cast(gpuUV.step), gpuOut.ptr(), static_cast(gpuOut.step), srcW, srcH, stream); } else { // Fused NV12→RGB + direct resize (stretch to fill, no padding) launchNV12ToRGBResize( gpuY.ptr(), static_cast(gpuY.step), gpuUV.ptr(), static_cast(gpuUV.step), gpuOut.ptr(), static_cast(gpuOut.step), inputW, inputH, srcW, srcH, stream); } }; } // ==================================================================== // NV12→BGR fused resize (for OCR detection) // ==================================================================== void NV12PreprocessHelper::nv12ToBGRResize( const uint8_t* devY, int yPitch, const uint8_t* devUV, int uvPitch, uint8_t* bgrOut, int outPitch, int outW, int outH, int srcW, int srcH, cudaStream_t stream) { launchNV12ToBGRResize(devY, yPitch, devUV, uvPitch, bgrOut, outPitch, outW, outH, srcW, srcH, stream); } // SCRFD center-padded letterbox launcher // ==================================================================== NV12KernelLauncher NV12PreprocessHelper::scrfdCenterLetterboxLauncher(int padLeft, int padTop) { return [padLeft, padTop](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV, cv::cuda::GpuMat& gpuOut, int srcW, int srcH, int inputW, int inputH, cudaStream_t stream) { float r = std::min(static_cast(inputW) / srcW, static_cast(inputH) / srcH); int unpadW = static_cast(r * srcW); int unpadH = static_cast(r * srcH); float invScale = 1.0f / r; launchNV12ToRGBCenterLetterbox( gpuY.ptr(), static_cast(gpuY.step), gpuUV.ptr(), static_cast(gpuUV.step), gpuOut.ptr(), static_cast(gpuOut.step), inputW, inputH, srcW, srcH, unpadW, unpadH, invScale, padLeft, padTop, stream); }; } // ==================================================================== // NV12 affine warp for face alignment // // Reads full-res NV12 from the registry, applies inverse affine // transform via CUDA kernel, outputs a small aligned face in BGR. // The affine matrix is computed on CPU (from detected landmarks to // ArcFace template), then scaled to map to full-res NV12 source. // ==================================================================== NV12PreprocessHelper::NV12AffineResult NV12PreprocessHelper::tryNV12AffineWarp( const cv::Mat& inputImage, int inferenceGpu, const cv::Mat& affineMatrix, int outW, int outH, float scaleX, float scaleY, SPDLogger& logger, const char* tag) { NV12AffineResult result; // Skip during warmup if (m_inferenceCount < NV12_WARMUP_THRESHOLD) { return result; } if (affineMatrix.empty()) { return result; } // Get NV12 frame data from thread-local (set by RunInferenceComplete_LV) GpuFrameData* gpuData = tl_currentGpuFrame(); if (!gpuData || !gpuData->yPlane || !gpuData->uvPlane || gpuData->pixelFormat != 23) { return result; } const int frameW = gpuData->width; const int frameH = gpuData->height; if (frameW <= 0 || frameH <= 0 || m_cudaContextDead) { return result; } // Ensure async D2D copy has completed before reading NV12 buffers if (gpuData->d2dCopyStream) { cudaStreamSynchronize(static_cast(gpuData->d2dCopyStream)); gpuData->d2dCopyStream = nullptr; } const bool isCudaDevice = gpuData->isCudaDevicePtr; const bool gpuMatch = !isCudaDevice || gpuData->gpuIndex < 0 || gpuData->gpuIndex == inferenceGpu; if (isCudaDevice && !gpuMatch) { return result; // cross-GPU — fall back to CPU warpAffine } const bool useZeroCopy = isCudaDevice && gpuMatch; uint8_t* effYPlane = gpuData->yPlane; uint8_t* effUvPlane = gpuData->uvPlane; int effYLinesize = gpuData->yLinesize; int effUvLinesize = gpuData->uvLinesize; // Compute inverse affine matrix and scale for full-res NV12 cv::Mat M_inv; cv::invertAffineTransform(affineMatrix, M_inv); M_inv.convertTo(M_inv, CV_64F); // Scale inverse affine to map from 112x112 output → full-res NV12 source // Row 0 maps to X (multiply by scaleX), Row 1 maps to Y (multiply by scaleY) double* row0 = M_inv.ptr(0); double* row1 = M_inv.ptr(1); row0[0] *= scaleX; row0[1] *= scaleX; row0[2] *= scaleX; row1[0] *= scaleY; row1[1] *= scaleY; row1[2] *= scaleY; float m00 = static_cast(row0[0]); float m01 = static_cast(row0[1]); float m02 = static_cast(row0[2]); float m10 = static_cast(row1[0]); float m11 = static_cast(row1[1]); float m12 = static_cast(row1[2]); cv::cuda::Stream stream; cv::cuda::GpuMat gpuY, gpuUV; if (useZeroCopy) { gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1, effYPlane, static_cast(effYLinesize)); gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1, effUvPlane, static_cast(effUvLinesize)); } else { // CPU path: memcpy + upload const size_t ySize = static_cast(frameW) * frameH; const size_t uvSize = static_cast(frameW) * frameH / 2; const size_t nv12Size = ySize + uvSize; ensurePinnedBuffer(nv12Size, logger, tag); if (!m_pinnedBuf) { return result; } uint8_t* dst = static_cast(m_pinnedBuf); bool cpyOk = true; if (effYLinesize == frameW) { cpyOk = safeMemcpy(dst, effYPlane, ySize); } else { for (int row = 0; row < frameH && cpyOk; row++) cpyOk = safeMemcpy(dst + row * frameW, effYPlane + row * effYLinesize, frameW); } if (cpyOk) { uint8_t* uvDst = dst + ySize; if (effUvLinesize == frameW) { cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize); } else { for (int row = 0; row < frameH / 2 && cpyOk; row++) cpyOk = safeMemcpy(uvDst + row * frameW, effUvPlane + row * effUvLinesize, frameW); } } if (!cpyOk) { return result; } // (No registry lock to release — data kept alive by refcount) cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf); cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1, static_cast(m_pinnedBuf) + ySize); gpuY.upload(pinnedY, stream); gpuUV.upload(pinnedUV, stream); } // Launch affine warp kernel (outputs BGR 112x112) cv::cuda::GpuMat gpuFace(outH, outW, CV_8UC3); cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream); launchNV12AffineWarpToBGR( gpuY.ptr(), static_cast(gpuY.step), gpuUV.ptr(), static_cast(gpuUV.step), gpuFace.ptr(), static_cast(gpuFace.step), outW, outH, frameW, frameH, m00, m01, m02, m10, m11, m12, rawStream); stream.waitForCompletion(); // (No registry lock to release — data kept alive by refcount) // Keep GPU copy for recognizer (avoids re-upload), download CPU copy for mask result.gpuAlignedFace = gpuFace; gpuFace.download(result.alignedFaceBGR); result.succeeded = true; return result; } // ── NV12 rectangular crop → BGR ──────────────────────────────────── NV12PreprocessHelper::NV12CropResult NV12PreprocessHelper::tryNV12CropToBGR( const cv::Mat& inputImage, int inferenceGpu, const cv::Rect& bbox, int padding, float scaleX, float scaleY, SPDLogger& logger, const char* tag) { NV12CropResult result; // Warmup gating if (m_inferenceCount < NV12_WARMUP_THRESHOLD) return result; // CUDA health check if (m_cudaContextDead) return result; // Get NV12 frame data from thread-local (set by RunInferenceComplete_LV) auto* gpuData = tl_currentGpuFrame(); if (!gpuData || gpuData->pixelFormat != 23) return result; // no NV12 data if (!gpuData->isCudaDevicePtr || !gpuData->yPlane || !gpuData->uvPlane) return result; // NV12 not on GPU // Ensure async D2D copy has completed before reading NV12 buffers if (gpuData->d2dCopyStream) { cudaStreamSynchronize(static_cast(gpuData->d2dCopyStream)); gpuData->d2dCopyStream = nullptr; } const int frameW = gpuData->width; const int frameH = gpuData->height; // Scale bbox from display-res to full-res NV12 coords with padding const int fx1 = std::max(0, static_cast((bbox.x - padding) * scaleX)); const int fy1 = std::max(0, static_cast((bbox.y - padding) * scaleY)); const int fx2 = std::min(frameW, static_cast((bbox.x + bbox.width + padding) * scaleX)); const int fy2 = std::min(frameH, static_cast((bbox.y + bbox.height + padding) * scaleY)); const int cropW = fx2 - fx1; const int cropH = fy2 - fy1; if (cropW < 5 || cropH < 5) return result; // GPU device matching int currentDev = 0; cudaGetDevice(¤tDev); if (inferenceGpu >= 0 && currentDev != inferenceGpu) cudaSetDevice(inferenceGpu); // Wrap existing GPU NV12 planes cv::cuda::GpuMat gpuY(frameH, frameW, CV_8UC1, gpuData->yPlane, gpuData->yLinesize); cv::cuda::GpuMat gpuUV(frameH / 2, frameW / 2, CV_8UC2, gpuData->uvPlane, gpuData->uvLinesize); // Allocate output on GPU cv::cuda::GpuMat gpuCrop(cropH, cropW, CV_8UC3); // Launch crop kernel cv::cuda::Stream stream; cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream); launchNV12CropToBGR( gpuY.ptr(), static_cast(gpuY.step), gpuUV.ptr(), static_cast(gpuUV.step), gpuCrop.ptr(), static_cast(gpuCrop.step), cropW, cropH, fx1, fy1, frameW, frameH, rawStream); stream.waitForCompletion(); // (No registry lock to release — data kept alive by refcount) // Download small crop to CPU gpuCrop.download(result.bgrCrop); result.succeeded = true; // One-shot diagnostic log if (!m_nv12CropLogged) { m_nv12CropLogged = true; logger.LogInfo(tag, "NV12 GPU crop ACTIVE: " + std::to_string(cropW) + "x" + std::to_string(cropH) + " BGR cropped from " + std::to_string(frameW) + "x" + std::to_string(frameH) + " NV12 (display=" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) + " scaleX=" + std::to_string(scaleX) + " scaleY=" + std::to_string(scaleY) + ")", __FILE__, __LINE__); } return result; } // ==================================================================== // tryNV12DirectToBuffer — write NV12→RGB CHW directly into TRT buffer // // For engines that manage their own GPU buffers (e.g. SAM3 with raw // TRT enqueueV3). Same registry lookup / GPU matching / safety logic // as tryNV12(), but writes CHW planar output into a pre-allocated // void* GPU buffer instead of allocating an HWC GpuMat. // ==================================================================== NV12PreprocessHelper::NV12DirectResult NV12PreprocessHelper::tryNV12DirectToBuffer( const cv::Mat& inputImage, int inferenceGpu, void* dstGpuBuffer, int inputW, int inputH, bool isFloat32, cudaStream_t stream, SPDLogger& logger, const char* tag) { NV12DirectResult result; // Skip during warmup phase if (m_inferenceCount < NV12_WARMUP_THRESHOLD) { return result; } if (!dstGpuBuffer) return result; // Get NV12 frame data from thread-local (set by RunInferenceComplete_LV) GpuFrameData* gpuData = tl_currentGpuFrame(); if (!gpuData || !gpuData->yPlane) { return result; // no NV12 data — caller uses CPU path } // BGR full-res path not applicable for direct-to-buffer (SAM3 handles its own resize) if (gpuData->pixelFormat == 1000) { return result; } // NV12 path only if (gpuData->pixelFormat != 23) { return result; } if (!gpuData->uvPlane) { return result; } const int frameW = gpuData->width; const int frameH = gpuData->height; if (frameW <= 0 || frameH <= 0 || m_cudaContextDead) { return result; } // Ensure async D2D copy has completed before reading NV12 buffers if (gpuData->d2dCopyStream) { cudaStreamSynchronize(static_cast(gpuData->d2dCopyStream)); gpuData->d2dCopyStream = nullptr; } const bool isCudaDevice = gpuData->isCudaDevicePtr; const bool gpuMatch = !isCudaDevice || gpuData->gpuIndex < 0 || gpuData->gpuIndex == inferenceGpu; const bool useZeroCopy = isCudaDevice && gpuMatch; if (isCudaDevice && !gpuMatch) { return result; // cross-GPU — fall back to CPU path } uint8_t* effYPlane = gpuData->yPlane; uint8_t* effUvPlane = gpuData->uvPlane; int effYLinesize = gpuData->yLinesize; int effUvLinesize = gpuData->uvLinesize; // Log path selection once if (!m_nv12PathLogged) { m_nv12PathLogged = true; const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY" : "CPU_NV12_UPLOAD"; logger.LogInfo(std::string(tag) + "::NV12Helper", std::string("Path: ") + pathName + " | isCuda=" + std::to_string(isCudaDevice) + " gpuMatch=" + std::to_string(gpuMatch) + " frame=" + std::to_string(frameW) + "x" + std::to_string(frameH) + " (direct CHW " + (isFloat32 ? "float32" : "uint8") + ")", __FILE__, __LINE__); } cv::cuda::GpuMat gpuY, gpuUV; if (useZeroCopy) { gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1, effYPlane, static_cast(effYLinesize)); gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1, effUvPlane, static_cast(effUvLinesize)); } else { // CPU path: memcpy NV12 to pinned buffer, upload const size_t ySize = static_cast(frameW) * frameH; const size_t uvSize = static_cast(frameW) * frameH / 2; const size_t nv12Size = ySize + uvSize; ensurePinnedBuffer(nv12Size, logger, tag); if (!m_pinnedBuf) return result; if (!isMemoryReadable(effYPlane, std::min(static_cast(effYLinesize) * frameH, (size_t)4096)) || !isMemoryReadable(effUvPlane, std::min(static_cast(effUvLinesize) * (frameH / 2), (size_t)4096))) { return result; } uint8_t* dst = static_cast(m_pinnedBuf); bool cpyOk = true; if (effYLinesize == frameW) { cpyOk = safeMemcpy(dst, effYPlane, ySize); } else { for (int row = 0; row < frameH && cpyOk; row++) cpyOk = safeMemcpy(dst + row * frameW, effYPlane + row * effYLinesize, frameW); } if (cpyOk) { uint8_t* uvDst = dst + ySize; if (effUvLinesize == frameW) { cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize); } else { for (int row = 0; row < frameH / 2 && cpyOk; row++) cpyOk = safeMemcpy(uvDst + row * frameW, effUvPlane + row * effUvLinesize, frameW); } } if (!cpyOk) return result; // (No registry lock to release — data kept alive by refcount) cv::cuda::Stream cvStream = cv::cuda::StreamAccessor::wrapStream(stream); cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf); cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1, static_cast(m_pinnedBuf) + ySize); gpuY.upload(pinnedY, cvStream); gpuUV.upload(pinnedUV, cvStream); } // Launch CHW kernel directly into TRT buffer if (isFloat32) { launchNV12ToRGBResizeCHW_float( gpuY.ptr(), static_cast(gpuY.step), gpuUV.ptr(), static_cast(gpuUV.step), static_cast(dstGpuBuffer), inputW, inputH, frameW, frameH, stream); } else { launchNV12ToRGBResizeCHW_uint8( gpuY.ptr(), static_cast(gpuY.step), gpuUV.ptr(), static_cast(gpuUV.step), static_cast(dstGpuBuffer), inputW, inputH, frameW, frameH, stream); } // Use polling sync instead of cudaStreamSynchronize to avoid // holding nvcuda64 SRW lock continuously (WDDM deadlock prevention). { cudaError_t err = cudaStreamQuery(stream); while (err == cudaErrorNotReady) { Sleep(0); err = cudaStreamQuery(stream); } } // (No registry lock to release — data kept alive by refcount) // Log activation once if (!m_nv12ActiveLogged) { m_nv12ActiveLogged = true; const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload"; logger.LogInfo(std::string(tag) + "::NV12Helper", std::string(mode) + " ACTIVE: " + std::to_string(frameW) + "x" + std::to_string(frameH) + " NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) + " CHW " + (isFloat32 ? "float32" : "uint8"), __FILE__, __LINE__); } result.succeeded = true; result.metaWidth = static_cast(inputImage.cols > 0 ? inputImage.cols : frameW); result.metaHeight = static_cast(inputImage.rows > 0 ? inputImage.rows : frameH); return result; } } // namespace ANSCENTER