Refactor project structure
This commit is contained in:
970
modules/ANSODEngine/NV12PreprocessHelper.cpp
Normal file
970
modules/ANSODEngine/NV12PreprocessHelper.cpp
Normal file
@@ -0,0 +1,970 @@
|
||||
#include "NV12PreprocessHelper.h"
|
||||
#include "ANSGpuFrameRegistry.h"
|
||||
#include "ANSEngineCommon.h"
|
||||
#include <opencv2/cudaimgproc.hpp>
|
||||
#include <opencv2/cudawarping.hpp>
|
||||
#include <opencv2/core/cuda_stream_accessor.hpp>
|
||||
#include <cstring>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
// SEH-protected memcpy: returns false on access violation instead of crashing.
|
||||
// Used for NV12 plane data which may originate from D3D11VA staging textures.
|
||||
static bool safeMemcpy(void* dst, const void* src, size_t bytes) {
|
||||
__try {
|
||||
std::memcpy(dst, src, bytes);
|
||||
return true;
|
||||
}
|
||||
__except (EXCEPTION_EXECUTE_HANDLER) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Check if a memory range is readable (non-crashing probe via VirtualQuery)
|
||||
static bool isMemoryReadable(const void* ptr, size_t bytes) {
|
||||
if (!ptr || bytes == 0) return false;
|
||||
MEMORY_BASIC_INFORMATION mbi{};
|
||||
if (VirtualQuery(ptr, &mbi, sizeof(mbi)) == 0) return false;
|
||||
if (mbi.State != MEM_COMMIT) return false;
|
||||
if (mbi.Protect & (PAGE_NOACCESS | PAGE_GUARD)) return false;
|
||||
const DWORD readFlags = PAGE_READONLY | PAGE_READWRITE | PAGE_EXECUTE_READ | PAGE_EXECUTE_READWRITE;
|
||||
return (mbi.Protect & readFlags) != 0;
|
||||
}
|
||||
#else
|
||||
static bool safeMemcpy(void* dst, const void* src, size_t bytes) {
|
||||
std::memcpy(dst, src, bytes);
|
||||
return true;
|
||||
}
|
||||
static bool isMemoryReadable(const void*, size_t) { return true; }
|
||||
#endif
|
||||
|
||||
// NV12→RGB CUDA kernel (nv12_to_rgb.cu)
|
||||
extern "C" void launchNV12ToRGB(
|
||||
const uint8_t* yPlane, int yPitch,
|
||||
const uint8_t* uvPlane, int uvPitch,
|
||||
uint8_t* rgbOut, int outPitch,
|
||||
int width, int height,
|
||||
cudaStream_t stream);
|
||||
|
||||
extern "C" void launchNV12ToRGBLetterbox(
|
||||
const uint8_t* yPlane, int yPitch,
|
||||
const uint8_t* uvPlane, int uvPitch,
|
||||
uint8_t* rgbOut, int outPitch,
|
||||
int outW, int outH,
|
||||
int srcW, int srcH,
|
||||
int unpadW, int unpadH,
|
||||
float invScale,
|
||||
cudaStream_t stream);
|
||||
|
||||
extern "C" void launchNV12ToRGBCenterLetterbox(
|
||||
const uint8_t* yPlane, int yPitch,
|
||||
const uint8_t* uvPlane, int uvPitch,
|
||||
uint8_t* rgbOut, int outPitch,
|
||||
int outW, int outH,
|
||||
int srcW, int srcH,
|
||||
int unpadW, int unpadH,
|
||||
float invScale,
|
||||
int padLeft, int padTop,
|
||||
cudaStream_t stream);
|
||||
|
||||
extern "C" void launchNV12AffineWarpToBGR(
|
||||
const uint8_t* yPlane, int yPitch,
|
||||
const uint8_t* uvPlane, int uvPitch,
|
||||
uint8_t* bgrOut, int outPitch,
|
||||
int outW, int outH,
|
||||
int srcW, int srcH,
|
||||
float m00, float m01, float m02,
|
||||
float m10, float m11, float m12,
|
||||
cudaStream_t stream);
|
||||
|
||||
extern "C" void launchNV12ToRGBResize(
|
||||
const uint8_t* yPlane, int yPitch,
|
||||
const uint8_t* uvPlane, int uvPitch,
|
||||
uint8_t* rgbOut, int outPitch,
|
||||
int outW, int outH,
|
||||
int srcW, int srcH,
|
||||
cudaStream_t stream);
|
||||
|
||||
extern "C" void launchNV12ToRGBResizeCHW_uint8(
|
||||
const uint8_t* yPlane, int yPitch,
|
||||
const uint8_t* uvPlane, int uvPitch,
|
||||
uint8_t* chwOut,
|
||||
int outW, int outH,
|
||||
int srcW, int srcH,
|
||||
cudaStream_t stream);
|
||||
|
||||
extern "C" void launchNV12ToRGBResizeCHW_float(
|
||||
const uint8_t* yPlane, int yPitch,
|
||||
const uint8_t* uvPlane, int uvPitch,
|
||||
float* chwOut,
|
||||
int outW, int outH,
|
||||
int srcW, int srcH,
|
||||
cudaStream_t stream);
|
||||
|
||||
extern "C" void launchNV12ToBGRResize(
|
||||
const uint8_t* yPlane, int yPitch,
|
||||
const uint8_t* uvPlane, int uvPitch,
|
||||
uint8_t* bgrOut, int outPitch,
|
||||
int outW, int outH,
|
||||
int srcW, int srcH,
|
||||
cudaStream_t stream);
|
||||
|
||||
extern "C" void launchNV12CropToBGR(
|
||||
const uint8_t* yPlane, int yPitch,
|
||||
const uint8_t* uvPlane, int uvPitch,
|
||||
uint8_t* bgrOut, int outPitch,
|
||||
int outW, int outH,
|
||||
int cropX, int cropY,
|
||||
int srcW, int srcH,
|
||||
cudaStream_t stream);
|
||||
|
||||
namespace ANSCENTER {
|
||||
|
||||
NV12PreprocessHelper::~NV12PreprocessHelper() {
|
||||
destroy();
|
||||
}
|
||||
|
||||
void NV12PreprocessHelper::destroy() {
|
||||
if (m_pinnedBuf) {
|
||||
cudaFreeHost(m_pinnedBuf);
|
||||
m_pinnedBuf = nullptr;
|
||||
m_pinnedBufSize = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void NV12PreprocessHelper::ensurePinnedBuffer(size_t bytes, SPDLogger& logger, const char* tag) {
|
||||
if (bytes <= m_pinnedBufSize) return;
|
||||
if (m_pinnedBuf) {
|
||||
cudaFreeHost(m_pinnedBuf);
|
||||
m_pinnedBuf = nullptr;
|
||||
m_pinnedBufSize = 0;
|
||||
}
|
||||
cudaError_t err = cudaHostAlloc(&m_pinnedBuf, bytes, cudaHostAllocPortable);
|
||||
if (err != cudaSuccess) {
|
||||
if (!m_nv12PinnedLogged) {
|
||||
m_nv12PinnedLogged = true;
|
||||
logger.LogWarn(std::string(tag) + "::NV12Helper",
|
||||
"cudaHostAlloc failed (" + std::string(cudaGetErrorString(err)) +
|
||||
") for " + std::to_string(bytes) + " bytes",
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
m_pinnedBuf = nullptr;
|
||||
m_pinnedBufSize = 0;
|
||||
return;
|
||||
}
|
||||
m_pinnedBufSize = bytes;
|
||||
}
|
||||
|
||||
bool NV12PreprocessHelper::isCudaContextHealthy(SPDLogger& logger, const char* tag) {
|
||||
if (m_cudaContextDead) return false;
|
||||
|
||||
cudaError_t pending = cudaGetLastError();
|
||||
cudaError_t err = cudaStreamQuery(nullptr);
|
||||
if (err == cudaSuccess || err == cudaErrorNotReady) {
|
||||
m_cudaFailStreak = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
++m_cudaFailStreak;
|
||||
if (m_cudaFailStreak >= CUDA_FAIL_LIMIT && !m_cudaContextDead) {
|
||||
m_cudaContextDead = true;
|
||||
logger.LogFatal(std::string(tag) + "::NV12Helper",
|
||||
"CUDA context permanently corrupted after " +
|
||||
std::to_string(m_cudaFailStreak) +
|
||||
" consecutive failures (last error: " +
|
||||
std::string(cudaGetErrorString(err)) +
|
||||
", pending: " + std::string(cudaGetErrorString(pending)) +
|
||||
"). GPU preprocessing DISABLED for this engine instance. "
|
||||
"This is typically caused by NVDEC/CUVID surface pool exhaustion "
|
||||
"(Error mapping a picture with CUVID: 205). "
|
||||
"Reduce the number of concurrent HW-decoded 4K streams or "
|
||||
"restart the application to recover.",
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// ====================================================================
|
||||
// tryNV12 — attempt NV12 fast-path preprocessing
|
||||
// ====================================================================
|
||||
NV12Result NV12PreprocessHelper::tryNV12(
|
||||
const cv::Mat& inputImage, int inferenceGpu,
|
||||
int inputW, int inputH,
|
||||
const NV12KernelLauncher& launcher,
|
||||
SPDLogger& logger, const char* tag) {
|
||||
NV12Result result;
|
||||
|
||||
// Skip during warmup phase
|
||||
if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
|
||||
GpuFrameData* gpuData = tl_currentGpuFrame();
|
||||
|
||||
if (!gpuData || !gpuData->yPlane) {
|
||||
return result; // no NV12 data — caller uses BGR
|
||||
}
|
||||
|
||||
// ── BGR full-res path (pixFmt=1000 from ANSVideoPlayer) ──────────
|
||||
if (gpuData->pixelFormat == 1000) {
|
||||
cv::Mat fullResImg(gpuData->height, gpuData->width, CV_8UC3,
|
||||
gpuData->yPlane, gpuData->yLinesize);
|
||||
if (!m_bgrFullResLogged) {
|
||||
m_bgrFullResLogged = true;
|
||||
logger.LogInfo(std::string(tag) + "::NV12Helper",
|
||||
"BGR full-res ACTIVE: " + std::to_string(gpuData->width) + "x" +
|
||||
std::to_string(gpuData->height) + " -> inference (display=" +
|
||||
std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) + ")",
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
result.useBgrFullRes = true;
|
||||
result.bgrFullResImg = fullResImg.clone(); // clone under lock
|
||||
if (inputImage.cols > 0 && inputImage.rows > 0 &&
|
||||
(fullResImg.cols != inputImage.cols || fullResImg.rows != inputImage.rows)) {
|
||||
result.bgrFullResScaleX = static_cast<float>(inputImage.cols) / static_cast<float>(fullResImg.cols);
|
||||
result.bgrFullResScaleY = static_cast<float>(inputImage.rows) / static_cast<float>(fullResImg.rows);
|
||||
}
|
||||
// (No registry lock — data kept alive by refcount)
|
||||
return result;
|
||||
}
|
||||
|
||||
// ── NV12 path (pixFmt=23 = AV_PIX_FMT_NV12) ────────────────────
|
||||
if (gpuData->pixelFormat != 23) {
|
||||
return result; // unknown format — fall back to BGR
|
||||
}
|
||||
|
||||
if (!gpuData->uvPlane) {
|
||||
if (!m_nv12NullLogged) {
|
||||
m_nv12NullLogged = true;
|
||||
logger.LogWarn(std::string(tag) + "::NV12Helper",
|
||||
"Early exit: null uvPlane",
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
const int frameW = gpuData->width;
|
||||
const int frameH = gpuData->height;
|
||||
|
||||
if (frameW <= 0 || frameH <= 0) {
|
||||
if (!m_nv12DimLogged) {
|
||||
m_nv12DimLogged = true;
|
||||
logger.LogWarn(std::string(tag) + "::NV12Helper",
|
||||
"Early exit: bad dimensions — w=" + std::to_string(frameW) +
|
||||
" h=" + std::to_string(frameH),
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
if (m_cudaContextDead) {
|
||||
if (!m_nv12DeadLogged) {
|
||||
m_nv12DeadLogged = true;
|
||||
logger.LogWarn(std::string(tag) + "::NV12Helper",
|
||||
"Early exit: CUDA context dead",
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
const bool isCudaDevice = gpuData->isCudaDevicePtr;
|
||||
const bool gpuMatch = !isCudaDevice ||
|
||||
gpuData->gpuIndex < 0 ||
|
||||
gpuData->gpuIndex == inferenceGpu;
|
||||
const bool useZeroCopy = isCudaDevice && gpuMatch;
|
||||
|
||||
// Effective plane pointers — for zero-copy, use CUDA device ptrs;
|
||||
// for CPU upload, use the CPU snapshot buffers.
|
||||
uint8_t* effYPlane;
|
||||
uint8_t* effUvPlane;
|
||||
int effYLinesize;
|
||||
int effUvLinesize;
|
||||
|
||||
if (useZeroCopy) {
|
||||
// Same GPU: wrap NVDEC device pointers directly
|
||||
effYPlane = gpuData->yPlane;
|
||||
effUvPlane = gpuData->uvPlane;
|
||||
effYLinesize = gpuData->yLinesize;
|
||||
effUvLinesize = gpuData->uvLinesize;
|
||||
} else if (isCudaDevice && !gpuMatch) {
|
||||
// Cross-GPU: can't use CUDA device ptrs from another GPU context.
|
||||
// Fall through to CPU upload using the CPU snapshot buffers.
|
||||
if (gpuData->cpuYPlane && gpuData->cpuUvPlane) {
|
||||
effYPlane = gpuData->cpuYPlane;
|
||||
effUvPlane = gpuData->cpuUvPlane;
|
||||
effYLinesize = gpuData->cpuYLinesize;
|
||||
effUvLinesize = gpuData->cpuUvLinesize;
|
||||
if (!m_gpuMismatchLogged) {
|
||||
m_gpuMismatchLogged = true;
|
||||
logger.LogInfo(std::string(tag) + "::NV12Helper",
|
||||
"GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) +
|
||||
" vs inference GPU " + std::to_string(inferenceGpu) +
|
||||
") — using CPU NV12 upload fallback",
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
} else {
|
||||
// No CPU fallback available — fall back to BGR
|
||||
if (!m_gpuMismatchLogged) {
|
||||
m_gpuMismatchLogged = true;
|
||||
logger.LogInfo(std::string(tag) + "::NV12Helper",
|
||||
"GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) +
|
||||
" vs inference GPU " + std::to_string(inferenceGpu) +
|
||||
") — no CPU fallback, using BGR",
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
} else {
|
||||
// Not CUDA device ptrs: use whatever yPlane/uvPlane point to (CPU data)
|
||||
effYPlane = gpuData->yPlane;
|
||||
effUvPlane = gpuData->uvPlane;
|
||||
effYLinesize = gpuData->yLinesize;
|
||||
effUvLinesize = gpuData->uvLinesize;
|
||||
}
|
||||
|
||||
// Log path selection once
|
||||
if (!m_nv12PathLogged) {
|
||||
m_nv12PathLogged = true;
|
||||
const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY" : "CPU_NV12_UPLOAD";
|
||||
logger.LogInfo(std::string(tag) + "::NV12Helper",
|
||||
std::string("Path: ") + pathName +
|
||||
" | isCuda=" + std::to_string(isCudaDevice) +
|
||||
" gpuMatch=" + std::to_string(gpuMatch) +
|
||||
" decodeGpu=" + std::to_string(gpuData->gpuIndex) +
|
||||
" infGpu=" + std::to_string(inferenceGpu) +
|
||||
" frame=" + std::to_string(frameW) + "x" + std::to_string(frameH),
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
|
||||
cv::cuda::Stream stream;
|
||||
cv::cuda::GpuMat gpuY, gpuUV;
|
||||
|
||||
if (useZeroCopy) {
|
||||
// CUDA zero-copy: wrap NVDEC device pointers directly
|
||||
gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1,
|
||||
effYPlane, static_cast<size_t>(effYLinesize));
|
||||
gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
|
||||
effUvPlane, static_cast<size_t>(effUvLinesize));
|
||||
} else {
|
||||
// CPU path: memcpy + upload
|
||||
const size_t ySize = static_cast<size_t>(frameW) * frameH;
|
||||
const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
|
||||
const size_t nv12Size = ySize + uvSize;
|
||||
ensurePinnedBuffer(nv12Size, logger, tag);
|
||||
if (!m_pinnedBuf) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Validate NV12 plane pointers before memcpy
|
||||
const size_t yBufNeeded = (effYLinesize == frameW)
|
||||
? ySize
|
||||
: static_cast<size_t>(effYLinesize) * frameH;
|
||||
const size_t uvBufNeeded = (effUvLinesize == frameW)
|
||||
? uvSize
|
||||
: static_cast<size_t>(effUvLinesize) * (frameH / 2);
|
||||
|
||||
if (!isMemoryReadable(effYPlane, std::min(yBufNeeded, (size_t)4096)) ||
|
||||
!isMemoryReadable(effUvPlane, std::min(uvBufNeeded, (size_t)4096))) {
|
||||
logger.LogWarn(std::string(tag) + "::NV12Helper",
|
||||
"NV12 plane pointers not readable — falling back to BGR",
|
||||
__FILE__, __LINE__);
|
||||
return result;
|
||||
}
|
||||
|
||||
uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
|
||||
bool cpyOk = true;
|
||||
if (effYLinesize == frameW) {
|
||||
cpyOk = safeMemcpy(dst, effYPlane, ySize);
|
||||
} else {
|
||||
for (int row = 0; row < frameH && cpyOk; row++)
|
||||
cpyOk = safeMemcpy(dst + row * frameW,
|
||||
effYPlane + row * effYLinesize, frameW);
|
||||
}
|
||||
if (cpyOk) {
|
||||
uint8_t* uvDst = dst + ySize;
|
||||
if (effUvLinesize == frameW) {
|
||||
cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
|
||||
} else {
|
||||
for (int row = 0; row < frameH / 2 && cpyOk; row++)
|
||||
cpyOk = safeMemcpy(uvDst + row * frameW,
|
||||
effUvPlane + row * effUvLinesize, frameW);
|
||||
}
|
||||
}
|
||||
|
||||
if (!cpyOk) {
|
||||
logger.LogWarn(std::string(tag) + "::NV12Helper",
|
||||
"Access violation during NV12 memcpy — falling back to BGR",
|
||||
__FILE__, __LINE__);
|
||||
return result;
|
||||
}
|
||||
|
||||
// NV12 data safely in pinned memory — release registry lock
|
||||
// (No registry lock to release — data kept alive by refcount)
|
||||
|
||||
cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
|
||||
cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
|
||||
static_cast<uint8_t*>(m_pinnedBuf) + ySize);
|
||||
gpuY.upload(pinnedY, stream);
|
||||
gpuUV.upload(pinnedUV, stream);
|
||||
}
|
||||
|
||||
// Use display dimensions for coordinate mapping
|
||||
const float metaW = static_cast<float>(inputImage.cols > 0 ? inputImage.cols : frameW);
|
||||
const float metaH = static_cast<float>(inputImage.rows > 0 ? inputImage.rows : frameH);
|
||||
|
||||
if (metaH <= 0 || metaW <= 0) {
|
||||
if (!m_nv12MetaLogged) {
|
||||
m_nv12MetaLogged = true;
|
||||
logger.LogWarn(std::string(tag) + "::NV12Helper",
|
||||
"Early exit: metadata dims invalid",
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
result.metaWidth = metaW;
|
||||
result.metaHeight = metaH;
|
||||
result.ratio = 1.f / std::min(
|
||||
static_cast<float>(inputW) / metaW,
|
||||
static_cast<float>(inputH) / metaH);
|
||||
|
||||
// Launch engine-specific kernel
|
||||
cv::cuda::GpuMat gpuResized;
|
||||
gpuResized.create(inputH, inputW, CV_8UC3);
|
||||
|
||||
cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
|
||||
launcher(gpuY, gpuUV, gpuResized, frameW, frameH, inputW, inputH, rawStream);
|
||||
|
||||
stream.waitForCompletion();
|
||||
|
||||
// (No registry lock to release — data kept alive by refcount)
|
||||
|
||||
// Log NV12 fast-path usage once per instance
|
||||
if (!m_nv12ActiveLogged) {
|
||||
m_nv12ActiveLogged = true;
|
||||
const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload";
|
||||
logger.LogInfo(std::string(tag) + "::NV12Helper",
|
||||
std::string(mode) + " ACTIVE: " +
|
||||
std::to_string(frameW) + "x" + std::to_string(frameH) +
|
||||
" NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) +
|
||||
" display=" + std::to_string(static_cast<int>(metaW)) + "x" +
|
||||
std::to_string(static_cast<int>(metaH)),
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
|
||||
result.succeeded = true;
|
||||
result.gpuRGB = std::move(gpuResized);
|
||||
return result;
|
||||
}
|
||||
|
||||
// ====================================================================
|
||||
// Default YOLO kernel launcher
|
||||
//
|
||||
// For detection/seg/pose/OBB: fused NV12→RGB + letterbox in a single
|
||||
// kernel at model input resolution (e.g. 640x640).
|
||||
// For classification: separate NV12→RGB at full res, then cuda::resize.
|
||||
// (Classification detection is not done here — the caller's engine
|
||||
// should set up the appropriate launcher if needed. This default
|
||||
// always uses the fused letterbox path, which works for all YOLO
|
||||
// detection/seg/pose/OBB tasks.)
|
||||
// ====================================================================
|
||||
NV12KernelLauncher NV12PreprocessHelper::defaultYOLOLauncher() {
|
||||
return [](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
|
||||
cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
|
||||
int inputW, int inputH, cudaStream_t stream) {
|
||||
if (srcW == inputW && srcH == inputH) {
|
||||
// Source matches model input — direct NV12→RGB, no resize needed
|
||||
launchNV12ToRGB(
|
||||
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
||||
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
||||
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
|
||||
srcW, srcH, stream);
|
||||
} else {
|
||||
// Fused NV12→RGB + letterbox resize
|
||||
float r = std::min(static_cast<float>(inputW) / srcW,
|
||||
static_cast<float>(inputH) / srcH);
|
||||
int unpadW = static_cast<int>(r * srcW);
|
||||
int unpadH = static_cast<int>(r * srcH);
|
||||
float invScale = 1.0f / r;
|
||||
|
||||
launchNV12ToRGBLetterbox(
|
||||
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
||||
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
||||
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
|
||||
inputW, inputH,
|
||||
srcW, srcH,
|
||||
unpadW, unpadH,
|
||||
invScale, stream);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// ====================================================================
|
||||
// Classification launcher (direct resize, no letterbox)
|
||||
// ====================================================================
|
||||
NV12KernelLauncher NV12PreprocessHelper::classificationLauncher() {
|
||||
return [](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
|
||||
cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
|
||||
int inputW, int inputH, cudaStream_t stream) {
|
||||
if (srcW == inputW && srcH == inputH) {
|
||||
// Source matches model input — direct NV12→RGB, no resize needed
|
||||
launchNV12ToRGB(
|
||||
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
||||
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
||||
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
|
||||
srcW, srcH, stream);
|
||||
} else {
|
||||
// Fused NV12→RGB + direct resize (stretch to fill, no padding)
|
||||
launchNV12ToRGBResize(
|
||||
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
||||
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
||||
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
|
||||
inputW, inputH,
|
||||
srcW, srcH, stream);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// ====================================================================
|
||||
// NV12→BGR fused resize (for OCR detection)
|
||||
// ====================================================================
|
||||
void NV12PreprocessHelper::nv12ToBGRResize(
|
||||
const uint8_t* devY, int yPitch,
|
||||
const uint8_t* devUV, int uvPitch,
|
||||
uint8_t* bgrOut, int outPitch,
|
||||
int outW, int outH, int srcW, int srcH,
|
||||
cudaStream_t stream) {
|
||||
launchNV12ToBGRResize(devY, yPitch, devUV, uvPitch,
|
||||
bgrOut, outPitch, outW, outH, srcW, srcH, stream);
|
||||
}
|
||||
|
||||
// SCRFD center-padded letterbox launcher
|
||||
// ====================================================================
|
||||
NV12KernelLauncher NV12PreprocessHelper::scrfdCenterLetterboxLauncher(int padLeft, int padTop) {
|
||||
return [padLeft, padTop](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
|
||||
cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
|
||||
int inputW, int inputH, cudaStream_t stream) {
|
||||
float r = std::min(static_cast<float>(inputW) / srcW,
|
||||
static_cast<float>(inputH) / srcH);
|
||||
int unpadW = static_cast<int>(r * srcW);
|
||||
int unpadH = static_cast<int>(r * srcH);
|
||||
float invScale = 1.0f / r;
|
||||
|
||||
launchNV12ToRGBCenterLetterbox(
|
||||
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
||||
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
||||
gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
|
||||
inputW, inputH,
|
||||
srcW, srcH,
|
||||
unpadW, unpadH,
|
||||
invScale,
|
||||
padLeft, padTop,
|
||||
stream);
|
||||
};
|
||||
}
|
||||
|
||||
// ====================================================================
|
||||
// NV12 affine warp for face alignment
|
||||
//
|
||||
// Reads full-res NV12 from the registry, applies inverse affine
|
||||
// transform via CUDA kernel, outputs a small aligned face in BGR.
|
||||
// The affine matrix is computed on CPU (from detected landmarks to
|
||||
// ArcFace template), then scaled to map to full-res NV12 source.
|
||||
// ====================================================================
|
||||
NV12PreprocessHelper::NV12AffineResult NV12PreprocessHelper::tryNV12AffineWarp(
|
||||
const cv::Mat& inputImage, int inferenceGpu,
|
||||
const cv::Mat& affineMatrix, int outW, int outH,
|
||||
float scaleX, float scaleY,
|
||||
SPDLogger& logger, const char* tag) {
|
||||
NV12AffineResult result;
|
||||
|
||||
// Skip during warmup
|
||||
if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
|
||||
return result;
|
||||
}
|
||||
|
||||
if (affineMatrix.empty()) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
|
||||
GpuFrameData* gpuData = tl_currentGpuFrame();
|
||||
|
||||
if (!gpuData || !gpuData->yPlane || !gpuData->uvPlane || gpuData->pixelFormat != 23) {
|
||||
return result;
|
||||
}
|
||||
|
||||
const int frameW = gpuData->width;
|
||||
const int frameH = gpuData->height;
|
||||
if (frameW <= 0 || frameH <= 0 || m_cudaContextDead) {
|
||||
return result;
|
||||
}
|
||||
|
||||
const bool isCudaDevice = gpuData->isCudaDevicePtr;
|
||||
const bool gpuMatch = !isCudaDevice ||
|
||||
gpuData->gpuIndex < 0 ||
|
||||
gpuData->gpuIndex == inferenceGpu;
|
||||
|
||||
if (isCudaDevice && !gpuMatch) {
|
||||
return result; // cross-GPU — fall back to CPU warpAffine
|
||||
}
|
||||
|
||||
const bool useZeroCopy = isCudaDevice && gpuMatch;
|
||||
uint8_t* effYPlane = gpuData->yPlane;
|
||||
uint8_t* effUvPlane = gpuData->uvPlane;
|
||||
int effYLinesize = gpuData->yLinesize;
|
||||
int effUvLinesize = gpuData->uvLinesize;
|
||||
|
||||
// Compute inverse affine matrix and scale for full-res NV12
|
||||
cv::Mat M_inv;
|
||||
cv::invertAffineTransform(affineMatrix, M_inv);
|
||||
M_inv.convertTo(M_inv, CV_64F);
|
||||
|
||||
// Scale inverse affine to map from 112x112 output → full-res NV12 source
|
||||
// Row 0 maps to X (multiply by scaleX), Row 1 maps to Y (multiply by scaleY)
|
||||
double* row0 = M_inv.ptr<double>(0);
|
||||
double* row1 = M_inv.ptr<double>(1);
|
||||
row0[0] *= scaleX; row0[1] *= scaleX; row0[2] *= scaleX;
|
||||
row1[0] *= scaleY; row1[1] *= scaleY; row1[2] *= scaleY;
|
||||
|
||||
float m00 = static_cast<float>(row0[0]);
|
||||
float m01 = static_cast<float>(row0[1]);
|
||||
float m02 = static_cast<float>(row0[2]);
|
||||
float m10 = static_cast<float>(row1[0]);
|
||||
float m11 = static_cast<float>(row1[1]);
|
||||
float m12 = static_cast<float>(row1[2]);
|
||||
|
||||
cv::cuda::Stream stream;
|
||||
cv::cuda::GpuMat gpuY, gpuUV;
|
||||
|
||||
if (useZeroCopy) {
|
||||
gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1,
|
||||
effYPlane, static_cast<size_t>(effYLinesize));
|
||||
gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
|
||||
effUvPlane, static_cast<size_t>(effUvLinesize));
|
||||
} else {
|
||||
// CPU path: memcpy + upload
|
||||
const size_t ySize = static_cast<size_t>(frameW) * frameH;
|
||||
const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
|
||||
const size_t nv12Size = ySize + uvSize;
|
||||
ensurePinnedBuffer(nv12Size, logger, tag);
|
||||
if (!m_pinnedBuf) {
|
||||
return result;
|
||||
}
|
||||
|
||||
uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
|
||||
bool cpyOk = true;
|
||||
if (effYLinesize == frameW) {
|
||||
cpyOk = safeMemcpy(dst, effYPlane, ySize);
|
||||
} else {
|
||||
for (int row = 0; row < frameH && cpyOk; row++)
|
||||
cpyOk = safeMemcpy(dst + row * frameW,
|
||||
effYPlane + row * effYLinesize, frameW);
|
||||
}
|
||||
if (cpyOk) {
|
||||
uint8_t* uvDst = dst + ySize;
|
||||
if (effUvLinesize == frameW) {
|
||||
cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
|
||||
} else {
|
||||
for (int row = 0; row < frameH / 2 && cpyOk; row++)
|
||||
cpyOk = safeMemcpy(uvDst + row * frameW,
|
||||
effUvPlane + row * effUvLinesize, frameW);
|
||||
}
|
||||
}
|
||||
|
||||
if (!cpyOk) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// (No registry lock to release — data kept alive by refcount)
|
||||
|
||||
cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
|
||||
cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
|
||||
static_cast<uint8_t*>(m_pinnedBuf) + ySize);
|
||||
gpuY.upload(pinnedY, stream);
|
||||
gpuUV.upload(pinnedUV, stream);
|
||||
}
|
||||
|
||||
// Launch affine warp kernel (outputs BGR 112x112)
|
||||
cv::cuda::GpuMat gpuFace(outH, outW, CV_8UC3);
|
||||
cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
|
||||
|
||||
launchNV12AffineWarpToBGR(
|
||||
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
||||
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
||||
gpuFace.ptr<uint8_t>(), static_cast<int>(gpuFace.step),
|
||||
outW, outH,
|
||||
frameW, frameH,
|
||||
m00, m01, m02, m10, m11, m12,
|
||||
rawStream);
|
||||
|
||||
stream.waitForCompletion();
|
||||
|
||||
// (No registry lock to release — data kept alive by refcount)
|
||||
|
||||
// Keep GPU copy for recognizer (avoids re-upload), download CPU copy for mask
|
||||
result.gpuAlignedFace = gpuFace;
|
||||
gpuFace.download(result.alignedFaceBGR);
|
||||
result.succeeded = true;
|
||||
return result;
|
||||
}
|
||||
|
||||
// ── NV12 rectangular crop → BGR ────────────────────────────────────
|
||||
NV12PreprocessHelper::NV12CropResult NV12PreprocessHelper::tryNV12CropToBGR(
|
||||
const cv::Mat& inputImage, int inferenceGpu,
|
||||
const cv::Rect& bbox, int padding,
|
||||
float scaleX, float scaleY,
|
||||
SPDLogger& logger, const char* tag)
|
||||
{
|
||||
NV12CropResult result;
|
||||
|
||||
// Warmup gating
|
||||
if (m_inferenceCount < NV12_WARMUP_THRESHOLD)
|
||||
return result;
|
||||
|
||||
// CUDA health check
|
||||
if (m_cudaContextDead)
|
||||
return result;
|
||||
|
||||
// Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
|
||||
auto* gpuData = tl_currentGpuFrame();
|
||||
if (!gpuData || gpuData->pixelFormat != 23)
|
||||
return result; // no NV12 data
|
||||
|
||||
if (!gpuData->isCudaDevicePtr || !gpuData->yPlane || !gpuData->uvPlane)
|
||||
return result; // NV12 not on GPU
|
||||
|
||||
const int frameW = gpuData->width;
|
||||
const int frameH = gpuData->height;
|
||||
|
||||
// Scale bbox from display-res to full-res NV12 coords with padding
|
||||
const int fx1 = std::max(0, static_cast<int>((bbox.x - padding) * scaleX));
|
||||
const int fy1 = std::max(0, static_cast<int>((bbox.y - padding) * scaleY));
|
||||
const int fx2 = std::min(frameW, static_cast<int>((bbox.x + bbox.width + padding) * scaleX));
|
||||
const int fy2 = std::min(frameH, static_cast<int>((bbox.y + bbox.height + padding) * scaleY));
|
||||
const int cropW = fx2 - fx1;
|
||||
const int cropH = fy2 - fy1;
|
||||
|
||||
if (cropW < 5 || cropH < 5)
|
||||
return result;
|
||||
|
||||
// GPU device matching
|
||||
int currentDev = 0;
|
||||
cudaGetDevice(¤tDev);
|
||||
if (inferenceGpu >= 0 && currentDev != inferenceGpu)
|
||||
cudaSetDevice(inferenceGpu);
|
||||
|
||||
// Wrap existing GPU NV12 planes
|
||||
cv::cuda::GpuMat gpuY(frameH, frameW, CV_8UC1,
|
||||
gpuData->yPlane, gpuData->yLinesize);
|
||||
cv::cuda::GpuMat gpuUV(frameH / 2, frameW / 2, CV_8UC2,
|
||||
gpuData->uvPlane, gpuData->uvLinesize);
|
||||
|
||||
// Allocate output on GPU
|
||||
cv::cuda::GpuMat gpuCrop(cropH, cropW, CV_8UC3);
|
||||
|
||||
// Launch crop kernel
|
||||
cv::cuda::Stream stream;
|
||||
cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
|
||||
|
||||
launchNV12CropToBGR(
|
||||
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
||||
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
||||
gpuCrop.ptr<uint8_t>(), static_cast<int>(gpuCrop.step),
|
||||
cropW, cropH,
|
||||
fx1, fy1,
|
||||
frameW, frameH,
|
||||
rawStream);
|
||||
|
||||
stream.waitForCompletion();
|
||||
|
||||
// (No registry lock to release — data kept alive by refcount)
|
||||
|
||||
// Download small crop to CPU
|
||||
gpuCrop.download(result.bgrCrop);
|
||||
result.succeeded = true;
|
||||
|
||||
// One-shot diagnostic log
|
||||
if (!m_nv12CropLogged) {
|
||||
m_nv12CropLogged = true;
|
||||
logger.LogInfo(tag,
|
||||
"NV12 GPU crop ACTIVE: " + std::to_string(cropW) + "x" + std::to_string(cropH) +
|
||||
" BGR cropped from " + std::to_string(frameW) + "x" + std::to_string(frameH) +
|
||||
" NV12 (display=" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) +
|
||||
" scaleX=" + std::to_string(scaleX) + " scaleY=" + std::to_string(scaleY) + ")",
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ====================================================================
|
||||
// tryNV12DirectToBuffer — write NV12→RGB CHW directly into TRT buffer
|
||||
//
|
||||
// For engines that manage their own GPU buffers (e.g. SAM3 with raw
|
||||
// TRT enqueueV3). Same registry lookup / GPU matching / safety logic
|
||||
// as tryNV12(), but writes CHW planar output into a pre-allocated
|
||||
// void* GPU buffer instead of allocating an HWC GpuMat.
|
||||
// ====================================================================
|
||||
NV12PreprocessHelper::NV12DirectResult NV12PreprocessHelper::tryNV12DirectToBuffer(
|
||||
const cv::Mat& inputImage, int inferenceGpu,
|
||||
void* dstGpuBuffer, int inputW, int inputH,
|
||||
bool isFloat32,
|
||||
cudaStream_t stream,
|
||||
SPDLogger& logger, const char* tag) {
|
||||
NV12DirectResult result;
|
||||
|
||||
// Skip during warmup phase
|
||||
if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
|
||||
return result;
|
||||
}
|
||||
|
||||
if (!dstGpuBuffer) return result;
|
||||
|
||||
// Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
|
||||
GpuFrameData* gpuData = tl_currentGpuFrame();
|
||||
|
||||
if (!gpuData || !gpuData->yPlane) {
|
||||
return result; // no NV12 data — caller uses CPU path
|
||||
}
|
||||
|
||||
// BGR full-res path not applicable for direct-to-buffer (SAM3 handles its own resize)
|
||||
if (gpuData->pixelFormat == 1000) {
|
||||
return result;
|
||||
}
|
||||
|
||||
// NV12 path only
|
||||
if (gpuData->pixelFormat != 23) {
|
||||
return result;
|
||||
}
|
||||
|
||||
if (!gpuData->uvPlane) {
|
||||
return result;
|
||||
}
|
||||
|
||||
const int frameW = gpuData->width;
|
||||
const int frameH = gpuData->height;
|
||||
|
||||
if (frameW <= 0 || frameH <= 0 || m_cudaContextDead) {
|
||||
return result;
|
||||
}
|
||||
|
||||
const bool isCudaDevice = gpuData->isCudaDevicePtr;
|
||||
const bool gpuMatch = !isCudaDevice ||
|
||||
gpuData->gpuIndex < 0 ||
|
||||
gpuData->gpuIndex == inferenceGpu;
|
||||
const bool useZeroCopy = isCudaDevice && gpuMatch;
|
||||
|
||||
if (isCudaDevice && !gpuMatch) {
|
||||
return result; // cross-GPU — fall back to CPU path
|
||||
}
|
||||
|
||||
uint8_t* effYPlane = gpuData->yPlane;
|
||||
uint8_t* effUvPlane = gpuData->uvPlane;
|
||||
int effYLinesize = gpuData->yLinesize;
|
||||
int effUvLinesize = gpuData->uvLinesize;
|
||||
|
||||
// Log path selection once
|
||||
if (!m_nv12PathLogged) {
|
||||
m_nv12PathLogged = true;
|
||||
const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY" : "CPU_NV12_UPLOAD";
|
||||
logger.LogInfo(std::string(tag) + "::NV12Helper",
|
||||
std::string("Path: ") + pathName +
|
||||
" | isCuda=" + std::to_string(isCudaDevice) +
|
||||
" gpuMatch=" + std::to_string(gpuMatch) +
|
||||
" frame=" + std::to_string(frameW) + "x" + std::to_string(frameH) +
|
||||
" (direct CHW " + (isFloat32 ? "float32" : "uint8") + ")",
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
|
||||
cv::cuda::GpuMat gpuY, gpuUV;
|
||||
|
||||
if (useZeroCopy) {
|
||||
gpuY = cv::cuda::GpuMat(frameH, frameW, CV_8UC1,
|
||||
effYPlane, static_cast<size_t>(effYLinesize));
|
||||
gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
|
||||
effUvPlane, static_cast<size_t>(effUvLinesize));
|
||||
} else {
|
||||
// CPU path: memcpy NV12 to pinned buffer, upload
|
||||
const size_t ySize = static_cast<size_t>(frameW) * frameH;
|
||||
const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
|
||||
const size_t nv12Size = ySize + uvSize;
|
||||
ensurePinnedBuffer(nv12Size, logger, tag);
|
||||
if (!m_pinnedBuf) return result;
|
||||
|
||||
if (!isMemoryReadable(effYPlane, std::min(static_cast<size_t>(effYLinesize) * frameH, (size_t)4096)) ||
|
||||
!isMemoryReadable(effUvPlane, std::min(static_cast<size_t>(effUvLinesize) * (frameH / 2), (size_t)4096))) {
|
||||
return result;
|
||||
}
|
||||
|
||||
uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
|
||||
bool cpyOk = true;
|
||||
if (effYLinesize == frameW) {
|
||||
cpyOk = safeMemcpy(dst, effYPlane, ySize);
|
||||
} else {
|
||||
for (int row = 0; row < frameH && cpyOk; row++)
|
||||
cpyOk = safeMemcpy(dst + row * frameW,
|
||||
effYPlane + row * effYLinesize, frameW);
|
||||
}
|
||||
if (cpyOk) {
|
||||
uint8_t* uvDst = dst + ySize;
|
||||
if (effUvLinesize == frameW) {
|
||||
cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
|
||||
} else {
|
||||
for (int row = 0; row < frameH / 2 && cpyOk; row++)
|
||||
cpyOk = safeMemcpy(uvDst + row * frameW,
|
||||
effUvPlane + row * effUvLinesize, frameW);
|
||||
}
|
||||
}
|
||||
if (!cpyOk) return result;
|
||||
|
||||
// (No registry lock to release — data kept alive by refcount)
|
||||
|
||||
cv::cuda::Stream cvStream = cv::cuda::StreamAccessor::wrapStream(stream);
|
||||
cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
|
||||
cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
|
||||
static_cast<uint8_t*>(m_pinnedBuf) + ySize);
|
||||
gpuY.upload(pinnedY, cvStream);
|
||||
gpuUV.upload(pinnedUV, cvStream);
|
||||
}
|
||||
|
||||
// Launch CHW kernel directly into TRT buffer
|
||||
if (isFloat32) {
|
||||
launchNV12ToRGBResizeCHW_float(
|
||||
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
||||
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
||||
static_cast<float*>(dstGpuBuffer),
|
||||
inputW, inputH, frameW, frameH, stream);
|
||||
} else {
|
||||
launchNV12ToRGBResizeCHW_uint8(
|
||||
gpuY.ptr<uint8_t>(), static_cast<int>(gpuY.step),
|
||||
gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
|
||||
static_cast<uint8_t*>(dstGpuBuffer),
|
||||
inputW, inputH, frameW, frameH, stream);
|
||||
}
|
||||
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
// (No registry lock to release — data kept alive by refcount)
|
||||
|
||||
// Log activation once
|
||||
if (!m_nv12ActiveLogged) {
|
||||
m_nv12ActiveLogged = true;
|
||||
const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload";
|
||||
logger.LogInfo(std::string(tag) + "::NV12Helper",
|
||||
std::string(mode) + " ACTIVE: " +
|
||||
std::to_string(frameW) + "x" + std::to_string(frameH) +
|
||||
" NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) +
|
||||
" CHW " + (isFloat32 ? "float32" : "uint8"),
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
|
||||
result.succeeded = true;
|
||||
result.metaWidth = static_cast<float>(inputImage.cols > 0 ? inputImage.cols : frameW);
|
||||
result.metaHeight = static_cast<float>(inputImage.rows > 0 ? inputImage.rows : frameH);
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace ANSCENTER
|
||||
Reference in New Issue
Block a user