770 lines
28 KiB
Plaintext
770 lines
28 KiB
Plaintext
// nv12_to_rgb.cu — CUDA kernels for NV12→RGB and NV12→BGR conversion.
|
||
//
|
||
// Two use cases:
|
||
// 1. launchNV12ToRGB() — inference fast path (ANSRTYOLO uploads NV12 directly)
|
||
// 2. ANSGpuNV12ToBGR() — video_player display path (replaces slow CPU cvtColorTwoPlane)
|
||
//
|
||
// cv::cuda::cvtColor does NOT support NV12, so we use custom kernels.
|
||
|
||
#include <cuda_runtime.h>
|
||
#include <cstdint>
|
||
#ifdef _WIN32
|
||
#include <windows.h> // Sleep()
|
||
#endif
|
||
#include <cstdio>
|
||
|
||
// ── Shared YUV→RGB computation ───────────────────────────────────────────
|
||
|
||
__device__ __forceinline__ void nv12_yuv_to_rgb(
|
||
float Y, float U, float V,
|
||
uint8_t& r, uint8_t& g, uint8_t& b)
|
||
{
|
||
// BT.601 conversion (same as FFmpeg swscale / OpenCV cvtColorTwoPlane)
|
||
const float R = Y + 1.402f * V;
|
||
const float G = Y - 0.344136f * U - 0.714136f * V;
|
||
const float B = Y + 1.772f * U;
|
||
r = static_cast<uint8_t>(fminf(fmaxf(R, 0.f), 255.f));
|
||
g = static_cast<uint8_t>(fminf(fmaxf(G, 0.f), 255.f));
|
||
b = static_cast<uint8_t>(fminf(fmaxf(B, 0.f), 255.f));
|
||
}
|
||
|
||
__device__ __forceinline__ void nv12_read_yuv(
|
||
const uint8_t* __restrict__ yPlane, int yPitch,
|
||
const uint8_t* __restrict__ uvPlane, int uvPitch,
|
||
int x, int y,
|
||
float& Y, float& U, float& V)
|
||
{
|
||
Y = static_cast<float>(yPlane[y * yPitch + x]);
|
||
const int uvIdx = (y >> 1) * uvPitch + (x & ~1);
|
||
U = static_cast<float>(uvPlane[uvIdx]) - 128.f;
|
||
V = static_cast<float>(uvPlane[uvIdx + 1]) - 128.f;
|
||
}
|
||
|
||
// ── NV12→RGB kernel (for inference fast path) ────────────────────────────
|
||
|
||
__global__ void nv12ToRgbKernel(
|
||
const uint8_t* __restrict__ yPlane, int yPitch,
|
||
const uint8_t* __restrict__ uvPlane, int uvPitch,
|
||
uint8_t* __restrict__ rgbOut, int outPitch,
|
||
int width, int height)
|
||
{
|
||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||
if (x >= width || y >= height) return;
|
||
|
||
float Y, U, V;
|
||
nv12_read_yuv(yPlane, yPitch, uvPlane, uvPitch, x, y, Y, U, V);
|
||
|
||
uint8_t r, g, b;
|
||
nv12_yuv_to_rgb(Y, U, V, r, g, b);
|
||
|
||
const int outIdx = y * outPitch + x * 3;
|
||
rgbOut[outIdx] = r;
|
||
rgbOut[outIdx + 1] = g;
|
||
rgbOut[outIdx + 2] = b;
|
||
}
|
||
|
||
// ── NV12→BGR kernel (for LabVIEW display — matches OpenCV channel order) ─
|
||
|
||
__global__ void nv12ToBgrKernel(
|
||
const uint8_t* __restrict__ yPlane, int yPitch,
|
||
const uint8_t* __restrict__ uvPlane, int uvPitch,
|
||
uint8_t* __restrict__ bgrOut, int outPitch,
|
||
int width, int height)
|
||
{
|
||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||
if (x >= width || y >= height) return;
|
||
|
||
float Y, U, V;
|
||
nv12_read_yuv(yPlane, yPitch, uvPlane, uvPitch, x, y, Y, U, V);
|
||
|
||
uint8_t r, g, b;
|
||
nv12_yuv_to_rgb(Y, U, V, r, g, b);
|
||
|
||
const int outIdx = y * outPitch + x * 3;
|
||
bgrOut[outIdx] = b; // BGR order for OpenCV
|
||
bgrOut[outIdx + 1] = g;
|
||
bgrOut[outIdx + 2] = r;
|
||
}
|
||
|
||
// ── Fused NV12→RGB + letterbox resize kernel ────────────────────────────
|
||
// Runs at OUTPUT resolution (e.g. 640×640), sampling from 4K NV12 source
|
||
// with bilinear interpolation. This avoids the 24MB 4K RGB intermediate
|
||
// and processes 20× fewer pixels than separate convert + resize.
|
||
|
||
__device__ __forceinline__ void nv12_read_yuv_bilinear(
|
||
const uint8_t* __restrict__ yPlane, int yPitch,
|
||
const uint8_t* __restrict__ uvPlane, int uvPitch,
|
||
float sx, float sy, int srcW, int srcH,
|
||
float& Y, float& U, float& V)
|
||
{
|
||
// Bilinear interpolation for Y plane
|
||
int x0 = static_cast<int>(sx);
|
||
int y0 = static_cast<int>(sy);
|
||
int x1 = min(x0 + 1, srcW - 1);
|
||
int y1 = min(y0 + 1, srcH - 1);
|
||
x0 = max(x0, 0);
|
||
y0 = max(y0, 0);
|
||
float fx = sx - static_cast<float>(x0);
|
||
float fy = sy - static_cast<float>(y0);
|
||
|
||
// Bilinear Y
|
||
float y00 = static_cast<float>(yPlane[y0 * yPitch + x0]);
|
||
float y01 = static_cast<float>(yPlane[y0 * yPitch + x1]);
|
||
float y10 = static_cast<float>(yPlane[y1 * yPitch + x0]);
|
||
float y11 = static_cast<float>(yPlane[y1 * yPitch + x1]);
|
||
Y = y00 * (1.f - fx) * (1.f - fy) + y01 * fx * (1.f - fy)
|
||
+ y10 * (1.f - fx) * fy + y11 * fx * fy;
|
||
|
||
// UV: NV12 has interleaved U/V at half resolution
|
||
// Sample at corresponding chroma position
|
||
float csx = sx * 0.5f;
|
||
float csy = sy * 0.5f;
|
||
int cx0 = static_cast<int>(csx);
|
||
int cy0 = static_cast<int>(csy);
|
||
int cx1 = min(cx0 + 1, srcW / 2 - 1);
|
||
int cy1 = min(cy0 + 1, srcH / 2 - 1);
|
||
cx0 = max(cx0, 0);
|
||
cy0 = max(cy0, 0);
|
||
float cfx = csx - static_cast<float>(cx0);
|
||
float cfy = csy - static_cast<float>(cy0);
|
||
|
||
// Bilinear U
|
||
float u00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2]);
|
||
float u01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2]);
|
||
float u10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2]);
|
||
float u11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2]);
|
||
U = (u00 * (1.f - cfx) * (1.f - cfy) + u01 * cfx * (1.f - cfy)
|
||
+ u10 * (1.f - cfx) * cfy + u11 * cfx * cfy) - 128.f;
|
||
|
||
// Bilinear V
|
||
float v00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2 + 1]);
|
||
float v01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2 + 1]);
|
||
float v10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2 + 1]);
|
||
float v11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2 + 1]);
|
||
V = (v00 * (1.f - cfx) * (1.f - cfy) + v01 * cfx * (1.f - cfy)
|
||
+ v10 * (1.f - cfx) * cfy + v11 * cfx * cfy) - 128.f;
|
||
}
|
||
|
||
__global__ void nv12ToRgbLetterboxKernel(
|
||
const uint8_t* __restrict__ yPlane, int yPitch,
|
||
const uint8_t* __restrict__ uvPlane, int uvPitch,
|
||
uint8_t* __restrict__ rgbOut, int outPitch,
|
||
int outW, int outH, // model input size (e.g. 640×640)
|
||
int srcW, int srcH, // NV12 source size (e.g. 3840×2160)
|
||
int unpadW, int unpadH, // letterboxed content size within output
|
||
float invScale) // 1/scale = srcDim/unpadDim
|
||
{
|
||
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
|
||
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
|
||
if (ox >= outW || oy >= outH) return;
|
||
|
||
const int outIdx = oy * outPitch + ox * 3;
|
||
|
||
// Letterbox padding: black for pixels outside content area
|
||
if (ox >= unpadW || oy >= unpadH) {
|
||
rgbOut[outIdx] = 0;
|
||
rgbOut[outIdx + 1] = 0;
|
||
rgbOut[outIdx + 2] = 0;
|
||
return;
|
||
}
|
||
|
||
// Map output pixel to source NV12 coordinates
|
||
float sx = (static_cast<float>(ox) + 0.5f) * invScale - 0.5f;
|
||
float sy = (static_cast<float>(oy) + 0.5f) * invScale - 0.5f;
|
||
sx = fmaxf(sx, 0.f);
|
||
sy = fmaxf(sy, 0.f);
|
||
|
||
float Y, U, V;
|
||
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
|
||
sx, sy, srcW, srcH, Y, U, V);
|
||
|
||
uint8_t r, g, b;
|
||
nv12_yuv_to_rgb(Y, U, V, r, g, b);
|
||
|
||
rgbOut[outIdx] = r;
|
||
rgbOut[outIdx + 1] = g;
|
||
rgbOut[outIdx + 2] = b;
|
||
}
|
||
|
||
// ── Host-callable wrappers ───────────────────────────────────────────────
|
||
|
||
extern "C" void launchNV12ToRGBLetterbox(
|
||
const uint8_t* yPlane, int yPitch,
|
||
const uint8_t* uvPlane, int uvPitch,
|
||
uint8_t* rgbOut, int outPitch,
|
||
int outW, int outH,
|
||
int srcW, int srcH,
|
||
int unpadW, int unpadH,
|
||
float invScale,
|
||
cudaStream_t stream)
|
||
{
|
||
dim3 block(32, 8);
|
||
dim3 grid((outW + block.x - 1) / block.x,
|
||
(outH + block.y - 1) / block.y);
|
||
nv12ToRgbLetterboxKernel<<<grid, block, 0, stream>>>(
|
||
yPlane, yPitch, uvPlane, uvPitch,
|
||
rgbOut, outPitch, outW, outH,
|
||
srcW, srcH, unpadW, unpadH, invScale);
|
||
}
|
||
|
||
extern "C" void launchNV12ToRGB(
|
||
const uint8_t* yPlane, int yPitch,
|
||
const uint8_t* uvPlane, int uvPitch,
|
||
uint8_t* rgbOut, int outPitch,
|
||
int width, int height,
|
||
cudaStream_t stream)
|
||
{
|
||
dim3 block(32, 8);
|
||
dim3 grid((width + block.x - 1) / block.x,
|
||
(height + block.y - 1) / block.y);
|
||
nv12ToRgbKernel<<<grid, block, 0, stream>>>(
|
||
yPlane, yPitch, uvPlane, uvPitch,
|
||
rgbOut, outPitch, width, height);
|
||
}
|
||
|
||
// ── Fused NV12→RGB + direct resize kernel (for classification) ───────────
|
||
// Like letterbox kernel but stretches to fill the entire output (no padding).
|
||
// Used for classification models that expect direct resize, not aspect-
|
||
// preserving letterbox.
|
||
|
||
__global__ void nv12ToRgbResizeKernel(
|
||
const uint8_t* __restrict__ yPlane, int yPitch,
|
||
const uint8_t* __restrict__ uvPlane, int uvPitch,
|
||
uint8_t* __restrict__ rgbOut, int outPitch,
|
||
int outW, int outH, // model input size (e.g. 224×224)
|
||
int srcW, int srcH) // NV12 source size (e.g. 2688×1520)
|
||
{
|
||
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
|
||
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
|
||
if (ox >= outW || oy >= outH) return;
|
||
|
||
// Map output pixel to source NV12 coordinates (stretch to fill)
|
||
float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
|
||
float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
|
||
sx = fmaxf(sx, 0.f);
|
||
sy = fmaxf(sy, 0.f);
|
||
|
||
float Y, U, V;
|
||
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
|
||
sx, sy, srcW, srcH, Y, U, V);
|
||
|
||
uint8_t r, g, b;
|
||
nv12_yuv_to_rgb(Y, U, V, r, g, b);
|
||
|
||
const int outIdx = oy * outPitch + ox * 3;
|
||
rgbOut[outIdx] = r;
|
||
rgbOut[outIdx + 1] = g;
|
||
rgbOut[outIdx + 2] = b;
|
||
}
|
||
|
||
extern "C" void launchNV12ToRGBResize(
|
||
const uint8_t* yPlane, int yPitch,
|
||
const uint8_t* uvPlane, int uvPitch,
|
||
uint8_t* rgbOut, int outPitch,
|
||
int outW, int outH,
|
||
int srcW, int srcH,
|
||
cudaStream_t stream)
|
||
{
|
||
dim3 block(32, 8);
|
||
dim3 grid((outW + block.x - 1) / block.x,
|
||
(outH + block.y - 1) / block.y);
|
||
nv12ToRgbResizeKernel<<<grid, block, 0, stream>>>(
|
||
yPlane, yPitch, uvPlane, uvPitch,
|
||
rgbOut, outPitch, outW, outH,
|
||
srcW, srcH);
|
||
}
|
||
|
||
// ── Fused NV12→RGB + resize + CHW planar output (for SAM3 image encoder) ─
|
||
// Writes R, G, B to separate contiguous planes: [R plane][G plane][B plane]
|
||
// at model input resolution. Eliminates cv::resize + cvtColor + split + memcpy.
|
||
|
||
__global__ void nv12ToRgbResizeCHW_uint8Kernel(
|
||
const uint8_t* __restrict__ yPlane, int yPitch,
|
||
const uint8_t* __restrict__ uvPlane, int uvPitch,
|
||
uint8_t* __restrict__ chwOut, // contiguous CHW: R plane, G plane, B plane
|
||
int outW, int outH,
|
||
int srcW, int srcH)
|
||
{
|
||
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
|
||
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
|
||
if (ox >= outW || oy >= outH) return;
|
||
|
||
float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
|
||
float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
|
||
sx = fmaxf(sx, 0.f);
|
||
sy = fmaxf(sy, 0.f);
|
||
|
||
float Y, U, V;
|
||
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
|
||
sx, sy, srcW, srcH, Y, U, V);
|
||
|
||
uint8_t r, g, b;
|
||
nv12_yuv_to_rgb(Y, U, V, r, g, b);
|
||
|
||
const int planeSize = outW * outH;
|
||
const int idx = oy * outW + ox;
|
||
chwOut[idx] = r; // R plane
|
||
chwOut[idx + planeSize] = g; // G plane
|
||
chwOut[idx + 2 * planeSize] = b; // B plane
|
||
}
|
||
|
||
__global__ void nv12ToRgbResizeCHW_floatKernel(
|
||
const uint8_t* __restrict__ yPlane, int yPitch,
|
||
const uint8_t* __restrict__ uvPlane, int uvPitch,
|
||
float* __restrict__ chwOut, // contiguous CHW float32 [0, 255]
|
||
int outW, int outH,
|
||
int srcW, int srcH)
|
||
{
|
||
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
|
||
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
|
||
if (ox >= outW || oy >= outH) return;
|
||
|
||
float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
|
||
float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
|
||
sx = fmaxf(sx, 0.f);
|
||
sy = fmaxf(sy, 0.f);
|
||
|
||
float Y, U, V;
|
||
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
|
||
sx, sy, srcW, srcH, Y, U, V);
|
||
|
||
uint8_t r, g, b;
|
||
nv12_yuv_to_rgb(Y, U, V, r, g, b);
|
||
|
||
const int planeSize = outW * outH;
|
||
const int idx = oy * outW + ox;
|
||
chwOut[idx] = static_cast<float>(r);
|
||
chwOut[idx + planeSize] = static_cast<float>(g);
|
||
chwOut[idx + 2 * planeSize] = static_cast<float>(b);
|
||
}
|
||
|
||
extern "C" void launchNV12ToRGBResizeCHW_uint8(
|
||
const uint8_t* yPlane, int yPitch,
|
||
const uint8_t* uvPlane, int uvPitch,
|
||
uint8_t* chwOut,
|
||
int outW, int outH,
|
||
int srcW, int srcH,
|
||
cudaStream_t stream)
|
||
{
|
||
dim3 block(32, 8);
|
||
dim3 grid((outW + block.x - 1) / block.x,
|
||
(outH + block.y - 1) / block.y);
|
||
nv12ToRgbResizeCHW_uint8Kernel<<<grid, block, 0, stream>>>(
|
||
yPlane, yPitch, uvPlane, uvPitch,
|
||
chwOut, outW, outH, srcW, srcH);
|
||
}
|
||
|
||
extern "C" void launchNV12ToRGBResizeCHW_float(
|
||
const uint8_t* yPlane, int yPitch,
|
||
const uint8_t* uvPlane, int uvPitch,
|
||
float* chwOut,
|
||
int outW, int outH,
|
||
int srcW, int srcH,
|
||
cudaStream_t stream)
|
||
{
|
||
dim3 block(32, 8);
|
||
dim3 grid((outW + block.x - 1) / block.x,
|
||
(outH + block.y - 1) / block.y);
|
||
nv12ToRgbResizeCHW_floatKernel<<<grid, block, 0, stream>>>(
|
||
yPlane, yPitch, uvPlane, uvPitch,
|
||
chwOut, outW, outH, srcW, srcH);
|
||
}
|
||
|
||
extern "C" void launchNV12ToBGR(
|
||
const uint8_t* yPlane, int yPitch,
|
||
const uint8_t* uvPlane, int uvPitch,
|
||
uint8_t* bgrOut, int outPitch,
|
||
int width, int height,
|
||
cudaStream_t stream)
|
||
{
|
||
dim3 block(32, 8);
|
||
dim3 grid((width + block.x - 1) / block.x,
|
||
(height + block.y - 1) / block.y);
|
||
nv12ToBgrKernel<<<grid, block, 0, stream>>>(
|
||
yPlane, yPitch, uvPlane, uvPitch,
|
||
bgrOut, outPitch, width, height);
|
||
}
|
||
|
||
// ── Fused NV12→BGR + bilinear resize kernel (for OCR detection) ──────────
|
||
// Same algorithm as nv12ToRgbResizeKernel but outputs BGR channel order.
|
||
__global__ void nv12ToBgrResizeKernel(
|
||
const uint8_t* __restrict__ yPlane, int yPitch,
|
||
const uint8_t* __restrict__ uvPlane, int uvPitch,
|
||
uint8_t* __restrict__ bgrOut, int outPitch,
|
||
int outW, int outH,
|
||
int srcW, int srcH)
|
||
{
|
||
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
|
||
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
|
||
if (ox >= outW || oy >= outH) return;
|
||
|
||
float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
|
||
float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
|
||
sx = fmaxf(sx, 0.f);
|
||
sy = fmaxf(sy, 0.f);
|
||
|
||
float Y, U, V;
|
||
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
|
||
sx, sy, srcW, srcH, Y, U, V);
|
||
|
||
uint8_t r, g, b;
|
||
nv12_yuv_to_rgb(Y, U, V, r, g, b);
|
||
|
||
const int outIdx = oy * outPitch + ox * 3;
|
||
bgrOut[outIdx] = b;
|
||
bgrOut[outIdx + 1] = g;
|
||
bgrOut[outIdx + 2] = r;
|
||
}
|
||
|
||
extern "C" void launchNV12ToBGRResize(
|
||
const uint8_t* yPlane, int yPitch,
|
||
const uint8_t* uvPlane, int uvPitch,
|
||
uint8_t* bgrOut, int outPitch,
|
||
int outW, int outH,
|
||
int srcW, int srcH,
|
||
cudaStream_t stream)
|
||
{
|
||
dim3 block(32, 8);
|
||
dim3 grid((outW + block.x - 1) / block.x,
|
||
(outH + block.y - 1) / block.y);
|
||
nv12ToBgrResizeKernel<<<grid, block, 0, stream>>>(
|
||
yPlane, yPitch, uvPlane, uvPitch,
|
||
bgrOut, outPitch, outW, outH, srcW, srcH);
|
||
}
|
||
|
||
// ── Center-padded NV12→RGB + letterbox resize kernel (for SCRFD) ─────────
|
||
// Same as nv12ToRgbLetterboxKernel but content is centered with (padLeft, padTop)
|
||
// offset instead of placed at top-left origin. Used by SCRFD face detector
|
||
// which uses center-padding (dw = pad_w/2, dh = pad_h/2).
|
||
|
||
__global__ void nv12ToRgbCenterLetterboxKernel(
|
||
const uint8_t* __restrict__ yPlane, int yPitch,
|
||
const uint8_t* __restrict__ uvPlane, int uvPitch,
|
||
uint8_t* __restrict__ rgbOut, int outPitch,
|
||
int outW, int outH,
|
||
int srcW, int srcH,
|
||
int unpadW, int unpadH,
|
||
float invScale,
|
||
int padLeft, int padTop)
|
||
{
|
||
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
|
||
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
|
||
if (ox >= outW || oy >= outH) return;
|
||
|
||
const int outIdx = oy * outPitch + ox * 3;
|
||
|
||
// Center-padding: black for pixels outside the content area
|
||
const int cx = ox - padLeft;
|
||
const int cy = oy - padTop;
|
||
if (cx < 0 || cx >= unpadW || cy < 0 || cy >= unpadH) {
|
||
rgbOut[outIdx] = 0;
|
||
rgbOut[outIdx + 1] = 0;
|
||
rgbOut[outIdx + 2] = 0;
|
||
return;
|
||
}
|
||
|
||
// Map content pixel to source NV12 coordinates
|
||
float sx = (static_cast<float>(cx) + 0.5f) * invScale - 0.5f;
|
||
float sy = (static_cast<float>(cy) + 0.5f) * invScale - 0.5f;
|
||
sx = fmaxf(sx, 0.f);
|
||
sy = fmaxf(sy, 0.f);
|
||
|
||
float Y, U, V;
|
||
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
|
||
sx, sy, srcW, srcH, Y, U, V);
|
||
|
||
uint8_t r, g, b;
|
||
nv12_yuv_to_rgb(Y, U, V, r, g, b);
|
||
|
||
rgbOut[outIdx] = r;
|
||
rgbOut[outIdx + 1] = g;
|
||
rgbOut[outIdx + 2] = b;
|
||
}
|
||
|
||
extern "C" void launchNV12ToRGBCenterLetterbox(
|
||
const uint8_t* yPlane, int yPitch,
|
||
const uint8_t* uvPlane, int uvPitch,
|
||
uint8_t* rgbOut, int outPitch,
|
||
int outW, int outH,
|
||
int srcW, int srcH,
|
||
int unpadW, int unpadH,
|
||
float invScale,
|
||
int padLeft, int padTop,
|
||
cudaStream_t stream)
|
||
{
|
||
dim3 block(32, 8);
|
||
dim3 grid((outW + block.x - 1) / block.x,
|
||
(outH + block.y - 1) / block.y);
|
||
nv12ToRgbCenterLetterboxKernel<<<grid, block, 0, stream>>>(
|
||
yPlane, yPitch, uvPlane, uvPitch,
|
||
rgbOut, outPitch, outW, outH,
|
||
srcW, srcH, unpadW, unpadH, invScale,
|
||
padLeft, padTop);
|
||
}
|
||
|
||
// ── NV12 affine warp → BGR kernel (for face alignment) ──────────────────
|
||
// Reads from full-resolution NV12 source, applies inverse affine transform,
|
||
// and outputs a small aligned face image (e.g. 112×112) in BGR order.
|
||
// The inverse affine maps each output pixel back to the NV12 source coords.
|
||
// Border handling: replicate edge (matches cv::BORDER_REPLICATE).
|
||
// Outputs BGR to match existing pipeline (ANSFaceRecognizer expects BGR).
|
||
|
||
__global__ void nv12AffineWarpToBgrKernel(
|
||
const uint8_t* __restrict__ yPlane, int yPitch,
|
||
const uint8_t* __restrict__ uvPlane, int uvPitch,
|
||
uint8_t* __restrict__ bgrOut, int outPitch,
|
||
int outW, int outH,
|
||
int srcW, int srcH,
|
||
float m00, float m01, float m02,
|
||
float m10, float m11, float m12)
|
||
{
|
||
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
|
||
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
|
||
if (ox >= outW || oy >= outH) return;
|
||
|
||
// Inverse affine: map output pixel to source NV12 coordinate
|
||
float sx = m00 * ox + m01 * oy + m02;
|
||
float sy = m10 * ox + m11 * oy + m12;
|
||
|
||
// Border replicate: clamp to source bounds
|
||
sx = fmaxf(0.f, fminf(sx, static_cast<float>(srcW - 1)));
|
||
sy = fmaxf(0.f, fminf(sy, static_cast<float>(srcH - 1)));
|
||
|
||
float Y, U, V;
|
||
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
|
||
sx, sy, srcW, srcH, Y, U, V);
|
||
|
||
uint8_t r, g, b;
|
||
nv12_yuv_to_rgb(Y, U, V, r, g, b);
|
||
|
||
// Output BGR order (matches OpenCV convention for downstream pipeline)
|
||
const int outIdx = oy * outPitch + ox * 3;
|
||
bgrOut[outIdx] = b;
|
||
bgrOut[outIdx + 1] = g;
|
||
bgrOut[outIdx + 2] = r;
|
||
}
|
||
|
||
extern "C" void launchNV12AffineWarpToBGR(
|
||
const uint8_t* yPlane, int yPitch,
|
||
const uint8_t* uvPlane, int uvPitch,
|
||
uint8_t* bgrOut, int outPitch,
|
||
int outW, int outH,
|
||
int srcW, int srcH,
|
||
float m00, float m01, float m02,
|
||
float m10, float m11, float m12,
|
||
cudaStream_t stream)
|
||
{
|
||
dim3 block(16, 8);
|
||
dim3 grid((outW + block.x - 1) / block.x,
|
||
(outH + block.y - 1) / block.y);
|
||
nv12AffineWarpToBgrKernel<<<grid, block, 0, stream>>>(
|
||
yPlane, yPitch, uvPlane, uvPitch,
|
||
bgrOut, outPitch, outW, outH,
|
||
srcW, srcH,
|
||
m00, m01, m02, m10, m11, m12);
|
||
}
|
||
|
||
// ── High-level NV12→BGR: CPU→GPU upload, kernel, GPU→CPU download ────────
|
||
// Thread-local GPU buffers avoid per-call cudaMalloc overhead.
|
||
// Exported for cross-DLL use by video_player via GetProcAddress.
|
||
|
||
struct GpuNV12Buffers {
|
||
uint8_t* d_yPlane = nullptr;
|
||
uint8_t* d_uvPlane = nullptr;
|
||
uint8_t* d_bgrOut = nullptr;
|
||
int allocWidth = 0;
|
||
int allocHeight = 0;
|
||
cudaStream_t stream = nullptr;
|
||
|
||
void ensureAllocated(int w, int h) {
|
||
if (w == allocWidth && h == allocHeight && d_yPlane) return;
|
||
free();
|
||
size_t ySize = (size_t)w * h;
|
||
size_t uvSize = (size_t)w * (h / 2);
|
||
size_t bgrSize = (size_t)w * h * 3;
|
||
cudaMalloc(&d_yPlane, ySize);
|
||
cudaMalloc(&d_uvPlane, uvSize);
|
||
cudaMalloc(&d_bgrOut, bgrSize);
|
||
cudaStreamCreate(&stream);
|
||
allocWidth = w;
|
||
allocHeight = h;
|
||
}
|
||
|
||
void free() {
|
||
if (d_yPlane) { cudaFree(d_yPlane); d_yPlane = nullptr; }
|
||
if (d_uvPlane) { cudaFree(d_uvPlane); d_uvPlane = nullptr; }
|
||
if (d_bgrOut) { cudaFree(d_bgrOut); d_bgrOut = nullptr; }
|
||
if (stream) { cudaStreamDestroy(stream); stream = nullptr; }
|
||
allocWidth = allocHeight = 0;
|
||
}
|
||
|
||
~GpuNV12Buffers() { free(); }
|
||
};
|
||
|
||
static thread_local GpuNV12Buffers t_bufs;
|
||
|
||
extern "C" __declspec(dllexport)
|
||
int ANSGpuNV12ToBGR(
|
||
const uint8_t* yPlane, int yLinesize,
|
||
const uint8_t* uvPlane, int uvLinesize,
|
||
uint8_t* bgrOut, int bgrStep,
|
||
int width, int height, int gpuIndex)
|
||
{
|
||
if (!yPlane || !uvPlane || !bgrOut || width <= 0 || height <= 0) return -1;
|
||
|
||
// Select GPU
|
||
if (gpuIndex >= 0) {
|
||
cudaError_t err = cudaSetDevice(gpuIndex);
|
||
if (err != cudaSuccess) {
|
||
fprintf(stderr, "ANSGpuNV12ToBGR: cudaSetDevice(%d) failed: %s\n",
|
||
gpuIndex, cudaGetErrorString(err));
|
||
return -2;
|
||
}
|
||
}
|
||
|
||
t_bufs.ensureAllocated(width, height);
|
||
if (!t_bufs.d_yPlane) return -3; // allocation failed
|
||
|
||
// Upload Y and UV planes to GPU
|
||
cudaMemcpy2DAsync(t_bufs.d_yPlane, width,
|
||
yPlane, yLinesize,
|
||
width, height,
|
||
cudaMemcpyHostToDevice, t_bufs.stream);
|
||
|
||
cudaMemcpy2DAsync(t_bufs.d_uvPlane, width,
|
||
uvPlane, uvLinesize,
|
||
width, height / 2,
|
||
cudaMemcpyHostToDevice, t_bufs.stream);
|
||
|
||
// Launch BGR kernel
|
||
dim3 block(32, 8);
|
||
dim3 grid((width + block.x - 1) / block.x,
|
||
(height + block.y - 1) / block.y);
|
||
nv12ToBgrKernel<<<grid, block, 0, t_bufs.stream>>>(
|
||
t_bufs.d_yPlane, width,
|
||
t_bufs.d_uvPlane, width,
|
||
t_bufs.d_bgrOut, width * 3,
|
||
width, height);
|
||
|
||
// Download BGR result back to CPU
|
||
cudaMemcpy2DAsync(bgrOut, bgrStep,
|
||
t_bufs.d_bgrOut, width * 3,
|
||
width * 3, height,
|
||
cudaMemcpyDeviceToHost, t_bufs.stream);
|
||
|
||
// Use polling sync instead of cudaStreamSynchronize to avoid
|
||
// holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
|
||
// Short Sleep(0) fast path for sub-ms kernels, then Sleep(1) to give
|
||
// cleanup operations (cuArrayDestroy, cuMemFree) a window to acquire
|
||
// the exclusive SRW lock.
|
||
{
|
||
cudaError_t qerr = cudaStreamQuery(t_bufs.stream);
|
||
if (qerr == cudaErrorNotReady) {
|
||
for (int i = 0; i < 10 && qerr == cudaErrorNotReady; ++i) {
|
||
Sleep(0);
|
||
qerr = cudaStreamQuery(t_bufs.stream);
|
||
}
|
||
while (qerr == cudaErrorNotReady) {
|
||
Sleep(1);
|
||
qerr = cudaStreamQuery(t_bufs.stream);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Check for errors
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
fprintf(stderr, "ANSGpuNV12ToBGR: CUDA error: %s\n", cudaGetErrorString(err));
|
||
return -4;
|
||
}
|
||
|
||
return 0; // success
|
||
}
|
||
|
||
// ============================================================================
|
||
// NV12 rectangular crop → BGR kernel
|
||
// Reads a rectangular region from the full NV12 frame and outputs BGR.
|
||
// No resize — output is the same size as the crop region.
|
||
// ============================================================================
|
||
__global__ void nv12CropToBGRKernel(
|
||
const uint8_t* __restrict__ yPlane, int yPitch,
|
||
const uint8_t* __restrict__ uvPlane, int uvPitch,
|
||
uint8_t* __restrict__ bgrOut, int outPitch,
|
||
int outW, int outH,
|
||
int cropX, int cropY,
|
||
int srcW, int srcH)
|
||
{
|
||
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
|
||
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
|
||
if (ox >= outW || oy >= outH) return;
|
||
|
||
// Map output pixel to source coordinate
|
||
const int sx = cropX + ox;
|
||
const int sy = cropY + oy;
|
||
|
||
// Clamp to source bounds
|
||
const int cx = min(max(sx, 0), srcW - 1);
|
||
const int cy = min(max(sy, 0), srcH - 1);
|
||
|
||
// Read Y
|
||
const float Y = static_cast<float>(yPlane[cy * yPitch + cx]);
|
||
|
||
// Bilinear UV (chroma subsampled 2:1 in both dimensions, interleaved NV12)
|
||
// Chroma sample centers are at (0.5, 0.5) offset in luma coords, so map
|
||
// luma (cx,cy) → chroma continuous coords, then bilinear-interpolate.
|
||
const float csx = static_cast<float>(cx) * 0.5f;
|
||
const float csy = static_cast<float>(cy) * 0.5f;
|
||
const int cx0 = static_cast<int>(csx);
|
||
const int cy0 = static_cast<int>(csy);
|
||
const int cx1 = min(cx0 + 1, (srcW >> 1) - 1);
|
||
const int cy1 = min(cy0 + 1, (srcH >> 1) - 1);
|
||
const float cfx = csx - static_cast<float>(cx0);
|
||
const float cfy = csy - static_cast<float>(cy0);
|
||
|
||
const float u00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2]);
|
||
const float u01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2]);
|
||
const float u10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2]);
|
||
const float u11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2]);
|
||
const float U = (u00 * (1.f - cfx) * (1.f - cfy) + u01 * cfx * (1.f - cfy)
|
||
+ u10 * (1.f - cfx) * cfy + u11 * cfx * cfy) - 128.f;
|
||
|
||
const float v00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2 + 1]);
|
||
const float v01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2 + 1]);
|
||
const float v10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2 + 1]);
|
||
const float v11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2 + 1]);
|
||
const float V = (v00 * (1.f - cfx) * (1.f - cfy) + v01 * cfx * (1.f - cfy)
|
||
+ v10 * (1.f - cfx) * cfy + v11 * cfx * cfy) - 128.f;
|
||
|
||
// BT.601 YUV → RGB
|
||
uint8_t r, g, b;
|
||
nv12_yuv_to_rgb(Y, U, V, r, g, b);
|
||
|
||
// Write BGR (OpenCV channel order)
|
||
const int outIdx = oy * outPitch + ox * 3;
|
||
bgrOut[outIdx] = b;
|
||
bgrOut[outIdx + 1] = g;
|
||
bgrOut[outIdx + 2] = r;
|
||
}
|
||
|
||
extern "C" void launchNV12CropToBGR(
|
||
const uint8_t* yPlane, int yPitch,
|
||
const uint8_t* uvPlane, int uvPitch,
|
||
uint8_t* bgrOut, int outPitch,
|
||
int outW, int outH,
|
||
int cropX, int cropY,
|
||
int srcW, int srcH,
|
||
cudaStream_t stream)
|
||
{
|
||
dim3 block(32, 8);
|
||
dim3 grid((outW + block.x - 1) / block.x,
|
||
(outH + block.y - 1) / block.y);
|
||
nv12CropToBGRKernel<<<grid, block, 0, stream>>>(
|
||
yPlane, yPitch, uvPlane, uvPitch,
|
||
bgrOut, outPitch,
|
||
outW, outH,
|
||
cropX, cropY,
|
||
srcW, srcH);
|
||
}
|