Files
ANSCORE/modules/ANSODEngine/nv12_to_rgb.cu

770 lines
28 KiB
Plaintext
Raw Normal View History

2026-03-28 16:54:11 +11:00
// nv12_to_rgb.cu — CUDA kernels for NV12→RGB and NV12→BGR conversion.
//
// Two use cases:
// 1. launchNV12ToRGB() — inference fast path (ANSRTYOLO uploads NV12 directly)
// 2. ANSGpuNV12ToBGR() — video_player display path (replaces slow CPU cvtColorTwoPlane)
//
// cv::cuda::cvtColor does NOT support NV12, so we use custom kernels.
#include <cuda_runtime.h>
#include <cstdint>
#ifdef _WIN32
#include <windows.h> // Sleep()
#endif
2026-03-28 16:54:11 +11:00
#include <cstdio>
// ── Shared YUV→RGB computation ───────────────────────────────────────────
__device__ __forceinline__ void nv12_yuv_to_rgb(
float Y, float U, float V,
uint8_t& r, uint8_t& g, uint8_t& b)
{
// BT.601 conversion (same as FFmpeg swscale / OpenCV cvtColorTwoPlane)
const float R = Y + 1.402f * V;
const float G = Y - 0.344136f * U - 0.714136f * V;
const float B = Y + 1.772f * U;
r = static_cast<uint8_t>(fminf(fmaxf(R, 0.f), 255.f));
g = static_cast<uint8_t>(fminf(fmaxf(G, 0.f), 255.f));
b = static_cast<uint8_t>(fminf(fmaxf(B, 0.f), 255.f));
}
__device__ __forceinline__ void nv12_read_yuv(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
int x, int y,
float& Y, float& U, float& V)
{
Y = static_cast<float>(yPlane[y * yPitch + x]);
const int uvIdx = (y >> 1) * uvPitch + (x & ~1);
U = static_cast<float>(uvPlane[uvIdx]) - 128.f;
V = static_cast<float>(uvPlane[uvIdx + 1]) - 128.f;
}
// ── NV12→RGB kernel (for inference fast path) ────────────────────────────
__global__ void nv12ToRgbKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ rgbOut, int outPitch,
int width, int height)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
float Y, U, V;
nv12_read_yuv(yPlane, yPitch, uvPlane, uvPitch, x, y, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
const int outIdx = y * outPitch + x * 3;
rgbOut[outIdx] = r;
rgbOut[outIdx + 1] = g;
rgbOut[outIdx + 2] = b;
}
// ── NV12→BGR kernel (for LabVIEW display — matches OpenCV channel order) ─
__global__ void nv12ToBgrKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ bgrOut, int outPitch,
int width, int height)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
float Y, U, V;
nv12_read_yuv(yPlane, yPitch, uvPlane, uvPitch, x, y, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
const int outIdx = y * outPitch + x * 3;
bgrOut[outIdx] = b; // BGR order for OpenCV
bgrOut[outIdx + 1] = g;
bgrOut[outIdx + 2] = r;
}
// ── Fused NV12→RGB + letterbox resize kernel ────────────────────────────
// Runs at OUTPUT resolution (e.g. 640×640), sampling from 4K NV12 source
// with bilinear interpolation. This avoids the 24MB 4K RGB intermediate
// and processes 20× fewer pixels than separate convert + resize.
__device__ __forceinline__ void nv12_read_yuv_bilinear(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
float sx, float sy, int srcW, int srcH,
float& Y, float& U, float& V)
{
// Bilinear interpolation for Y plane
int x0 = static_cast<int>(sx);
int y0 = static_cast<int>(sy);
int x1 = min(x0 + 1, srcW - 1);
int y1 = min(y0 + 1, srcH - 1);
x0 = max(x0, 0);
y0 = max(y0, 0);
float fx = sx - static_cast<float>(x0);
float fy = sy - static_cast<float>(y0);
// Bilinear Y
float y00 = static_cast<float>(yPlane[y0 * yPitch + x0]);
float y01 = static_cast<float>(yPlane[y0 * yPitch + x1]);
float y10 = static_cast<float>(yPlane[y1 * yPitch + x0]);
float y11 = static_cast<float>(yPlane[y1 * yPitch + x1]);
Y = y00 * (1.f - fx) * (1.f - fy) + y01 * fx * (1.f - fy)
+ y10 * (1.f - fx) * fy + y11 * fx * fy;
// UV: NV12 has interleaved U/V at half resolution
// Sample at corresponding chroma position
float csx = sx * 0.5f;
float csy = sy * 0.5f;
int cx0 = static_cast<int>(csx);
int cy0 = static_cast<int>(csy);
int cx1 = min(cx0 + 1, srcW / 2 - 1);
int cy1 = min(cy0 + 1, srcH / 2 - 1);
cx0 = max(cx0, 0);
cy0 = max(cy0, 0);
float cfx = csx - static_cast<float>(cx0);
float cfy = csy - static_cast<float>(cy0);
// Bilinear U
float u00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2]);
float u01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2]);
float u10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2]);
float u11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2]);
U = (u00 * (1.f - cfx) * (1.f - cfy) + u01 * cfx * (1.f - cfy)
+ u10 * (1.f - cfx) * cfy + u11 * cfx * cfy) - 128.f;
// Bilinear V
float v00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2 + 1]);
float v01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2 + 1]);
float v10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2 + 1]);
float v11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2 + 1]);
V = (v00 * (1.f - cfx) * (1.f - cfy) + v01 * cfx * (1.f - cfy)
+ v10 * (1.f - cfx) * cfy + v11 * cfx * cfy) - 128.f;
}
__global__ void nv12ToRgbLetterboxKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ rgbOut, int outPitch,
int outW, int outH, // model input size (e.g. 640×640)
int srcW, int srcH, // NV12 source size (e.g. 3840×2160)
int unpadW, int unpadH, // letterboxed content size within output
float invScale) // 1/scale = srcDim/unpadDim
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
const int outIdx = oy * outPitch + ox * 3;
// Letterbox padding: black for pixels outside content area
if (ox >= unpadW || oy >= unpadH) {
rgbOut[outIdx] = 0;
rgbOut[outIdx + 1] = 0;
rgbOut[outIdx + 2] = 0;
return;
}
// Map output pixel to source NV12 coordinates
float sx = (static_cast<float>(ox) + 0.5f) * invScale - 0.5f;
float sy = (static_cast<float>(oy) + 0.5f) * invScale - 0.5f;
sx = fmaxf(sx, 0.f);
sy = fmaxf(sy, 0.f);
float Y, U, V;
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
sx, sy, srcW, srcH, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
rgbOut[outIdx] = r;
rgbOut[outIdx + 1] = g;
rgbOut[outIdx + 2] = b;
}
// ── Host-callable wrappers ───────────────────────────────────────────────
extern "C" void launchNV12ToRGBLetterbox(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* rgbOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
int unpadW, int unpadH,
float invScale,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12ToRgbLetterboxKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
rgbOut, outPitch, outW, outH,
srcW, srcH, unpadW, unpadH, invScale);
}
extern "C" void launchNV12ToRGB(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* rgbOut, int outPitch,
int width, int height,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((width + block.x - 1) / block.x,
(height + block.y - 1) / block.y);
nv12ToRgbKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
rgbOut, outPitch, width, height);
}
// ── Fused NV12→RGB + direct resize kernel (for classification) ───────────
// Like letterbox kernel but stretches to fill the entire output (no padding).
// Used for classification models that expect direct resize, not aspect-
// preserving letterbox.
__global__ void nv12ToRgbResizeKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ rgbOut, int outPitch,
int outW, int outH, // model input size (e.g. 224×224)
int srcW, int srcH) // NV12 source size (e.g. 2688×1520)
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
// Map output pixel to source NV12 coordinates (stretch to fill)
float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
sx = fmaxf(sx, 0.f);
sy = fmaxf(sy, 0.f);
float Y, U, V;
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
sx, sy, srcW, srcH, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
const int outIdx = oy * outPitch + ox * 3;
rgbOut[outIdx] = r;
rgbOut[outIdx + 1] = g;
rgbOut[outIdx + 2] = b;
}
extern "C" void launchNV12ToRGBResize(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* rgbOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12ToRgbResizeKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
rgbOut, outPitch, outW, outH,
srcW, srcH);
}
// ── Fused NV12→RGB + resize + CHW planar output (for SAM3 image encoder) ─
// Writes R, G, B to separate contiguous planes: [R plane][G plane][B plane]
// at model input resolution. Eliminates cv::resize + cvtColor + split + memcpy.
__global__ void nv12ToRgbResizeCHW_uint8Kernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ chwOut, // contiguous CHW: R plane, G plane, B plane
int outW, int outH,
int srcW, int srcH)
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
sx = fmaxf(sx, 0.f);
sy = fmaxf(sy, 0.f);
float Y, U, V;
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
sx, sy, srcW, srcH, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
const int planeSize = outW * outH;
const int idx = oy * outW + ox;
chwOut[idx] = r; // R plane
chwOut[idx + planeSize] = g; // G plane
chwOut[idx + 2 * planeSize] = b; // B plane
}
__global__ void nv12ToRgbResizeCHW_floatKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
float* __restrict__ chwOut, // contiguous CHW float32 [0, 255]
int outW, int outH,
int srcW, int srcH)
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
sx = fmaxf(sx, 0.f);
sy = fmaxf(sy, 0.f);
float Y, U, V;
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
sx, sy, srcW, srcH, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
const int planeSize = outW * outH;
const int idx = oy * outW + ox;
chwOut[idx] = static_cast<float>(r);
chwOut[idx + planeSize] = static_cast<float>(g);
chwOut[idx + 2 * planeSize] = static_cast<float>(b);
}
extern "C" void launchNV12ToRGBResizeCHW_uint8(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* chwOut,
int outW, int outH,
int srcW, int srcH,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12ToRgbResizeCHW_uint8Kernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
chwOut, outW, outH, srcW, srcH);
}
extern "C" void launchNV12ToRGBResizeCHW_float(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
float* chwOut,
int outW, int outH,
int srcW, int srcH,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12ToRgbResizeCHW_floatKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
chwOut, outW, outH, srcW, srcH);
}
extern "C" void launchNV12ToBGR(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* bgrOut, int outPitch,
int width, int height,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((width + block.x - 1) / block.x,
(height + block.y - 1) / block.y);
nv12ToBgrKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
bgrOut, outPitch, width, height);
}
// ── Fused NV12→BGR + bilinear resize kernel (for OCR detection) ──────────
// Same algorithm as nv12ToRgbResizeKernel but outputs BGR channel order.
__global__ void nv12ToBgrResizeKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ bgrOut, int outPitch,
int outW, int outH,
int srcW, int srcH)
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
sx = fmaxf(sx, 0.f);
sy = fmaxf(sy, 0.f);
float Y, U, V;
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
sx, sy, srcW, srcH, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
const int outIdx = oy * outPitch + ox * 3;
bgrOut[outIdx] = b;
bgrOut[outIdx + 1] = g;
bgrOut[outIdx + 2] = r;
}
extern "C" void launchNV12ToBGRResize(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* bgrOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12ToBgrResizeKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
bgrOut, outPitch, outW, outH, srcW, srcH);
}
// ── Center-padded NV12→RGB + letterbox resize kernel (for SCRFD) ─────────
// Same as nv12ToRgbLetterboxKernel but content is centered with (padLeft, padTop)
// offset instead of placed at top-left origin. Used by SCRFD face detector
// which uses center-padding (dw = pad_w/2, dh = pad_h/2).
__global__ void nv12ToRgbCenterLetterboxKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ rgbOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
int unpadW, int unpadH,
float invScale,
int padLeft, int padTop)
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
const int outIdx = oy * outPitch + ox * 3;
// Center-padding: black for pixels outside the content area
const int cx = ox - padLeft;
const int cy = oy - padTop;
if (cx < 0 || cx >= unpadW || cy < 0 || cy >= unpadH) {
rgbOut[outIdx] = 0;
rgbOut[outIdx + 1] = 0;
rgbOut[outIdx + 2] = 0;
return;
}
// Map content pixel to source NV12 coordinates
float sx = (static_cast<float>(cx) + 0.5f) * invScale - 0.5f;
float sy = (static_cast<float>(cy) + 0.5f) * invScale - 0.5f;
sx = fmaxf(sx, 0.f);
sy = fmaxf(sy, 0.f);
float Y, U, V;
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
sx, sy, srcW, srcH, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
rgbOut[outIdx] = r;
rgbOut[outIdx + 1] = g;
rgbOut[outIdx + 2] = b;
}
extern "C" void launchNV12ToRGBCenterLetterbox(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* rgbOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
int unpadW, int unpadH,
float invScale,
int padLeft, int padTop,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12ToRgbCenterLetterboxKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
rgbOut, outPitch, outW, outH,
srcW, srcH, unpadW, unpadH, invScale,
padLeft, padTop);
}
// ── NV12 affine warp → BGR kernel (for face alignment) ──────────────────
// Reads from full-resolution NV12 source, applies inverse affine transform,
// and outputs a small aligned face image (e.g. 112×112) in BGR order.
// The inverse affine maps each output pixel back to the NV12 source coords.
// Border handling: replicate edge (matches cv::BORDER_REPLICATE).
// Outputs BGR to match existing pipeline (ANSFaceRecognizer expects BGR).
__global__ void nv12AffineWarpToBgrKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ bgrOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
float m00, float m01, float m02,
float m10, float m11, float m12)
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
// Inverse affine: map output pixel to source NV12 coordinate
float sx = m00 * ox + m01 * oy + m02;
float sy = m10 * ox + m11 * oy + m12;
// Border replicate: clamp to source bounds
sx = fmaxf(0.f, fminf(sx, static_cast<float>(srcW - 1)));
sy = fmaxf(0.f, fminf(sy, static_cast<float>(srcH - 1)));
float Y, U, V;
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
sx, sy, srcW, srcH, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
// Output BGR order (matches OpenCV convention for downstream pipeline)
const int outIdx = oy * outPitch + ox * 3;
bgrOut[outIdx] = b;
bgrOut[outIdx + 1] = g;
bgrOut[outIdx + 2] = r;
}
extern "C" void launchNV12AffineWarpToBGR(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* bgrOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
float m00, float m01, float m02,
float m10, float m11, float m12,
cudaStream_t stream)
{
dim3 block(16, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12AffineWarpToBgrKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
bgrOut, outPitch, outW, outH,
srcW, srcH,
m00, m01, m02, m10, m11, m12);
}
// ── High-level NV12→BGR: CPU→GPU upload, kernel, GPU→CPU download ────────
// Thread-local GPU buffers avoid per-call cudaMalloc overhead.
// Exported for cross-DLL use by video_player via GetProcAddress.
struct GpuNV12Buffers {
uint8_t* d_yPlane = nullptr;
uint8_t* d_uvPlane = nullptr;
uint8_t* d_bgrOut = nullptr;
int allocWidth = 0;
int allocHeight = 0;
cudaStream_t stream = nullptr;
void ensureAllocated(int w, int h) {
if (w == allocWidth && h == allocHeight && d_yPlane) return;
free();
size_t ySize = (size_t)w * h;
size_t uvSize = (size_t)w * (h / 2);
size_t bgrSize = (size_t)w * h * 3;
cudaMalloc(&d_yPlane, ySize);
cudaMalloc(&d_uvPlane, uvSize);
cudaMalloc(&d_bgrOut, bgrSize);
cudaStreamCreate(&stream);
allocWidth = w;
allocHeight = h;
}
void free() {
if (d_yPlane) { cudaFree(d_yPlane); d_yPlane = nullptr; }
if (d_uvPlane) { cudaFree(d_uvPlane); d_uvPlane = nullptr; }
if (d_bgrOut) { cudaFree(d_bgrOut); d_bgrOut = nullptr; }
if (stream) { cudaStreamDestroy(stream); stream = nullptr; }
allocWidth = allocHeight = 0;
}
~GpuNV12Buffers() { free(); }
};
static thread_local GpuNV12Buffers t_bufs;
extern "C" __declspec(dllexport)
int ANSGpuNV12ToBGR(
const uint8_t* yPlane, int yLinesize,
const uint8_t* uvPlane, int uvLinesize,
uint8_t* bgrOut, int bgrStep,
int width, int height, int gpuIndex)
{
if (!yPlane || !uvPlane || !bgrOut || width <= 0 || height <= 0) return -1;
// Select GPU
if (gpuIndex >= 0) {
cudaError_t err = cudaSetDevice(gpuIndex);
if (err != cudaSuccess) {
fprintf(stderr, "ANSGpuNV12ToBGR: cudaSetDevice(%d) failed: %s\n",
gpuIndex, cudaGetErrorString(err));
return -2;
}
}
t_bufs.ensureAllocated(width, height);
if (!t_bufs.d_yPlane) return -3; // allocation failed
// Upload Y and UV planes to GPU
cudaMemcpy2DAsync(t_bufs.d_yPlane, width,
yPlane, yLinesize,
width, height,
cudaMemcpyHostToDevice, t_bufs.stream);
cudaMemcpy2DAsync(t_bufs.d_uvPlane, width,
uvPlane, uvLinesize,
width, height / 2,
cudaMemcpyHostToDevice, t_bufs.stream);
// Launch BGR kernel
dim3 block(32, 8);
dim3 grid((width + block.x - 1) / block.x,
(height + block.y - 1) / block.y);
nv12ToBgrKernel<<<grid, block, 0, t_bufs.stream>>>(
t_bufs.d_yPlane, width,
t_bufs.d_uvPlane, width,
t_bufs.d_bgrOut, width * 3,
width, height);
// Download BGR result back to CPU
cudaMemcpy2DAsync(bgrOut, bgrStep,
t_bufs.d_bgrOut, width * 3,
width * 3, height,
cudaMemcpyDeviceToHost, t_bufs.stream);
// Use polling sync instead of cudaStreamSynchronize to avoid
// holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
// Short Sleep(0) fast path for sub-ms kernels, then Sleep(1) to give
// cleanup operations (cuArrayDestroy, cuMemFree) a window to acquire
// the exclusive SRW lock.
{
cudaError_t qerr = cudaStreamQuery(t_bufs.stream);
if (qerr == cudaErrorNotReady) {
for (int i = 0; i < 10 && qerr == cudaErrorNotReady; ++i) {
Sleep(0);
qerr = cudaStreamQuery(t_bufs.stream);
}
while (qerr == cudaErrorNotReady) {
Sleep(1);
qerr = cudaStreamQuery(t_bufs.stream);
}
}
}
2026-03-28 16:54:11 +11:00
// Check for errors
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "ANSGpuNV12ToBGR: CUDA error: %s\n", cudaGetErrorString(err));
return -4;
}
return 0; // success
}
// ============================================================================
// NV12 rectangular crop → BGR kernel
// Reads a rectangular region from the full NV12 frame and outputs BGR.
// No resize — output is the same size as the crop region.
// ============================================================================
__global__ void nv12CropToBGRKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ bgrOut, int outPitch,
int outW, int outH,
int cropX, int cropY,
int srcW, int srcH)
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
// Map output pixel to source coordinate
const int sx = cropX + ox;
const int sy = cropY + oy;
// Clamp to source bounds
const int cx = min(max(sx, 0), srcW - 1);
const int cy = min(max(sy, 0), srcH - 1);
// Read Y
const float Y = static_cast<float>(yPlane[cy * yPitch + cx]);
// Bilinear UV (chroma subsampled 2:1 in both dimensions, interleaved NV12)
// Chroma sample centers are at (0.5, 0.5) offset in luma coords, so map
// luma (cx,cy) → chroma continuous coords, then bilinear-interpolate.
const float csx = static_cast<float>(cx) * 0.5f;
const float csy = static_cast<float>(cy) * 0.5f;
const int cx0 = static_cast<int>(csx);
const int cy0 = static_cast<int>(csy);
const int cx1 = min(cx0 + 1, (srcW >> 1) - 1);
const int cy1 = min(cy0 + 1, (srcH >> 1) - 1);
const float cfx = csx - static_cast<float>(cx0);
const float cfy = csy - static_cast<float>(cy0);
const float u00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2]);
const float u01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2]);
const float u10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2]);
const float u11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2]);
const float U = (u00 * (1.f - cfx) * (1.f - cfy) + u01 * cfx * (1.f - cfy)
+ u10 * (1.f - cfx) * cfy + u11 * cfx * cfy) - 128.f;
const float v00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2 + 1]);
const float v01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2 + 1]);
const float v10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2 + 1]);
const float v11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2 + 1]);
const float V = (v00 * (1.f - cfx) * (1.f - cfy) + v01 * cfx * (1.f - cfy)
+ v10 * (1.f - cfx) * cfy + v11 * cfx * cfy) - 128.f;
// BT.601 YUV → RGB
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
// Write BGR (OpenCV channel order)
const int outIdx = oy * outPitch + ox * 3;
bgrOut[outIdx] = b;
bgrOut[outIdx + 1] = g;
bgrOut[outIdx + 2] = r;
}
extern "C" void launchNV12CropToBGR(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* bgrOut, int outPitch,
int outW, int outH,
int cropX, int cropY,
int srcW, int srcH,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12CropToBGRKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
bgrOut, outPitch,
outW, outH,
cropX, cropY,
srcW, srcH);
}