Files
ANSCORE/modules/ANSODEngine/nv12_to_rgb.cu

770 lines
28 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// nv12_to_rgb.cu — CUDA kernels for NV12→RGB and NV12→BGR conversion.
//
// Two use cases:
// 1. launchNV12ToRGB() — inference fast path (ANSRTYOLO uploads NV12 directly)
// 2. ANSGpuNV12ToBGR() — video_player display path (replaces slow CPU cvtColorTwoPlane)
//
// cv::cuda::cvtColor does NOT support NV12, so we use custom kernels.
#include <cuda_runtime.h>
#include <cstdint>
#ifdef _WIN32
#include <windows.h> // Sleep()
#endif
#include <cstdio>
// ── Shared YUV→RGB computation ───────────────────────────────────────────
__device__ __forceinline__ void nv12_yuv_to_rgb(
float Y, float U, float V,
uint8_t& r, uint8_t& g, uint8_t& b)
{
// BT.601 conversion (same as FFmpeg swscale / OpenCV cvtColorTwoPlane)
const float R = Y + 1.402f * V;
const float G = Y - 0.344136f * U - 0.714136f * V;
const float B = Y + 1.772f * U;
r = static_cast<uint8_t>(fminf(fmaxf(R, 0.f), 255.f));
g = static_cast<uint8_t>(fminf(fmaxf(G, 0.f), 255.f));
b = static_cast<uint8_t>(fminf(fmaxf(B, 0.f), 255.f));
}
__device__ __forceinline__ void nv12_read_yuv(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
int x, int y,
float& Y, float& U, float& V)
{
Y = static_cast<float>(yPlane[y * yPitch + x]);
const int uvIdx = (y >> 1) * uvPitch + (x & ~1);
U = static_cast<float>(uvPlane[uvIdx]) - 128.f;
V = static_cast<float>(uvPlane[uvIdx + 1]) - 128.f;
}
// ── NV12→RGB kernel (for inference fast path) ────────────────────────────
__global__ void nv12ToRgbKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ rgbOut, int outPitch,
int width, int height)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
float Y, U, V;
nv12_read_yuv(yPlane, yPitch, uvPlane, uvPitch, x, y, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
const int outIdx = y * outPitch + x * 3;
rgbOut[outIdx] = r;
rgbOut[outIdx + 1] = g;
rgbOut[outIdx + 2] = b;
}
// ── NV12→BGR kernel (for LabVIEW display — matches OpenCV channel order) ─
__global__ void nv12ToBgrKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ bgrOut, int outPitch,
int width, int height)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
float Y, U, V;
nv12_read_yuv(yPlane, yPitch, uvPlane, uvPitch, x, y, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
const int outIdx = y * outPitch + x * 3;
bgrOut[outIdx] = b; // BGR order for OpenCV
bgrOut[outIdx + 1] = g;
bgrOut[outIdx + 2] = r;
}
// ── Fused NV12→RGB + letterbox resize kernel ────────────────────────────
// Runs at OUTPUT resolution (e.g. 640×640), sampling from 4K NV12 source
// with bilinear interpolation. This avoids the 24MB 4K RGB intermediate
// and processes 20× fewer pixels than separate convert + resize.
__device__ __forceinline__ void nv12_read_yuv_bilinear(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
float sx, float sy, int srcW, int srcH,
float& Y, float& U, float& V)
{
// Bilinear interpolation for Y plane
int x0 = static_cast<int>(sx);
int y0 = static_cast<int>(sy);
int x1 = min(x0 + 1, srcW - 1);
int y1 = min(y0 + 1, srcH - 1);
x0 = max(x0, 0);
y0 = max(y0, 0);
float fx = sx - static_cast<float>(x0);
float fy = sy - static_cast<float>(y0);
// Bilinear Y
float y00 = static_cast<float>(yPlane[y0 * yPitch + x0]);
float y01 = static_cast<float>(yPlane[y0 * yPitch + x1]);
float y10 = static_cast<float>(yPlane[y1 * yPitch + x0]);
float y11 = static_cast<float>(yPlane[y1 * yPitch + x1]);
Y = y00 * (1.f - fx) * (1.f - fy) + y01 * fx * (1.f - fy)
+ y10 * (1.f - fx) * fy + y11 * fx * fy;
// UV: NV12 has interleaved U/V at half resolution
// Sample at corresponding chroma position
float csx = sx * 0.5f;
float csy = sy * 0.5f;
int cx0 = static_cast<int>(csx);
int cy0 = static_cast<int>(csy);
int cx1 = min(cx0 + 1, srcW / 2 - 1);
int cy1 = min(cy0 + 1, srcH / 2 - 1);
cx0 = max(cx0, 0);
cy0 = max(cy0, 0);
float cfx = csx - static_cast<float>(cx0);
float cfy = csy - static_cast<float>(cy0);
// Bilinear U
float u00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2]);
float u01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2]);
float u10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2]);
float u11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2]);
U = (u00 * (1.f - cfx) * (1.f - cfy) + u01 * cfx * (1.f - cfy)
+ u10 * (1.f - cfx) * cfy + u11 * cfx * cfy) - 128.f;
// Bilinear V
float v00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2 + 1]);
float v01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2 + 1]);
float v10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2 + 1]);
float v11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2 + 1]);
V = (v00 * (1.f - cfx) * (1.f - cfy) + v01 * cfx * (1.f - cfy)
+ v10 * (1.f - cfx) * cfy + v11 * cfx * cfy) - 128.f;
}
__global__ void nv12ToRgbLetterboxKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ rgbOut, int outPitch,
int outW, int outH, // model input size (e.g. 640×640)
int srcW, int srcH, // NV12 source size (e.g. 3840×2160)
int unpadW, int unpadH, // letterboxed content size within output
float invScale) // 1/scale = srcDim/unpadDim
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
const int outIdx = oy * outPitch + ox * 3;
// Letterbox padding: black for pixels outside content area
if (ox >= unpadW || oy >= unpadH) {
rgbOut[outIdx] = 0;
rgbOut[outIdx + 1] = 0;
rgbOut[outIdx + 2] = 0;
return;
}
// Map output pixel to source NV12 coordinates
float sx = (static_cast<float>(ox) + 0.5f) * invScale - 0.5f;
float sy = (static_cast<float>(oy) + 0.5f) * invScale - 0.5f;
sx = fmaxf(sx, 0.f);
sy = fmaxf(sy, 0.f);
float Y, U, V;
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
sx, sy, srcW, srcH, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
rgbOut[outIdx] = r;
rgbOut[outIdx + 1] = g;
rgbOut[outIdx + 2] = b;
}
// ── Host-callable wrappers ───────────────────────────────────────────────
extern "C" void launchNV12ToRGBLetterbox(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* rgbOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
int unpadW, int unpadH,
float invScale,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12ToRgbLetterboxKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
rgbOut, outPitch, outW, outH,
srcW, srcH, unpadW, unpadH, invScale);
}
extern "C" void launchNV12ToRGB(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* rgbOut, int outPitch,
int width, int height,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((width + block.x - 1) / block.x,
(height + block.y - 1) / block.y);
nv12ToRgbKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
rgbOut, outPitch, width, height);
}
// ── Fused NV12→RGB + direct resize kernel (for classification) ───────────
// Like letterbox kernel but stretches to fill the entire output (no padding).
// Used for classification models that expect direct resize, not aspect-
// preserving letterbox.
__global__ void nv12ToRgbResizeKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ rgbOut, int outPitch,
int outW, int outH, // model input size (e.g. 224×224)
int srcW, int srcH) // NV12 source size (e.g. 2688×1520)
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
// Map output pixel to source NV12 coordinates (stretch to fill)
float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
sx = fmaxf(sx, 0.f);
sy = fmaxf(sy, 0.f);
float Y, U, V;
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
sx, sy, srcW, srcH, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
const int outIdx = oy * outPitch + ox * 3;
rgbOut[outIdx] = r;
rgbOut[outIdx + 1] = g;
rgbOut[outIdx + 2] = b;
}
extern "C" void launchNV12ToRGBResize(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* rgbOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12ToRgbResizeKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
rgbOut, outPitch, outW, outH,
srcW, srcH);
}
// ── Fused NV12→RGB + resize + CHW planar output (for SAM3 image encoder) ─
// Writes R, G, B to separate contiguous planes: [R plane][G plane][B plane]
// at model input resolution. Eliminates cv::resize + cvtColor + split + memcpy.
__global__ void nv12ToRgbResizeCHW_uint8Kernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ chwOut, // contiguous CHW: R plane, G plane, B plane
int outW, int outH,
int srcW, int srcH)
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
sx = fmaxf(sx, 0.f);
sy = fmaxf(sy, 0.f);
float Y, U, V;
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
sx, sy, srcW, srcH, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
const int planeSize = outW * outH;
const int idx = oy * outW + ox;
chwOut[idx] = r; // R plane
chwOut[idx + planeSize] = g; // G plane
chwOut[idx + 2 * planeSize] = b; // B plane
}
__global__ void nv12ToRgbResizeCHW_floatKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
float* __restrict__ chwOut, // contiguous CHW float32 [0, 255]
int outW, int outH,
int srcW, int srcH)
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
sx = fmaxf(sx, 0.f);
sy = fmaxf(sy, 0.f);
float Y, U, V;
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
sx, sy, srcW, srcH, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
const int planeSize = outW * outH;
const int idx = oy * outW + ox;
chwOut[idx] = static_cast<float>(r);
chwOut[idx + planeSize] = static_cast<float>(g);
chwOut[idx + 2 * planeSize] = static_cast<float>(b);
}
extern "C" void launchNV12ToRGBResizeCHW_uint8(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* chwOut,
int outW, int outH,
int srcW, int srcH,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12ToRgbResizeCHW_uint8Kernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
chwOut, outW, outH, srcW, srcH);
}
extern "C" void launchNV12ToRGBResizeCHW_float(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
float* chwOut,
int outW, int outH,
int srcW, int srcH,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12ToRgbResizeCHW_floatKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
chwOut, outW, outH, srcW, srcH);
}
extern "C" void launchNV12ToBGR(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* bgrOut, int outPitch,
int width, int height,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((width + block.x - 1) / block.x,
(height + block.y - 1) / block.y);
nv12ToBgrKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
bgrOut, outPitch, width, height);
}
// ── Fused NV12→BGR + bilinear resize kernel (for OCR detection) ──────────
// Same algorithm as nv12ToRgbResizeKernel but outputs BGR channel order.
__global__ void nv12ToBgrResizeKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ bgrOut, int outPitch,
int outW, int outH,
int srcW, int srcH)
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
sx = fmaxf(sx, 0.f);
sy = fmaxf(sy, 0.f);
float Y, U, V;
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
sx, sy, srcW, srcH, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
const int outIdx = oy * outPitch + ox * 3;
bgrOut[outIdx] = b;
bgrOut[outIdx + 1] = g;
bgrOut[outIdx + 2] = r;
}
extern "C" void launchNV12ToBGRResize(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* bgrOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12ToBgrResizeKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
bgrOut, outPitch, outW, outH, srcW, srcH);
}
// ── Center-padded NV12→RGB + letterbox resize kernel (for SCRFD) ─────────
// Same as nv12ToRgbLetterboxKernel but content is centered with (padLeft, padTop)
// offset instead of placed at top-left origin. Used by SCRFD face detector
// which uses center-padding (dw = pad_w/2, dh = pad_h/2).
__global__ void nv12ToRgbCenterLetterboxKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ rgbOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
int unpadW, int unpadH,
float invScale,
int padLeft, int padTop)
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
const int outIdx = oy * outPitch + ox * 3;
// Center-padding: black for pixels outside the content area
const int cx = ox - padLeft;
const int cy = oy - padTop;
if (cx < 0 || cx >= unpadW || cy < 0 || cy >= unpadH) {
rgbOut[outIdx] = 0;
rgbOut[outIdx + 1] = 0;
rgbOut[outIdx + 2] = 0;
return;
}
// Map content pixel to source NV12 coordinates
float sx = (static_cast<float>(cx) + 0.5f) * invScale - 0.5f;
float sy = (static_cast<float>(cy) + 0.5f) * invScale - 0.5f;
sx = fmaxf(sx, 0.f);
sy = fmaxf(sy, 0.f);
float Y, U, V;
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
sx, sy, srcW, srcH, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
rgbOut[outIdx] = r;
rgbOut[outIdx + 1] = g;
rgbOut[outIdx + 2] = b;
}
extern "C" void launchNV12ToRGBCenterLetterbox(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* rgbOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
int unpadW, int unpadH,
float invScale,
int padLeft, int padTop,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12ToRgbCenterLetterboxKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
rgbOut, outPitch, outW, outH,
srcW, srcH, unpadW, unpadH, invScale,
padLeft, padTop);
}
// ── NV12 affine warp → BGR kernel (for face alignment) ──────────────────
// Reads from full-resolution NV12 source, applies inverse affine transform,
// and outputs a small aligned face image (e.g. 112×112) in BGR order.
// The inverse affine maps each output pixel back to the NV12 source coords.
// Border handling: replicate edge (matches cv::BORDER_REPLICATE).
// Outputs BGR to match existing pipeline (ANSFaceRecognizer expects BGR).
__global__ void nv12AffineWarpToBgrKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ bgrOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
float m00, float m01, float m02,
float m10, float m11, float m12)
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
// Inverse affine: map output pixel to source NV12 coordinate
float sx = m00 * ox + m01 * oy + m02;
float sy = m10 * ox + m11 * oy + m12;
// Border replicate: clamp to source bounds
sx = fmaxf(0.f, fminf(sx, static_cast<float>(srcW - 1)));
sy = fmaxf(0.f, fminf(sy, static_cast<float>(srcH - 1)));
float Y, U, V;
nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
sx, sy, srcW, srcH, Y, U, V);
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
// Output BGR order (matches OpenCV convention for downstream pipeline)
const int outIdx = oy * outPitch + ox * 3;
bgrOut[outIdx] = b;
bgrOut[outIdx + 1] = g;
bgrOut[outIdx + 2] = r;
}
extern "C" void launchNV12AffineWarpToBGR(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* bgrOut, int outPitch,
int outW, int outH,
int srcW, int srcH,
float m00, float m01, float m02,
float m10, float m11, float m12,
cudaStream_t stream)
{
dim3 block(16, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12AffineWarpToBgrKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
bgrOut, outPitch, outW, outH,
srcW, srcH,
m00, m01, m02, m10, m11, m12);
}
// ── High-level NV12→BGR: CPU→GPU upload, kernel, GPU→CPU download ────────
// Thread-local GPU buffers avoid per-call cudaMalloc overhead.
// Exported for cross-DLL use by video_player via GetProcAddress.
struct GpuNV12Buffers {
uint8_t* d_yPlane = nullptr;
uint8_t* d_uvPlane = nullptr;
uint8_t* d_bgrOut = nullptr;
int allocWidth = 0;
int allocHeight = 0;
cudaStream_t stream = nullptr;
void ensureAllocated(int w, int h) {
if (w == allocWidth && h == allocHeight && d_yPlane) return;
free();
size_t ySize = (size_t)w * h;
size_t uvSize = (size_t)w * (h / 2);
size_t bgrSize = (size_t)w * h * 3;
cudaMalloc(&d_yPlane, ySize);
cudaMalloc(&d_uvPlane, uvSize);
cudaMalloc(&d_bgrOut, bgrSize);
cudaStreamCreate(&stream);
allocWidth = w;
allocHeight = h;
}
void free() {
if (d_yPlane) { cudaFree(d_yPlane); d_yPlane = nullptr; }
if (d_uvPlane) { cudaFree(d_uvPlane); d_uvPlane = nullptr; }
if (d_bgrOut) { cudaFree(d_bgrOut); d_bgrOut = nullptr; }
if (stream) { cudaStreamDestroy(stream); stream = nullptr; }
allocWidth = allocHeight = 0;
}
~GpuNV12Buffers() { free(); }
};
static thread_local GpuNV12Buffers t_bufs;
extern "C" __declspec(dllexport)
int ANSGpuNV12ToBGR(
const uint8_t* yPlane, int yLinesize,
const uint8_t* uvPlane, int uvLinesize,
uint8_t* bgrOut, int bgrStep,
int width, int height, int gpuIndex)
{
if (!yPlane || !uvPlane || !bgrOut || width <= 0 || height <= 0) return -1;
// Select GPU
if (gpuIndex >= 0) {
cudaError_t err = cudaSetDevice(gpuIndex);
if (err != cudaSuccess) {
fprintf(stderr, "ANSGpuNV12ToBGR: cudaSetDevice(%d) failed: %s\n",
gpuIndex, cudaGetErrorString(err));
return -2;
}
}
t_bufs.ensureAllocated(width, height);
if (!t_bufs.d_yPlane) return -3; // allocation failed
// Upload Y and UV planes to GPU
cudaMemcpy2DAsync(t_bufs.d_yPlane, width,
yPlane, yLinesize,
width, height,
cudaMemcpyHostToDevice, t_bufs.stream);
cudaMemcpy2DAsync(t_bufs.d_uvPlane, width,
uvPlane, uvLinesize,
width, height / 2,
cudaMemcpyHostToDevice, t_bufs.stream);
// Launch BGR kernel
dim3 block(32, 8);
dim3 grid((width + block.x - 1) / block.x,
(height + block.y - 1) / block.y);
nv12ToBgrKernel<<<grid, block, 0, t_bufs.stream>>>(
t_bufs.d_yPlane, width,
t_bufs.d_uvPlane, width,
t_bufs.d_bgrOut, width * 3,
width, height);
// Download BGR result back to CPU
cudaMemcpy2DAsync(bgrOut, bgrStep,
t_bufs.d_bgrOut, width * 3,
width * 3, height,
cudaMemcpyDeviceToHost, t_bufs.stream);
// Use polling sync instead of cudaStreamSynchronize to avoid
// holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
// Short Sleep(0) fast path for sub-ms kernels, then Sleep(1) to give
// cleanup operations (cuArrayDestroy, cuMemFree) a window to acquire
// the exclusive SRW lock.
{
cudaError_t qerr = cudaStreamQuery(t_bufs.stream);
if (qerr == cudaErrorNotReady) {
for (int i = 0; i < 10 && qerr == cudaErrorNotReady; ++i) {
Sleep(0);
qerr = cudaStreamQuery(t_bufs.stream);
}
while (qerr == cudaErrorNotReady) {
Sleep(1);
qerr = cudaStreamQuery(t_bufs.stream);
}
}
}
// Check for errors
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "ANSGpuNV12ToBGR: CUDA error: %s\n", cudaGetErrorString(err));
return -4;
}
return 0; // success
}
// ============================================================================
// NV12 rectangular crop → BGR kernel
// Reads a rectangular region from the full NV12 frame and outputs BGR.
// No resize — output is the same size as the crop region.
// ============================================================================
__global__ void nv12CropToBGRKernel(
const uint8_t* __restrict__ yPlane, int yPitch,
const uint8_t* __restrict__ uvPlane, int uvPitch,
uint8_t* __restrict__ bgrOut, int outPitch,
int outW, int outH,
int cropX, int cropY,
int srcW, int srcH)
{
const int ox = blockIdx.x * blockDim.x + threadIdx.x;
const int oy = blockIdx.y * blockDim.y + threadIdx.y;
if (ox >= outW || oy >= outH) return;
// Map output pixel to source coordinate
const int sx = cropX + ox;
const int sy = cropY + oy;
// Clamp to source bounds
const int cx = min(max(sx, 0), srcW - 1);
const int cy = min(max(sy, 0), srcH - 1);
// Read Y
const float Y = static_cast<float>(yPlane[cy * yPitch + cx]);
// Bilinear UV (chroma subsampled 2:1 in both dimensions, interleaved NV12)
// Chroma sample centers are at (0.5, 0.5) offset in luma coords, so map
// luma (cx,cy) → chroma continuous coords, then bilinear-interpolate.
const float csx = static_cast<float>(cx) * 0.5f;
const float csy = static_cast<float>(cy) * 0.5f;
const int cx0 = static_cast<int>(csx);
const int cy0 = static_cast<int>(csy);
const int cx1 = min(cx0 + 1, (srcW >> 1) - 1);
const int cy1 = min(cy0 + 1, (srcH >> 1) - 1);
const float cfx = csx - static_cast<float>(cx0);
const float cfy = csy - static_cast<float>(cy0);
const float u00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2]);
const float u01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2]);
const float u10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2]);
const float u11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2]);
const float U = (u00 * (1.f - cfx) * (1.f - cfy) + u01 * cfx * (1.f - cfy)
+ u10 * (1.f - cfx) * cfy + u11 * cfx * cfy) - 128.f;
const float v00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2 + 1]);
const float v01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2 + 1]);
const float v10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2 + 1]);
const float v11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2 + 1]);
const float V = (v00 * (1.f - cfx) * (1.f - cfy) + v01 * cfx * (1.f - cfy)
+ v10 * (1.f - cfx) * cfy + v11 * cfx * cfy) - 128.f;
// BT.601 YUV → RGB
uint8_t r, g, b;
nv12_yuv_to_rgb(Y, U, V, r, g, b);
// Write BGR (OpenCV channel order)
const int outIdx = oy * outPitch + ox * 3;
bgrOut[outIdx] = b;
bgrOut[outIdx + 1] = g;
bgrOut[outIdx + 2] = r;
}
extern "C" void launchNV12CropToBGR(
const uint8_t* yPlane, int yPitch,
const uint8_t* uvPlane, int uvPitch,
uint8_t* bgrOut, int outPitch,
int outW, int outH,
int cropX, int cropY,
int srcW, int srcH,
cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((outW + block.x - 1) / block.x,
(outH + block.y - 1) / block.y);
nv12CropToBGRKernel<<<grid, block, 0, stream>>>(
yPlane, yPitch, uvPlane, uvPitch,
bgrOut, outPitch,
outW, outH,
cropX, cropY,
srcW, srcH);
}