// nv12_to_rgb.cu — CUDA kernels for NV12→RGB and NV12→BGR conversion. // // Two use cases: // 1. launchNV12ToRGB() — inference fast path (ANSRTYOLO uploads NV12 directly) // 2. ANSGpuNV12ToBGR() — video_player display path (replaces slow CPU cvtColorTwoPlane) // // cv::cuda::cvtColor does NOT support NV12, so we use custom kernels. #include #include #include // ── Shared YUV→RGB computation ─────────────────────────────────────────── __device__ __forceinline__ void nv12_yuv_to_rgb( float Y, float U, float V, uint8_t& r, uint8_t& g, uint8_t& b) { // BT.601 conversion (same as FFmpeg swscale / OpenCV cvtColorTwoPlane) const float R = Y + 1.402f * V; const float G = Y - 0.344136f * U - 0.714136f * V; const float B = Y + 1.772f * U; r = static_cast(fminf(fmaxf(R, 0.f), 255.f)); g = static_cast(fminf(fmaxf(G, 0.f), 255.f)); b = static_cast(fminf(fmaxf(B, 0.f), 255.f)); } __device__ __forceinline__ void nv12_read_yuv( const uint8_t* __restrict__ yPlane, int yPitch, const uint8_t* __restrict__ uvPlane, int uvPitch, int x, int y, float& Y, float& U, float& V) { Y = static_cast(yPlane[y * yPitch + x]); const int uvIdx = (y >> 1) * uvPitch + (x & ~1); U = static_cast(uvPlane[uvIdx]) - 128.f; V = static_cast(uvPlane[uvIdx + 1]) - 128.f; } // ── NV12→RGB kernel (for inference fast path) ──────────────────────────── __global__ void nv12ToRgbKernel( const uint8_t* __restrict__ yPlane, int yPitch, const uint8_t* __restrict__ uvPlane, int uvPitch, uint8_t* __restrict__ rgbOut, int outPitch, int width, int height) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= width || y >= height) return; float Y, U, V; nv12_read_yuv(yPlane, yPitch, uvPlane, uvPitch, x, y, Y, U, V); uint8_t r, g, b; nv12_yuv_to_rgb(Y, U, V, r, g, b); const int outIdx = y * outPitch + x * 3; rgbOut[outIdx] = r; rgbOut[outIdx + 1] = g; rgbOut[outIdx + 2] = b; } // ── NV12→BGR kernel (for LabVIEW display — matches OpenCV channel order) ─ __global__ void nv12ToBgrKernel( const uint8_t* __restrict__ yPlane, int yPitch, const uint8_t* __restrict__ uvPlane, int uvPitch, uint8_t* __restrict__ bgrOut, int outPitch, int width, int height) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= width || y >= height) return; float Y, U, V; nv12_read_yuv(yPlane, yPitch, uvPlane, uvPitch, x, y, Y, U, V); uint8_t r, g, b; nv12_yuv_to_rgb(Y, U, V, r, g, b); const int outIdx = y * outPitch + x * 3; bgrOut[outIdx] = b; // BGR order for OpenCV bgrOut[outIdx + 1] = g; bgrOut[outIdx + 2] = r; } // ── Fused NV12→RGB + letterbox resize kernel ──────────────────────────── // Runs at OUTPUT resolution (e.g. 640×640), sampling from 4K NV12 source // with bilinear interpolation. This avoids the 24MB 4K RGB intermediate // and processes 20× fewer pixels than separate convert + resize. __device__ __forceinline__ void nv12_read_yuv_bilinear( const uint8_t* __restrict__ yPlane, int yPitch, const uint8_t* __restrict__ uvPlane, int uvPitch, float sx, float sy, int srcW, int srcH, float& Y, float& U, float& V) { // Bilinear interpolation for Y plane int x0 = static_cast(sx); int y0 = static_cast(sy); int x1 = min(x0 + 1, srcW - 1); int y1 = min(y0 + 1, srcH - 1); x0 = max(x0, 0); y0 = max(y0, 0); float fx = sx - static_cast(x0); float fy = sy - static_cast(y0); // Bilinear Y float y00 = static_cast(yPlane[y0 * yPitch + x0]); float y01 = static_cast(yPlane[y0 * yPitch + x1]); float y10 = static_cast(yPlane[y1 * yPitch + x0]); float y11 = static_cast(yPlane[y1 * yPitch + x1]); Y = y00 * (1.f - fx) * (1.f - fy) + y01 * fx * (1.f - fy) + y10 * (1.f - fx) * fy + y11 * fx * fy; // UV: NV12 has interleaved U/V at half resolution // Sample at corresponding chroma position float csx = sx * 0.5f; float csy = sy * 0.5f; int cx0 = static_cast(csx); int cy0 = static_cast(csy); int cx1 = min(cx0 + 1, srcW / 2 - 1); int cy1 = min(cy0 + 1, srcH / 2 - 1); cx0 = max(cx0, 0); cy0 = max(cy0, 0); float cfx = csx - static_cast(cx0); float cfy = csy - static_cast(cy0); // Bilinear U float u00 = static_cast(uvPlane[cy0 * uvPitch + cx0 * 2]); float u01 = static_cast(uvPlane[cy0 * uvPitch + cx1 * 2]); float u10 = static_cast(uvPlane[cy1 * uvPitch + cx0 * 2]); float u11 = static_cast(uvPlane[cy1 * uvPitch + cx1 * 2]); U = (u00 * (1.f - cfx) * (1.f - cfy) + u01 * cfx * (1.f - cfy) + u10 * (1.f - cfx) * cfy + u11 * cfx * cfy) - 128.f; // Bilinear V float v00 = static_cast(uvPlane[cy0 * uvPitch + cx0 * 2 + 1]); float v01 = static_cast(uvPlane[cy0 * uvPitch + cx1 * 2 + 1]); float v10 = static_cast(uvPlane[cy1 * uvPitch + cx0 * 2 + 1]); float v11 = static_cast(uvPlane[cy1 * uvPitch + cx1 * 2 + 1]); V = (v00 * (1.f - cfx) * (1.f - cfy) + v01 * cfx * (1.f - cfy) + v10 * (1.f - cfx) * cfy + v11 * cfx * cfy) - 128.f; } __global__ void nv12ToRgbLetterboxKernel( const uint8_t* __restrict__ yPlane, int yPitch, const uint8_t* __restrict__ uvPlane, int uvPitch, uint8_t* __restrict__ rgbOut, int outPitch, int outW, int outH, // model input size (e.g. 640×640) int srcW, int srcH, // NV12 source size (e.g. 3840×2160) int unpadW, int unpadH, // letterboxed content size within output float invScale) // 1/scale = srcDim/unpadDim { const int ox = blockIdx.x * blockDim.x + threadIdx.x; const int oy = blockIdx.y * blockDim.y + threadIdx.y; if (ox >= outW || oy >= outH) return; const int outIdx = oy * outPitch + ox * 3; // Letterbox padding: black for pixels outside content area if (ox >= unpadW || oy >= unpadH) { rgbOut[outIdx] = 0; rgbOut[outIdx + 1] = 0; rgbOut[outIdx + 2] = 0; return; } // Map output pixel to source NV12 coordinates float sx = (static_cast(ox) + 0.5f) * invScale - 0.5f; float sy = (static_cast(oy) + 0.5f) * invScale - 0.5f; sx = fmaxf(sx, 0.f); sy = fmaxf(sy, 0.f); float Y, U, V; nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch, sx, sy, srcW, srcH, Y, U, V); uint8_t r, g, b; nv12_yuv_to_rgb(Y, U, V, r, g, b); rgbOut[outIdx] = r; rgbOut[outIdx + 1] = g; rgbOut[outIdx + 2] = b; } // ── Host-callable wrappers ─────────────────────────────────────────────── extern "C" void launchNV12ToRGBLetterbox( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* rgbOut, int outPitch, int outW, int outH, int srcW, int srcH, int unpadW, int unpadH, float invScale, cudaStream_t stream) { dim3 block(32, 8); dim3 grid((outW + block.x - 1) / block.x, (outH + block.y - 1) / block.y); nv12ToRgbLetterboxKernel<<>>( yPlane, yPitch, uvPlane, uvPitch, rgbOut, outPitch, outW, outH, srcW, srcH, unpadW, unpadH, invScale); } extern "C" void launchNV12ToRGB( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* rgbOut, int outPitch, int width, int height, cudaStream_t stream) { dim3 block(32, 8); dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y); nv12ToRgbKernel<<>>( yPlane, yPitch, uvPlane, uvPitch, rgbOut, outPitch, width, height); } // ── Fused NV12→RGB + direct resize kernel (for classification) ─────────── // Like letterbox kernel but stretches to fill the entire output (no padding). // Used for classification models that expect direct resize, not aspect- // preserving letterbox. __global__ void nv12ToRgbResizeKernel( const uint8_t* __restrict__ yPlane, int yPitch, const uint8_t* __restrict__ uvPlane, int uvPitch, uint8_t* __restrict__ rgbOut, int outPitch, int outW, int outH, // model input size (e.g. 224×224) int srcW, int srcH) // NV12 source size (e.g. 2688×1520) { const int ox = blockIdx.x * blockDim.x + threadIdx.x; const int oy = blockIdx.y * blockDim.y + threadIdx.y; if (ox >= outW || oy >= outH) return; // Map output pixel to source NV12 coordinates (stretch to fill) float sx = (static_cast(ox) + 0.5f) * static_cast(srcW) / static_cast(outW) - 0.5f; float sy = (static_cast(oy) + 0.5f) * static_cast(srcH) / static_cast(outH) - 0.5f; sx = fmaxf(sx, 0.f); sy = fmaxf(sy, 0.f); float Y, U, V; nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch, sx, sy, srcW, srcH, Y, U, V); uint8_t r, g, b; nv12_yuv_to_rgb(Y, U, V, r, g, b); const int outIdx = oy * outPitch + ox * 3; rgbOut[outIdx] = r; rgbOut[outIdx + 1] = g; rgbOut[outIdx + 2] = b; } extern "C" void launchNV12ToRGBResize( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* rgbOut, int outPitch, int outW, int outH, int srcW, int srcH, cudaStream_t stream) { dim3 block(32, 8); dim3 grid((outW + block.x - 1) / block.x, (outH + block.y - 1) / block.y); nv12ToRgbResizeKernel<<>>( yPlane, yPitch, uvPlane, uvPitch, rgbOut, outPitch, outW, outH, srcW, srcH); } // ── Fused NV12→RGB + resize + CHW planar output (for SAM3 image encoder) ─ // Writes R, G, B to separate contiguous planes: [R plane][G plane][B plane] // at model input resolution. Eliminates cv::resize + cvtColor + split + memcpy. __global__ void nv12ToRgbResizeCHW_uint8Kernel( const uint8_t* __restrict__ yPlane, int yPitch, const uint8_t* __restrict__ uvPlane, int uvPitch, uint8_t* __restrict__ chwOut, // contiguous CHW: R plane, G plane, B plane int outW, int outH, int srcW, int srcH) { const int ox = blockIdx.x * blockDim.x + threadIdx.x; const int oy = blockIdx.y * blockDim.y + threadIdx.y; if (ox >= outW || oy >= outH) return; float sx = (static_cast(ox) + 0.5f) * static_cast(srcW) / static_cast(outW) - 0.5f; float sy = (static_cast(oy) + 0.5f) * static_cast(srcH) / static_cast(outH) - 0.5f; sx = fmaxf(sx, 0.f); sy = fmaxf(sy, 0.f); float Y, U, V; nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch, sx, sy, srcW, srcH, Y, U, V); uint8_t r, g, b; nv12_yuv_to_rgb(Y, U, V, r, g, b); const int planeSize = outW * outH; const int idx = oy * outW + ox; chwOut[idx] = r; // R plane chwOut[idx + planeSize] = g; // G plane chwOut[idx + 2 * planeSize] = b; // B plane } __global__ void nv12ToRgbResizeCHW_floatKernel( const uint8_t* __restrict__ yPlane, int yPitch, const uint8_t* __restrict__ uvPlane, int uvPitch, float* __restrict__ chwOut, // contiguous CHW float32 [0, 255] int outW, int outH, int srcW, int srcH) { const int ox = blockIdx.x * blockDim.x + threadIdx.x; const int oy = blockIdx.y * blockDim.y + threadIdx.y; if (ox >= outW || oy >= outH) return; float sx = (static_cast(ox) + 0.5f) * static_cast(srcW) / static_cast(outW) - 0.5f; float sy = (static_cast(oy) + 0.5f) * static_cast(srcH) / static_cast(outH) - 0.5f; sx = fmaxf(sx, 0.f); sy = fmaxf(sy, 0.f); float Y, U, V; nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch, sx, sy, srcW, srcH, Y, U, V); uint8_t r, g, b; nv12_yuv_to_rgb(Y, U, V, r, g, b); const int planeSize = outW * outH; const int idx = oy * outW + ox; chwOut[idx] = static_cast(r); chwOut[idx + planeSize] = static_cast(g); chwOut[idx + 2 * planeSize] = static_cast(b); } extern "C" void launchNV12ToRGBResizeCHW_uint8( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* chwOut, int outW, int outH, int srcW, int srcH, cudaStream_t stream) { dim3 block(32, 8); dim3 grid((outW + block.x - 1) / block.x, (outH + block.y - 1) / block.y); nv12ToRgbResizeCHW_uint8Kernel<<>>( yPlane, yPitch, uvPlane, uvPitch, chwOut, outW, outH, srcW, srcH); } extern "C" void launchNV12ToRGBResizeCHW_float( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, float* chwOut, int outW, int outH, int srcW, int srcH, cudaStream_t stream) { dim3 block(32, 8); dim3 grid((outW + block.x - 1) / block.x, (outH + block.y - 1) / block.y); nv12ToRgbResizeCHW_floatKernel<<>>( yPlane, yPitch, uvPlane, uvPitch, chwOut, outW, outH, srcW, srcH); } extern "C" void launchNV12ToBGR( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* bgrOut, int outPitch, int width, int height, cudaStream_t stream) { dim3 block(32, 8); dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y); nv12ToBgrKernel<<>>( yPlane, yPitch, uvPlane, uvPitch, bgrOut, outPitch, width, height); } // ── Fused NV12→BGR + bilinear resize kernel (for OCR detection) ────────── // Same algorithm as nv12ToRgbResizeKernel but outputs BGR channel order. __global__ void nv12ToBgrResizeKernel( const uint8_t* __restrict__ yPlane, int yPitch, const uint8_t* __restrict__ uvPlane, int uvPitch, uint8_t* __restrict__ bgrOut, int outPitch, int outW, int outH, int srcW, int srcH) { const int ox = blockIdx.x * blockDim.x + threadIdx.x; const int oy = blockIdx.y * blockDim.y + threadIdx.y; if (ox >= outW || oy >= outH) return; float sx = (static_cast(ox) + 0.5f) * static_cast(srcW) / static_cast(outW) - 0.5f; float sy = (static_cast(oy) + 0.5f) * static_cast(srcH) / static_cast(outH) - 0.5f; sx = fmaxf(sx, 0.f); sy = fmaxf(sy, 0.f); float Y, U, V; nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch, sx, sy, srcW, srcH, Y, U, V); uint8_t r, g, b; nv12_yuv_to_rgb(Y, U, V, r, g, b); const int outIdx = oy * outPitch + ox * 3; bgrOut[outIdx] = b; bgrOut[outIdx + 1] = g; bgrOut[outIdx + 2] = r; } extern "C" void launchNV12ToBGRResize( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* bgrOut, int outPitch, int outW, int outH, int srcW, int srcH, cudaStream_t stream) { dim3 block(32, 8); dim3 grid((outW + block.x - 1) / block.x, (outH + block.y - 1) / block.y); nv12ToBgrResizeKernel<<>>( yPlane, yPitch, uvPlane, uvPitch, bgrOut, outPitch, outW, outH, srcW, srcH); } // ── Center-padded NV12→RGB + letterbox resize kernel (for SCRFD) ───────── // Same as nv12ToRgbLetterboxKernel but content is centered with (padLeft, padTop) // offset instead of placed at top-left origin. Used by SCRFD face detector // which uses center-padding (dw = pad_w/2, dh = pad_h/2). __global__ void nv12ToRgbCenterLetterboxKernel( const uint8_t* __restrict__ yPlane, int yPitch, const uint8_t* __restrict__ uvPlane, int uvPitch, uint8_t* __restrict__ rgbOut, int outPitch, int outW, int outH, int srcW, int srcH, int unpadW, int unpadH, float invScale, int padLeft, int padTop) { const int ox = blockIdx.x * blockDim.x + threadIdx.x; const int oy = blockIdx.y * blockDim.y + threadIdx.y; if (ox >= outW || oy >= outH) return; const int outIdx = oy * outPitch + ox * 3; // Center-padding: black for pixels outside the content area const int cx = ox - padLeft; const int cy = oy - padTop; if (cx < 0 || cx >= unpadW || cy < 0 || cy >= unpadH) { rgbOut[outIdx] = 0; rgbOut[outIdx + 1] = 0; rgbOut[outIdx + 2] = 0; return; } // Map content pixel to source NV12 coordinates float sx = (static_cast(cx) + 0.5f) * invScale - 0.5f; float sy = (static_cast(cy) + 0.5f) * invScale - 0.5f; sx = fmaxf(sx, 0.f); sy = fmaxf(sy, 0.f); float Y, U, V; nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch, sx, sy, srcW, srcH, Y, U, V); uint8_t r, g, b; nv12_yuv_to_rgb(Y, U, V, r, g, b); rgbOut[outIdx] = r; rgbOut[outIdx + 1] = g; rgbOut[outIdx + 2] = b; } extern "C" void launchNV12ToRGBCenterLetterbox( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* rgbOut, int outPitch, int outW, int outH, int srcW, int srcH, int unpadW, int unpadH, float invScale, int padLeft, int padTop, cudaStream_t stream) { dim3 block(32, 8); dim3 grid((outW + block.x - 1) / block.x, (outH + block.y - 1) / block.y); nv12ToRgbCenterLetterboxKernel<<>>( yPlane, yPitch, uvPlane, uvPitch, rgbOut, outPitch, outW, outH, srcW, srcH, unpadW, unpadH, invScale, padLeft, padTop); } // ── NV12 affine warp → BGR kernel (for face alignment) ────────────────── // Reads from full-resolution NV12 source, applies inverse affine transform, // and outputs a small aligned face image (e.g. 112×112) in BGR order. // The inverse affine maps each output pixel back to the NV12 source coords. // Border handling: replicate edge (matches cv::BORDER_REPLICATE). // Outputs BGR to match existing pipeline (ANSFaceRecognizer expects BGR). __global__ void nv12AffineWarpToBgrKernel( const uint8_t* __restrict__ yPlane, int yPitch, const uint8_t* __restrict__ uvPlane, int uvPitch, uint8_t* __restrict__ bgrOut, int outPitch, int outW, int outH, int srcW, int srcH, float m00, float m01, float m02, float m10, float m11, float m12) { const int ox = blockIdx.x * blockDim.x + threadIdx.x; const int oy = blockIdx.y * blockDim.y + threadIdx.y; if (ox >= outW || oy >= outH) return; // Inverse affine: map output pixel to source NV12 coordinate float sx = m00 * ox + m01 * oy + m02; float sy = m10 * ox + m11 * oy + m12; // Border replicate: clamp to source bounds sx = fmaxf(0.f, fminf(sx, static_cast(srcW - 1))); sy = fmaxf(0.f, fminf(sy, static_cast(srcH - 1))); float Y, U, V; nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch, sx, sy, srcW, srcH, Y, U, V); uint8_t r, g, b; nv12_yuv_to_rgb(Y, U, V, r, g, b); // Output BGR order (matches OpenCV convention for downstream pipeline) const int outIdx = oy * outPitch + ox * 3; bgrOut[outIdx] = b; bgrOut[outIdx + 1] = g; bgrOut[outIdx + 2] = r; } extern "C" void launchNV12AffineWarpToBGR( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* bgrOut, int outPitch, int outW, int outH, int srcW, int srcH, float m00, float m01, float m02, float m10, float m11, float m12, cudaStream_t stream) { dim3 block(16, 8); dim3 grid((outW + block.x - 1) / block.x, (outH + block.y - 1) / block.y); nv12AffineWarpToBgrKernel<<>>( yPlane, yPitch, uvPlane, uvPitch, bgrOut, outPitch, outW, outH, srcW, srcH, m00, m01, m02, m10, m11, m12); } // ── High-level NV12→BGR: CPU→GPU upload, kernel, GPU→CPU download ──────── // Thread-local GPU buffers avoid per-call cudaMalloc overhead. // Exported for cross-DLL use by video_player via GetProcAddress. struct GpuNV12Buffers { uint8_t* d_yPlane = nullptr; uint8_t* d_uvPlane = nullptr; uint8_t* d_bgrOut = nullptr; int allocWidth = 0; int allocHeight = 0; cudaStream_t stream = nullptr; void ensureAllocated(int w, int h) { if (w == allocWidth && h == allocHeight && d_yPlane) return; free(); size_t ySize = (size_t)w * h; size_t uvSize = (size_t)w * (h / 2); size_t bgrSize = (size_t)w * h * 3; cudaMalloc(&d_yPlane, ySize); cudaMalloc(&d_uvPlane, uvSize); cudaMalloc(&d_bgrOut, bgrSize); cudaStreamCreate(&stream); allocWidth = w; allocHeight = h; } void free() { if (d_yPlane) { cudaFree(d_yPlane); d_yPlane = nullptr; } if (d_uvPlane) { cudaFree(d_uvPlane); d_uvPlane = nullptr; } if (d_bgrOut) { cudaFree(d_bgrOut); d_bgrOut = nullptr; } if (stream) { cudaStreamDestroy(stream); stream = nullptr; } allocWidth = allocHeight = 0; } ~GpuNV12Buffers() { free(); } }; static thread_local GpuNV12Buffers t_bufs; extern "C" __declspec(dllexport) int ANSGpuNV12ToBGR( const uint8_t* yPlane, int yLinesize, const uint8_t* uvPlane, int uvLinesize, uint8_t* bgrOut, int bgrStep, int width, int height, int gpuIndex) { if (!yPlane || !uvPlane || !bgrOut || width <= 0 || height <= 0) return -1; // Select GPU if (gpuIndex >= 0) { cudaError_t err = cudaSetDevice(gpuIndex); if (err != cudaSuccess) { fprintf(stderr, "ANSGpuNV12ToBGR: cudaSetDevice(%d) failed: %s\n", gpuIndex, cudaGetErrorString(err)); return -2; } } t_bufs.ensureAllocated(width, height); if (!t_bufs.d_yPlane) return -3; // allocation failed // Upload Y and UV planes to GPU cudaMemcpy2DAsync(t_bufs.d_yPlane, width, yPlane, yLinesize, width, height, cudaMemcpyHostToDevice, t_bufs.stream); cudaMemcpy2DAsync(t_bufs.d_uvPlane, width, uvPlane, uvLinesize, width, height / 2, cudaMemcpyHostToDevice, t_bufs.stream); // Launch BGR kernel dim3 block(32, 8); dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y); nv12ToBgrKernel<<>>( t_bufs.d_yPlane, width, t_bufs.d_uvPlane, width, t_bufs.d_bgrOut, width * 3, width, height); // Download BGR result back to CPU cudaMemcpy2DAsync(bgrOut, bgrStep, t_bufs.d_bgrOut, width * 3, width * 3, height, cudaMemcpyDeviceToHost, t_bufs.stream); cudaStreamSynchronize(t_bufs.stream); // Check for errors cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "ANSGpuNV12ToBGR: CUDA error: %s\n", cudaGetErrorString(err)); return -4; } return 0; // success } // ============================================================================ // NV12 rectangular crop → BGR kernel // Reads a rectangular region from the full NV12 frame and outputs BGR. // No resize — output is the same size as the crop region. // ============================================================================ __global__ void nv12CropToBGRKernel( const uint8_t* __restrict__ yPlane, int yPitch, const uint8_t* __restrict__ uvPlane, int uvPitch, uint8_t* __restrict__ bgrOut, int outPitch, int outW, int outH, int cropX, int cropY, int srcW, int srcH) { const int ox = blockIdx.x * blockDim.x + threadIdx.x; const int oy = blockIdx.y * blockDim.y + threadIdx.y; if (ox >= outW || oy >= outH) return; // Map output pixel to source coordinate const int sx = cropX + ox; const int sy = cropY + oy; // Clamp to source bounds const int cx = min(max(sx, 0), srcW - 1); const int cy = min(max(sy, 0), srcH - 1); // Read Y const float Y = static_cast(yPlane[cy * yPitch + cx]); // Bilinear UV (chroma subsampled 2:1 in both dimensions, interleaved NV12) // Chroma sample centers are at (0.5, 0.5) offset in luma coords, so map // luma (cx,cy) → chroma continuous coords, then bilinear-interpolate. const float csx = static_cast(cx) * 0.5f; const float csy = static_cast(cy) * 0.5f; const int cx0 = static_cast(csx); const int cy0 = static_cast(csy); const int cx1 = min(cx0 + 1, (srcW >> 1) - 1); const int cy1 = min(cy0 + 1, (srcH >> 1) - 1); const float cfx = csx - static_cast(cx0); const float cfy = csy - static_cast(cy0); const float u00 = static_cast(uvPlane[cy0 * uvPitch + cx0 * 2]); const float u01 = static_cast(uvPlane[cy0 * uvPitch + cx1 * 2]); const float u10 = static_cast(uvPlane[cy1 * uvPitch + cx0 * 2]); const float u11 = static_cast(uvPlane[cy1 * uvPitch + cx1 * 2]); const float U = (u00 * (1.f - cfx) * (1.f - cfy) + u01 * cfx * (1.f - cfy) + u10 * (1.f - cfx) * cfy + u11 * cfx * cfy) - 128.f; const float v00 = static_cast(uvPlane[cy0 * uvPitch + cx0 * 2 + 1]); const float v01 = static_cast(uvPlane[cy0 * uvPitch + cx1 * 2 + 1]); const float v10 = static_cast(uvPlane[cy1 * uvPitch + cx0 * 2 + 1]); const float v11 = static_cast(uvPlane[cy1 * uvPitch + cx1 * 2 + 1]); const float V = (v00 * (1.f - cfx) * (1.f - cfy) + v01 * cfx * (1.f - cfy) + v10 * (1.f - cfx) * cfy + v11 * cfx * cfy) - 128.f; // BT.601 YUV → RGB uint8_t r, g, b; nv12_yuv_to_rgb(Y, U, V, r, g, b); // Write BGR (OpenCV channel order) const int outIdx = oy * outPitch + ox * 3; bgrOut[outIdx] = b; bgrOut[outIdx + 1] = g; bgrOut[outIdx + 2] = r; } extern "C" void launchNV12CropToBGR( const uint8_t* yPlane, int yPitch, const uint8_t* uvPlane, int uvPitch, uint8_t* bgrOut, int outPitch, int outW, int outH, int cropX, int cropY, int srcW, int srcH, cudaStream_t stream) { dim3 block(32, 8); dim3 grid((outW + block.x - 1) / block.x, (outH + block.y - 1) / block.y); nv12CropToBGRKernel<<>>( yPlane, yPitch, uvPlane, uvPitch, bgrOut, outPitch, outW, outH, cropX, cropY, srcW, srcH); }