// nv12_to_rgb.cu — CUDA kernels for NV12→RGB and NV12→BGR conversion.
//
// Two use cases:
//   1. launchNV12ToRGB()  — inference fast path (ANSRTYOLO uploads NV12 directly)
//   2. ANSGpuNV12ToBGR()  — video_player display path (replaces slow CPU cvtColorTwoPlane)
//
// cv::cuda::cvtColor does NOT support NV12, so we use custom kernels.

#include <cuda_runtime.h>
#include <cstdint>
#include <cstdio>

// ── Shared YUV→RGB computation ───────────────────────────────────────────

__device__ __forceinline__ void nv12_yuv_to_rgb(
    float Y, float U, float V,
    uint8_t& r, uint8_t& g, uint8_t& b)
{
    // BT.601 conversion (same as FFmpeg swscale / OpenCV cvtColorTwoPlane)
    const float R = Y + 1.402f * V;
    const float G = Y - 0.344136f * U - 0.714136f * V;
    const float B = Y + 1.772f * U;
    r = static_cast<uint8_t>(fminf(fmaxf(R, 0.f), 255.f));
    g = static_cast<uint8_t>(fminf(fmaxf(G, 0.f), 255.f));
    b = static_cast<uint8_t>(fminf(fmaxf(B, 0.f), 255.f));
}

__device__ __forceinline__ void nv12_read_yuv(
    const uint8_t* __restrict__ yPlane, int yPitch,
    const uint8_t* __restrict__ uvPlane, int uvPitch,
    int x, int y,
    float& Y, float& U, float& V)
{
    Y = static_cast<float>(yPlane[y * yPitch + x]);
    const int uvIdx = (y >> 1) * uvPitch + (x & ~1);
    U = static_cast<float>(uvPlane[uvIdx])     - 128.f;
    V = static_cast<float>(uvPlane[uvIdx + 1]) - 128.f;
}

// ── NV12→RGB kernel (for inference fast path) ────────────────────────────

__global__ void nv12ToRgbKernel(
    const uint8_t* __restrict__ yPlane,   int yPitch,
    const uint8_t* __restrict__ uvPlane,  int uvPitch,
    uint8_t* __restrict__ rgbOut,         int outPitch,
    int width, int height)
{
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= width || y >= height) return;

    float Y, U, V;
    nv12_read_yuv(yPlane, yPitch, uvPlane, uvPitch, x, y, Y, U, V);

    uint8_t r, g, b;
    nv12_yuv_to_rgb(Y, U, V, r, g, b);

    const int outIdx = y * outPitch + x * 3;
    rgbOut[outIdx]     = r;
    rgbOut[outIdx + 1] = g;
    rgbOut[outIdx + 2] = b;
}

// ── NV12→BGR kernel (for LabVIEW display — matches OpenCV channel order) ─

__global__ void nv12ToBgrKernel(
    const uint8_t* __restrict__ yPlane,   int yPitch,
    const uint8_t* __restrict__ uvPlane,  int uvPitch,
    uint8_t* __restrict__ bgrOut,         int outPitch,
    int width, int height)
{
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= width || y >= height) return;

    float Y, U, V;
    nv12_read_yuv(yPlane, yPitch, uvPlane, uvPitch, x, y, Y, U, V);

    uint8_t r, g, b;
    nv12_yuv_to_rgb(Y, U, V, r, g, b);

    const int outIdx = y * outPitch + x * 3;
    bgrOut[outIdx]     = b;  // BGR order for OpenCV
    bgrOut[outIdx + 1] = g;
    bgrOut[outIdx + 2] = r;
}

// ── Fused NV12→RGB + letterbox resize kernel ────────────────────────────
// Runs at OUTPUT resolution (e.g. 640×640), sampling from 4K NV12 source
// with bilinear interpolation.  This avoids the 24MB 4K RGB intermediate
// and processes 20× fewer pixels than separate convert + resize.

__device__ __forceinline__ void nv12_read_yuv_bilinear(
    const uint8_t* __restrict__ yPlane, int yPitch,
    const uint8_t* __restrict__ uvPlane, int uvPitch,
    float sx, float sy, int srcW, int srcH,
    float& Y, float& U, float& V)
{
    // Bilinear interpolation for Y plane
    int x0 = static_cast<int>(sx);
    int y0 = static_cast<int>(sy);
    int x1 = min(x0 + 1, srcW - 1);
    int y1 = min(y0 + 1, srcH - 1);
    x0 = max(x0, 0);
    y0 = max(y0, 0);
    float fx = sx - static_cast<float>(x0);
    float fy = sy - static_cast<float>(y0);

    // Bilinear Y
    float y00 = static_cast<float>(yPlane[y0 * yPitch + x0]);
    float y01 = static_cast<float>(yPlane[y0 * yPitch + x1]);
    float y10 = static_cast<float>(yPlane[y1 * yPitch + x0]);
    float y11 = static_cast<float>(yPlane[y1 * yPitch + x1]);
    Y = y00 * (1.f - fx) * (1.f - fy) + y01 * fx * (1.f - fy)
      + y10 * (1.f - fx) * fy          + y11 * fx * fy;

    // UV: NV12 has interleaved U/V at half resolution
    // Sample at corresponding chroma position
    float csx = sx * 0.5f;
    float csy = sy * 0.5f;
    int cx0 = static_cast<int>(csx);
    int cy0 = static_cast<int>(csy);
    int cx1 = min(cx0 + 1, srcW / 2 - 1);
    int cy1 = min(cy0 + 1, srcH / 2 - 1);
    cx0 = max(cx0, 0);
    cy0 = max(cy0, 0);
    float cfx = csx - static_cast<float>(cx0);
    float cfy = csy - static_cast<float>(cy0);

    // Bilinear U
    float u00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2]);
    float u01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2]);
    float u10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2]);
    float u11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2]);
    U = (u00 * (1.f - cfx) * (1.f - cfy) + u01 * cfx * (1.f - cfy)
       + u10 * (1.f - cfx) * cfy          + u11 * cfx * cfy) - 128.f;

    // Bilinear V
    float v00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2 + 1]);
    float v01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2 + 1]);
    float v10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2 + 1]);
    float v11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2 + 1]);
    V = (v00 * (1.f - cfx) * (1.f - cfy) + v01 * cfx * (1.f - cfy)
       + v10 * (1.f - cfx) * cfy          + v11 * cfx * cfy) - 128.f;
}

__global__ void nv12ToRgbLetterboxKernel(
    const uint8_t* __restrict__ yPlane,   int yPitch,
    const uint8_t* __restrict__ uvPlane,  int uvPitch,
    uint8_t* __restrict__ rgbOut,         int outPitch,
    int outW, int outH,         // model input size (e.g. 640×640)
    int srcW, int srcH,         // NV12 source size (e.g. 3840×2160)
    int unpadW, int unpadH,     // letterboxed content size within output
    float invScale)             // 1/scale = srcDim/unpadDim
{
    const int ox = blockIdx.x * blockDim.x + threadIdx.x;
    const int oy = blockIdx.y * blockDim.y + threadIdx.y;
    if (ox >= outW || oy >= outH) return;

    const int outIdx = oy * outPitch + ox * 3;

    // Letterbox padding: black for pixels outside content area
    if (ox >= unpadW || oy >= unpadH) {
        rgbOut[outIdx]     = 0;
        rgbOut[outIdx + 1] = 0;
        rgbOut[outIdx + 2] = 0;
        return;
    }

    // Map output pixel to source NV12 coordinates
    float sx = (static_cast<float>(ox) + 0.5f) * invScale - 0.5f;
    float sy = (static_cast<float>(oy) + 0.5f) * invScale - 0.5f;
    sx = fmaxf(sx, 0.f);
    sy = fmaxf(sy, 0.f);

    float Y, U, V;
    nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
                           sx, sy, srcW, srcH, Y, U, V);

    uint8_t r, g, b;
    nv12_yuv_to_rgb(Y, U, V, r, g, b);

    rgbOut[outIdx]     = r;
    rgbOut[outIdx + 1] = g;
    rgbOut[outIdx + 2] = b;
}

// ── Host-callable wrappers ───────────────────────────────────────────────

extern "C" void launchNV12ToRGBLetterbox(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* rgbOut,         int outPitch,
    int outW, int outH,
    int srcW, int srcH,
    int unpadW, int unpadH,
    float invScale,
    cudaStream_t stream)
{
    dim3 block(32, 8);
    dim3 grid((outW + block.x - 1) / block.x,
              (outH + block.y - 1) / block.y);
    nv12ToRgbLetterboxKernel<<<grid, block, 0, stream>>>(
        yPlane, yPitch, uvPlane, uvPitch,
        rgbOut, outPitch, outW, outH,
        srcW, srcH, unpadW, unpadH, invScale);
}

extern "C" void launchNV12ToRGB(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* rgbOut,         int outPitch,
    int width, int height,
    cudaStream_t stream)
{
    dim3 block(32, 8);
    dim3 grid((width + block.x - 1) / block.x,
              (height + block.y - 1) / block.y);
    nv12ToRgbKernel<<<grid, block, 0, stream>>>(
        yPlane, yPitch, uvPlane, uvPitch,
        rgbOut, outPitch, width, height);
}

// ── Fused NV12→RGB + direct resize kernel (for classification) ───────────
// Like letterbox kernel but stretches to fill the entire output (no padding).
// Used for classification models that expect direct resize, not aspect-
// preserving letterbox.

__global__ void nv12ToRgbResizeKernel(
    const uint8_t* __restrict__ yPlane,   int yPitch,
    const uint8_t* __restrict__ uvPlane,  int uvPitch,
    uint8_t* __restrict__ rgbOut,         int outPitch,
    int outW, int outH,         // model input size (e.g. 224×224)
    int srcW, int srcH)         // NV12 source size (e.g. 2688×1520)
{
    const int ox = blockIdx.x * blockDim.x + threadIdx.x;
    const int oy = blockIdx.y * blockDim.y + threadIdx.y;
    if (ox >= outW || oy >= outH) return;

    // Map output pixel to source NV12 coordinates (stretch to fill)
    float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
    float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
    sx = fmaxf(sx, 0.f);
    sy = fmaxf(sy, 0.f);

    float Y, U, V;
    nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
                           sx, sy, srcW, srcH, Y, U, V);

    uint8_t r, g, b;
    nv12_yuv_to_rgb(Y, U, V, r, g, b);

    const int outIdx = oy * outPitch + ox * 3;
    rgbOut[outIdx]     = r;
    rgbOut[outIdx + 1] = g;
    rgbOut[outIdx + 2] = b;
}

extern "C" void launchNV12ToRGBResize(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* rgbOut,         int outPitch,
    int outW, int outH,
    int srcW, int srcH,
    cudaStream_t stream)
{
    dim3 block(32, 8);
    dim3 grid((outW + block.x - 1) / block.x,
              (outH + block.y - 1) / block.y);
    nv12ToRgbResizeKernel<<<grid, block, 0, stream>>>(
        yPlane, yPitch, uvPlane, uvPitch,
        rgbOut, outPitch, outW, outH,
        srcW, srcH);
}

// ── Fused NV12→RGB + resize + CHW planar output (for SAM3 image encoder) ─
// Writes R, G, B to separate contiguous planes: [R plane][G plane][B plane]
// at model input resolution.  Eliminates cv::resize + cvtColor + split + memcpy.

__global__ void nv12ToRgbResizeCHW_uint8Kernel(
    const uint8_t* __restrict__ yPlane,   int yPitch,
    const uint8_t* __restrict__ uvPlane,  int uvPitch,
    uint8_t* __restrict__ chwOut,         // contiguous CHW: R plane, G plane, B plane
    int outW, int outH,
    int srcW, int srcH)
{
    const int ox = blockIdx.x * blockDim.x + threadIdx.x;
    const int oy = blockIdx.y * blockDim.y + threadIdx.y;
    if (ox >= outW || oy >= outH) return;

    float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
    float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
    sx = fmaxf(sx, 0.f);
    sy = fmaxf(sy, 0.f);

    float Y, U, V;
    nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
                           sx, sy, srcW, srcH, Y, U, V);

    uint8_t r, g, b;
    nv12_yuv_to_rgb(Y, U, V, r, g, b);

    const int planeSize = outW * outH;
    const int idx = oy * outW + ox;
    chwOut[idx]                 = r;   // R plane
    chwOut[idx + planeSize]     = g;   // G plane
    chwOut[idx + 2 * planeSize] = b;   // B plane
}

__global__ void nv12ToRgbResizeCHW_floatKernel(
    const uint8_t* __restrict__ yPlane,   int yPitch,
    const uint8_t* __restrict__ uvPlane,  int uvPitch,
    float* __restrict__ chwOut,           // contiguous CHW float32 [0, 255]
    int outW, int outH,
    int srcW, int srcH)
{
    const int ox = blockIdx.x * blockDim.x + threadIdx.x;
    const int oy = blockIdx.y * blockDim.y + threadIdx.y;
    if (ox >= outW || oy >= outH) return;

    float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
    float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
    sx = fmaxf(sx, 0.f);
    sy = fmaxf(sy, 0.f);

    float Y, U, V;
    nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
                           sx, sy, srcW, srcH, Y, U, V);

    uint8_t r, g, b;
    nv12_yuv_to_rgb(Y, U, V, r, g, b);

    const int planeSize = outW * outH;
    const int idx = oy * outW + ox;
    chwOut[idx]                 = static_cast<float>(r);
    chwOut[idx + planeSize]     = static_cast<float>(g);
    chwOut[idx + 2 * planeSize] = static_cast<float>(b);
}

extern "C" void launchNV12ToRGBResizeCHW_uint8(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* chwOut,
    int outW, int outH,
    int srcW, int srcH,
    cudaStream_t stream)
{
    dim3 block(32, 8);
    dim3 grid((outW + block.x - 1) / block.x,
              (outH + block.y - 1) / block.y);
    nv12ToRgbResizeCHW_uint8Kernel<<<grid, block, 0, stream>>>(
        yPlane, yPitch, uvPlane, uvPitch,
        chwOut, outW, outH, srcW, srcH);
}

extern "C" void launchNV12ToRGBResizeCHW_float(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    float* chwOut,
    int outW, int outH,
    int srcW, int srcH,
    cudaStream_t stream)
{
    dim3 block(32, 8);
    dim3 grid((outW + block.x - 1) / block.x,
              (outH + block.y - 1) / block.y);
    nv12ToRgbResizeCHW_floatKernel<<<grid, block, 0, stream>>>(
        yPlane, yPitch, uvPlane, uvPitch,
        chwOut, outW, outH, srcW, srcH);
}

extern "C" void launchNV12ToBGR(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* bgrOut,         int outPitch,
    int width, int height,
    cudaStream_t stream)
{
    dim3 block(32, 8);
    dim3 grid((width + block.x - 1) / block.x,
              (height + block.y - 1) / block.y);
    nv12ToBgrKernel<<<grid, block, 0, stream>>>(
        yPlane, yPitch, uvPlane, uvPitch,
        bgrOut, outPitch, width, height);
}

// ── Fused NV12→BGR + bilinear resize kernel (for OCR detection) ──────────
// Same algorithm as nv12ToRgbResizeKernel but outputs BGR channel order.
__global__ void nv12ToBgrResizeKernel(
    const uint8_t* __restrict__ yPlane,   int yPitch,
    const uint8_t* __restrict__ uvPlane,  int uvPitch,
    uint8_t* __restrict__ bgrOut,         int outPitch,
    int outW, int outH,
    int srcW, int srcH)
{
    const int ox = blockIdx.x * blockDim.x + threadIdx.x;
    const int oy = blockIdx.y * blockDim.y + threadIdx.y;
    if (ox >= outW || oy >= outH) return;

    float sx = (static_cast<float>(ox) + 0.5f) * static_cast<float>(srcW) / static_cast<float>(outW) - 0.5f;
    float sy = (static_cast<float>(oy) + 0.5f) * static_cast<float>(srcH) / static_cast<float>(outH) - 0.5f;
    sx = fmaxf(sx, 0.f);
    sy = fmaxf(sy, 0.f);

    float Y, U, V;
    nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
                           sx, sy, srcW, srcH, Y, U, V);

    uint8_t r, g, b;
    nv12_yuv_to_rgb(Y, U, V, r, g, b);

    const int outIdx = oy * outPitch + ox * 3;
    bgrOut[outIdx]     = b;
    bgrOut[outIdx + 1] = g;
    bgrOut[outIdx + 2] = r;
}

extern "C" void launchNV12ToBGRResize(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* bgrOut,         int outPitch,
    int outW, int outH,
    int srcW, int srcH,
    cudaStream_t stream)
{
    dim3 block(32, 8);
    dim3 grid((outW + block.x - 1) / block.x,
              (outH + block.y - 1) / block.y);
    nv12ToBgrResizeKernel<<<grid, block, 0, stream>>>(
        yPlane, yPitch, uvPlane, uvPitch,
        bgrOut, outPitch, outW, outH, srcW, srcH);
}

// ── Center-padded NV12→RGB + letterbox resize kernel (for SCRFD) ─────────
// Same as nv12ToRgbLetterboxKernel but content is centered with (padLeft, padTop)
// offset instead of placed at top-left origin.  Used by SCRFD face detector
// which uses center-padding (dw = pad_w/2, dh = pad_h/2).

__global__ void nv12ToRgbCenterLetterboxKernel(
    const uint8_t* __restrict__ yPlane,   int yPitch,
    const uint8_t* __restrict__ uvPlane,  int uvPitch,
    uint8_t* __restrict__ rgbOut,         int outPitch,
    int outW, int outH,
    int srcW, int srcH,
    int unpadW, int unpadH,
    float invScale,
    int padLeft, int padTop)
{
    const int ox = blockIdx.x * blockDim.x + threadIdx.x;
    const int oy = blockIdx.y * blockDim.y + threadIdx.y;
    if (ox >= outW || oy >= outH) return;

    const int outIdx = oy * outPitch + ox * 3;

    // Center-padding: black for pixels outside the content area
    const int cx = ox - padLeft;
    const int cy = oy - padTop;
    if (cx < 0 || cx >= unpadW || cy < 0 || cy >= unpadH) {
        rgbOut[outIdx]     = 0;
        rgbOut[outIdx + 1] = 0;
        rgbOut[outIdx + 2] = 0;
        return;
    }

    // Map content pixel to source NV12 coordinates
    float sx = (static_cast<float>(cx) + 0.5f) * invScale - 0.5f;
    float sy = (static_cast<float>(cy) + 0.5f) * invScale - 0.5f;
    sx = fmaxf(sx, 0.f);
    sy = fmaxf(sy, 0.f);

    float Y, U, V;
    nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
                           sx, sy, srcW, srcH, Y, U, V);

    uint8_t r, g, b;
    nv12_yuv_to_rgb(Y, U, V, r, g, b);

    rgbOut[outIdx]     = r;
    rgbOut[outIdx + 1] = g;
    rgbOut[outIdx + 2] = b;
}

extern "C" void launchNV12ToRGBCenterLetterbox(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* rgbOut,         int outPitch,
    int outW, int outH,
    int srcW, int srcH,
    int unpadW, int unpadH,
    float invScale,
    int padLeft, int padTop,
    cudaStream_t stream)
{
    dim3 block(32, 8);
    dim3 grid((outW + block.x - 1) / block.x,
              (outH + block.y - 1) / block.y);
    nv12ToRgbCenterLetterboxKernel<<<grid, block, 0, stream>>>(
        yPlane, yPitch, uvPlane, uvPitch,
        rgbOut, outPitch, outW, outH,
        srcW, srcH, unpadW, unpadH, invScale,
        padLeft, padTop);
}

// ── NV12 affine warp → BGR kernel (for face alignment) ──────────────────
// Reads from full-resolution NV12 source, applies inverse affine transform,
// and outputs a small aligned face image (e.g. 112×112) in BGR order.
// The inverse affine maps each output pixel back to the NV12 source coords.
// Border handling: replicate edge (matches cv::BORDER_REPLICATE).
// Outputs BGR to match existing pipeline (ANSFaceRecognizer expects BGR).

__global__ void nv12AffineWarpToBgrKernel(
    const uint8_t* __restrict__ yPlane,   int yPitch,
    const uint8_t* __restrict__ uvPlane,  int uvPitch,
    uint8_t* __restrict__ bgrOut,         int outPitch,
    int outW, int outH,
    int srcW, int srcH,
    float m00, float m01, float m02,
    float m10, float m11, float m12)
{
    const int ox = blockIdx.x * blockDim.x + threadIdx.x;
    const int oy = blockIdx.y * blockDim.y + threadIdx.y;
    if (ox >= outW || oy >= outH) return;

    // Inverse affine: map output pixel to source NV12 coordinate
    float sx = m00 * ox + m01 * oy + m02;
    float sy = m10 * ox + m11 * oy + m12;

    // Border replicate: clamp to source bounds
    sx = fmaxf(0.f, fminf(sx, static_cast<float>(srcW - 1)));
    sy = fmaxf(0.f, fminf(sy, static_cast<float>(srcH - 1)));

    float Y, U, V;
    nv12_read_yuv_bilinear(yPlane, yPitch, uvPlane, uvPitch,
                           sx, sy, srcW, srcH, Y, U, V);

    uint8_t r, g, b;
    nv12_yuv_to_rgb(Y, U, V, r, g, b);

    // Output BGR order (matches OpenCV convention for downstream pipeline)
    const int outIdx = oy * outPitch + ox * 3;
    bgrOut[outIdx]     = b;
    bgrOut[outIdx + 1] = g;
    bgrOut[outIdx + 2] = r;
}

extern "C" void launchNV12AffineWarpToBGR(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* bgrOut,         int outPitch,
    int outW, int outH,
    int srcW, int srcH,
    float m00, float m01, float m02,
    float m10, float m11, float m12,
    cudaStream_t stream)
{
    dim3 block(16, 8);
    dim3 grid((outW + block.x - 1) / block.x,
              (outH + block.y - 1) / block.y);
    nv12AffineWarpToBgrKernel<<<grid, block, 0, stream>>>(
        yPlane, yPitch, uvPlane, uvPitch,
        bgrOut, outPitch, outW, outH,
        srcW, srcH,
        m00, m01, m02, m10, m11, m12);
}

// ── High-level NV12→BGR: CPU→GPU upload, kernel, GPU→CPU download ────────
// Thread-local GPU buffers avoid per-call cudaMalloc overhead.
// Exported for cross-DLL use by video_player via GetProcAddress.

struct GpuNV12Buffers {
    uint8_t* d_yPlane   = nullptr;
    uint8_t* d_uvPlane  = nullptr;
    uint8_t* d_bgrOut   = nullptr;
    int allocWidth  = 0;
    int allocHeight = 0;
    cudaStream_t stream = nullptr;

    void ensureAllocated(int w, int h) {
        if (w == allocWidth && h == allocHeight && d_yPlane) return;
        free();
        size_t ySize   = (size_t)w * h;
        size_t uvSize  = (size_t)w * (h / 2);
        size_t bgrSize = (size_t)w * h * 3;
        cudaMalloc(&d_yPlane,  ySize);
        cudaMalloc(&d_uvPlane, uvSize);
        cudaMalloc(&d_bgrOut,  bgrSize);
        cudaStreamCreate(&stream);
        allocWidth  = w;
        allocHeight = h;
    }

    void free() {
        if (d_yPlane)  { cudaFree(d_yPlane);  d_yPlane  = nullptr; }
        if (d_uvPlane) { cudaFree(d_uvPlane); d_uvPlane = nullptr; }
        if (d_bgrOut)  { cudaFree(d_bgrOut);  d_bgrOut  = nullptr; }
        if (stream)    { cudaStreamDestroy(stream); stream = nullptr; }
        allocWidth = allocHeight = 0;
    }

    ~GpuNV12Buffers() { free(); }
};

static thread_local GpuNV12Buffers t_bufs;

extern "C" __declspec(dllexport)
int ANSGpuNV12ToBGR(
    const uint8_t* yPlane,   int yLinesize,
    const uint8_t* uvPlane,  int uvLinesize,
    uint8_t*       bgrOut,   int bgrStep,
    int width, int height, int gpuIndex)
{
    if (!yPlane || !uvPlane || !bgrOut || width <= 0 || height <= 0) return -1;

    // Select GPU
    if (gpuIndex >= 0) {
        cudaError_t err = cudaSetDevice(gpuIndex);
        if (err != cudaSuccess) {
            fprintf(stderr, "ANSGpuNV12ToBGR: cudaSetDevice(%d) failed: %s\n",
                    gpuIndex, cudaGetErrorString(err));
            return -2;
        }
    }

    t_bufs.ensureAllocated(width, height);
    if (!t_bufs.d_yPlane) return -3;  // allocation failed

    // Upload Y and UV planes to GPU
    cudaMemcpy2DAsync(t_bufs.d_yPlane, width,
                      yPlane, yLinesize,
                      width, height,
                      cudaMemcpyHostToDevice, t_bufs.stream);

    cudaMemcpy2DAsync(t_bufs.d_uvPlane, width,
                      uvPlane, uvLinesize,
                      width, height / 2,
                      cudaMemcpyHostToDevice, t_bufs.stream);

    // Launch BGR kernel
    dim3 block(32, 8);
    dim3 grid((width + block.x - 1) / block.x,
              (height + block.y - 1) / block.y);
    nv12ToBgrKernel<<<grid, block, 0, t_bufs.stream>>>(
        t_bufs.d_yPlane, width,
        t_bufs.d_uvPlane, width,
        t_bufs.d_bgrOut, width * 3,
        width, height);

    // Download BGR result back to CPU
    cudaMemcpy2DAsync(bgrOut, bgrStep,
                      t_bufs.d_bgrOut, width * 3,
                      width * 3, height,
                      cudaMemcpyDeviceToHost, t_bufs.stream);

    cudaStreamSynchronize(t_bufs.stream);

    // Check for errors
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        fprintf(stderr, "ANSGpuNV12ToBGR: CUDA error: %s\n", cudaGetErrorString(err));
        return -4;
    }

    return 0;  // success
}

// ============================================================================
// NV12 rectangular crop → BGR kernel
// Reads a rectangular region from the full NV12 frame and outputs BGR.
// No resize — output is the same size as the crop region.
// ============================================================================
__global__ void nv12CropToBGRKernel(
    const uint8_t* __restrict__ yPlane,  int yPitch,
    const uint8_t* __restrict__ uvPlane, int uvPitch,
    uint8_t* __restrict__ bgrOut,        int outPitch,
    int outW, int outH,
    int cropX, int cropY,
    int srcW, int srcH)
{
    const int ox = blockIdx.x * blockDim.x + threadIdx.x;
    const int oy = blockIdx.y * blockDim.y + threadIdx.y;
    if (ox >= outW || oy >= outH) return;

    // Map output pixel to source coordinate
    const int sx = cropX + ox;
    const int sy = cropY + oy;

    // Clamp to source bounds
    const int cx = min(max(sx, 0), srcW - 1);
    const int cy = min(max(sy, 0), srcH - 1);

    // Read Y
    const float Y = static_cast<float>(yPlane[cy * yPitch + cx]);

    // Bilinear UV (chroma subsampled 2:1 in both dimensions, interleaved NV12)
    // Chroma sample centers are at (0.5, 0.5) offset in luma coords, so map
    // luma (cx,cy) → chroma continuous coords, then bilinear-interpolate.
    const float csx = static_cast<float>(cx) * 0.5f;
    const float csy = static_cast<float>(cy) * 0.5f;
    const int cx0 = static_cast<int>(csx);
    const int cy0 = static_cast<int>(csy);
    const int cx1 = min(cx0 + 1, (srcW >> 1) - 1);
    const int cy1 = min(cy0 + 1, (srcH >> 1) - 1);
    const float cfx = csx - static_cast<float>(cx0);
    const float cfy = csy - static_cast<float>(cy0);

    const float u00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2]);
    const float u01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2]);
    const float u10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2]);
    const float u11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2]);
    const float U = (u00 * (1.f - cfx) * (1.f - cfy) + u01 * cfx * (1.f - cfy)
                    + u10 * (1.f - cfx) * cfy          + u11 * cfx * cfy) - 128.f;

    const float v00 = static_cast<float>(uvPlane[cy0 * uvPitch + cx0 * 2 + 1]);
    const float v01 = static_cast<float>(uvPlane[cy0 * uvPitch + cx1 * 2 + 1]);
    const float v10 = static_cast<float>(uvPlane[cy1 * uvPitch + cx0 * 2 + 1]);
    const float v11 = static_cast<float>(uvPlane[cy1 * uvPitch + cx1 * 2 + 1]);
    const float V = (v00 * (1.f - cfx) * (1.f - cfy) + v01 * cfx * (1.f - cfy)
                    + v10 * (1.f - cfx) * cfy          + v11 * cfx * cfy) - 128.f;

    // BT.601 YUV → RGB
    uint8_t r, g, b;
    nv12_yuv_to_rgb(Y, U, V, r, g, b);

    // Write BGR (OpenCV channel order)
    const int outIdx = oy * outPitch + ox * 3;
    bgrOut[outIdx]     = b;
    bgrOut[outIdx + 1] = g;
    bgrOut[outIdx + 2] = r;
}

extern "C" void launchNV12CropToBGR(
    const uint8_t* yPlane,  int yPitch,
    const uint8_t* uvPlane, int uvPitch,
    uint8_t* bgrOut,        int outPitch,
    int outW, int outH,
    int cropX, int cropY,
    int srcW, int srcH,
    cudaStream_t stream)
{
    dim3 block(32, 8);
    dim3 grid((outW + block.x - 1) / block.x,
              (outH + block.y - 1) / block.y);
    nv12CropToBGRKernel<<<grid, block, 0, stream>>>(
        yPlane, yPitch, uvPlane, uvPitch,
        bgrOut, outPitch,
        outW, outH,
        cropX, cropY,
        srcW, srcH);
}