ANSCORE/modules/ANSODEngine/NV12PreprocessHelper.cpp

#include "NV12PreprocessHelper.h"
#include "ANSGpuFrameRegistry.h"
#include "ANSEngineCommon.h"
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>
#include <cstring>

#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
// SEH-protected memcpy: returns false on access violation instead of crashing.
// Used for NV12 plane data which may originate from D3D11VA staging textures.
static bool safeMemcpy(void* dst, const void* src, size_t bytes) {
    __try {
        std::memcpy(dst, src, bytes);
        return true;
    }
    __except (EXCEPTION_EXECUTE_HANDLER) {
        return false;
    }
}
// Check if a memory range is readable (non-crashing probe via VirtualQuery)
static bool isMemoryReadable(const void* ptr, size_t bytes) {
    if (!ptr || bytes == 0) return false;
    MEMORY_BASIC_INFORMATION mbi{};
    if (VirtualQuery(ptr, &mbi, sizeof(mbi)) == 0) return false;
    if (mbi.State != MEM_COMMIT) return false;
    if (mbi.Protect & (PAGE_NOACCESS | PAGE_GUARD)) return false;
    const DWORD readFlags = PAGE_READONLY | PAGE_READWRITE | PAGE_EXECUTE_READ | PAGE_EXECUTE_READWRITE;
    return (mbi.Protect & readFlags) != 0;
}
#else
static bool safeMemcpy(void* dst, const void* src, size_t bytes) {
    std::memcpy(dst, src, bytes);
    return true;
}
static bool isMemoryReadable(const void*, size_t) { return true; }
#endif

// NV12→RGB CUDA kernel (nv12_to_rgb.cu)
extern "C" void launchNV12ToRGB(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* rgbOut,         int outPitch,
    int width, int height,
    cudaStream_t stream);

extern "C" void launchNV12ToRGBLetterbox(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* rgbOut,         int outPitch,
    int outW, int outH,
    int srcW, int srcH,
    int unpadW, int unpadH,
    float invScale,
    cudaStream_t stream);

extern "C" void launchNV12ToRGBCenterLetterbox(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* rgbOut,         int outPitch,
    int outW, int outH,
    int srcW, int srcH,
    int unpadW, int unpadH,
    float invScale,
    int padLeft, int padTop,
    cudaStream_t stream);

extern "C" void launchNV12AffineWarpToBGR(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* bgrOut,         int outPitch,
    int outW, int outH,
    int srcW, int srcH,
    float m00, float m01, float m02,
    float m10, float m11, float m12,
    cudaStream_t stream);

extern "C" void launchNV12ToRGBResize(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* rgbOut,         int outPitch,
    int outW, int outH,
    int srcW, int srcH,
    cudaStream_t stream);

extern "C" void launchNV12ToRGBResizeCHW_uint8(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* chwOut,
    int outW, int outH,
    int srcW, int srcH,
    cudaStream_t stream);

extern "C" void launchNV12ToRGBResizeCHW_float(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    float* chwOut,
    int outW, int outH,
    int srcW, int srcH,
    cudaStream_t stream);

extern "C" void launchNV12ToBGRResize(
    const uint8_t* yPlane,   int yPitch,
    const uint8_t* uvPlane,  int uvPitch,
    uint8_t* bgrOut,         int outPitch,
    int outW, int outH,
    int srcW, int srcH,
    cudaStream_t stream);

extern "C" void launchNV12CropToBGR(
    const uint8_t* yPlane,  int yPitch,
    const uint8_t* uvPlane, int uvPitch,
    uint8_t* bgrOut,        int outPitch,
    int outW, int outH,
    int cropX, int cropY,
    int srcW, int srcH,
    cudaStream_t stream);

namespace ANSCENTER {

    NV12PreprocessHelper::~NV12PreprocessHelper() {
        destroy();
    }

    void NV12PreprocessHelper::destroy() {
        if (m_pinnedBuf) {
            cudaFreeHost(m_pinnedBuf);
            m_pinnedBuf = nullptr;
            m_pinnedBufSize = 0;
        }
    }

    void NV12PreprocessHelper::ensurePinnedBuffer(size_t bytes, SPDLogger& logger, const char* tag) {
        if (bytes <= m_pinnedBufSize) return;
        if (m_pinnedBuf) {
            cudaFreeHost(m_pinnedBuf);
            m_pinnedBuf = nullptr;
            m_pinnedBufSize = 0;
        }
        cudaError_t err = cudaHostAlloc(&m_pinnedBuf, bytes, cudaHostAllocPortable);
        if (err != cudaSuccess) {
            if (!m_nv12PinnedLogged) {
                m_nv12PinnedLogged = true;
                logger.LogWarn(std::string(tag) + "::NV12Helper",
                    "cudaHostAlloc failed (" + std::string(cudaGetErrorString(err)) +
                    ") for " + std::to_string(bytes) + " bytes",
                    __FILE__, __LINE__);
            }
            m_pinnedBuf = nullptr;
            m_pinnedBufSize = 0;
            return;
        }
        m_pinnedBufSize = bytes;
    }

    bool NV12PreprocessHelper::isCudaContextHealthy(SPDLogger& logger, const char* tag) {
        if (m_cudaContextDead) return false;

        cudaError_t pending = cudaGetLastError();
        cudaError_t err = cudaStreamQuery(nullptr);
        if (err == cudaSuccess || err == cudaErrorNotReady) {
            m_cudaFailStreak = 0;
            return true;
        }

        ++m_cudaFailStreak;
        if (m_cudaFailStreak >= CUDA_FAIL_LIMIT && !m_cudaContextDead) {
            m_cudaContextDead = true;
            logger.LogFatal(std::string(tag) + "::NV12Helper",
                "CUDA context permanently corrupted after " +
                std::to_string(m_cudaFailStreak) +
                " consecutive failures (last error: " +
                std::string(cudaGetErrorString(err)) +
                ", pending: " + std::string(cudaGetErrorString(pending)) +
                "). GPU preprocessing DISABLED for this engine instance. "
                "This is typically caused by NVDEC/CUVID surface pool exhaustion "
                "(Error mapping a picture with CUVID: 205). "
                "Reduce the number of concurrent HW-decoded 4K streams or "
                "restart the application to recover.",
                __FILE__, __LINE__);
        }
        return false;
    }

    // ====================================================================
    //  tryNV12 — attempt NV12 fast-path preprocessing
    // ====================================================================
    NV12Result NV12PreprocessHelper::tryNV12(
            const cv::Mat& inputImage, int inferenceGpu,
            int inputW, int inputH,
            const NV12KernelLauncher& launcher,
            SPDLogger& logger, const char* tag) {
        NV12Result result;

        // Skip during warmup phase
        if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
            return result;
        }

        // Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
        GpuFrameData* gpuData = tl_currentGpuFrame();

        if (!gpuData || !gpuData->yPlane) {
            return result;  // no NV12 data — caller uses BGR
        }

        // ── BGR full-res path (pixFmt=1000 from ANSVideoPlayer) ──────────
        if (gpuData->pixelFormat == 1000) {
            cv::Mat fullResImg(gpuData->height, gpuData->width, CV_8UC3,
                               gpuData->yPlane, gpuData->yLinesize);
            if (!m_bgrFullResLogged) {
                m_bgrFullResLogged = true;
                logger.LogInfo(std::string(tag) + "::NV12Helper",
                    "BGR full-res ACTIVE: " + std::to_string(gpuData->width) + "x" +
                    std::to_string(gpuData->height) + " -> inference (display=" +
                    std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) + ")",
                    __FILE__, __LINE__);
            }
            result.useBgrFullRes = true;
            result.bgrFullResImg = fullResImg.clone();  // clone under lock
            if (inputImage.cols > 0 && inputImage.rows > 0 &&
                (fullResImg.cols != inputImage.cols || fullResImg.rows != inputImage.rows)) {
                result.bgrFullResScaleX = static_cast<float>(inputImage.cols) / static_cast<float>(fullResImg.cols);
                result.bgrFullResScaleY = static_cast<float>(inputImage.rows) / static_cast<float>(fullResImg.rows);
            }
            // (No registry lock — data kept alive by refcount)
            return result;
        }

        // ── NV12 path (pixFmt=23 = AV_PIX_FMT_NV12) ────────────────────
        if (gpuData->pixelFormat != 23) {
            return result;  // unknown format — fall back to BGR
        }

        if (!gpuData->uvPlane) {
            if (!m_nv12NullLogged) {
                m_nv12NullLogged = true;
                logger.LogWarn(std::string(tag) + "::NV12Helper",
                    "Early exit: null uvPlane",
                    __FILE__, __LINE__);
            }
            return result;
        }

        const int frameW = gpuData->width;
        const int frameH = gpuData->height;

        if (frameW <= 0 || frameH <= 0) {
            if (!m_nv12DimLogged) {
                m_nv12DimLogged = true;
                logger.LogWarn(std::string(tag) + "::NV12Helper",
                    "Early exit: bad dimensions — w=" + std::to_string(frameW) +
                    " h=" + std::to_string(frameH),
                    __FILE__, __LINE__);
            }
            return result;
        }

        if (m_cudaContextDead) {
            if (!m_nv12DeadLogged) {
                m_nv12DeadLogged = true;
                logger.LogWarn(std::string(tag) + "::NV12Helper",
                    "Early exit: CUDA context dead",
                    __FILE__, __LINE__);
            }
            return result;
        }

        const bool isCudaDevice = gpuData->isCudaDevicePtr;
        const bool gpuMatch = !isCudaDevice ||
                              gpuData->gpuIndex < 0 ||
                              gpuData->gpuIndex == inferenceGpu;
        const bool useZeroCopy = isCudaDevice && gpuMatch;

        // --- Debug: log pointer state before reading ---
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
        {
            char _nv12_dbg[512];
            snprintf(_nv12_dbg, sizeof(_nv12_dbg),
                "[NV12Helper] tryNV12: gpuData=%p yPlane=%p uvPlane=%p isCuda=%d "
                "gpuIdx=%d infGpu=%d gpuMatch=%d zeroCopy=%d "
                "gpuCacheY=%p gpuCacheUV=%p gpuCacheValid=%d refcount=%d %dx%d\n",
                (void*)gpuData, (void*)gpuData->yPlane, (void*)gpuData->uvPlane,
                (int)isCudaDevice, gpuData->gpuIndex, inferenceGpu,
                (int)gpuMatch, (int)useZeroCopy,
                gpuData->gpuCacheY, gpuData->gpuCacheUV,
                (int)gpuData->gpuCacheValid,
                gpuData->refcount.load(),
                frameW, frameH);
#ifdef _WIN32
            OutputDebugStringA(_nv12_dbg);
#endif
            fprintf(stderr, "%s", _nv12_dbg);
        }
#endif

        // Effective plane pointers — for zero-copy, use CUDA device ptrs;
        // for CPU upload, use the CPU snapshot buffers.
        uint8_t* effYPlane;
        uint8_t* effUvPlane;
        int      effYLinesize;
        int      effUvLinesize;

        if (useZeroCopy) {
            // Same GPU: wrap owned CUDA device pointers directly
            effYPlane     = gpuData->yPlane;
            effUvPlane    = gpuData->uvPlane;
            effYLinesize  = gpuData->yLinesize;
            effUvLinesize = gpuData->uvLinesize;
        } else if (isCudaDevice && !gpuMatch) {
            // Cross-GPU: can't use CUDA device ptrs from another GPU context.
            // Fall through to CPU upload using the CPU snapshot buffers.
            if (gpuData->cpuYPlane && gpuData->cpuUvPlane) {
                effYPlane     = gpuData->cpuYPlane;
                effUvPlane    = gpuData->cpuUvPlane;
                effYLinesize  = gpuData->cpuYLinesize;
                effUvLinesize = gpuData->cpuUvLinesize;
                if (!m_gpuMismatchLogged) {
                    m_gpuMismatchLogged = true;
                    logger.LogInfo(std::string(tag) + "::NV12Helper",
                        "GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) +
                        " vs inference GPU " + std::to_string(inferenceGpu) +
                        ") — using CPU NV12 upload fallback",
                        __FILE__, __LINE__);
                }
            } else {
                // No CPU fallback available — fall back to BGR
                if (!m_gpuMismatchLogged) {
                    m_gpuMismatchLogged = true;
                    logger.LogInfo(std::string(tag) + "::NV12Helper",
                        "GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) +
                        " vs inference GPU " + std::to_string(inferenceGpu) +
                        ") — no CPU fallback, using BGR",
                        __FILE__, __LINE__);
                }
                return result;
            }
        } else {
            // Not CUDA device ptrs: use whatever yPlane/uvPlane point to (CPU data)
            effYPlane     = gpuData->yPlane;
            effUvPlane    = gpuData->uvPlane;
            effYLinesize  = gpuData->yLinesize;
            effUvLinesize = gpuData->uvLinesize;
        }

        // Log path selection once — both spdlog and DebugView
        if (!m_nv12PathLogged) {
            m_nv12PathLogged = true;
            const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY" : "CPU_NV12_UPLOAD";
            auto msg = std::string("Path: ") + pathName +
                " | isCuda=" + std::to_string(isCudaDevice) +
                " gpuMatch=" + std::to_string(gpuMatch) +
                " decodeGpu=" + std::to_string(gpuData->gpuIndex) +
                " infGpu=" + std::to_string(inferenceGpu) +
                " frame=" + std::to_string(frameW) + "x" + std::to_string(frameH);
            logger.LogInfo(std::string(tag) + "::NV12Helper", msg, __FILE__, __LINE__);
#ifdef _WIN32
            auto dbg = std::string("[NV12 ACTIVE] ") + tag + " " + msg + "\n";
            OutputDebugStringA(dbg.c_str());
#endif
        }

        cv::cuda::Stream stream;
        cv::cuda::GpuMat gpuY, gpuUV;

        if (useZeroCopy) {
            // CUDA zero-copy: wrap pool buffer device pointers directly
            gpuY  = cv::cuda::GpuMat(frameH,     frameW, CV_8UC1,
                        effYPlane,  static_cast<size_t>(effYLinesize));
            gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
                        effUvPlane, static_cast<size_t>(effUvLinesize));
        } else {
            // CPU path: memcpy + upload
            const size_t ySize  = static_cast<size_t>(frameW) * frameH;
            const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
            const size_t nv12Size = ySize + uvSize;
            ensurePinnedBuffer(nv12Size, logger, tag);
            if (!m_pinnedBuf) {
                return result;
            }

            // Validate NV12 plane pointers before memcpy
            const size_t yBufNeeded  = (effYLinesize == frameW)
                ? ySize
                : static_cast<size_t>(effYLinesize) * frameH;
            const size_t uvBufNeeded = (effUvLinesize == frameW)
                ? uvSize
                : static_cast<size_t>(effUvLinesize) * (frameH / 2);

            if (!isMemoryReadable(effYPlane, std::min(yBufNeeded, (size_t)4096)) ||
                !isMemoryReadable(effUvPlane, std::min(uvBufNeeded, (size_t)4096))) {
                logger.LogWarn(std::string(tag) + "::NV12Helper",
                    "NV12 plane pointers not readable — falling back to BGR",
                    __FILE__, __LINE__);
                return result;
            }

            uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
            bool cpyOk = true;
            if (effYLinesize == frameW) {
                cpyOk = safeMemcpy(dst, effYPlane, ySize);
            } else {
                for (int row = 0; row < frameH && cpyOk; row++)
                    cpyOk = safeMemcpy(dst + row * frameW,
                                effYPlane + row * effYLinesize, frameW);
            }
            if (cpyOk) {
                uint8_t* uvDst = dst + ySize;
                if (effUvLinesize == frameW) {
                    cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
                } else {
                    for (int row = 0; row < frameH / 2 && cpyOk; row++)
                        cpyOk = safeMemcpy(uvDst + row * frameW,
                                    effUvPlane + row * effUvLinesize, frameW);
                }
            }

            if (!cpyOk) {
                logger.LogWarn(std::string(tag) + "::NV12Helper",
                    "Access violation during NV12 memcpy — falling back to BGR",
                    __FILE__, __LINE__);
                return result;
            }

            // NV12 data safely in pinned memory — release registry lock
            // (No registry lock to release — data kept alive by refcount)

            cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
            cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
                static_cast<uint8_t*>(m_pinnedBuf) + ySize);
            gpuY.upload(pinnedY, stream);
            gpuUV.upload(pinnedUV, stream);
        }

        // Use display dimensions for coordinate mapping
        const float metaW = static_cast<float>(inputImage.cols > 0 ? inputImage.cols : frameW);
        const float metaH = static_cast<float>(inputImage.rows > 0 ? inputImage.rows : frameH);

        if (metaH <= 0 || metaW <= 0) {
            if (!m_nv12MetaLogged) {
                m_nv12MetaLogged = true;
                logger.LogWarn(std::string(tag) + "::NV12Helper",
                    "Early exit: metadata dims invalid",
                    __FILE__, __LINE__);
            }
            return result;
        }

        result.metaWidth  = metaW;
        result.metaHeight = metaH;
        result.ratio = 1.f / std::min(
            static_cast<float>(inputW) / metaW,
            static_cast<float>(inputH) / metaH);

        // Launch engine-specific kernel
        cv::cuda::GpuMat gpuResized;
        gpuResized.create(inputH, inputW, CV_8UC3);

        cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
        {
            char _nv12_dbg2[256];
            snprintf(_nv12_dbg2, sizeof(_nv12_dbg2),
                "[NV12Helper] KERNEL LAUNCH: gpuY=%p(%dx%d) gpuUV=%p(%dx%d) -> %dx%d zeroCopy=%d\n",
                (void*)gpuY.data, gpuY.cols, gpuY.rows,
                (void*)gpuUV.data, gpuUV.cols, gpuUV.rows,
                inputW, inputH, (int)useZeroCopy);
#ifdef _WIN32
            OutputDebugStringA(_nv12_dbg2);
#endif
            fprintf(stderr, "%s", _nv12_dbg2);
        }
#endif
        launcher(gpuY, gpuUV, gpuResized, frameW, frameH, inputW, inputH, rawStream);

        stream.waitForCompletion();

        // (No registry lock to release — data kept alive by refcount)

        // Log NV12 fast-path usage once per instance
        if (!m_nv12ActiveLogged) {
            m_nv12ActiveLogged = true;
            const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload";
            logger.LogInfo(std::string(tag) + "::NV12Helper",
                std::string(mode) + " ACTIVE: " +
                std::to_string(frameW) + "x" + std::to_string(frameH) +
                " NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) +
                " display=" + std::to_string(static_cast<int>(metaW)) + "x" +
                std::to_string(static_cast<int>(metaH)),
                __FILE__, __LINE__);
        }

        result.succeeded = true;
        result.gpuRGB    = std::move(gpuResized);
        return result;
    }

    // ====================================================================
    //  Default YOLO kernel launcher
    //
    //  For detection/seg/pose/OBB: fused NV12→RGB + letterbox in a single
    //  kernel at model input resolution (e.g. 640x640).
    //  For classification: separate NV12→RGB at full res, then cuda::resize.
    //  (Classification detection is not done here — the caller's engine
    //   should set up the appropriate launcher if needed. This default
    //   always uses the fused letterbox path, which works for all YOLO
    //   detection/seg/pose/OBB tasks.)
    // ====================================================================
    NV12KernelLauncher NV12PreprocessHelper::defaultYOLOLauncher() {
        return [](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
                  cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
                  int inputW, int inputH, cudaStream_t stream) {
            if (srcW == inputW && srcH == inputH) {
                // Source matches model input — direct NV12→RGB, no resize needed
                launchNV12ToRGB(
                    gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
                    gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
                    gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
                    srcW, srcH, stream);
            } else {
                // Fused NV12→RGB + letterbox resize
                float r = std::min(static_cast<float>(inputW) / srcW,
                                   static_cast<float>(inputH) / srcH);
                int unpadW = static_cast<int>(r * srcW);
                int unpadH = static_cast<int>(r * srcH);
                float invScale = 1.0f / r;

                launchNV12ToRGBLetterbox(
                    gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
                    gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
                    gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
                    inputW, inputH,
                    srcW, srcH,
                    unpadW, unpadH,
                    invScale, stream);
            }
        };
    }

    // ====================================================================
    //  Classification launcher (direct resize, no letterbox)
    // ====================================================================
    NV12KernelLauncher NV12PreprocessHelper::classificationLauncher() {
        return [](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
                  cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
                  int inputW, int inputH, cudaStream_t stream) {
            if (srcW == inputW && srcH == inputH) {
                // Source matches model input — direct NV12→RGB, no resize needed
                launchNV12ToRGB(
                    gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
                    gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
                    gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
                    srcW, srcH, stream);
            } else {
                // Fused NV12→RGB + direct resize (stretch to fill, no padding)
                launchNV12ToRGBResize(
                    gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
                    gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
                    gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
                    inputW, inputH,
                    srcW, srcH, stream);
            }
        };
    }

    // ====================================================================
    //  NV12→BGR fused resize (for OCR detection)
    // ====================================================================
    void NV12PreprocessHelper::nv12ToBGRResize(
        const uint8_t* devY, int yPitch,
        const uint8_t* devUV, int uvPitch,
        uint8_t* bgrOut, int outPitch,
        int outW, int outH, int srcW, int srcH,
        cudaStream_t stream) {
        launchNV12ToBGRResize(devY, yPitch, devUV, uvPitch,
                              bgrOut, outPitch, outW, outH, srcW, srcH, stream);
    }

    //  SCRFD center-padded letterbox launcher
    // ====================================================================
    NV12KernelLauncher NV12PreprocessHelper::scrfdCenterLetterboxLauncher(int padLeft, int padTop) {
        return [padLeft, padTop](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
                  cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
                  int inputW, int inputH, cudaStream_t stream) {
            float r = std::min(static_cast<float>(inputW) / srcW,
                               static_cast<float>(inputH) / srcH);
            int unpadW = static_cast<int>(r * srcW);
            int unpadH = static_cast<int>(r * srcH);
            float invScale = 1.0f / r;

            launchNV12ToRGBCenterLetterbox(
                gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
                gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
                gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
                inputW, inputH,
                srcW, srcH,
                unpadW, unpadH,
                invScale,
                padLeft, padTop,
                stream);
        };
    }

    // ====================================================================
    //  NV12 affine warp for face alignment
    //
    //  Reads full-res NV12 from the registry, applies inverse affine
    //  transform via CUDA kernel, outputs a small aligned face in BGR.
    //  The affine matrix is computed on CPU (from detected landmarks to
    //  ArcFace template), then scaled to map to full-res NV12 source.
    // ====================================================================
    NV12PreprocessHelper::NV12AffineResult NV12PreprocessHelper::tryNV12AffineWarp(
            const cv::Mat& inputImage, int inferenceGpu,
            const cv::Mat& affineMatrix, int outW, int outH,
            float scaleX, float scaleY,
            SPDLogger& logger, const char* tag) {
        NV12AffineResult result;

        // Skip during warmup
        if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
            return result;
        }

        if (affineMatrix.empty()) {
            return result;
        }

        // Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
        GpuFrameData* gpuData = tl_currentGpuFrame();

        if (!gpuData || !gpuData->yPlane || !gpuData->uvPlane || gpuData->pixelFormat != 23) {
            return result;
        }

        const int frameW = gpuData->width;
        const int frameH = gpuData->height;
        if (frameW <= 0 || frameH <= 0 || m_cudaContextDead) {
            return result;
        }

        const bool isCudaDevice = gpuData->isCudaDevicePtr;
        const bool gpuMatch = !isCudaDevice ||
                              gpuData->gpuIndex < 0 ||
                              gpuData->gpuIndex == inferenceGpu;

        if (isCudaDevice && !gpuMatch) {
            return result;  // cross-GPU — fall back to CPU warpAffine
        }

        const bool useZeroCopy = isCudaDevice && gpuMatch;
        uint8_t* effYPlane     = gpuData->yPlane;
        uint8_t* effUvPlane    = gpuData->uvPlane;
        int      effYLinesize  = gpuData->yLinesize;
        int      effUvLinesize = gpuData->uvLinesize;

        // Compute inverse affine matrix and scale for full-res NV12
        cv::Mat M_inv;
        cv::invertAffineTransform(affineMatrix, M_inv);
        M_inv.convertTo(M_inv, CV_64F);

        // Scale inverse affine to map from 112x112 output → full-res NV12 source
        // Row 0 maps to X (multiply by scaleX), Row 1 maps to Y (multiply by scaleY)
        double* row0 = M_inv.ptr<double>(0);
        double* row1 = M_inv.ptr<double>(1);
        row0[0] *= scaleX; row0[1] *= scaleX; row0[2] *= scaleX;
        row1[0] *= scaleY; row1[1] *= scaleY; row1[2] *= scaleY;

        float m00 = static_cast<float>(row0[0]);
        float m01 = static_cast<float>(row0[1]);
        float m02 = static_cast<float>(row0[2]);
        float m10 = static_cast<float>(row1[0]);
        float m11 = static_cast<float>(row1[1]);
        float m12 = static_cast<float>(row1[2]);

        cv::cuda::Stream stream;
        cv::cuda::GpuMat gpuY, gpuUV;

        if (useZeroCopy) {
            gpuY  = cv::cuda::GpuMat(frameH,     frameW, CV_8UC1,
                        effYPlane,  static_cast<size_t>(effYLinesize));
            gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
                        effUvPlane, static_cast<size_t>(effUvLinesize));
        } else {
            // CPU path: memcpy + upload
            const size_t ySize  = static_cast<size_t>(frameW) * frameH;
            const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
            const size_t nv12Size = ySize + uvSize;
            ensurePinnedBuffer(nv12Size, logger, tag);
            if (!m_pinnedBuf) {
                return result;
            }

            uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
            bool cpyOk = true;
            if (effYLinesize == frameW) {
                cpyOk = safeMemcpy(dst, effYPlane, ySize);
            } else {
                for (int row = 0; row < frameH && cpyOk; row++)
                    cpyOk = safeMemcpy(dst + row * frameW,
                                effYPlane + row * effYLinesize, frameW);
            }
            if (cpyOk) {
                uint8_t* uvDst = dst + ySize;
                if (effUvLinesize == frameW) {
                    cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
                } else {
                    for (int row = 0; row < frameH / 2 && cpyOk; row++)
                        cpyOk = safeMemcpy(uvDst + row * frameW,
                                    effUvPlane + row * effUvLinesize, frameW);
                }
            }

            if (!cpyOk) {
                return result;
            }

            // (No registry lock to release — data kept alive by refcount)

            cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
            cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
                static_cast<uint8_t*>(m_pinnedBuf) + ySize);
            gpuY.upload(pinnedY, stream);
            gpuUV.upload(pinnedUV, stream);
        }

        // Launch affine warp kernel (outputs BGR 112x112)
        cv::cuda::GpuMat gpuFace(outH, outW, CV_8UC3);
        cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);

        launchNV12AffineWarpToBGR(
            gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
            gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
            gpuFace.ptr<uint8_t>(), static_cast<int>(gpuFace.step),
            outW, outH,
            frameW, frameH,
            m00, m01, m02, m10, m11, m12,
            rawStream);

        stream.waitForCompletion();

        // (No registry lock to release — data kept alive by refcount)

        // Keep GPU copy for recognizer (avoids re-upload), download CPU copy for mask
        result.gpuAlignedFace = gpuFace;
        gpuFace.download(result.alignedFaceBGR);
        result.succeeded = true;
        return result;
    }

    // ── NV12 rectangular crop → BGR ────────────────────────────────────
    NV12PreprocessHelper::NV12CropResult NV12PreprocessHelper::tryNV12CropToBGR(
        const cv::Mat& inputImage, int inferenceGpu,
        const cv::Rect& bbox, int padding,
        float scaleX, float scaleY,
        SPDLogger& logger, const char* tag)
    {
        NV12CropResult result;

        // Warmup gating
        if (m_inferenceCount < NV12_WARMUP_THRESHOLD)
            return result;

        // CUDA health check
        if (m_cudaContextDead)
            return result;

        // Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
        auto* gpuData = tl_currentGpuFrame();
        if (!gpuData || gpuData->pixelFormat != 23)
            return result;  // no NV12 data

        if (!gpuData->isCudaDevicePtr || !gpuData->yPlane || !gpuData->uvPlane)
            return result;  // NV12 not on GPU

        const int frameW = gpuData->width;
        const int frameH = gpuData->height;

        // Scale bbox from display-res to full-res NV12 coords with padding
        const int fx1 = std::max(0, static_cast<int>((bbox.x - padding) * scaleX));
        const int fy1 = std::max(0, static_cast<int>((bbox.y - padding) * scaleY));
        const int fx2 = std::min(frameW, static_cast<int>((bbox.x + bbox.width + padding) * scaleX));
        const int fy2 = std::min(frameH, static_cast<int>((bbox.y + bbox.height + padding) * scaleY));
        const int cropW = fx2 - fx1;
        const int cropH = fy2 - fy1;

        if (cropW < 5 || cropH < 5)
            return result;

        // GPU device matching
        int currentDev = 0;
        cudaGetDevice(&currentDev);
        if (inferenceGpu >= 0 && currentDev != inferenceGpu)
            cudaSetDevice(inferenceGpu);

        // Wrap existing GPU NV12 planes
        cv::cuda::GpuMat gpuY(frameH, frameW, CV_8UC1,
            gpuData->yPlane, gpuData->yLinesize);
        cv::cuda::GpuMat gpuUV(frameH / 2, frameW / 2, CV_8UC2,
            gpuData->uvPlane, gpuData->uvLinesize);

        // Allocate output on GPU
        cv::cuda::GpuMat gpuCrop(cropH, cropW, CV_8UC3);

        // Launch crop kernel
        cv::cuda::Stream stream;
        cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);

        launchNV12CropToBGR(
            gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
            gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
            gpuCrop.ptr<uint8_t>(), static_cast<int>(gpuCrop.step),
            cropW, cropH,
            fx1, fy1,
            frameW, frameH,
            rawStream);

        stream.waitForCompletion();

        // (No registry lock to release — data kept alive by refcount)

        // Download small crop to CPU
        gpuCrop.download(result.bgrCrop);
        result.succeeded = true;

        // One-shot diagnostic log
        if (!m_nv12CropLogged) {
            m_nv12CropLogged = true;
            logger.LogInfo(tag,
                "NV12 GPU crop ACTIVE: " + std::to_string(cropW) + "x" + std::to_string(cropH) +
                " BGR cropped from " + std::to_string(frameW) + "x" + std::to_string(frameH) +
                " NV12 (display=" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) +
                " scaleX=" + std::to_string(scaleX) + " scaleY=" + std::to_string(scaleY) + ")",
                __FILE__, __LINE__);
        }

        return result;
    }

    // ====================================================================
    //  tryNV12DirectToBuffer — write NV12→RGB CHW directly into TRT buffer
    //
    //  For engines that manage their own GPU buffers (e.g. SAM3 with raw
    //  TRT enqueueV3).  Same registry lookup / GPU matching / safety logic
    //  as tryNV12(), but writes CHW planar output into a pre-allocated
    //  void* GPU buffer instead of allocating an HWC GpuMat.
    // ====================================================================
    NV12PreprocessHelper::NV12DirectResult NV12PreprocessHelper::tryNV12DirectToBuffer(
            const cv::Mat& inputImage, int inferenceGpu,
            void* dstGpuBuffer, int inputW, int inputH,
            bool isFloat32,
            cudaStream_t stream,
            SPDLogger& logger, const char* tag) {
        NV12DirectResult result;

        // Skip during warmup phase
        if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
            return result;
        }

        if (!dstGpuBuffer) return result;

        // Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
        GpuFrameData* gpuData = tl_currentGpuFrame();

        if (!gpuData || !gpuData->yPlane) {
            return result;  // no NV12 data — caller uses CPU path
        }

        // BGR full-res path not applicable for direct-to-buffer (SAM3 handles its own resize)
        if (gpuData->pixelFormat == 1000) {
            return result;
        }

        // NV12 path only
        if (gpuData->pixelFormat != 23) {
            return result;
        }

        if (!gpuData->uvPlane) {
            return result;
        }

        const int frameW = gpuData->width;
        const int frameH = gpuData->height;

        if (frameW <= 0 || frameH <= 0 || m_cudaContextDead) {
            return result;
        }

        const bool isCudaDevice = gpuData->isCudaDevicePtr;
        const bool gpuMatch = !isCudaDevice ||
                              gpuData->gpuIndex < 0 ||
                              gpuData->gpuIndex == inferenceGpu;
        const bool useZeroCopy = isCudaDevice && gpuMatch;

        if (isCudaDevice && !gpuMatch) {
            return result;  // cross-GPU — fall back to CPU path
        }

        uint8_t* effYPlane     = gpuData->yPlane;
        uint8_t* effUvPlane    = gpuData->uvPlane;
        int      effYLinesize  = gpuData->yLinesize;
        int      effUvLinesize = gpuData->uvLinesize;

        // Log path selection once
        if (!m_nv12PathLogged) {
            m_nv12PathLogged = true;
            const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY" : "CPU_NV12_UPLOAD";
            logger.LogInfo(std::string(tag) + "::NV12Helper",
                std::string("Path: ") + pathName +
                " | isCuda=" + std::to_string(isCudaDevice) +
                " gpuMatch=" + std::to_string(gpuMatch) +
                " frame=" + std::to_string(frameW) + "x" + std::to_string(frameH) +
                " (direct CHW " + (isFloat32 ? "float32" : "uint8") + ")",
                __FILE__, __LINE__);
        }

        cv::cuda::GpuMat gpuY, gpuUV;

        if (useZeroCopy) {
            gpuY  = cv::cuda::GpuMat(frameH,     frameW, CV_8UC1,
                        effYPlane,  static_cast<size_t>(effYLinesize));
            gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
                        effUvPlane, static_cast<size_t>(effUvLinesize));
        } else {
            // CPU path: memcpy NV12 to pinned buffer, upload
            const size_t ySize  = static_cast<size_t>(frameW) * frameH;
            const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
            const size_t nv12Size = ySize + uvSize;
            ensurePinnedBuffer(nv12Size, logger, tag);
            if (!m_pinnedBuf) return result;

            if (!isMemoryReadable(effYPlane, std::min(static_cast<size_t>(effYLinesize) * frameH, (size_t)4096)) ||
                !isMemoryReadable(effUvPlane, std::min(static_cast<size_t>(effUvLinesize) * (frameH / 2), (size_t)4096))) {
                return result;
            }

            uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
            bool cpyOk = true;
            if (effYLinesize == frameW) {
                cpyOk = safeMemcpy(dst, effYPlane, ySize);
            } else {
                for (int row = 0; row < frameH && cpyOk; row++)
                    cpyOk = safeMemcpy(dst + row * frameW,
                                effYPlane + row * effYLinesize, frameW);
            }
            if (cpyOk) {
                uint8_t* uvDst = dst + ySize;
                if (effUvLinesize == frameW) {
                    cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
                } else {
                    for (int row = 0; row < frameH / 2 && cpyOk; row++)
                        cpyOk = safeMemcpy(uvDst + row * frameW,
                                    effUvPlane + row * effUvLinesize, frameW);
                }
            }
            if (!cpyOk) return result;

            // (No registry lock to release — data kept alive by refcount)

            cv::cuda::Stream cvStream = cv::cuda::StreamAccessor::wrapStream(stream);
            cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
            cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
                static_cast<uint8_t*>(m_pinnedBuf) + ySize);
            gpuY.upload(pinnedY, cvStream);
            gpuUV.upload(pinnedUV, cvStream);
        }

        // Launch CHW kernel directly into TRT buffer
        if (isFloat32) {
            launchNV12ToRGBResizeCHW_float(
                gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
                gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
                static_cast<float*>(dstGpuBuffer),
                inputW, inputH, frameW, frameH, stream);
        } else {
            launchNV12ToRGBResizeCHW_uint8(
                gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
                gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
                static_cast<uint8_t*>(dstGpuBuffer),
                inputW, inputH, frameW, frameH, stream);
        }

        // Use polling sync instead of cudaStreamSynchronize to avoid
        // holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
        {
            cudaError_t err = cudaStreamQuery(stream);
            while (err == cudaErrorNotReady) {
                Sleep(0);
                err = cudaStreamQuery(stream);
            }
        }

        // (No registry lock to release — data kept alive by refcount)

        // Log activation once
        if (!m_nv12ActiveLogged) {
            m_nv12ActiveLogged = true;
            const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload";
            logger.LogInfo(std::string(tag) + "::NV12Helper",
                std::string(mode) + " ACTIVE: " +
                std::to_string(frameW) + "x" + std::to_string(frameH) +
                " NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) +
                " CHW " + (isFloat32 ? "float32" : "uint8"),
                __FILE__, __LINE__);
        }

        result.succeeded  = true;
        result.metaWidth  = static_cast<float>(inputImage.cols > 0 ? inputImage.cols : frameW);
        result.metaHeight = static_cast<float>(inputImage.rows > 0 ? inputImage.rows : frameH);
        return result;
    }

} // namespace ANSCENTER