Refactor project structure

2026-03-28 19:56:39 +11:00
parent 1d267378b2
commit 8a2e721058
511 changed files with 59 additions and 48 deletions
--- a/modules/ANSODEngine/NV12PreprocessHelper.cpp
+++ b/modules/ANSODEngine/NV12PreprocessHelper.cpp
@@ -0,0 +1,970 @@
+#include "NV12PreprocessHelper.h"
+#include "ANSGpuFrameRegistry.h"
+#include "ANSEngineCommon.h"
+#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/cudawarping.hpp>
+#include <opencv2/core/cuda_stream_accessor.hpp>
+#include <cstring>
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+// SEH-protected memcpy: returns false on access violation instead of crashing.
+// Used for NV12 plane data which may originate from D3D11VA staging textures.
+static bool safeMemcpy(void* dst, const void* src, size_t bytes) {
+    __try {
+        std::memcpy(dst, src, bytes);
+        return true;
+    }
+    __except (EXCEPTION_EXECUTE_HANDLER) {
+        return false;
+    }
+}
+// Check if a memory range is readable (non-crashing probe via VirtualQuery)
+static bool isMemoryReadable(const void* ptr, size_t bytes) {
+    if (!ptr || bytes == 0) return false;
+    MEMORY_BASIC_INFORMATION mbi{};
+    if (VirtualQuery(ptr, &mbi, sizeof(mbi)) == 0) return false;
+    if (mbi.State != MEM_COMMIT) return false;
+    if (mbi.Protect & (PAGE_NOACCESS | PAGE_GUARD)) return false;
+    const DWORD readFlags = PAGE_READONLY | PAGE_READWRITE | PAGE_EXECUTE_READ | PAGE_EXECUTE_READWRITE;
+    return (mbi.Protect & readFlags) != 0;
+}
+#else
+static bool safeMemcpy(void* dst, const void* src, size_t bytes) {
+    std::memcpy(dst, src, bytes);
+    return true;
+}
+static bool isMemoryReadable(const void*, size_t) { return true; }
+#endif
+
+// NV12→RGB CUDA kernel (nv12_to_rgb.cu)
+extern "C" void launchNV12ToRGB(
+    const uint8_t* yPlane,   int yPitch,
+    const uint8_t* uvPlane,  int uvPitch,
+    uint8_t* rgbOut,         int outPitch,
+    int width, int height,
+    cudaStream_t stream);
+
+extern "C" void launchNV12ToRGBLetterbox(
+    const uint8_t* yPlane,   int yPitch,
+    const uint8_t* uvPlane,  int uvPitch,
+    uint8_t* rgbOut,         int outPitch,
+    int outW, int outH,
+    int srcW, int srcH,
+    int unpadW, int unpadH,
+    float invScale,
+    cudaStream_t stream);
+
+extern "C" void launchNV12ToRGBCenterLetterbox(
+    const uint8_t* yPlane,   int yPitch,
+    const uint8_t* uvPlane,  int uvPitch,
+    uint8_t* rgbOut,         int outPitch,
+    int outW, int outH,
+    int srcW, int srcH,
+    int unpadW, int unpadH,
+    float invScale,
+    int padLeft, int padTop,
+    cudaStream_t stream);
+
+extern "C" void launchNV12AffineWarpToBGR(
+    const uint8_t* yPlane,   int yPitch,
+    const uint8_t* uvPlane,  int uvPitch,
+    uint8_t* bgrOut,         int outPitch,
+    int outW, int outH,
+    int srcW, int srcH,
+    float m00, float m01, float m02,
+    float m10, float m11, float m12,
+    cudaStream_t stream);
+
+extern "C" void launchNV12ToRGBResize(
+    const uint8_t* yPlane,   int yPitch,
+    const uint8_t* uvPlane,  int uvPitch,
+    uint8_t* rgbOut,         int outPitch,
+    int outW, int outH,
+    int srcW, int srcH,
+    cudaStream_t stream);
+
+extern "C" void launchNV12ToRGBResizeCHW_uint8(
+    const uint8_t* yPlane,   int yPitch,
+    const uint8_t* uvPlane,  int uvPitch,
+    uint8_t* chwOut,
+    int outW, int outH,
+    int srcW, int srcH,
+    cudaStream_t stream);
+
+extern "C" void launchNV12ToRGBResizeCHW_float(
+    const uint8_t* yPlane,   int yPitch,
+    const uint8_t* uvPlane,  int uvPitch,
+    float* chwOut,
+    int outW, int outH,
+    int srcW, int srcH,
+    cudaStream_t stream);
+
+extern "C" void launchNV12ToBGRResize(
+    const uint8_t* yPlane,   int yPitch,
+    const uint8_t* uvPlane,  int uvPitch,
+    uint8_t* bgrOut,         int outPitch,
+    int outW, int outH,
+    int srcW, int srcH,
+    cudaStream_t stream);
+
+extern "C" void launchNV12CropToBGR(
+    const uint8_t* yPlane,  int yPitch,
+    const uint8_t* uvPlane, int uvPitch,
+    uint8_t* bgrOut,        int outPitch,
+    int outW, int outH,
+    int cropX, int cropY,
+    int srcW, int srcH,
+    cudaStream_t stream);
+
+namespace ANSCENTER {
+
+    NV12PreprocessHelper::~NV12PreprocessHelper() {
+        destroy();
+    }
+
+    void NV12PreprocessHelper::destroy() {
+        if (m_pinnedBuf) {
+            cudaFreeHost(m_pinnedBuf);
+            m_pinnedBuf = nullptr;
+            m_pinnedBufSize = 0;
+        }
+    }
+
+    void NV12PreprocessHelper::ensurePinnedBuffer(size_t bytes, SPDLogger& logger, const char* tag) {
+        if (bytes <= m_pinnedBufSize) return;
+        if (m_pinnedBuf) {
+            cudaFreeHost(m_pinnedBuf);
+            m_pinnedBuf = nullptr;
+            m_pinnedBufSize = 0;
+        }
+        cudaError_t err = cudaHostAlloc(&m_pinnedBuf, bytes, cudaHostAllocPortable);
+        if (err != cudaSuccess) {
+            if (!m_nv12PinnedLogged) {
+                m_nv12PinnedLogged = true;
+                logger.LogWarn(std::string(tag) + "::NV12Helper",
+                    "cudaHostAlloc failed (" + std::string(cudaGetErrorString(err)) +
+                    ") for " + std::to_string(bytes) + " bytes",
+                    __FILE__, __LINE__);
+            }
+            m_pinnedBuf = nullptr;
+            m_pinnedBufSize = 0;
+            return;
+        }
+        m_pinnedBufSize = bytes;
+    }
+
+    bool NV12PreprocessHelper::isCudaContextHealthy(SPDLogger& logger, const char* tag) {
+        if (m_cudaContextDead) return false;
+
+        cudaError_t pending = cudaGetLastError();
+        cudaError_t err = cudaStreamQuery(nullptr);
+        if (err == cudaSuccess || err == cudaErrorNotReady) {
+            m_cudaFailStreak = 0;
+            return true;
+        }
+
+        ++m_cudaFailStreak;
+        if (m_cudaFailStreak >= CUDA_FAIL_LIMIT && !m_cudaContextDead) {
+            m_cudaContextDead = true;
+            logger.LogFatal(std::string(tag) + "::NV12Helper",
+                "CUDA context permanently corrupted after " +
+                std::to_string(m_cudaFailStreak) +
+                " consecutive failures (last error: " +
+                std::string(cudaGetErrorString(err)) +
+                ", pending: " + std::string(cudaGetErrorString(pending)) +
+                "). GPU preprocessing DISABLED for this engine instance. "
+                "This is typically caused by NVDEC/CUVID surface pool exhaustion "
+                "(Error mapping a picture with CUVID: 205). "
+                "Reduce the number of concurrent HW-decoded 4K streams or "
+                "restart the application to recover.",
+                __FILE__, __LINE__);
+        }
+        return false;
+    }
+
+    // ====================================================================
+    //  tryNV12 — attempt NV12 fast-path preprocessing
+    // ====================================================================
+    NV12Result NV12PreprocessHelper::tryNV12(
+            const cv::Mat& inputImage, int inferenceGpu,
+            int inputW, int inputH,
+            const NV12KernelLauncher& launcher,
+            SPDLogger& logger, const char* tag) {
+        NV12Result result;
+
+        // Skip during warmup phase
+        if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
+            return result;
+        }
+
+        // Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
+        GpuFrameData* gpuData = tl_currentGpuFrame();
+
+        if (!gpuData || !gpuData->yPlane) {
+            return result;  // no NV12 data — caller uses BGR
+        }
+
+        // ── BGR full-res path (pixFmt=1000 from ANSVideoPlayer) ──────────
+        if (gpuData->pixelFormat == 1000) {
+            cv::Mat fullResImg(gpuData->height, gpuData->width, CV_8UC3,
+                               gpuData->yPlane, gpuData->yLinesize);
+            if (!m_bgrFullResLogged) {
+                m_bgrFullResLogged = true;
+                logger.LogInfo(std::string(tag) + "::NV12Helper",
+                    "BGR full-res ACTIVE: " + std::to_string(gpuData->width) + "x" +
+                    std::to_string(gpuData->height) + " -> inference (display=" +
+                    std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) + ")",
+                    __FILE__, __LINE__);
+            }
+            result.useBgrFullRes = true;
+            result.bgrFullResImg = fullResImg.clone();  // clone under lock
+            if (inputImage.cols > 0 && inputImage.rows > 0 &&
+                (fullResImg.cols != inputImage.cols || fullResImg.rows != inputImage.rows)) {
+                result.bgrFullResScaleX = static_cast<float>(inputImage.cols) / static_cast<float>(fullResImg.cols);
+                result.bgrFullResScaleY = static_cast<float>(inputImage.rows) / static_cast<float>(fullResImg.rows);
+            }
+            // (No registry lock — data kept alive by refcount)
+            return result;
+        }
+
+        // ── NV12 path (pixFmt=23 = AV_PIX_FMT_NV12) ────────────────────
+        if (gpuData->pixelFormat != 23) {
+            return result;  // unknown format — fall back to BGR
+        }
+
+        if (!gpuData->uvPlane) {
+            if (!m_nv12NullLogged) {
+                m_nv12NullLogged = true;
+                logger.LogWarn(std::string(tag) + "::NV12Helper",
+                    "Early exit: null uvPlane",
+                    __FILE__, __LINE__);
+            }
+            return result;
+        }
+
+        const int frameW = gpuData->width;
+        const int frameH = gpuData->height;
+
+        if (frameW <= 0 || frameH <= 0) {
+            if (!m_nv12DimLogged) {
+                m_nv12DimLogged = true;
+                logger.LogWarn(std::string(tag) + "::NV12Helper",
+                    "Early exit: bad dimensions — w=" + std::to_string(frameW) +
+                    " h=" + std::to_string(frameH),
+                    __FILE__, __LINE__);
+            }
+            return result;
+        }
+
+        if (m_cudaContextDead) {
+            if (!m_nv12DeadLogged) {
+                m_nv12DeadLogged = true;
+                logger.LogWarn(std::string(tag) + "::NV12Helper",
+                    "Early exit: CUDA context dead",
+                    __FILE__, __LINE__);
+            }
+            return result;
+        }
+
+        const bool isCudaDevice = gpuData->isCudaDevicePtr;
+        const bool gpuMatch = !isCudaDevice ||
+                              gpuData->gpuIndex < 0 ||
+                              gpuData->gpuIndex == inferenceGpu;
+        const bool useZeroCopy = isCudaDevice && gpuMatch;
+
+        // Effective plane pointers — for zero-copy, use CUDA device ptrs;
+        // for CPU upload, use the CPU snapshot buffers.
+        uint8_t* effYPlane;
+        uint8_t* effUvPlane;
+        int      effYLinesize;
+        int      effUvLinesize;
+
+        if (useZeroCopy) {
+            // Same GPU: wrap NVDEC device pointers directly
+            effYPlane     = gpuData->yPlane;
+            effUvPlane    = gpuData->uvPlane;
+            effYLinesize  = gpuData->yLinesize;
+            effUvLinesize = gpuData->uvLinesize;
+        } else if (isCudaDevice && !gpuMatch) {
+            // Cross-GPU: can't use CUDA device ptrs from another GPU context.
+            // Fall through to CPU upload using the CPU snapshot buffers.
+            if (gpuData->cpuYPlane && gpuData->cpuUvPlane) {
+                effYPlane     = gpuData->cpuYPlane;
+                effUvPlane    = gpuData->cpuUvPlane;
+                effYLinesize  = gpuData->cpuYLinesize;
+                effUvLinesize = gpuData->cpuUvLinesize;
+                if (!m_gpuMismatchLogged) {
+                    m_gpuMismatchLogged = true;
+                    logger.LogInfo(std::string(tag) + "::NV12Helper",
+                        "GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) +
+                        " vs inference GPU " + std::to_string(inferenceGpu) +
+                        ") — using CPU NV12 upload fallback",
+                        __FILE__, __LINE__);
+                }
+            } else {
+                // No CPU fallback available — fall back to BGR
+                if (!m_gpuMismatchLogged) {
+                    m_gpuMismatchLogged = true;
+                    logger.LogInfo(std::string(tag) + "::NV12Helper",
+                        "GPU mismatch (decode GPU " + std::to_string(gpuData->gpuIndex) +
+                        " vs inference GPU " + std::to_string(inferenceGpu) +
+                        ") — no CPU fallback, using BGR",
+                        __FILE__, __LINE__);
+                }
+                return result;
+            }
+        } else {
+            // Not CUDA device ptrs: use whatever yPlane/uvPlane point to (CPU data)
+            effYPlane     = gpuData->yPlane;
+            effUvPlane    = gpuData->uvPlane;
+            effYLinesize  = gpuData->yLinesize;
+            effUvLinesize = gpuData->uvLinesize;
+        }
+
+        // Log path selection once
+        if (!m_nv12PathLogged) {
+            m_nv12PathLogged = true;
+            const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY" : "CPU_NV12_UPLOAD";
+            logger.LogInfo(std::string(tag) + "::NV12Helper",
+                std::string("Path: ") + pathName +
+                " | isCuda=" + std::to_string(isCudaDevice) +
+                " gpuMatch=" + std::to_string(gpuMatch) +
+                " decodeGpu=" + std::to_string(gpuData->gpuIndex) +
+                " infGpu=" + std::to_string(inferenceGpu) +
+                " frame=" + std::to_string(frameW) + "x" + std::to_string(frameH),
+                __FILE__, __LINE__);
+        }
+
+        cv::cuda::Stream stream;
+        cv::cuda::GpuMat gpuY, gpuUV;
+
+        if (useZeroCopy) {
+            // CUDA zero-copy: wrap NVDEC device pointers directly
+            gpuY  = cv::cuda::GpuMat(frameH,     frameW, CV_8UC1,
+                        effYPlane,  static_cast<size_t>(effYLinesize));
+            gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
+                        effUvPlane, static_cast<size_t>(effUvLinesize));
+        } else {
+            // CPU path: memcpy + upload
+            const size_t ySize  = static_cast<size_t>(frameW) * frameH;
+            const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
+            const size_t nv12Size = ySize + uvSize;
+            ensurePinnedBuffer(nv12Size, logger, tag);
+            if (!m_pinnedBuf) {
+                return result;
+            }
+
+            // Validate NV12 plane pointers before memcpy
+            const size_t yBufNeeded  = (effYLinesize == frameW)
+                ? ySize
+                : static_cast<size_t>(effYLinesize) * frameH;
+            const size_t uvBufNeeded = (effUvLinesize == frameW)
+                ? uvSize
+                : static_cast<size_t>(effUvLinesize) * (frameH / 2);
+
+            if (!isMemoryReadable(effYPlane, std::min(yBufNeeded, (size_t)4096)) ||
+                !isMemoryReadable(effUvPlane, std::min(uvBufNeeded, (size_t)4096))) {
+                logger.LogWarn(std::string(tag) + "::NV12Helper",
+                    "NV12 plane pointers not readable — falling back to BGR",
+                    __FILE__, __LINE__);
+                return result;
+            }
+
+            uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
+            bool cpyOk = true;
+            if (effYLinesize == frameW) {
+                cpyOk = safeMemcpy(dst, effYPlane, ySize);
+            } else {
+                for (int row = 0; row < frameH && cpyOk; row++)
+                    cpyOk = safeMemcpy(dst + row * frameW,
+                                effYPlane + row * effYLinesize, frameW);
+            }
+            if (cpyOk) {
+                uint8_t* uvDst = dst + ySize;
+                if (effUvLinesize == frameW) {
+                    cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
+                } else {
+                    for (int row = 0; row < frameH / 2 && cpyOk; row++)
+                        cpyOk = safeMemcpy(uvDst + row * frameW,
+                                    effUvPlane + row * effUvLinesize, frameW);
+                }
+            }
+
+            if (!cpyOk) {
+                logger.LogWarn(std::string(tag) + "::NV12Helper",
+                    "Access violation during NV12 memcpy — falling back to BGR",
+                    __FILE__, __LINE__);
+                return result;
+            }
+
+            // NV12 data safely in pinned memory — release registry lock
+            // (No registry lock to release — data kept alive by refcount)
+
+            cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
+            cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
+                static_cast<uint8_t*>(m_pinnedBuf) + ySize);
+            gpuY.upload(pinnedY, stream);
+            gpuUV.upload(pinnedUV, stream);
+        }
+
+        // Use display dimensions for coordinate mapping
+        const float metaW = static_cast<float>(inputImage.cols > 0 ? inputImage.cols : frameW);
+        const float metaH = static_cast<float>(inputImage.rows > 0 ? inputImage.rows : frameH);
+
+        if (metaH <= 0 || metaW <= 0) {
+            if (!m_nv12MetaLogged) {
+                m_nv12MetaLogged = true;
+                logger.LogWarn(std::string(tag) + "::NV12Helper",
+                    "Early exit: metadata dims invalid",
+                    __FILE__, __LINE__);
+            }
+            return result;
+        }
+
+        result.metaWidth  = metaW;
+        result.metaHeight = metaH;
+        result.ratio = 1.f / std::min(
+            static_cast<float>(inputW) / metaW,
+            static_cast<float>(inputH) / metaH);
+
+        // Launch engine-specific kernel
+        cv::cuda::GpuMat gpuResized;
+        gpuResized.create(inputH, inputW, CV_8UC3);
+
+        cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
+        launcher(gpuY, gpuUV, gpuResized, frameW, frameH, inputW, inputH, rawStream);
+
+        stream.waitForCompletion();
+
+        // (No registry lock to release — data kept alive by refcount)
+
+        // Log NV12 fast-path usage once per instance
+        if (!m_nv12ActiveLogged) {
+            m_nv12ActiveLogged = true;
+            const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload";
+            logger.LogInfo(std::string(tag) + "::NV12Helper",
+                std::string(mode) + " ACTIVE: " +
+                std::to_string(frameW) + "x" + std::to_string(frameH) +
+                " NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) +
+                " display=" + std::to_string(static_cast<int>(metaW)) + "x" +
+                std::to_string(static_cast<int>(metaH)),
+                __FILE__, __LINE__);
+        }
+
+        result.succeeded = true;
+        result.gpuRGB    = std::move(gpuResized);
+        return result;
+    }
+
+    // ====================================================================
+    //  Default YOLO kernel launcher
+    //
+    //  For detection/seg/pose/OBB: fused NV12→RGB + letterbox in a single
+    //  kernel at model input resolution (e.g. 640x640).
+    //  For classification: separate NV12→RGB at full res, then cuda::resize.
+    //  (Classification detection is not done here — the caller's engine
+    //   should set up the appropriate launcher if needed. This default
+    //   always uses the fused letterbox path, which works for all YOLO
+    //   detection/seg/pose/OBB tasks.)
+    // ====================================================================
+    NV12KernelLauncher NV12PreprocessHelper::defaultYOLOLauncher() {
+        return [](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
+                  cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
+                  int inputW, int inputH, cudaStream_t stream) {
+            if (srcW == inputW && srcH == inputH) {
+                // Source matches model input — direct NV12→RGB, no resize needed
+                launchNV12ToRGB(
+                    gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
+                    gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
+                    gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
+                    srcW, srcH, stream);
+            } else {
+                // Fused NV12→RGB + letterbox resize
+                float r = std::min(static_cast<float>(inputW) / srcW,
+                                   static_cast<float>(inputH) / srcH);
+                int unpadW = static_cast<int>(r * srcW);
+                int unpadH = static_cast<int>(r * srcH);
+                float invScale = 1.0f / r;
+
+                launchNV12ToRGBLetterbox(
+                    gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
+                    gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
+                    gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
+                    inputW, inputH,
+                    srcW, srcH,
+                    unpadW, unpadH,
+                    invScale, stream);
+            }
+        };
+    }
+
+    // ====================================================================
+    //  Classification launcher (direct resize, no letterbox)
+    // ====================================================================
+    NV12KernelLauncher NV12PreprocessHelper::classificationLauncher() {
+        return [](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
+                  cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
+                  int inputW, int inputH, cudaStream_t stream) {
+            if (srcW == inputW && srcH == inputH) {
+                // Source matches model input — direct NV12→RGB, no resize needed
+                launchNV12ToRGB(
+                    gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
+                    gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
+                    gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
+                    srcW, srcH, stream);
+            } else {
+                // Fused NV12→RGB + direct resize (stretch to fill, no padding)
+                launchNV12ToRGBResize(
+                    gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
+                    gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
+                    gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
+                    inputW, inputH,
+                    srcW, srcH, stream);
+            }
+        };
+    }
+
+    // ====================================================================
+    //  NV12→BGR fused resize (for OCR detection)
+    // ====================================================================
+    void NV12PreprocessHelper::nv12ToBGRResize(
+        const uint8_t* devY, int yPitch,
+        const uint8_t* devUV, int uvPitch,
+        uint8_t* bgrOut, int outPitch,
+        int outW, int outH, int srcW, int srcH,
+        cudaStream_t stream) {
+        launchNV12ToBGRResize(devY, yPitch, devUV, uvPitch,
+                              bgrOut, outPitch, outW, outH, srcW, srcH, stream);
+    }
+
+    //  SCRFD center-padded letterbox launcher
+    // ====================================================================
+    NV12KernelLauncher NV12PreprocessHelper::scrfdCenterLetterboxLauncher(int padLeft, int padTop) {
+        return [padLeft, padTop](const cv::cuda::GpuMat& gpuY, const cv::cuda::GpuMat& gpuUV,
+                  cv::cuda::GpuMat& gpuOut, int srcW, int srcH,
+                  int inputW, int inputH, cudaStream_t stream) {
+            float r = std::min(static_cast<float>(inputW) / srcW,
+                               static_cast<float>(inputH) / srcH);
+            int unpadW = static_cast<int>(r * srcW);
+            int unpadH = static_cast<int>(r * srcH);
+            float invScale = 1.0f / r;
+
+            launchNV12ToRGBCenterLetterbox(
+                gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
+                gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
+                gpuOut.ptr<uint8_t>(), static_cast<int>(gpuOut.step),
+                inputW, inputH,
+                srcW, srcH,
+                unpadW, unpadH,
+                invScale,
+                padLeft, padTop,
+                stream);
+        };
+    }
+
+    // ====================================================================
+    //  NV12 affine warp for face alignment
+    //
+    //  Reads full-res NV12 from the registry, applies inverse affine
+    //  transform via CUDA kernel, outputs a small aligned face in BGR.
+    //  The affine matrix is computed on CPU (from detected landmarks to
+    //  ArcFace template), then scaled to map to full-res NV12 source.
+    // ====================================================================
+    NV12PreprocessHelper::NV12AffineResult NV12PreprocessHelper::tryNV12AffineWarp(
+            const cv::Mat& inputImage, int inferenceGpu,
+            const cv::Mat& affineMatrix, int outW, int outH,
+            float scaleX, float scaleY,
+            SPDLogger& logger, const char* tag) {
+        NV12AffineResult result;
+
+        // Skip during warmup
+        if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
+            return result;
+        }
+
+        if (affineMatrix.empty()) {
+            return result;
+        }
+
+        // Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
+        GpuFrameData* gpuData = tl_currentGpuFrame();
+
+        if (!gpuData || !gpuData->yPlane || !gpuData->uvPlane || gpuData->pixelFormat != 23) {
+            return result;
+        }
+
+        const int frameW = gpuData->width;
+        const int frameH = gpuData->height;
+        if (frameW <= 0 || frameH <= 0 || m_cudaContextDead) {
+            return result;
+        }
+
+        const bool isCudaDevice = gpuData->isCudaDevicePtr;
+        const bool gpuMatch = !isCudaDevice ||
+                              gpuData->gpuIndex < 0 ||
+                              gpuData->gpuIndex == inferenceGpu;
+
+        if (isCudaDevice && !gpuMatch) {
+            return result;  // cross-GPU — fall back to CPU warpAffine
+        }
+
+        const bool useZeroCopy = isCudaDevice && gpuMatch;
+        uint8_t* effYPlane     = gpuData->yPlane;
+        uint8_t* effUvPlane    = gpuData->uvPlane;
+        int      effYLinesize  = gpuData->yLinesize;
+        int      effUvLinesize = gpuData->uvLinesize;
+
+        // Compute inverse affine matrix and scale for full-res NV12
+        cv::Mat M_inv;
+        cv::invertAffineTransform(affineMatrix, M_inv);
+        M_inv.convertTo(M_inv, CV_64F);
+
+        // Scale inverse affine to map from 112x112 output → full-res NV12 source
+        // Row 0 maps to X (multiply by scaleX), Row 1 maps to Y (multiply by scaleY)
+        double* row0 = M_inv.ptr<double>(0);
+        double* row1 = M_inv.ptr<double>(1);
+        row0[0] *= scaleX; row0[1] *= scaleX; row0[2] *= scaleX;
+        row1[0] *= scaleY; row1[1] *= scaleY; row1[2] *= scaleY;
+
+        float m00 = static_cast<float>(row0[0]);
+        float m01 = static_cast<float>(row0[1]);
+        float m02 = static_cast<float>(row0[2]);
+        float m10 = static_cast<float>(row1[0]);
+        float m11 = static_cast<float>(row1[1]);
+        float m12 = static_cast<float>(row1[2]);
+
+        cv::cuda::Stream stream;
+        cv::cuda::GpuMat gpuY, gpuUV;
+
+        if (useZeroCopy) {
+            gpuY  = cv::cuda::GpuMat(frameH,     frameW, CV_8UC1,
+                        effYPlane,  static_cast<size_t>(effYLinesize));
+            gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
+                        effUvPlane, static_cast<size_t>(effUvLinesize));
+        } else {
+            // CPU path: memcpy + upload
+            const size_t ySize  = static_cast<size_t>(frameW) * frameH;
+            const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
+            const size_t nv12Size = ySize + uvSize;
+            ensurePinnedBuffer(nv12Size, logger, tag);
+            if (!m_pinnedBuf) {
+                return result;
+            }
+
+            uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
+            bool cpyOk = true;
+            if (effYLinesize == frameW) {
+                cpyOk = safeMemcpy(dst, effYPlane, ySize);
+            } else {
+                for (int row = 0; row < frameH && cpyOk; row++)
+                    cpyOk = safeMemcpy(dst + row * frameW,
+                                effYPlane + row * effYLinesize, frameW);
+            }
+            if (cpyOk) {
+                uint8_t* uvDst = dst + ySize;
+                if (effUvLinesize == frameW) {
+                    cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
+                } else {
+                    for (int row = 0; row < frameH / 2 && cpyOk; row++)
+                        cpyOk = safeMemcpy(uvDst + row * frameW,
+                                    effUvPlane + row * effUvLinesize, frameW);
+                }
+            }
+
+            if (!cpyOk) {
+                return result;
+            }
+
+            // (No registry lock to release — data kept alive by refcount)
+
+            cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
+            cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
+                static_cast<uint8_t*>(m_pinnedBuf) + ySize);
+            gpuY.upload(pinnedY, stream);
+            gpuUV.upload(pinnedUV, stream);
+        }
+
+        // Launch affine warp kernel (outputs BGR 112x112)
+        cv::cuda::GpuMat gpuFace(outH, outW, CV_8UC3);
+        cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
+
+        launchNV12AffineWarpToBGR(
+            gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
+            gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
+            gpuFace.ptr<uint8_t>(), static_cast<int>(gpuFace.step),
+            outW, outH,
+            frameW, frameH,
+            m00, m01, m02, m10, m11, m12,
+            rawStream);
+
+        stream.waitForCompletion();
+
+        // (No registry lock to release — data kept alive by refcount)
+
+        // Keep GPU copy for recognizer (avoids re-upload), download CPU copy for mask
+        result.gpuAlignedFace = gpuFace;
+        gpuFace.download(result.alignedFaceBGR);
+        result.succeeded = true;
+        return result;
+    }
+
+    // ── NV12 rectangular crop → BGR ────────────────────────────────────
+    NV12PreprocessHelper::NV12CropResult NV12PreprocessHelper::tryNV12CropToBGR(
+        const cv::Mat& inputImage, int inferenceGpu,
+        const cv::Rect& bbox, int padding,
+        float scaleX, float scaleY,
+        SPDLogger& logger, const char* tag)
+    {
+        NV12CropResult result;
+
+        // Warmup gating
+        if (m_inferenceCount < NV12_WARMUP_THRESHOLD)
+            return result;
+
+        // CUDA health check
+        if (m_cudaContextDead)
+            return result;
+
+        // Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
+        auto* gpuData = tl_currentGpuFrame();
+        if (!gpuData || gpuData->pixelFormat != 23)
+            return result;  // no NV12 data
+
+        if (!gpuData->isCudaDevicePtr || !gpuData->yPlane || !gpuData->uvPlane)
+            return result;  // NV12 not on GPU
+
+        const int frameW = gpuData->width;
+        const int frameH = gpuData->height;
+
+        // Scale bbox from display-res to full-res NV12 coords with padding
+        const int fx1 = std::max(0, static_cast<int>((bbox.x - padding) * scaleX));
+        const int fy1 = std::max(0, static_cast<int>((bbox.y - padding) * scaleY));
+        const int fx2 = std::min(frameW, static_cast<int>((bbox.x + bbox.width + padding) * scaleX));
+        const int fy2 = std::min(frameH, static_cast<int>((bbox.y + bbox.height + padding) * scaleY));
+        const int cropW = fx2 - fx1;
+        const int cropH = fy2 - fy1;
+
+        if (cropW < 5 || cropH < 5)
+            return result;
+
+        // GPU device matching
+        int currentDev = 0;
+        cudaGetDevice(&currentDev);
+        if (inferenceGpu >= 0 && currentDev != inferenceGpu)
+            cudaSetDevice(inferenceGpu);
+
+        // Wrap existing GPU NV12 planes
+        cv::cuda::GpuMat gpuY(frameH, frameW, CV_8UC1,
+            gpuData->yPlane, gpuData->yLinesize);
+        cv::cuda::GpuMat gpuUV(frameH / 2, frameW / 2, CV_8UC2,
+            gpuData->uvPlane, gpuData->uvLinesize);
+
+        // Allocate output on GPU
+        cv::cuda::GpuMat gpuCrop(cropH, cropW, CV_8UC3);
+
+        // Launch crop kernel
+        cv::cuda::Stream stream;
+        cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
+
+        launchNV12CropToBGR(
+            gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
+            gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
+            gpuCrop.ptr<uint8_t>(), static_cast<int>(gpuCrop.step),
+            cropW, cropH,
+            fx1, fy1,
+            frameW, frameH,
+            rawStream);
+
+        stream.waitForCompletion();
+
+        // (No registry lock to release — data kept alive by refcount)
+
+        // Download small crop to CPU
+        gpuCrop.download(result.bgrCrop);
+        result.succeeded = true;
+
+        // One-shot diagnostic log
+        if (!m_nv12CropLogged) {
+            m_nv12CropLogged = true;
+            logger.LogInfo(tag,
+                "NV12 GPU crop ACTIVE: " + std::to_string(cropW) + "x" + std::to_string(cropH) +
+                " BGR cropped from " + std::to_string(frameW) + "x" + std::to_string(frameH) +
+                " NV12 (display=" + std::to_string(inputImage.cols) + "x" + std::to_string(inputImage.rows) +
+                " scaleX=" + std::to_string(scaleX) + " scaleY=" + std::to_string(scaleY) + ")",
+                __FILE__, __LINE__);
+        }
+
+        return result;
+    }
+
+    // ====================================================================
+    //  tryNV12DirectToBuffer — write NV12→RGB CHW directly into TRT buffer
+    //
+    //  For engines that manage their own GPU buffers (e.g. SAM3 with raw
+    //  TRT enqueueV3).  Same registry lookup / GPU matching / safety logic
+    //  as tryNV12(), but writes CHW planar output into a pre-allocated
+    //  void* GPU buffer instead of allocating an HWC GpuMat.
+    // ====================================================================
+    NV12PreprocessHelper::NV12DirectResult NV12PreprocessHelper::tryNV12DirectToBuffer(
+            const cv::Mat& inputImage, int inferenceGpu,
+            void* dstGpuBuffer, int inputW, int inputH,
+            bool isFloat32,
+            cudaStream_t stream,
+            SPDLogger& logger, const char* tag) {
+        NV12DirectResult result;
+
+        // Skip during warmup phase
+        if (m_inferenceCount < NV12_WARMUP_THRESHOLD) {
+            return result;
+        }
+
+        if (!dstGpuBuffer) return result;
+
+        // Get NV12 frame data from thread-local (set by RunInferenceComplete_LV)
+        GpuFrameData* gpuData = tl_currentGpuFrame();
+
+        if (!gpuData || !gpuData->yPlane) {
+            return result;  // no NV12 data — caller uses CPU path
+        }
+
+        // BGR full-res path not applicable for direct-to-buffer (SAM3 handles its own resize)
+        if (gpuData->pixelFormat == 1000) {
+            return result;
+        }
+
+        // NV12 path only
+        if (gpuData->pixelFormat != 23) {
+            return result;
+        }
+
+        if (!gpuData->uvPlane) {
+            return result;
+        }
+
+        const int frameW = gpuData->width;
+        const int frameH = gpuData->height;
+
+        if (frameW <= 0 || frameH <= 0 || m_cudaContextDead) {
+            return result;
+        }
+
+        const bool isCudaDevice = gpuData->isCudaDevicePtr;
+        const bool gpuMatch = !isCudaDevice ||
+                              gpuData->gpuIndex < 0 ||
+                              gpuData->gpuIndex == inferenceGpu;
+        const bool useZeroCopy = isCudaDevice && gpuMatch;
+
+        if (isCudaDevice && !gpuMatch) {
+            return result;  // cross-GPU — fall back to CPU path
+        }
+
+        uint8_t* effYPlane     = gpuData->yPlane;
+        uint8_t* effUvPlane    = gpuData->uvPlane;
+        int      effYLinesize  = gpuData->yLinesize;
+        int      effUvLinesize = gpuData->uvLinesize;
+
+        // Log path selection once
+        if (!m_nv12PathLogged) {
+            m_nv12PathLogged = true;
+            const char* pathName = useZeroCopy ? "CUDA_ZERO_COPY" : "CPU_NV12_UPLOAD";
+            logger.LogInfo(std::string(tag) + "::NV12Helper",
+                std::string("Path: ") + pathName +
+                " | isCuda=" + std::to_string(isCudaDevice) +
+                " gpuMatch=" + std::to_string(gpuMatch) +
+                " frame=" + std::to_string(frameW) + "x" + std::to_string(frameH) +
+                " (direct CHW " + (isFloat32 ? "float32" : "uint8") + ")",
+                __FILE__, __LINE__);
+        }
+
+        cv::cuda::GpuMat gpuY, gpuUV;
+
+        if (useZeroCopy) {
+            gpuY  = cv::cuda::GpuMat(frameH,     frameW, CV_8UC1,
+                        effYPlane,  static_cast<size_t>(effYLinesize));
+            gpuUV = cv::cuda::GpuMat(frameH / 2, frameW, CV_8UC1,
+                        effUvPlane, static_cast<size_t>(effUvLinesize));
+        } else {
+            // CPU path: memcpy NV12 to pinned buffer, upload
+            const size_t ySize  = static_cast<size_t>(frameW) * frameH;
+            const size_t uvSize = static_cast<size_t>(frameW) * frameH / 2;
+            const size_t nv12Size = ySize + uvSize;
+            ensurePinnedBuffer(nv12Size, logger, tag);
+            if (!m_pinnedBuf) return result;
+
+            if (!isMemoryReadable(effYPlane, std::min(static_cast<size_t>(effYLinesize) * frameH, (size_t)4096)) ||
+                !isMemoryReadable(effUvPlane, std::min(static_cast<size_t>(effUvLinesize) * (frameH / 2), (size_t)4096))) {
+                return result;
+            }
+
+            uint8_t* dst = static_cast<uint8_t*>(m_pinnedBuf);
+            bool cpyOk = true;
+            if (effYLinesize == frameW) {
+                cpyOk = safeMemcpy(dst, effYPlane, ySize);
+            } else {
+                for (int row = 0; row < frameH && cpyOk; row++)
+                    cpyOk = safeMemcpy(dst + row * frameW,
+                                effYPlane + row * effYLinesize, frameW);
+            }
+            if (cpyOk) {
+                uint8_t* uvDst = dst + ySize;
+                if (effUvLinesize == frameW) {
+                    cpyOk = safeMemcpy(uvDst, effUvPlane, uvSize);
+                } else {
+                    for (int row = 0; row < frameH / 2 && cpyOk; row++)
+                        cpyOk = safeMemcpy(uvDst + row * frameW,
+                                    effUvPlane + row * effUvLinesize, frameW);
+                }
+            }
+            if (!cpyOk) return result;
+
+            // (No registry lock to release — data kept alive by refcount)
+
+            cv::cuda::Stream cvStream = cv::cuda::StreamAccessor::wrapStream(stream);
+            cv::Mat pinnedY(frameH, frameW, CV_8UC1, m_pinnedBuf);
+            cv::Mat pinnedUV(frameH / 2, frameW, CV_8UC1,
+                static_cast<uint8_t*>(m_pinnedBuf) + ySize);
+            gpuY.upload(pinnedY, cvStream);
+            gpuUV.upload(pinnedUV, cvStream);
+        }
+
+        // Launch CHW kernel directly into TRT buffer
+        if (isFloat32) {
+            launchNV12ToRGBResizeCHW_float(
+                gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
+                gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
+                static_cast<float*>(dstGpuBuffer),
+                inputW, inputH, frameW, frameH, stream);
+        } else {
+            launchNV12ToRGBResizeCHW_uint8(
+                gpuY.ptr<uint8_t>(),  static_cast<int>(gpuY.step),
+                gpuUV.ptr<uint8_t>(), static_cast<int>(gpuUV.step),
+                static_cast<uint8_t*>(dstGpuBuffer),
+                inputW, inputH, frameW, frameH, stream);
+        }
+
+        cudaStreamSynchronize(stream);
+
+        // (No registry lock to release — data kept alive by refcount)
+
+        // Log activation once
+        if (!m_nv12ActiveLogged) {
+            m_nv12ActiveLogged = true;
+            const char* mode = useZeroCopy ? "CUDA zero-copy" : "CPU upload";
+            logger.LogInfo(std::string(tag) + "::NV12Helper",
+                std::string(mode) + " ACTIVE: " +
+                std::to_string(frameW) + "x" + std::to_string(frameH) +
+                " NV12 -> " + std::to_string(inputW) + "x" + std::to_string(inputH) +
+                " CHW " + (isFloat32 ? "float32" : "uint8"),
+                __FILE__, __LINE__);
+        }
+
+        result.succeeded  = true;
+        result.metaWidth  = static_cast<float>(inputImage.cols > 0 ? inputImage.cols : frameW);
+        result.metaHeight = static_cast<float>(inputImage.rows > 0 ? inputImage.rows : frameH);
+        return result;
+    }
+
+} // namespace ANSCENTER