ANSCORE/modules/ANSCV/ANSGpuFrameOps.h

#pragma once
// ANSGpuFrameOps.h — FFmpeg-aware convenience functions for ANSGpuFrameRegistry.
//
// This header requires FFmpeg headers (libavutil/frame.h) and provides
// typed attach/invalidate/remove operations that handle av_frame_clone/free.
//
// NEW DESIGN: Instead of storing AVFrame* references (which lock NVDEC surfaces),
// we snapshot the CPU NV12 planes into malloc'd buffers and release the AVFrames
// immediately. This prevents decoder surface pool exhaustion when many clones
// hold references to the same frame.
//
// Include this in ANSCV/ANSRTSP (which link FFmpeg). For projects without
// FFmpeg (ANSODEngine), include ANSGpuFrameRegistry.h directly and use
// gpu_frame_lookup() + the GpuFrameData plane pointers.

#include "ANSGpuFrameRegistry.h"
#include "GpuNV12SlotPool.h"

extern "C" {
#include "libavutil/frame.h"
}

#include <cuda_runtime.h>
#include <cstring>
#include <cstdlib>
#include <thread>
#include <mutex>
#include <cstdio>

#ifdef _WIN32
#include <windows.h>
#endif

// Debug logging macro for GPU frame operations.
// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
#ifndef GPU_FRAME_DBG
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
#ifdef _WIN32
#define GPU_FRAME_DBG(fmt, ...) do { \
    char _gpu_dbg_buf[512]; \
    snprintf(_gpu_dbg_buf, sizeof(_gpu_dbg_buf), "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__); \
    OutputDebugStringA(_gpu_dbg_buf); \
    fprintf(stderr, "%s", _gpu_dbg_buf); \
} while(0)
#else
#define GPU_FRAME_DBG(fmt, ...) \
    fprintf(stderr, "[GpuFrameOps] " fmt "\n", ##__VA_ARGS__)
#endif
#else
#define GPU_FRAME_DBG(fmt, ...) ((void)0)
#endif
#endif

namespace anscv_gpu_ops {
namespace detail {

// Snapshot NV12 Y and UV planes from an AVFrame into malloc'd buffers.
// Returns true on success. Caller owns the output buffers.
inline bool snapshotNV12Planes(const AVFrame* nv12,
                                uint8_t*& outY, int& outYLinesize,
                                uint8_t*& outUV, int& outUVLinesize,
                                int& outWidth, int& outHeight) {
    if (!nv12 || !nv12->data[0] || !nv12->data[1])
        return false;

    outWidth  = nv12->width;
    outHeight = nv12->height;
    outYLinesize  = nv12->width;   // Packed (no alignment padding)
    outUVLinesize = nv12->width;   // UV interleaved: width bytes per row

    size_t yBytes  = static_cast<size_t>(outYLinesize) * outHeight;
    size_t uvBytes = static_cast<size_t>(outUVLinesize) * (outHeight / 2);

    outY  = static_cast<uint8_t*>(std::malloc(yBytes));
    outUV = static_cast<uint8_t*>(std::malloc(uvBytes));

    if (!outY || !outUV) {
        std::free(outY);
        std::free(outUV);
        outY = nullptr;
        outUV = nullptr;
        return false;
    }

    // Copy line-by-line (source may have padding via linesize > width)
    const int srcYLinesize  = nv12->linesize[0];
    const int srcUVLinesize = nv12->linesize[1];

    for (int row = 0; row < outHeight; ++row) {
        std::memcpy(outY + row * outYLinesize,
                    nv12->data[0] + row * srcYLinesize,
                    outWidth);
    }
    for (int row = 0; row < outHeight / 2; ++row) {
        std::memcpy(outUV + row * outUVLinesize,
                    nv12->data[1] + row * srcUVLinesize,
                    outWidth);
    }

    return true;
}

// Drain pending GPU device pointers and cudaFree them.
// Uses time-based safety: only frees entries queued >100ms ago, guaranteeing
// all CUDA kernels reading from them have completed (kernels take <10ms).
// NO cudaDeviceSynchronize — zero blocking of GPU pipeline.
//
// If forceAll=true, drains ALL entries with cudaDeviceSynchronize first
// (used only by Destroy/Reconnect for final cleanup).
inline void drainAndFreeGpuPending(bool forceAll = false) {
    static constexpr int SAFE_AGE_MS = 100; // 100ms >> 10ms kernel duration
    auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending(
        forceAll ? 0 : SAFE_AGE_MS);
    if (gpuPending.empty()) return;
    GPU_FRAME_DBG("drainGpuPending: freeing %zu GPU ptrs (force=%d)", gpuPending.size(), (int)forceAll);
    int prevDev = -1;
    cudaGetDevice(&prevDev);
    if (forceAll) {
        // Final cleanup — sync all devices first
        cudaDeviceSynchronize();
    }
    for (auto& entry : gpuPending) {
        if (entry.ptr) {
            if (entry.deviceIdx >= 0)
                cudaSetDevice(entry.deviceIdx);
            cudaError_t err = cudaFree(entry.ptr);
            if (err != cudaSuccess) {
                GPU_FRAME_DBG("drainGpuPending: cudaFree FAILED err=%d (%s)",
                              (int)err, cudaGetErrorString(err));
            }
        }
    }
    if (prevDev >= 0)
        cudaSetDevice(prevDev);
}

} // namespace detail
} // namespace anscv_gpu_ops

// Attach NV12/YUV frame keyed by cv::Mat* pointer.
// Snapshots CPU NV12 planes into owned malloc'd buffers, then releases the AVFrame.
// TAKES OWNERSHIP of nv12 — caller must NOT av_frame_free after this call.
inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pts) {
    if (!mat || !nv12) return;

    GpuFrameData data{};
    data.gpuIndex    = gpuIdx;
    data.pts         = pts;
    data.pixelFormat = nv12->format;
    data.width       = nv12->width;
    data.height      = nv12->height;

    // Snapshot NV12 planes to owned buffers
    bool ok = anscv_gpu_ops::detail::snapshotNV12Planes(
        nv12,
        data.cpuYPlane, data.cpuYLinesize,
        data.cpuUvPlane, data.cpuUvLinesize,
        data.width, data.height);

    // Keep legacy pointers for backward compat during transition
    data.yPlane    = data.cpuYPlane;
    data.uvPlane   = data.cpuUvPlane;
    data.yLinesize = data.cpuYLinesize;
    data.uvLinesize = data.cpuUvLinesize;

    // Store AVFrame for legacy cleanup (will be freed by drain_pending)
    data.avframe = nv12;

    void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
    if (old) {
        // Defer old frame's AVFrame free
        auto& reg = ANSGpuFrameRegistry::instance();
        auto lk = reg.acquire_lock();
        reg.pushPendingFree_locked(old);
    }

    // NOTE: No drain_pending() here (hot path). Freed by evict_stale.
}

// Attach CUDA HW frame — copies NV12 from NVDEC surfaces to owned GPU memory.
// TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
//
// D2D copy: SYNCHRONOUS cudaMemcpy2D from NVDEC surfaces into a GpuNV12Slot
// buffer from the global pool.  Data is valid immediately after the call returns.
// AVFrame is freed immediately (NVDEC surface returned to decoder pool).
//
// The slot is protected by a 200ms cooldown after the GpuFrameData's refcount
// drops to 0, guaranteeing that all in-flight GPU kernels (which complete in
// <10ms) have finished reading from the buffer before it can be reused.
//
// slot: pre-acquired from GpuNV12SlotPool::instance().acquire().
//       If non-null, D2D copy goes into slot buffers (no per-frame alloc).
//       If nullptr, falls back to per-frame cudaMallocPitch (legacy path).
//
// Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
// inference (when decode GPU != inference GPU).
inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
                                   AVFrame* cpuNV12 = nullptr,
                                   GpuNV12Slot* slot = nullptr) {
    if (!mat || !cudaFrame) {
        GPU_FRAME_DBG("attach_cuda: SKIP mat=%p cudaFrame=%p", (void*)mat, (void*)cudaFrame);
        return;
    }

    const int w = cudaFrame->width;
    const int h = cudaFrame->height;
    GPU_FRAME_DBG("attach_cuda: START mat=%p %dx%d gpu=%d nvdecY=%p nvdecUV=%p slot=%p",
                  (void*)mat, w, h, gpuIdx,
                  (void*)cudaFrame->data[0], (void*)cudaFrame->data[1], (void*)slot);

    GpuFrameData data{};
    data.gpuIndex        = gpuIdx;
    data.pts             = pts;
    data.width           = w;
    data.height          = h;
    data.pixelFormat     = 23; // AV_PIX_FMT_NV12

    // NOTE: CPU NV12 snapshot is DEFERRED — only taken if pool D2D fails.
    // For 4K frames, the snapshot is ~12MB malloc+memcpy+free per frame.
    // Skipping it when the pool path succeeds (the common case) eliminates
    // ~276MB/s of CPU heap allocation churn that causes process-level stalls.

    // --- D2D copy: NVDEC surface → GPU buffer ---
    bool d2dOk = false;

    if (slot && slot->bufY && slot->bufUV && slot->pitchY > 0 && slot->pitchUV > 0) {
        // --- Global pool path: D2D copy on per-slot non-blocking stream ---
        // cudaMemcpy2DAsync + cudaStreamSynchronize(slotStream):
        //   - Non-blocking stream avoids NULL-stream implicit sync with inference
        //   - Sync waits ONLY for the 2 copies (~1.5ms for 4K, ~0.3ms for 1080p)
        //   - Data valid after sync — av_frame_free is safe
        int prevDev = -1;
        cudaGetDevice(&prevDev);
        if (gpuIdx >= 0) cudaSetDevice(gpuIdx);

        cudaStream_t copyStream = static_cast<cudaStream_t>(slot->copyStream);
        cudaError_t e3, e4;

        if (copyStream) {
            e3 = cudaMemcpy2DAsync(slot->bufY,  slot->pitchY,
                                    cudaFrame->data[0], cudaFrame->linesize[0],
                                    w, h, cudaMemcpyDeviceToDevice, copyStream);
            e4 = cudaMemcpy2DAsync(slot->bufUV, slot->pitchUV,
                                    cudaFrame->data[1], cudaFrame->linesize[1],
                                    w, h / 2, cudaMemcpyDeviceToDevice, copyStream);
            // NO cudaStreamSynchronize here — let the copy run asynchronously.
            // The camera thread is NOT blocked by the WDDM SRW lock.
            // Inference will call cudaStreamSynchronize(d2dCopyStream) in tryNV12()
            // before reading the buffer.  By that time (~50-200ms later), the copy
            // (~0.3ms for 1080p, ~1.5ms for 4K) has long completed, so the sync
            // returns immediately with zero blocking.
        } else {
            e3 = cudaMemcpy2D(slot->bufY,  slot->pitchY,
                               cudaFrame->data[0], cudaFrame->linesize[0],
                               w, h, cudaMemcpyDeviceToDevice);
            e4 = cudaMemcpy2D(slot->bufUV, slot->pitchUV,
                               cudaFrame->data[1], cudaFrame->linesize[1],
                               w, h / 2, cudaMemcpyDeviceToDevice);
        }

        if (prevDev >= 0) cudaSetDevice(prevDev);

        if (e3 == cudaSuccess && e4 == cudaSuccess) {
            data.isCudaDevicePtr = true;
            data.yPlane          = static_cast<uint8_t*>(slot->bufY);
            data.uvPlane         = static_cast<uint8_t*>(slot->bufUV);
            data.yLinesize       = static_cast<int>(slot->pitchY);
            data.uvLinesize      = static_cast<int>(slot->pitchUV);
            data.poolSlot        = slot;
            data.d2dCopyStream   = copyStream;  // Inference syncs on this before reading
            d2dOk = true;
            GPU_FRAME_DBG("attach_cuda: D2D OK (global pool, async) Y=%p UV=%p yPitch=%zu uvPitch=%zu stream=%p",
                          slot->bufY, slot->bufUV, slot->pitchY, slot->pitchUV, copyStream);
        } else {
            GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED (pool) e3=%d e4=%d — fallback",
                          (int)e3, (int)e4);
            slot->state.store(GpuNV12Slot::STATE_FREE, std::memory_order_release);
        }
    }

    if (!d2dOk && !slot) {
        // --- Legacy path: per-frame cudaMallocPitch (for modules without pool) ---
        const size_t yBytes  = static_cast<size_t>(w) * h;
        const size_t uvBytes = static_cast<size_t>(w) * (h / 2);
        const size_t totalBytes = yBytes + uvBytes;

        if (ANSGpuFrameRegistry::instance().canAllocateGpuCache(totalBytes)) {
            int prevDev = -1;
            cudaGetDevice(&prevDev);
            if (gpuIdx >= 0) cudaSetDevice(gpuIdx);

            void*  ownedY  = nullptr;
            void*  ownedUV = nullptr;
            size_t yPitch  = 0;
            size_t uvPitch = 0;

            cudaError_t e1 = cudaMallocPitch(&ownedY,  &yPitch,  w, h);
            cudaError_t e2 = cudaMallocPitch(&ownedUV, &uvPitch, w, h / 2);

            if (e1 == cudaSuccess && e2 == cudaSuccess) {
                cudaError_t e3 = cudaMemcpy2D(ownedY,  yPitch,
                                               cudaFrame->data[0], cudaFrame->linesize[0],
                                               w, h, cudaMemcpyDeviceToDevice);
                cudaError_t e4 = cudaMemcpy2D(ownedUV, uvPitch,
                                               cudaFrame->data[1], cudaFrame->linesize[1],
                                               w, h / 2, cudaMemcpyDeviceToDevice);

                if (e3 == cudaSuccess && e4 == cudaSuccess) {
                    data.isCudaDevicePtr = true;
                    data.yPlane          = static_cast<uint8_t*>(ownedY);
                    data.uvPlane         = static_cast<uint8_t*>(ownedUV);
                    data.yLinesize       = static_cast<int>(yPitch);
                    data.uvLinesize      = static_cast<int>(uvPitch);
                    data.gpuCacheY          = ownedY;
                    data.gpuCacheUV         = ownedUV;
                    data.gpuCacheYPitch     = yPitch;
                    data.gpuCacheUVPitch    = uvPitch;
                    data.gpuCacheDeviceIdx  = gpuIdx;
                    data.gpuCacheValid      = true;
                    data.gpuCacheBytes      = yPitch * h + uvPitch * (h / 2);
                    ANSGpuFrameRegistry::instance().onGpuCacheCreated(data.gpuCacheBytes);
                    d2dOk = true;
                    GPU_FRAME_DBG("attach_cuda: D2D OK ownedY=%p ownedUV=%p yPitch=%zu uvPitch=%zu bytes=%zu",
                                  ownedY, ownedUV, yPitch, uvPitch, data.gpuCacheBytes);
                } else {
                    GPU_FRAME_DBG("attach_cuda: D2D COPY FAILED e3=%d e4=%d — fallback CPU",
                                  (int)e3, (int)e4);
                    cudaFree(ownedY);
                    cudaFree(ownedUV);
                }
            } else {
                GPU_FRAME_DBG("attach_cuda: cudaMallocPitch FAILED e1=%d e2=%d — fallback CPU",
                              (int)e1, (int)e2);
                if (e1 == cudaSuccess) cudaFree(ownedY);
                if (e2 == cudaSuccess) cudaFree(ownedUV);
            }

            if (prevDev >= 0) cudaSetDevice(prevDev);
        }
    }

    if (!d2dOk) {
        // D2D failed or no slot — take CPU NV12 snapshot now (before freeing cpuNV12).
        // This is the ONLY path where the CPU snapshot is needed.  Skipping it
        // on the pool-success path avoids ~12MB malloc+memcpy+free per 4K frame.
        if (cpuNV12) {
            anscv_gpu_ops::detail::snapshotNV12Planes(
                cpuNV12,
                data.cpuYPlane, data.cpuYLinesize,
                data.cpuUvPlane, data.cpuUvLinesize,
                data.width, data.height);
        }
        GPU_FRAME_DBG("attach_cuda: FALLBACK CPU-only cpuY=%p cpuUV=%p",
                      (void*)data.cpuYPlane, (void*)data.cpuUvPlane);
        data.isCudaDevicePtr = false;
        data.yPlane          = data.cpuYPlane;
        data.uvPlane         = data.cpuUvPlane;
        data.yLinesize       = data.cpuYLinesize;
        data.uvLinesize      = data.cpuUvLinesize;
    }

    // AVFrame lifetime management:
    // - If D2D was ASYNC (d2dCopyStream != null): keep cudaFrame alive in
    //   GpuFrameData.avframe so the NVDEC surface (copy source) remains valid
    //   until the async copy completes.  The AVFrame is freed when GpuFrameData
    //   is released (after inference), by which time the 0.3ms copy is long done.
    // - If D2D was SYNC or failed: push to pending free immediately (old behavior).
    if (data.d2dCopyStream && cudaFrame) {
        // Async D2D — keep AVFrame alive, inference will outlive the copy
        data.avframe = cudaFrame;
        GPU_FRAME_DBG("attach_cuda: keeping AVFrame alive for async D2D cudaFrame=%p",
                      (void*)cudaFrame);
    } else {
        // Sync D2D or fallback — safe to defer free now
        GPU_FRAME_DBG("attach_cuda: deferring AVFrame free cudaFrame=%p",
                      (void*)cudaFrame);
        if (cudaFrame) {
            auto& reg = ANSGpuFrameRegistry::instance();
            auto lk = reg.acquire_lock();
            reg.pushPendingFree_locked(cudaFrame);
        }
        data.avframe = nullptr;
    }
    // cpuNV12 is always safe to defer — CPU snapshot (if taken) is already copied
    if (cpuNV12) {
        auto& reg = ANSGpuFrameRegistry::instance();
        auto lk = reg.acquire_lock();
        reg.pushPendingFree_locked(cpuNV12);
    }
    data.cpuAvframe = nullptr;

    GPU_FRAME_DBG("attach_cuda: FINAL yPlane=%p uvPlane=%p isCuda=%d poolSlot=%p",
                  (void*)data.yPlane, (void*)data.uvPlane, (int)data.isCudaDevicePtr,
                  (void*)data.poolSlot);

    void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
    if (old) {
        // Old frame's AVFrame returned — defer its free too
        auto& reg = ANSGpuFrameRegistry::instance();
        auto lk = reg.acquire_lock();
        reg.pushPendingFree_locked(old);
    }

    // NOTE: No drain_pending() here (hot path). AVFrames accumulate in
    // m_pendingFree and are freed by gpu_frame_evict_stale() which runs
    // every 500ms from anscv_mat_replace. This removes av_frame_free
    // (5-20ms SRW lock per call) from the camera frame-grabbing path.
}

// Release entry by cv::Mat* and free any returned AVFrames.
// GPU device pointers are deferred to TTL eviction or explicit cleanup.
// Safe if not in map (no-op).
inline void gpu_frame_remove(cv::Mat* mat) {
    if (!mat) return;

    GPU_FRAME_DBG("gpu_frame_remove: mat=%p", (void*)mat);
    ANSGpuFrameRegistry::instance().release(mat);

    // NOTE: No drain_pending() here (hot path). AVFrames freed by evict_stale.
}

// Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
inline void gpu_frame_invalidate(cv::Mat* mat) {
    gpu_frame_remove(mat);
}

// Run TTL eviction + drain pending. Call periodically from camera threads.
// TTL eviction is throttled to every 500ms (EVICT_CHECK_INTERVAL_MS).
// GPU buffer cleanup is safe here because:
//   1. Only frames >3 seconds old are evicted (kernels take <10ms)
//   2. cudaDeviceSynchronize() ensures all in-flight kernels are done
//   3. At 500ms interval, one sync per 500ms is ~0.1ms cost (acceptable)
//      vs per-frame sync which caused 900ms spikes
inline void gpu_frame_evict_stale() {
    ANSGpuFrameRegistry::instance().evictStaleFrames();

    // Drain and free AVFrames on a background thread to avoid blocking the
    // camera hot path.  av_frame_free on CUDA-mapped frames can take 5-20ms
    // per call due to nvcuda64 SRW lock.  The background thread frees them
    // periodically (every 50ms) in batches.
    {
        static std::once_flag s_initOnce;
        static std::mutex s_avFreeMutex;
        static std::vector<void*> s_avFreeQueue;

        // Move pending AVFrames to the background queue
        auto pending = ANSGpuFrameRegistry::instance().drain_pending();
        if (!pending.empty()) {
            std::lock_guard<std::mutex> lock(s_avFreeMutex);
            s_avFreeQueue.insert(s_avFreeQueue.end(), pending.begin(), pending.end());
        }

        // Start background free thread on first call
        std::call_once(s_initOnce, []() {
            std::thread([]() {
                while (true) {
                    std::vector<void*> batch;
                    {
                        std::lock_guard<std::mutex> lock(s_avFreeMutex);
                        batch.swap(s_avFreeQueue);
                    }
                    for (void* p : batch) {
                        AVFrame* f = static_cast<AVFrame*>(p);
                        av_frame_free(&f);
                    }
                    std::this_thread::sleep_for(std::chrono::milliseconds(50));
                }
            }).detach();
        });
    }

    // Free GPU device pointers from evicted/released frames (legacy path).
    // Pool-backed frames (ANSRTSP) don't add to this list (gpuCacheY=nullptr).
    anscv_gpu_ops::detail::drainAndFreeGpuPending();
}