modules/ANSCV/ANSGpuFrameOps.h

#pragma once
// ANSGpuFrameOps.h — FFmpeg-aware convenience functions for ANSGpuFrameRegistry.
//
// This header requires FFmpeg headers (libavutil/frame.h) and provides
// typed attach/invalidate/remove operations that handle av_frame_clone/free.
//
// NEW DESIGN: Instead of storing AVFrame* references (which lock NVDEC surfaces),
// we snapshot the CPU NV12 planes into malloc'd buffers and release the AVFrames
// immediately. This prevents decoder surface pool exhaustion when many clones
// hold references to the same frame.
//
// Include this in ANSCV/ANSRTSP (which link FFmpeg). For projects without
// FFmpeg (ANSODEngine), include ANSGpuFrameRegistry.h directly and use
// gpu_frame_lookup() + the GpuFrameData plane pointers.

#include "ANSGpuFrameRegistry.h"

extern "C" {
#include "libavutil/frame.h"
}

#include <cstring>
#include <cstdlib>

namespace anscv_gpu_ops {
namespace detail {

// Snapshot NV12 Y and UV planes from an AVFrame into malloc'd buffers.
// Returns true on success. Caller owns the output buffers.
inline bool snapshotNV12Planes(const AVFrame* nv12,
                                uint8_t*& outY, int& outYLinesize,
                                uint8_t*& outUV, int& outUVLinesize,
                                int& outWidth, int& outHeight) {
    if (!nv12 || !nv12->data[0] || !nv12->data[1])
        return false;

    outWidth  = nv12->width;
    outHeight = nv12->height;
    outYLinesize  = nv12->width;   // Packed (no alignment padding)
    outUVLinesize = nv12->width;   // UV interleaved: width bytes per row

    size_t yBytes  = static_cast<size_t>(outYLinesize) * outHeight;
    size_t uvBytes = static_cast<size_t>(outUVLinesize) * (outHeight / 2);

    outY  = static_cast<uint8_t*>(std::malloc(yBytes));
    outUV = static_cast<uint8_t*>(std::malloc(uvBytes));

    if (!outY || !outUV) {
        std::free(outY);
        std::free(outUV);
        outY = nullptr;
        outUV = nullptr;
        return false;
    }

    // Copy line-by-line (source may have padding via linesize > width)
    const int srcYLinesize  = nv12->linesize[0];
    const int srcUVLinesize = nv12->linesize[1];

    for (int row = 0; row < outHeight; ++row) {
        std::memcpy(outY + row * outYLinesize,
                    nv12->data[0] + row * srcYLinesize,
                    outWidth);
    }
    for (int row = 0; row < outHeight / 2; ++row) {
        std::memcpy(outUV + row * outUVLinesize,
                    nv12->data[1] + row * srcUVLinesize,
                    outWidth);
    }

    return true;
}

} // namespace detail
} // namespace anscv_gpu_ops

// Attach NV12/YUV frame keyed by cv::Mat* pointer.
// Snapshots CPU NV12 planes into owned malloc'd buffers, then releases the AVFrame.
// TAKES OWNERSHIP of nv12 — caller must NOT av_frame_free after this call.
inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pts) {
    if (!mat || !nv12) return;

    GpuFrameData data{};
    data.gpuIndex    = gpuIdx;
    data.pts         = pts;
    data.pixelFormat = nv12->format;
    data.width       = nv12->width;
    data.height      = nv12->height;

    // Snapshot NV12 planes to owned buffers
    bool ok = anscv_gpu_ops::detail::snapshotNV12Planes(
        nv12,
        data.cpuYPlane, data.cpuYLinesize,
        data.cpuUvPlane, data.cpuUvLinesize,
        data.width, data.height);

    // Keep legacy pointers for backward compat during transition
    data.yPlane    = data.cpuYPlane;
    data.uvPlane   = data.cpuUvPlane;
    data.yLinesize = data.cpuYLinesize;
    data.uvLinesize = data.cpuUvLinesize;

    // Store AVFrame for legacy cleanup (will be freed by drain_pending)
    data.avframe = nv12;

    void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
    if (old) {
        AVFrame* oldFrame = static_cast<AVFrame*>(old);
        av_frame_free(&oldFrame);
    }

    // Free stale entries evicted by TTL or previous attach
    auto pending = ANSGpuFrameRegistry::instance().drain_pending();
    for (void* p : pending) {
        AVFrame* stale = static_cast<AVFrame*>(p);
        av_frame_free(&stale);
    }
}

// Attach CUDA HW frame — keeps CUDA device pointers for zero-copy inference.
// TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.
//
// Primary path: yPlane/uvPlane point to CUDA device pointers from the cloned
// AVFrame (data[0]/data[1]).  The cloned AVFrame keeps the NVDEC surface alive
// until gpu_frame_remove() is called after inference.  With 4 cameras each
// holding ~1 surface, this uses 4 of NVDEC's 25-32 surface pool — safe.
//
// Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU
// inference (when decode GPU != inference GPU, CUDA device ptrs aren't
// accessible from another GPU context).
inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,
                                   AVFrame* cpuNV12 = nullptr) {
    if (!mat || !cudaFrame) return;

    GpuFrameData data{};
    data.gpuIndex        = gpuIdx;
    data.pts             = pts;
    data.width           = cudaFrame->width;
    data.height          = cudaFrame->height;
    data.pixelFormat     = 23; // AV_PIX_FMT_NV12 — the underlying sw_format

    // Primary: CUDA device pointers from NVDEC (zero-copy on same GPU)
    data.isCudaDevicePtr = true;
    data.yPlane          = cudaFrame->data[0];   // CUDA device ptr: Y plane
    data.uvPlane         = cudaFrame->data[1];   // CUDA device ptr: UV plane
    data.yLinesize       = cudaFrame->linesize[0];
    data.uvLinesize      = cudaFrame->linesize[1];

    // Fallback: snapshot CPU NV12 for cross-GPU inference
    if (cpuNV12) {
        anscv_gpu_ops::detail::snapshotNV12Planes(
            cpuNV12,
            data.cpuYPlane, data.cpuYLinesize,
            data.cpuUvPlane, data.cpuUvLinesize,
            data.width, data.height);
    }

    // Store AVFrames for cleanup (cudaFrame keeps NVDEC surface alive)
    data.avframe    = cudaFrame;
    data.cpuAvframe = cpuNV12;

    void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));
    if (old) {
        AVFrame* oldFrame = static_cast<AVFrame*>(old);
        av_frame_free(&oldFrame);
    }

    auto pending = ANSGpuFrameRegistry::instance().drain_pending();
    for (void* p : pending) {
        AVFrame* stale = static_cast<AVFrame*>(p);
        av_frame_free(&stale);
    }
}

// Release entry by cv::Mat* and free any returned AVFrames. Safe if not in map (no-op).
inline void gpu_frame_remove(cv::Mat* mat) {
    if (!mat) return;

    ANSGpuFrameRegistry::instance().release(mat);

    // Free any AVFrames that became pending from this release or prior eviction
    auto pending = ANSGpuFrameRegistry::instance().drain_pending();
    for (void* p : pending) {
        AVFrame* stale = static_cast<AVFrame*>(p);
        av_frame_free(&stale);
    }

    // Free any GPU device pointers that became pending
    auto gpuPending = gpu_frame_drain_gpu_pending();
    // NOTE: cudaFree requires CUDA context — caller must be on a CUDA-capable thread.
    // If not, these will leak. In practice, gpu_frame_remove is called from ANSCV
    // camera threads which do have CUDA context.
    // For safety, we skip cudaFree here and let NV12PreprocessHelper handle it.
    // The GPU pointers are tracked in the budget and will be accounted for.
    (void)gpuPending;
}

// Alias for remove — used in ANSCV mutating functions to drop stale GPU data.
inline void gpu_frame_invalidate(cv::Mat* mat) {
    gpu_frame_remove(mat);
}

// Run TTL eviction + drain pending. Call periodically from camera threads.
inline void gpu_frame_evict_stale() {
    ANSGpuFrameRegistry::instance().evictStaleFrames();

    auto pending = ANSGpuFrameRegistry::instance().drain_pending();
    for (void* p : pending) {
        AVFrame* stale = static_cast<AVFrame*>(p);
        av_frame_free(&stale);
    }
}
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`#pragma once`
			`// ANSGpuFrameOps.h — FFmpeg-aware convenience functions for ANSGpuFrameRegistry.`
			`//`
			`// This header requires FFmpeg headers (libavutil/frame.h) and provides`
			`// typed attach/invalidate/remove operations that handle av_frame_clone/free.`
			`//`
			`// NEW DESIGN: Instead of storing AVFrame* references (which lock NVDEC surfaces),`
			`// we snapshot the CPU NV12 planes into malloc'd buffers and release the AVFrames`
			`// immediately. This prevents decoder surface pool exhaustion when many clones`
			`// hold references to the same frame.`
			`//`
			`// Include this in ANSCV/ANSRTSP (which link FFmpeg). For projects without`
			`// FFmpeg (ANSODEngine), include ANSGpuFrameRegistry.h directly and use`
			`// gpu_frame_lookup() + the GpuFrameData plane pointers.`

			`#include "ANSGpuFrameRegistry.h"`

			`extern "C" {`
			`#include "libavutil/frame.h"`
			`}`

			`#include <cstring>`
			`#include <cstdlib>`

			`namespace anscv_gpu_ops {`
			`namespace detail {`

			`// Snapshot NV12 Y and UV planes from an AVFrame into malloc'd buffers.`
			`// Returns true on success. Caller owns the output buffers.`
			`inline bool snapshotNV12Planes(const AVFrame* nv12,`
			`uint8_t*& outY, int& outYLinesize,`
			`uint8_t*& outUV, int& outUVLinesize,`
			`int& outWidth, int& outHeight) {`
			`if (!nv12 \|\| !nv12->data[0] \|\| !nv12->data[1])`
			`return false;`

			`outWidth = nv12->width;`
			`outHeight = nv12->height;`
			`outYLinesize = nv12->width; // Packed (no alignment padding)`
			`outUVLinesize = nv12->width; // UV interleaved: width bytes per row`

			`size_t yBytes = static_cast<size_t>(outYLinesize) * outHeight;`
			`size_t uvBytes = static_cast<size_t>(outUVLinesize) * (outHeight / 2);`

			`outY = static_cast<uint8_t*>(std::malloc(yBytes));`
			`outUV = static_cast<uint8_t*>(std::malloc(uvBytes));`

			`if (!outY \|\| !outUV) {`
			`std::free(outY);`
			`std::free(outUV);`
			`outY = nullptr;`
			`outUV = nullptr;`
			`return false;`
			`}`

			`// Copy line-by-line (source may have padding via linesize > width)`
			`const int srcYLinesize = nv12->linesize[0];`
			`const int srcUVLinesize = nv12->linesize[1];`

			`for (int row = 0; row < outHeight; ++row) {`
			`std::memcpy(outY + row * outYLinesize,`
			`nv12->data[0] + row * srcYLinesize,`
			`outWidth);`
			`}`
			`for (int row = 0; row < outHeight / 2; ++row) {`
			`std::memcpy(outUV + row * outUVLinesize,`
			`nv12->data[1] + row * srcUVLinesize,`
			`outWidth);`
			`}`

			`return true;`
			`}`

			`} // namespace detail`
			`} // namespace anscv_gpu_ops`

			`// Attach NV12/YUV frame keyed by cv::Mat* pointer.`
			`// Snapshots CPU NV12 planes into owned malloc'd buffers, then releases the AVFrame.`
			`// TAKES OWNERSHIP of nv12 — caller must NOT av_frame_free after this call.`
			`inline void gpu_frame_attach(cv::Mat* mat, AVFrame* nv12, int gpuIdx, int64_t pts) {`
			`if (!mat \|\| !nv12) return;`

			`GpuFrameData data{};`
			`data.gpuIndex = gpuIdx;`
			`data.pts = pts;`
			`data.pixelFormat = nv12->format;`
			`data.width = nv12->width;`
			`data.height = nv12->height;`

			`// Snapshot NV12 planes to owned buffers`
			`bool ok = anscv_gpu_ops::detail::snapshotNV12Planes(`
			`nv12,`
			`data.cpuYPlane, data.cpuYLinesize,`
			`data.cpuUvPlane, data.cpuUvLinesize,`
			`data.width, data.height);`

			`// Keep legacy pointers for backward compat during transition`
			`data.yPlane = data.cpuYPlane;`
			`data.uvPlane = data.cpuUvPlane;`
			`data.yLinesize = data.cpuYLinesize;`
			`data.uvLinesize = data.cpuUvLinesize;`

			`// Store AVFrame for legacy cleanup (will be freed by drain_pending)`
			`data.avframe = nv12;`

			`void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));`
			`if (old) {`
			`AVFrame* oldFrame = static_cast<AVFrame*>(old);`
			`av_frame_free(&oldFrame);`
			`}`

			`// Free stale entries evicted by TTL or previous attach`
			`auto pending = ANSGpuFrameRegistry::instance().drain_pending();`
			`for (void* p : pending) {`
			`AVFrame* stale = static_cast<AVFrame*>(p);`
			`av_frame_free(&stale);`
			`}`
			`}`

			`// Attach CUDA HW frame — keeps CUDA device pointers for zero-copy inference.`
			`// TAKES OWNERSHIP of cudaFrame AND cpuNV12 — caller must NOT av_frame_free after.`
			`//`
			`// Primary path: yPlane/uvPlane point to CUDA device pointers from the cloned`
			`// AVFrame (data[0]/data[1]). The cloned AVFrame keeps the NVDEC surface alive`
			`// until gpu_frame_remove() is called after inference. With 4 cameras each`
			`// holding ~1 surface, this uses 4 of NVDEC's 25-32 surface pool — safe.`
			`//`
			`// Fallback: cpuYPlane/cpuUvPlane hold CPU-side NV12 snapshot for cross-GPU`
			`// inference (when decode GPU != inference GPU, CUDA device ptrs aren't`
			`// accessible from another GPU context).`
			`inline void gpu_frame_attach_cuda(cv::Mat* mat, AVFrame* cudaFrame, int gpuIdx, int64_t pts,`
			`AVFrame* cpuNV12 = nullptr) {`
			`if (!mat \|\| !cudaFrame) return;`

			`GpuFrameData data{};`
			`data.gpuIndex = gpuIdx;`
			`data.pts = pts;`
			`data.width = cudaFrame->width;`
			`data.height = cudaFrame->height;`
			`data.pixelFormat = 23; // AV_PIX_FMT_NV12 — the underlying sw_format`

			`// Primary: CUDA device pointers from NVDEC (zero-copy on same GPU)`
			`data.isCudaDevicePtr = true;`
			`data.yPlane = cudaFrame->data[0]; // CUDA device ptr: Y plane`
			`data.uvPlane = cudaFrame->data[1]; // CUDA device ptr: UV plane`
			`data.yLinesize = cudaFrame->linesize[0];`
			`data.uvLinesize = cudaFrame->linesize[1];`

			`// Fallback: snapshot CPU NV12 for cross-GPU inference`
			`if (cpuNV12) {`
			`anscv_gpu_ops::detail::snapshotNV12Planes(`
			`cpuNV12,`
			`data.cpuYPlane, data.cpuYLinesize,`
			`data.cpuUvPlane, data.cpuUvLinesize,`
			`data.width, data.height);`
			`}`

			`// Store AVFrames for cleanup (cudaFrame keeps NVDEC surface alive)`
			`data.avframe = cudaFrame;`
			`data.cpuAvframe = cpuNV12;`

			`void* old = ANSGpuFrameRegistry::instance().attach(mat, std::move(data));`
			`if (old) {`
			`AVFrame* oldFrame = static_cast<AVFrame*>(old);`
			`av_frame_free(&oldFrame);`
			`}`

			`auto pending = ANSGpuFrameRegistry::instance().drain_pending();`
			`for (void* p : pending) {`
			`AVFrame* stale = static_cast<AVFrame*>(p);`
			`av_frame_free(&stale);`
			`}`
			`}`

			`// Release entry by cv::Mat* and free any returned AVFrames. Safe if not in map (no-op).`
			`inline void gpu_frame_remove(cv::Mat* mat) {`
			`if (!mat) return;`

			`ANSGpuFrameRegistry::instance().release(mat);`

			`// Free any AVFrames that became pending from this release or prior eviction`
			`auto pending = ANSGpuFrameRegistry::instance().drain_pending();`
			`for (void* p : pending) {`
			`AVFrame* stale = static_cast<AVFrame*>(p);`
			`av_frame_free(&stale);`
			`}`

			`// Free any GPU device pointers that became pending`
			`auto gpuPending = gpu_frame_drain_gpu_pending();`
			`// NOTE: cudaFree requires CUDA context — caller must be on a CUDA-capable thread.`
			`// If not, these will leak. In practice, gpu_frame_remove is called from ANSCV`
			`// camera threads which do have CUDA context.`
			`// For safety, we skip cudaFree here and let NV12PreprocessHelper handle it.`
			`// The GPU pointers are tracked in the budget and will be accounted for.`
			`(void)gpuPending;`
			`}`

			`// Alias for remove — used in ANSCV mutating functions to drop stale GPU data.`
			`inline void gpu_frame_invalidate(cv::Mat* mat) {`
			`gpu_frame_remove(mat);`
			`}`

			`// Run TTL eviction + drain pending. Call periodically from camera threads.`
			`inline void gpu_frame_evict_stale() {`
			`ANSGpuFrameRegistry::instance().evictStaleFrames();`

			`auto pending = ANSGpuFrameRegistry::instance().drain_pending();`
			`for (void* p : pending) {`
			`AVFrame* stale = static_cast<AVFrame*>(p);`
			`av_frame_free(&stale);`
			`}`
			`}`