ANSCORE/include/ANSGpuFrameRegistry.h

#pragma once
// ANSGpuFrameRegistry.h — Side-table registry associating cv::Mat pointers
// with GPU-friendly NV12 frame data for fast-path inference.
//
// Key: cv::Mat* (the heap-allocated pointer from anscv_mat_new), NOT datastart.
// This survives deep copies (CloneImage_S) because each clone gets its own key
// pointing to the same shared GpuFrameData via reference counting.
//
// When RTSP HW decode produces an NV12 AVFrame, we snapshot the CPU NV12 planes
// into owned buffers and register them keyed by the cv::Mat*. When CloneImage_S
// is called, addRef() links the new Mat* to the same GpuFrameData (refcount++).
// When inference runs, it reads the NV12 data via a thread-local pointer set by
// RunInferenceComplete_LV — no registry lookup needed in the engine hot path.
//
// Cleanup:
//   - anscv_mat_delete() calls release() → refcount--; frees when 0
//   - anscv_mat_replace() calls release() on old Mat* → same
//   - TTL eviction catches stuck tasks (frames older than 3s with refcount > 0)
//
// Safety layers:
//   1. Refcount cap (64) — prevents runaway refs from bugs
//   2. Frame TTL (3s) — force-frees frames held by stuck tasks
//   3. Global VRAM budget (1GB) — caps GPU cache allocation
//
// Thread-safe: all methods lock internally.
//
// NOTE: This header is FFmpeg-free. CPU NV12 snapshots are owned malloc'd buffers.
// The opaque `avframe`/`cpuAvframe` pointers are retained for ANSCV to free via av_frame_free.

#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <mutex>
#include <atomic>
#include <cstdint>
#include <cstdlib>
#include <cstdio>
#include <chrono>
#include <opencv2/core/mat.hpp>

#ifdef _WIN32
#include <windows.h>
#endif

// Debug logging for registry operations — both stderr and OutputDebugString.
#ifndef REG_DBG
#ifdef _WIN32
#define REG_DBG(fmt, ...) do { \
    char _reg_buf[512]; \
    snprintf(_reg_buf, sizeof(_reg_buf), "[Registry] " fmt "\n", ##__VA_ARGS__); \
    OutputDebugStringA(_reg_buf); \
    fprintf(stderr, "%s", _reg_buf); \
} while(0)
#else
#define REG_DBG(fmt, ...) fprintf(stderr, "[Registry] " fmt "\n", ##__VA_ARGS__)
#endif
#endif

// Safety constants
static constexpr int    MAX_FRAME_REFCOUNT    = 64;
static constexpr int    FRAME_TTL_SECONDS     = 3;
static constexpr size_t GPU_CACHE_BUDGET_DEFAULT = 1ULL * 1024 * 1024 * 1024; // 1GB
static constexpr int    EVICT_CHECK_INTERVAL_MS  = 500;

// Entry for deferred GPU memory deallocation (tracks device index for cudaSetDevice)
struct GpuPendingFreeEntry {
    void* ptr       = nullptr;
    int   deviceIdx = -1;
};

struct GpuFrameData {
    // --- CPU NV12 snapshot (OWNED malloc'd buffers, independent of decoder) ---
    uint8_t*  cpuYPlane    = nullptr;   // malloc'd Y plane copy
    uint8_t*  cpuUvPlane   = nullptr;   // malloc'd UV plane copy
    int       cpuYLinesize = 0;         // Bytes per row in Y plane
    int       cpuUvLinesize = 0;        // Bytes per row in UV plane

    // --- GPU upload cache (created on first inference, shared across tasks) ---
    void*     gpuCacheY    = nullptr;   // cudaMalloc'd Y on inference GPU
    void*     gpuCacheUV   = nullptr;   // cudaMalloc'd UV on inference GPU
    size_t    gpuCacheYPitch  = 0;      // Pitch of cached Y plane
    size_t    gpuCacheUVPitch = 0;      // Pitch of cached UV plane
    size_t    gpuCacheBytes   = 0;      // Total VRAM bytes (for budget tracking)
    int       gpuCacheDeviceIdx = -1;   // GPU index where cache lives
    bool      gpuCacheValid = false;    // true after first upload
    // gpuCacheMutex is NOT here — use the registry mutex for cache creation

    // --- Legacy opaque AVFrame pointers (freed by ANSCV via av_frame_free) ---
    void*     avframe      = nullptr;   // Original CUDA or CPU AVFrame (owned)
    void*     cpuAvframe   = nullptr;   // CPU fallback AVFrame (owned, may be nullptr)

    // --- Frame metadata ---
    int       width        = 0;
    int       height       = 0;
    int       pixelFormat  = 0;         // 23=NV12, 1000=BGR full-res
    int       gpuIndex     = -1;        // GPU that decoded this frame
    int64_t   pts          = 0;         // Presentation timestamp
    bool      isCudaDevicePtr = false;  // Legacy: true if original was CUDA zero-copy

    // --- Legacy NV12 plane pointers (point into avframe, used during transition) ---
    // TODO: Remove once all consumers use cpuYPlane/cpuUvPlane via thread-local
    uint8_t*  yPlane       = nullptr;
    uint8_t*  uvPlane      = nullptr;
    int       yLinesize    = 0;
    int       uvLinesize   = 0;

    // --- Lifecycle ---
    std::atomic<int> refcount{1};
    std::chrono::steady_clock::time_point createdAt;

    // --- Owner callback (for per-client inference guard) ---
    // When the last reference to this frame drops, onReleaseFn is called
    // with ownerClient to decrement the RTSP client's in-flight counter.
    // This lets Destroy() wait for in-flight inference to finish before
    // freeing NVDEC surfaces (fixes LabVIEW crash).
    void*  ownerClient   = nullptr;
    void (*onReleaseFn)(void*) = nullptr;

    // Default constructor
    GpuFrameData() = default;

    // Move constructor (std::atomic is neither copyable nor movable)
    GpuFrameData(GpuFrameData&& o) noexcept
        : cpuYPlane(o.cpuYPlane), cpuUvPlane(o.cpuUvPlane)
        , cpuYLinesize(o.cpuYLinesize), cpuUvLinesize(o.cpuUvLinesize)
        , gpuCacheY(o.gpuCacheY), gpuCacheUV(o.gpuCacheUV)
        , gpuCacheYPitch(o.gpuCacheYPitch), gpuCacheUVPitch(o.gpuCacheUVPitch)
        , gpuCacheBytes(o.gpuCacheBytes), gpuCacheDeviceIdx(o.gpuCacheDeviceIdx)
        , gpuCacheValid(o.gpuCacheValid)
        , avframe(o.avframe), cpuAvframe(o.cpuAvframe)
        , width(o.width), height(o.height), pixelFormat(o.pixelFormat)
        , gpuIndex(o.gpuIndex), pts(o.pts), isCudaDevicePtr(o.isCudaDevicePtr)
        , yPlane(o.yPlane), uvPlane(o.uvPlane)
        , yLinesize(o.yLinesize), uvLinesize(o.uvLinesize)
        , refcount(o.refcount.load()), createdAt(o.createdAt)
        , ownerClient(o.ownerClient), onReleaseFn(o.onReleaseFn)
    {
        // Null out source to prevent double-free of owned pointers
        o.cpuYPlane = nullptr;
        o.cpuUvPlane = nullptr;
        o.gpuCacheY = nullptr;
        o.gpuCacheUV = nullptr;
        o.avframe = nullptr;
        o.cpuAvframe = nullptr;
        o.yPlane = nullptr;
        o.uvPlane = nullptr;
        o.gpuCacheBytes = 0;
        o.ownerClient = nullptr;
        o.onReleaseFn = nullptr;
    }

    // No copy
    GpuFrameData(const GpuFrameData&) = delete;
    GpuFrameData& operator=(const GpuFrameData&) = delete;
};

class ANSGpuFrameRegistry {
public:
    // Process-wide singleton. On Windows, header-only static locals are per-DLL.
    // ANSCV.dll exports ANSGpuFrameRegistry_GetInstance() (defined in
    // ANSGpuFrameRegistry.cpp); other DLLs find it via GetProcAddress at runtime.
    static ANSGpuFrameRegistry& instance() {
#ifdef _WIN32
        static ANSGpuFrameRegistry* s_inst = resolveProcessWide();
        return *s_inst;
#else
        static ANSGpuFrameRegistry reg;
        return reg;
#endif
    }

    // --- Attach: register a new GpuFrameData keyed by cv::Mat* ---
    // Allocates GpuFrameData on heap. Takes ownership of avframe/cpuAvframe.
    // Returns old avframe pointer if this Mat* was already registered (caller must av_frame_free).
    void* attach(cv::Mat* mat, GpuFrameData&& data) {
        if (!mat) return nullptr;
        void* oldAvframe = nullptr;

        // Capture old frame's owner callback to invoke OUTSIDE m_mutex
        void* oldOwner = nullptr;
        void (*oldReleaseFn)(void*) = nullptr;

        data.createdAt = std::chrono::steady_clock::now();
        data.refcount.store(1);

        auto* heapData = new GpuFrameData(std::move(data));
        REG_DBG("attach mat=%p new frame=%p yPlane=%p gpuCacheY=%p isCuda=%d %dx%d",
                (void*)mat, (void*)heapData,
                (void*)heapData->yPlane, heapData->gpuCacheY,
                (int)heapData->isCudaDevicePtr,
                heapData->width, heapData->height);

        {
            std::lock_guard<std::mutex> lock(m_mutex);

            // If this Mat* already has an entry, release the old one
            auto it = m_map.find(mat);
            if (it != m_map.end()) {
                auto* oldFrame = it->second;
                int oldRef = oldFrame->refcount.fetch_sub(1);
                if (oldRef <= 1) {
                    oldOwner = oldFrame->ownerClient;
                    oldReleaseFn = oldFrame->onReleaseFn;
                    oldAvframe = oldFrame->avframe;
                    if (oldFrame->cpuAvframe)
                        m_pendingFree.push_back(oldFrame->cpuAvframe);
                    freeOwnedBuffers_locked(oldFrame);
                    m_frameSet.erase(oldFrame);
                    delete oldFrame;
                }
                // If oldRef > 1, other clones still reference it — just unlink this Mat*
                m_map.erase(it);
            }

            m_map[mat] = heapData;
            m_frameSet.insert(heapData);
        }

        // Notify old frame's owner OUTSIDE m_mutex
        if (oldReleaseFn && oldOwner) {
            oldReleaseFn(oldOwner);
        }

        return oldAvframe;  // Caller must av_frame_free if non-null
    }

    // --- addRef: link a cloned cv::Mat* to the same GpuFrameData as src ---
    // Returns true if successful, false if src not found or refcount cap reached.
    bool addRef(cv::Mat* src, cv::Mat* dst) {
        if (!src || !dst || src == dst) return false;

        std::lock_guard<std::mutex> lock(m_mutex);

        auto it = m_map.find(src);
        if (it == m_map.end()) return false;

        auto* frame = it->second;
        int current = frame->refcount.load();
        if (current >= MAX_FRAME_REFCOUNT) {
            return false;  // Cap reached — caller falls back to BGR
        }

        frame->refcount.fetch_add(1);
        m_map[dst] = frame;
        return true;
    }

    // --- release: decrement refcount for this Mat*, free if 0 ---
    // Returns avframe pointer to free (or nullptr) via pendingFree.
    // Caller must drain_pending() and av_frame_free each returned pointer.
    void release(cv::Mat* mat) {
        if (!mat) return;

        // Capture owner callback to invoke OUTSIDE m_mutex (deadlock safety)
        void* owner = nullptr;
        void (*releaseFn)(void*) = nullptr;

        {
            std::lock_guard<std::mutex> lock(m_mutex);

            auto it = m_map.find(mat);
            if (it == m_map.end()) return;

            auto* frame = it->second;
            m_map.erase(it);

            int oldRef = frame->refcount.fetch_sub(1);
            REG_DBG("release mat=%p refcount %d->%d yPlane=%p gpuCacheY=%p owner=%p",
                    (void*)mat, oldRef, oldRef - 1,
                    (void*)frame->yPlane, frame->gpuCacheY, frame->ownerClient);
            if (oldRef <= 1) {
                // Capture owner callback before deleting frame
                owner = frame->ownerClient;
                releaseFn = frame->onReleaseFn;
                REG_DBG("LAST REF — freeing frame=%p cpuY=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu",
                        (void*)frame, (void*)frame->cpuYPlane,
                        frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes);
                // Last reference — free everything
                if (frame->avframe)
                    m_pendingFree.push_back(frame->avframe);
                if (frame->cpuAvframe)
                    m_pendingFree.push_back(frame->cpuAvframe);
                freeOwnedBuffers_locked(frame);
                m_frameSet.erase(frame);
                delete frame;
            }
        }

        // Notify owner OUTSIDE m_mutex — prevents lock-ordering deadlock
        // with ANSRTSPClient::_mutex (used by Destroy's condition_variable wait)
        if (releaseFn && owner) {
            REG_DBG("calling onReleaseFn owner=%p", owner);
            releaseFn(owner);
        }
    }

    // --- lookup: find GpuFrameData by cv::Mat* (locking) ---
    GpuFrameData* lookup(cv::Mat* mat) {
        std::lock_guard<std::mutex> lock(m_mutex);
        auto it = m_map.find(mat);
        return (it != m_map.end()) ? it->second : nullptr;
    }

    // --- lookup_unlocked: caller MUST hold lock via acquire_lock() ---
    GpuFrameData* lookup_unlocked(cv::Mat* mat) {
        auto it = m_map.find(mat);
        return (it != m_map.end()) ? it->second : nullptr;
    }

    // --- Backward-compat: lookup by datastart (for transition period) ---
    // Searches all entries for matching datastart. O(n) — avoid in hot path.
    GpuFrameData* lookup_by_datastart(const uchar* datastart) {
        std::lock_guard<std::mutex> lock(m_mutex);
        return lookup_by_datastart_unlocked(datastart);
    }

    GpuFrameData* lookup_by_datastart_unlocked(const uchar* datastart) {
        if (!datastart) return nullptr;
        for (auto& [mat, frame] : m_map) {
            if (mat && mat->datastart == datastart)
                return frame;
        }
        return nullptr;
    }

    // Acquire the registry lock explicitly.
    std::unique_lock<std::mutex> acquire_lock() {
        return std::unique_lock<std::mutex>(m_mutex);
    }

    // Number of map entries (Mat* keys) — caller MUST hold lock.
    size_t size_unlocked() const { return m_map.size(); }

    // Number of unique frames alive — caller MUST hold lock.
    size_t frame_count_unlocked() const { return m_frameSet.size(); }

    // --- Drain pending avframe pointers for caller to av_frame_free ---
    std::vector<void*> drain_pending() {
        std::lock_guard<std::mutex> lock(m_mutex);
        std::vector<void*> result;
        result.swap(m_pendingFree);
        return result;
    }

    // --- Drain pending GPU device pointers for caller to cudaFree ---
    // Each entry includes the device index for cudaSetDevice before cudaFree.
    std::vector<GpuPendingFreeEntry> drain_gpu_pending() {
        std::lock_guard<std::mutex> lock(m_mutex);
        std::vector<GpuPendingFreeEntry> result;
        result.swap(m_pendingGpuFree);
        return result;
    }

    // --- TTL eviction: force-free frames older than FRAME_TTL_SECONDS ---
    // Call periodically from camera threads (piggybacked on mat_replace).
    void evictStaleFrames() {
        auto now = std::chrono::steady_clock::now();

        // Throttle: skip if called too frequently
        {
            std::lock_guard<std::mutex> lock(m_mutex);
            if (now - m_lastEvictCheck < std::chrono::milliseconds(EVICT_CHECK_INTERVAL_MS))
                return;
            m_lastEvictCheck = now;
        }

        // Collect owner callbacks to invoke OUTSIDE m_mutex
        struct OwnerCallback { void* client; void (*fn)(void*); };
        std::vector<OwnerCallback> callbacks;

        {
            std::lock_guard<std::mutex> lock(m_mutex);
            for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
                auto* frame = *it;
                auto age_s = std::chrono::duration_cast<std::chrono::seconds>(
                    now - frame->createdAt).count();
                if (age_s > FRAME_TTL_SECONDS && frame->refcount.load() > 0) {
                    // Capture owner callback before deleting
                    if (frame->onReleaseFn && frame->ownerClient) {
                        callbacks.push_back({frame->ownerClient, frame->onReleaseFn});
                    }
                    // Force cleanup — remove all Mat* keys pointing to this frame
                    for (auto jt = m_map.begin(); jt != m_map.end(); ) {
                        if (jt->second == frame)
                            jt = m_map.erase(jt);
                        else
                            ++jt;
                    }
                    // Push avframes to pendingFree
                    if (frame->avframe)
                        m_pendingFree.push_back(frame->avframe);
                    if (frame->cpuAvframe)
                        m_pendingFree.push_back(frame->cpuAvframe);
                    freeOwnedBuffers_locked(frame);
                    it = m_frameSet.erase(it);
                    delete frame;
                } else {
                    ++it;
                }
            }
        }

        // Notify owners OUTSIDE m_mutex
        for (auto& cb : callbacks) {
            cb.fn(cb.client);
        }
    }

    // --- VRAM budget management ---
    bool canAllocateGpuCache(size_t bytes) const {
        return m_totalGpuCacheBytes.load(std::memory_order_relaxed) + bytes <= m_gpuCacheBudget;
    }

    void onGpuCacheCreated(size_t bytes) {
        m_totalGpuCacheBytes.fetch_add(bytes, std::memory_order_relaxed);
    }

    void onGpuCacheFreed(size_t bytes) {
        // Prevent underflow
        size_t old = m_totalGpuCacheBytes.load(std::memory_order_relaxed);
        while (old >= bytes) {
            if (m_totalGpuCacheBytes.compare_exchange_weak(old, old - bytes,
                    std::memory_order_relaxed))
                break;
        }
    }

    size_t totalGpuCacheBytes() const {
        return m_totalGpuCacheBytes.load(std::memory_order_relaxed);
    }

    void setGpuCacheBudget(size_t bytes) { m_gpuCacheBudget = bytes; }
    size_t gpuCacheBudget() const { return m_gpuCacheBudget; }

    // --- Invalidate owner: nullify all callbacks for a client being destroyed ---
    // Called by Destroy() on timeout to prevent callbacks into a deleted object.
    void invalidateOwner(void* client) {
        if (!client) return;
        std::lock_guard<std::mutex> lock(m_mutex);
        for (auto* frame : m_frameSet) {
            if (frame->ownerClient == client) {
                frame->ownerClient = nullptr;
                frame->onReleaseFn = nullptr;
            }
        }
    }

    // --- Force-release all frames owned by a client ---
    // Called by Destroy() BEFORE close() to free GPU buffers while the CUDA
    // context is still alive.  Without this, unreleased clones (e.g. 70 cloned
    // images held by LabVIEW AI tasks that haven't finished) keep gpuCacheY/UV
    // allocated.  When close() destroys the CUDA context, those buffers become
    // orphaned and later cudaFree calls crash.
    //
    // This force-frees ALL owned buffers for frames belonging to this client,
    // removes all Mat* keys pointing to them, and deletes the GpuFrameData.
    // Returns the number of frames force-released.
    int forceReleaseByOwner(void* client) {
        if (!client) return 0;
        int count = 0;

        std::lock_guard<std::mutex> lock(m_mutex);

        for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
            auto* frame = *it;
            if (frame->ownerClient == client) {
                REG_DBG("forceReleaseByOwner: frame=%p refcount=%d gpuCacheY=%p gpuCacheUV=%p bytes=%zu",
                        (void*)frame, frame->refcount.load(),
                        frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes);

                // Remove all Mat* keys pointing to this frame
                for (auto jt = m_map.begin(); jt != m_map.end(); ) {
                    if (jt->second == frame)
                        jt = m_map.erase(jt);
                    else
                        ++jt;
                }

                // Free owned buffers (CPU + GPU pending)
                if (frame->avframe)
                    m_pendingFree.push_back(frame->avframe);
                if (frame->cpuAvframe)
                    m_pendingFree.push_back(frame->cpuAvframe);
                freeOwnedBuffers_locked(frame);
                it = m_frameSet.erase(it);
                delete frame;
                ++count;
            } else {
                ++it;
            }
        }

        if (count > 0) {
            REG_DBG("forceReleaseByOwner: force-released %d frames for client=%p", count, client);
        }
        return count;
    }

private:
    ANSGpuFrameRegistry() = default;

#ifdef _WIN32
    static ANSGpuFrameRegistry* resolveProcessWide();
#endif

    // Free malloc'd CPU NV12 buffers and GPU cache (but NOT avframe/cpuAvframe —
    // those go to pendingFree for the caller to av_frame_free).
    void freeOwnedBuffers_locked(GpuFrameData* frame) {
        REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d",
                (void*)frame, (void*)frame->cpuYPlane, (void*)frame->cpuUvPlane,
                frame->gpuCacheY, frame->gpuCacheUV,
                frame->gpuCacheBytes, frame->gpuCacheDeviceIdx);
        if (frame->cpuYPlane) {
            std::free(frame->cpuYPlane);
            frame->cpuYPlane = nullptr;
        }
        if (frame->cpuUvPlane) {
            std::free(frame->cpuUvPlane);
            frame->cpuUvPlane = nullptr;
        }
        // GPU cache freed via CUDA — push to deferred list with device index
        // so the caller (ANSGpuFrameOps.h) can cudaSetDevice + cudaFree.
        if (frame->gpuCacheBytes > 0) {
            onGpuCacheFreed(frame->gpuCacheBytes);
            frame->gpuCacheValid = false;
            frame->gpuCacheBytes = 0;
            int devIdx = frame->gpuCacheDeviceIdx;
            if (frame->gpuCacheY)
                m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx});
            if (frame->gpuCacheUV)
                m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx});
            frame->gpuCacheY = nullptr;
            frame->gpuCacheUV = nullptr;
        }
    }

    std::mutex m_mutex;
    std::unordered_map<cv::Mat*, GpuFrameData*> m_map;
    std::unordered_set<GpuFrameData*> m_frameSet;  // All unique frames (for TTL scan)
    std::vector<void*> m_pendingFree;     // AVFrame* pointers to av_frame_free
    std::vector<GpuPendingFreeEntry> m_pendingGpuFree;  // CUDA device pointers to cudaFree
    std::atomic<size_t> m_totalGpuCacheBytes{0};
    size_t m_gpuCacheBudget = GPU_CACHE_BUDGET_DEFAULT;
    std::chrono::steady_clock::time_point m_lastEvictCheck;
};

// ── Convenience free functions (FFmpeg-agnostic) ────────────────────────

// Lookup by cv::Mat* pointer (primary key)
inline GpuFrameData* gpu_frame_lookup(cv::Mat* mat) {
    return ANSGpuFrameRegistry::instance().lookup(mat);
}

// Backward-compat: lookup by datastart (O(n) — avoid in hot path)
inline GpuFrameData* gpu_frame_lookup(const uchar* datastart) {
    return ANSGpuFrameRegistry::instance().lookup_by_datastart(datastart);
}

// Add ref: link clone Mat* to same GpuFrameData as src Mat*
inline bool gpu_frame_addref(cv::Mat* src, cv::Mat* dst) {
    return ANSGpuFrameRegistry::instance().addRef(src, dst);
}

// Drain GPU device pointers that need cudaFree.
// Caller must cudaSetDevice(entry.deviceIdx) + cudaFree(entry.ptr) for each.
inline std::vector<GpuPendingFreeEntry> gpu_frame_drain_gpu_pending() {
    return ANSGpuFrameRegistry::instance().drain_gpu_pending();
}