2026-03-28 16:54:11 +11:00
|
|
|
#pragma once
|
|
|
|
|
// ANSGpuFrameRegistry.h — Side-table registry associating cv::Mat pointers
|
|
|
|
|
// with GPU-friendly NV12 frame data for fast-path inference.
|
|
|
|
|
//
|
|
|
|
|
// Key: cv::Mat* (the heap-allocated pointer from anscv_mat_new), NOT datastart.
|
|
|
|
|
// This survives deep copies (CloneImage_S) because each clone gets its own key
|
|
|
|
|
// pointing to the same shared GpuFrameData via reference counting.
|
|
|
|
|
//
|
|
|
|
|
// When RTSP HW decode produces an NV12 AVFrame, we snapshot the CPU NV12 planes
|
|
|
|
|
// into owned buffers and register them keyed by the cv::Mat*. When CloneImage_S
|
|
|
|
|
// is called, addRef() links the new Mat* to the same GpuFrameData (refcount++).
|
|
|
|
|
// When inference runs, it reads the NV12 data via a thread-local pointer set by
|
|
|
|
|
// RunInferenceComplete_LV — no registry lookup needed in the engine hot path.
|
|
|
|
|
//
|
|
|
|
|
// Cleanup:
|
|
|
|
|
// - anscv_mat_delete() calls release() → refcount--; frees when 0
|
|
|
|
|
// - anscv_mat_replace() calls release() on old Mat* → same
|
|
|
|
|
// - TTL eviction catches stuck tasks (frames older than 3s with refcount > 0)
|
|
|
|
|
//
|
|
|
|
|
// Safety layers:
|
|
|
|
|
// 1. Refcount cap (64) — prevents runaway refs from bugs
|
|
|
|
|
// 2. Frame TTL (3s) — force-frees frames held by stuck tasks
|
|
|
|
|
// 3. Global VRAM budget (1GB) — caps GPU cache allocation
|
|
|
|
|
//
|
|
|
|
|
// Thread-safe: all methods lock internally.
|
|
|
|
|
//
|
|
|
|
|
// NOTE: This header is FFmpeg-free. CPU NV12 snapshots are owned malloc'd buffers.
|
|
|
|
|
// The opaque `avframe`/`cpuAvframe` pointers are retained for ANSCV to free via av_frame_free.
|
|
|
|
|
|
|
|
|
|
#include <unordered_map>
|
|
|
|
|
#include <unordered_set>
|
|
|
|
|
#include <vector>
|
|
|
|
|
#include <mutex>
|
|
|
|
|
#include <atomic>
|
|
|
|
|
#include <cstdint>
|
|
|
|
|
#include <cstdlib>
|
2026-04-02 22:07:27 +11:00
|
|
|
#include <cstdio>
|
2026-03-28 16:54:11 +11:00
|
|
|
#include <chrono>
|
|
|
|
|
#include <opencv2/core/mat.hpp>
|
|
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
#ifdef _WIN32
|
|
|
|
|
#include <windows.h>
|
|
|
|
|
#endif
|
|
|
|
|
|
2026-04-03 14:51:52 +11:00
|
|
|
// Debug logging for registry operations.
|
|
|
|
|
// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
|
2026-04-02 22:07:27 +11:00
|
|
|
#ifndef REG_DBG
|
2026-04-03 14:51:52 +11:00
|
|
|
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
|
2026-04-02 22:07:27 +11:00
|
|
|
#ifdef _WIN32
|
|
|
|
|
#define REG_DBG(fmt, ...) do { \
|
|
|
|
|
char _reg_buf[512]; \
|
|
|
|
|
snprintf(_reg_buf, sizeof(_reg_buf), "[Registry] " fmt "\n", ##__VA_ARGS__); \
|
|
|
|
|
OutputDebugStringA(_reg_buf); \
|
|
|
|
|
fprintf(stderr, "%s", _reg_buf); \
|
|
|
|
|
} while(0)
|
|
|
|
|
#else
|
|
|
|
|
#define REG_DBG(fmt, ...) fprintf(stderr, "[Registry] " fmt "\n", ##__VA_ARGS__)
|
|
|
|
|
#endif
|
2026-04-03 14:51:52 +11:00
|
|
|
#else
|
|
|
|
|
#define REG_DBG(fmt, ...) ((void)0)
|
|
|
|
|
#endif
|
2026-04-02 22:07:27 +11:00
|
|
|
#endif
|
|
|
|
|
|
2026-04-03 14:51:52 +11:00
|
|
|
// GpuNV12Slot definition needed by freeOwnedBuffers_locked() (accesses inUse atomic).
|
|
|
|
|
#include "GpuNV12SlotPool.h"
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
// Safety constants
|
|
|
|
|
static constexpr int MAX_FRAME_REFCOUNT = 64;
|
|
|
|
|
static constexpr int FRAME_TTL_SECONDS = 3;
|
|
|
|
|
static constexpr size_t GPU_CACHE_BUDGET_DEFAULT = 1ULL * 1024 * 1024 * 1024; // 1GB
|
|
|
|
|
static constexpr int EVICT_CHECK_INTERVAL_MS = 500;
|
|
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
// Entry for deferred GPU memory deallocation (tracks device index for cudaSetDevice)
|
|
|
|
|
struct GpuPendingFreeEntry {
|
|
|
|
|
void* ptr = nullptr;
|
|
|
|
|
int deviceIdx = -1;
|
2026-04-03 14:51:52 +11:00
|
|
|
std::chrono::steady_clock::time_point queuedAt; // When this entry was queued
|
2026-04-02 22:07:27 +11:00
|
|
|
};
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
struct GpuFrameData {
|
|
|
|
|
// --- CPU NV12 snapshot (OWNED malloc'd buffers, independent of decoder) ---
|
|
|
|
|
uint8_t* cpuYPlane = nullptr; // malloc'd Y plane copy
|
|
|
|
|
uint8_t* cpuUvPlane = nullptr; // malloc'd UV plane copy
|
|
|
|
|
int cpuYLinesize = 0; // Bytes per row in Y plane
|
|
|
|
|
int cpuUvLinesize = 0; // Bytes per row in UV plane
|
|
|
|
|
|
|
|
|
|
// --- GPU upload cache (created on first inference, shared across tasks) ---
|
|
|
|
|
void* gpuCacheY = nullptr; // cudaMalloc'd Y on inference GPU
|
|
|
|
|
void* gpuCacheUV = nullptr; // cudaMalloc'd UV on inference GPU
|
|
|
|
|
size_t gpuCacheYPitch = 0; // Pitch of cached Y plane
|
|
|
|
|
size_t gpuCacheUVPitch = 0; // Pitch of cached UV plane
|
|
|
|
|
size_t gpuCacheBytes = 0; // Total VRAM bytes (for budget tracking)
|
|
|
|
|
int gpuCacheDeviceIdx = -1; // GPU index where cache lives
|
|
|
|
|
bool gpuCacheValid = false; // true after first upload
|
|
|
|
|
// gpuCacheMutex is NOT here — use the registry mutex for cache creation
|
|
|
|
|
|
|
|
|
|
// --- Legacy opaque AVFrame pointers (freed by ANSCV via av_frame_free) ---
|
|
|
|
|
void* avframe = nullptr; // Original CUDA or CPU AVFrame (owned)
|
|
|
|
|
void* cpuAvframe = nullptr; // CPU fallback AVFrame (owned, may be nullptr)
|
|
|
|
|
|
|
|
|
|
// --- Frame metadata ---
|
|
|
|
|
int width = 0;
|
|
|
|
|
int height = 0;
|
|
|
|
|
int pixelFormat = 0; // 23=NV12, 1000=BGR full-res
|
|
|
|
|
int gpuIndex = -1; // GPU that decoded this frame
|
|
|
|
|
int64_t pts = 0; // Presentation timestamp
|
|
|
|
|
bool isCudaDevicePtr = false; // Legacy: true if original was CUDA zero-copy
|
|
|
|
|
|
|
|
|
|
// --- Legacy NV12 plane pointers (point into avframe, used during transition) ---
|
|
|
|
|
// TODO: Remove once all consumers use cpuYPlane/cpuUvPlane via thread-local
|
|
|
|
|
uint8_t* yPlane = nullptr;
|
|
|
|
|
uint8_t* uvPlane = nullptr;
|
|
|
|
|
int yLinesize = 0;
|
|
|
|
|
int uvLinesize = 0;
|
|
|
|
|
|
|
|
|
|
// --- Lifecycle ---
|
|
|
|
|
std::atomic<int> refcount{1};
|
|
|
|
|
std::chrono::steady_clock::time_point createdAt;
|
|
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
// --- Owner callback (for per-client inference guard) ---
|
|
|
|
|
// When the last reference to this frame drops, onReleaseFn is called
|
|
|
|
|
// with ownerClient to decrement the RTSP client's in-flight counter.
|
|
|
|
|
// This lets Destroy() wait for in-flight inference to finish before
|
|
|
|
|
// freeing NVDEC surfaces (fixes LabVIEW crash).
|
|
|
|
|
void* ownerClient = nullptr;
|
|
|
|
|
void (*onReleaseFn)(void*) = nullptr;
|
|
|
|
|
|
2026-04-03 14:51:52 +11:00
|
|
|
// --- Global pool slot (from GpuNV12SlotPool) ---
|
|
|
|
|
// When non-null, yPlane/uvPlane point into this slot's buffers.
|
|
|
|
|
// Released (slot->inUse = false) in freeOwnedBuffers_locked() when
|
|
|
|
|
// the frame's refcount drops to 0 — guarantees the buffer is not
|
|
|
|
|
// freed while any consumer is still reading it.
|
|
|
|
|
GpuNV12Slot* poolSlot = nullptr;
|
|
|
|
|
|
2026-04-04 10:09:47 +11:00
|
|
|
// --- Async D2D copy stream ---
|
|
|
|
|
// The CUDA stream used for the async D2D copy from NVDEC surface to pool buffer.
|
|
|
|
|
// Inference MUST call cudaStreamSynchronize on this before reading yPlane/uvPlane
|
|
|
|
|
// to ensure the copy has completed. Stored as void* to avoid cuda_runtime.h here.
|
|
|
|
|
// nullptr means D2D was synchronous (legacy path) or no D2D copy was done.
|
|
|
|
|
void* d2dCopyStream = nullptr;
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
// Default constructor
|
|
|
|
|
GpuFrameData() = default;
|
|
|
|
|
|
|
|
|
|
// Move constructor (std::atomic is neither copyable nor movable)
|
|
|
|
|
GpuFrameData(GpuFrameData&& o) noexcept
|
|
|
|
|
: cpuYPlane(o.cpuYPlane), cpuUvPlane(o.cpuUvPlane)
|
|
|
|
|
, cpuYLinesize(o.cpuYLinesize), cpuUvLinesize(o.cpuUvLinesize)
|
|
|
|
|
, gpuCacheY(o.gpuCacheY), gpuCacheUV(o.gpuCacheUV)
|
|
|
|
|
, gpuCacheYPitch(o.gpuCacheYPitch), gpuCacheUVPitch(o.gpuCacheUVPitch)
|
|
|
|
|
, gpuCacheBytes(o.gpuCacheBytes), gpuCacheDeviceIdx(o.gpuCacheDeviceIdx)
|
|
|
|
|
, gpuCacheValid(o.gpuCacheValid)
|
|
|
|
|
, avframe(o.avframe), cpuAvframe(o.cpuAvframe)
|
|
|
|
|
, width(o.width), height(o.height), pixelFormat(o.pixelFormat)
|
|
|
|
|
, gpuIndex(o.gpuIndex), pts(o.pts), isCudaDevicePtr(o.isCudaDevicePtr)
|
|
|
|
|
, yPlane(o.yPlane), uvPlane(o.uvPlane)
|
|
|
|
|
, yLinesize(o.yLinesize), uvLinesize(o.uvLinesize)
|
|
|
|
|
, refcount(o.refcount.load()), createdAt(o.createdAt)
|
2026-04-02 22:07:27 +11:00
|
|
|
, ownerClient(o.ownerClient), onReleaseFn(o.onReleaseFn)
|
2026-04-03 14:51:52 +11:00
|
|
|
, poolSlot(o.poolSlot)
|
2026-04-04 10:09:47 +11:00
|
|
|
, d2dCopyStream(o.d2dCopyStream)
|
2026-03-28 16:54:11 +11:00
|
|
|
{
|
|
|
|
|
// Null out source to prevent double-free of owned pointers
|
|
|
|
|
o.cpuYPlane = nullptr;
|
|
|
|
|
o.cpuUvPlane = nullptr;
|
|
|
|
|
o.gpuCacheY = nullptr;
|
|
|
|
|
o.gpuCacheUV = nullptr;
|
|
|
|
|
o.avframe = nullptr;
|
|
|
|
|
o.cpuAvframe = nullptr;
|
|
|
|
|
o.yPlane = nullptr;
|
|
|
|
|
o.uvPlane = nullptr;
|
|
|
|
|
o.gpuCacheBytes = 0;
|
2026-04-02 22:07:27 +11:00
|
|
|
o.ownerClient = nullptr;
|
|
|
|
|
o.onReleaseFn = nullptr;
|
2026-04-03 14:51:52 +11:00
|
|
|
o.poolSlot = nullptr;
|
2026-04-04 10:09:47 +11:00
|
|
|
o.d2dCopyStream = nullptr;
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// No copy
|
|
|
|
|
GpuFrameData(const GpuFrameData&) = delete;
|
|
|
|
|
GpuFrameData& operator=(const GpuFrameData&) = delete;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
class ANSGpuFrameRegistry {
|
|
|
|
|
public:
|
|
|
|
|
// Process-wide singleton. On Windows, header-only static locals are per-DLL.
|
|
|
|
|
// ANSCV.dll exports ANSGpuFrameRegistry_GetInstance() (defined in
|
|
|
|
|
// ANSGpuFrameRegistry.cpp); other DLLs find it via GetProcAddress at runtime.
|
|
|
|
|
static ANSGpuFrameRegistry& instance() {
|
|
|
|
|
#ifdef _WIN32
|
|
|
|
|
static ANSGpuFrameRegistry* s_inst = resolveProcessWide();
|
|
|
|
|
return *s_inst;
|
|
|
|
|
#else
|
|
|
|
|
static ANSGpuFrameRegistry reg;
|
|
|
|
|
return reg;
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// --- Attach: register a new GpuFrameData keyed by cv::Mat* ---
|
|
|
|
|
// Allocates GpuFrameData on heap. Takes ownership of avframe/cpuAvframe.
|
|
|
|
|
// Returns old avframe pointer if this Mat* was already registered (caller must av_frame_free).
|
|
|
|
|
void* attach(cv::Mat* mat, GpuFrameData&& data) {
|
|
|
|
|
if (!mat) return nullptr;
|
|
|
|
|
void* oldAvframe = nullptr;
|
|
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
// Capture old frame's owner callback to invoke OUTSIDE m_mutex
|
|
|
|
|
void* oldOwner = nullptr;
|
|
|
|
|
void (*oldReleaseFn)(void*) = nullptr;
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
data.createdAt = std::chrono::steady_clock::now();
|
|
|
|
|
data.refcount.store(1);
|
|
|
|
|
|
|
|
|
|
auto* heapData = new GpuFrameData(std::move(data));
|
2026-04-02 22:07:27 +11:00
|
|
|
REG_DBG("attach mat=%p new frame=%p yPlane=%p gpuCacheY=%p isCuda=%d %dx%d",
|
|
|
|
|
(void*)mat, (void*)heapData,
|
|
|
|
|
(void*)heapData->yPlane, heapData->gpuCacheY,
|
|
|
|
|
(int)heapData->isCudaDevicePtr,
|
|
|
|
|
heapData->width, heapData->height);
|
2026-03-28 16:54:11 +11:00
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
{
|
|
|
|
|
std::lock_guard<std::mutex> lock(m_mutex);
|
2026-03-28 16:54:11 +11:00
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
// If this Mat* already has an entry, release the old one
|
|
|
|
|
auto it = m_map.find(mat);
|
|
|
|
|
if (it != m_map.end()) {
|
|
|
|
|
auto* oldFrame = it->second;
|
|
|
|
|
int oldRef = oldFrame->refcount.fetch_sub(1);
|
|
|
|
|
if (oldRef <= 1) {
|
|
|
|
|
oldOwner = oldFrame->ownerClient;
|
|
|
|
|
oldReleaseFn = oldFrame->onReleaseFn;
|
|
|
|
|
oldAvframe = oldFrame->avframe;
|
|
|
|
|
if (oldFrame->cpuAvframe)
|
|
|
|
|
m_pendingFree.push_back(oldFrame->cpuAvframe);
|
|
|
|
|
freeOwnedBuffers_locked(oldFrame);
|
|
|
|
|
m_frameSet.erase(oldFrame);
|
|
|
|
|
delete oldFrame;
|
|
|
|
|
}
|
|
|
|
|
// If oldRef > 1, other clones still reference it — just unlink this Mat*
|
|
|
|
|
m_map.erase(it);
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
2026-04-02 22:07:27 +11:00
|
|
|
|
|
|
|
|
m_map[mat] = heapData;
|
|
|
|
|
m_frameSet.insert(heapData);
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
|
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
// Notify old frame's owner OUTSIDE m_mutex
|
|
|
|
|
if (oldReleaseFn && oldOwner) {
|
|
|
|
|
oldReleaseFn(oldOwner);
|
|
|
|
|
}
|
2026-03-28 16:54:11 +11:00
|
|
|
|
|
|
|
|
return oldAvframe; // Caller must av_frame_free if non-null
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// --- addRef: link a cloned cv::Mat* to the same GpuFrameData as src ---
|
|
|
|
|
// Returns true if successful, false if src not found or refcount cap reached.
|
|
|
|
|
bool addRef(cv::Mat* src, cv::Mat* dst) {
|
|
|
|
|
if (!src || !dst || src == dst) return false;
|
|
|
|
|
|
|
|
|
|
std::lock_guard<std::mutex> lock(m_mutex);
|
|
|
|
|
|
|
|
|
|
auto it = m_map.find(src);
|
|
|
|
|
if (it == m_map.end()) return false;
|
|
|
|
|
|
|
|
|
|
auto* frame = it->second;
|
|
|
|
|
int current = frame->refcount.load();
|
|
|
|
|
if (current >= MAX_FRAME_REFCOUNT) {
|
|
|
|
|
return false; // Cap reached — caller falls back to BGR
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
frame->refcount.fetch_add(1);
|
|
|
|
|
m_map[dst] = frame;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// --- release: decrement refcount for this Mat*, free if 0 ---
|
|
|
|
|
// Returns avframe pointer to free (or nullptr) via pendingFree.
|
|
|
|
|
// Caller must drain_pending() and av_frame_free each returned pointer.
|
|
|
|
|
void release(cv::Mat* mat) {
|
|
|
|
|
if (!mat) return;
|
|
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
// Capture owner callback to invoke OUTSIDE m_mutex (deadlock safety)
|
|
|
|
|
void* owner = nullptr;
|
|
|
|
|
void (*releaseFn)(void*) = nullptr;
|
2026-03-28 16:54:11 +11:00
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
{
|
|
|
|
|
std::lock_guard<std::mutex> lock(m_mutex);
|
2026-03-28 16:54:11 +11:00
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
auto it = m_map.find(mat);
|
|
|
|
|
if (it == m_map.end()) return;
|
|
|
|
|
|
|
|
|
|
auto* frame = it->second;
|
|
|
|
|
m_map.erase(it);
|
|
|
|
|
|
|
|
|
|
int oldRef = frame->refcount.fetch_sub(1);
|
|
|
|
|
REG_DBG("release mat=%p refcount %d->%d yPlane=%p gpuCacheY=%p owner=%p",
|
|
|
|
|
(void*)mat, oldRef, oldRef - 1,
|
|
|
|
|
(void*)frame->yPlane, frame->gpuCacheY, frame->ownerClient);
|
|
|
|
|
if (oldRef <= 1) {
|
|
|
|
|
// Capture owner callback before deleting frame
|
|
|
|
|
owner = frame->ownerClient;
|
|
|
|
|
releaseFn = frame->onReleaseFn;
|
|
|
|
|
REG_DBG("LAST REF — freeing frame=%p cpuY=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu",
|
|
|
|
|
(void*)frame, (void*)frame->cpuYPlane,
|
|
|
|
|
frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes);
|
|
|
|
|
// Last reference — free everything
|
|
|
|
|
if (frame->avframe)
|
|
|
|
|
m_pendingFree.push_back(frame->avframe);
|
|
|
|
|
if (frame->cpuAvframe)
|
|
|
|
|
m_pendingFree.push_back(frame->cpuAvframe);
|
|
|
|
|
freeOwnedBuffers_locked(frame);
|
|
|
|
|
m_frameSet.erase(frame);
|
|
|
|
|
delete frame;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Notify owner OUTSIDE m_mutex — prevents lock-ordering deadlock
|
|
|
|
|
// with ANSRTSPClient::_mutex (used by Destroy's condition_variable wait)
|
|
|
|
|
if (releaseFn && owner) {
|
|
|
|
|
REG_DBG("calling onReleaseFn owner=%p", owner);
|
|
|
|
|
releaseFn(owner);
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// --- lookup: find GpuFrameData by cv::Mat* (locking) ---
|
|
|
|
|
GpuFrameData* lookup(cv::Mat* mat) {
|
|
|
|
|
std::lock_guard<std::mutex> lock(m_mutex);
|
|
|
|
|
auto it = m_map.find(mat);
|
|
|
|
|
return (it != m_map.end()) ? it->second : nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// --- lookup_unlocked: caller MUST hold lock via acquire_lock() ---
|
|
|
|
|
GpuFrameData* lookup_unlocked(cv::Mat* mat) {
|
|
|
|
|
auto it = m_map.find(mat);
|
|
|
|
|
return (it != m_map.end()) ? it->second : nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// --- Backward-compat: lookup by datastart (for transition period) ---
|
|
|
|
|
// Searches all entries for matching datastart. O(n) — avoid in hot path.
|
|
|
|
|
GpuFrameData* lookup_by_datastart(const uchar* datastart) {
|
|
|
|
|
std::lock_guard<std::mutex> lock(m_mutex);
|
|
|
|
|
return lookup_by_datastart_unlocked(datastart);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
GpuFrameData* lookup_by_datastart_unlocked(const uchar* datastart) {
|
|
|
|
|
if (!datastart) return nullptr;
|
|
|
|
|
for (auto& [mat, frame] : m_map) {
|
|
|
|
|
if (mat && mat->datastart == datastart)
|
|
|
|
|
return frame;
|
|
|
|
|
}
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Acquire the registry lock explicitly.
|
|
|
|
|
std::unique_lock<std::mutex> acquire_lock() {
|
|
|
|
|
return std::unique_lock<std::mutex>(m_mutex);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Number of map entries (Mat* keys) — caller MUST hold lock.
|
|
|
|
|
size_t size_unlocked() const { return m_map.size(); }
|
|
|
|
|
|
|
|
|
|
// Number of unique frames alive — caller MUST hold lock.
|
|
|
|
|
size_t frame_count_unlocked() const { return m_frameSet.size(); }
|
|
|
|
|
|
|
|
|
|
// --- Drain pending avframe pointers for caller to av_frame_free ---
|
|
|
|
|
std::vector<void*> drain_pending() {
|
|
|
|
|
std::lock_guard<std::mutex> lock(m_mutex);
|
|
|
|
|
std::vector<void*> result;
|
|
|
|
|
result.swap(m_pendingFree);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-04 10:09:47 +11:00
|
|
|
// Push an AVFrame* (as void*) for deferred freeing.
|
|
|
|
|
// Caller MUST hold the lock via acquire_lock().
|
|
|
|
|
void pushPendingFree_locked(void* ptr) {
|
|
|
|
|
if (ptr) m_pendingFree.push_back(ptr);
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
// --- Drain pending GPU device pointers for caller to cudaFree ---
|
2026-04-02 22:07:27 +11:00
|
|
|
// Each entry includes the device index for cudaSetDevice before cudaFree.
|
2026-04-03 14:51:52 +11:00
|
|
|
// If minAgeMs > 0, only drain entries older than minAgeMs milliseconds.
|
|
|
|
|
// This allows time-based safety: entries queued >100ms ago are guaranteed
|
|
|
|
|
// safe to free because all CUDA kernels complete in <10ms.
|
|
|
|
|
std::vector<GpuPendingFreeEntry> drain_gpu_pending(int minAgeMs = 0) {
|
2026-03-28 16:54:11 +11:00
|
|
|
std::lock_guard<std::mutex> lock(m_mutex);
|
2026-04-03 14:51:52 +11:00
|
|
|
if (minAgeMs <= 0) {
|
|
|
|
|
// Drain all (used by Destroy/Reconnect with cudaDeviceSynchronize)
|
|
|
|
|
std::vector<GpuPendingFreeEntry> result;
|
|
|
|
|
result.swap(m_pendingGpuFree);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
// Drain only entries older than minAgeMs
|
|
|
|
|
auto now = std::chrono::steady_clock::now();
|
|
|
|
|
auto threshold = std::chrono::milliseconds(minAgeMs);
|
|
|
|
|
std::vector<GpuPendingFreeEntry> ready;
|
|
|
|
|
std::vector<GpuPendingFreeEntry> notReady;
|
|
|
|
|
for (auto& entry : m_pendingGpuFree) {
|
|
|
|
|
if (now - entry.queuedAt >= threshold)
|
|
|
|
|
ready.push_back(entry);
|
|
|
|
|
else
|
|
|
|
|
notReady.push_back(entry);
|
|
|
|
|
}
|
|
|
|
|
m_pendingGpuFree = std::move(notReady);
|
|
|
|
|
return ready;
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// --- TTL eviction: force-free frames older than FRAME_TTL_SECONDS ---
|
|
|
|
|
// Call periodically from camera threads (piggybacked on mat_replace).
|
|
|
|
|
void evictStaleFrames() {
|
|
|
|
|
auto now = std::chrono::steady_clock::now();
|
|
|
|
|
|
|
|
|
|
// Throttle: skip if called too frequently
|
|
|
|
|
{
|
|
|
|
|
std::lock_guard<std::mutex> lock(m_mutex);
|
|
|
|
|
if (now - m_lastEvictCheck < std::chrono::milliseconds(EVICT_CHECK_INTERVAL_MS))
|
|
|
|
|
return;
|
|
|
|
|
m_lastEvictCheck = now;
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
// Collect owner callbacks to invoke OUTSIDE m_mutex
|
|
|
|
|
struct OwnerCallback { void* client; void (*fn)(void*); };
|
|
|
|
|
std::vector<OwnerCallback> callbacks;
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
std::lock_guard<std::mutex> lock(m_mutex);
|
|
|
|
|
for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
|
|
|
|
|
auto* frame = *it;
|
|
|
|
|
auto age_s = std::chrono::duration_cast<std::chrono::seconds>(
|
|
|
|
|
now - frame->createdAt).count();
|
|
|
|
|
if (age_s > FRAME_TTL_SECONDS && frame->refcount.load() > 0) {
|
|
|
|
|
// Capture owner callback before deleting
|
|
|
|
|
if (frame->onReleaseFn && frame->ownerClient) {
|
|
|
|
|
callbacks.push_back({frame->ownerClient, frame->onReleaseFn});
|
|
|
|
|
}
|
|
|
|
|
// Force cleanup — remove all Mat* keys pointing to this frame
|
|
|
|
|
for (auto jt = m_map.begin(); jt != m_map.end(); ) {
|
|
|
|
|
if (jt->second == frame)
|
|
|
|
|
jt = m_map.erase(jt);
|
|
|
|
|
else
|
|
|
|
|
++jt;
|
|
|
|
|
}
|
|
|
|
|
// Push avframes to pendingFree
|
|
|
|
|
if (frame->avframe)
|
|
|
|
|
m_pendingFree.push_back(frame->avframe);
|
|
|
|
|
if (frame->cpuAvframe)
|
|
|
|
|
m_pendingFree.push_back(frame->cpuAvframe);
|
|
|
|
|
freeOwnedBuffers_locked(frame);
|
|
|
|
|
it = m_frameSet.erase(it);
|
|
|
|
|
delete frame;
|
|
|
|
|
} else {
|
|
|
|
|
++it;
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-04-02 22:07:27 +11:00
|
|
|
|
|
|
|
|
// Notify owners OUTSIDE m_mutex
|
|
|
|
|
for (auto& cb : callbacks) {
|
|
|
|
|
cb.fn(cb.client);
|
|
|
|
|
}
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// --- VRAM budget management ---
|
|
|
|
|
bool canAllocateGpuCache(size_t bytes) const {
|
|
|
|
|
return m_totalGpuCacheBytes.load(std::memory_order_relaxed) + bytes <= m_gpuCacheBudget;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void onGpuCacheCreated(size_t bytes) {
|
|
|
|
|
m_totalGpuCacheBytes.fetch_add(bytes, std::memory_order_relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void onGpuCacheFreed(size_t bytes) {
|
|
|
|
|
// Prevent underflow
|
|
|
|
|
size_t old = m_totalGpuCacheBytes.load(std::memory_order_relaxed);
|
|
|
|
|
while (old >= bytes) {
|
|
|
|
|
if (m_totalGpuCacheBytes.compare_exchange_weak(old, old - bytes,
|
|
|
|
|
std::memory_order_relaxed))
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t totalGpuCacheBytes() const {
|
|
|
|
|
return m_totalGpuCacheBytes.load(std::memory_order_relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void setGpuCacheBudget(size_t bytes) { m_gpuCacheBudget = bytes; }
|
|
|
|
|
size_t gpuCacheBudget() const { return m_gpuCacheBudget; }
|
|
|
|
|
|
2026-04-02 22:07:27 +11:00
|
|
|
// --- Invalidate owner: nullify all callbacks for a client being destroyed ---
|
|
|
|
|
// Called by Destroy() on timeout to prevent callbacks into a deleted object.
|
|
|
|
|
void invalidateOwner(void* client) {
|
|
|
|
|
if (!client) return;
|
|
|
|
|
std::lock_guard<std::mutex> lock(m_mutex);
|
|
|
|
|
for (auto* frame : m_frameSet) {
|
|
|
|
|
if (frame->ownerClient == client) {
|
|
|
|
|
frame->ownerClient = nullptr;
|
|
|
|
|
frame->onReleaseFn = nullptr;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// --- Force-release all frames owned by a client ---
|
|
|
|
|
// Called by Destroy() BEFORE close() to free GPU buffers while the CUDA
|
|
|
|
|
// context is still alive. Without this, unreleased clones (e.g. 70 cloned
|
|
|
|
|
// images held by LabVIEW AI tasks that haven't finished) keep gpuCacheY/UV
|
|
|
|
|
// allocated. When close() destroys the CUDA context, those buffers become
|
|
|
|
|
// orphaned and later cudaFree calls crash.
|
|
|
|
|
//
|
|
|
|
|
// This force-frees ALL owned buffers for frames belonging to this client,
|
|
|
|
|
// removes all Mat* keys pointing to them, and deletes the GpuFrameData.
|
|
|
|
|
// Returns the number of frames force-released.
|
|
|
|
|
int forceReleaseByOwner(void* client) {
|
|
|
|
|
if (!client) return 0;
|
|
|
|
|
int count = 0;
|
|
|
|
|
|
|
|
|
|
std::lock_guard<std::mutex> lock(m_mutex);
|
|
|
|
|
|
|
|
|
|
for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
|
|
|
|
|
auto* frame = *it;
|
|
|
|
|
if (frame->ownerClient == client) {
|
|
|
|
|
REG_DBG("forceReleaseByOwner: frame=%p refcount=%d gpuCacheY=%p gpuCacheUV=%p bytes=%zu",
|
|
|
|
|
(void*)frame, frame->refcount.load(),
|
|
|
|
|
frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes);
|
|
|
|
|
|
|
|
|
|
// Remove all Mat* keys pointing to this frame
|
|
|
|
|
for (auto jt = m_map.begin(); jt != m_map.end(); ) {
|
|
|
|
|
if (jt->second == frame)
|
|
|
|
|
jt = m_map.erase(jt);
|
|
|
|
|
else
|
|
|
|
|
++jt;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Free owned buffers (CPU + GPU pending)
|
|
|
|
|
if (frame->avframe)
|
|
|
|
|
m_pendingFree.push_back(frame->avframe);
|
|
|
|
|
if (frame->cpuAvframe)
|
|
|
|
|
m_pendingFree.push_back(frame->cpuAvframe);
|
|
|
|
|
freeOwnedBuffers_locked(frame);
|
|
|
|
|
it = m_frameSet.erase(it);
|
|
|
|
|
delete frame;
|
|
|
|
|
++count;
|
|
|
|
|
} else {
|
|
|
|
|
++it;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (count > 0) {
|
|
|
|
|
REG_DBG("forceReleaseByOwner: force-released %d frames for client=%p", count, client);
|
|
|
|
|
}
|
|
|
|
|
return count;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
private:
|
|
|
|
|
ANSGpuFrameRegistry() = default;
|
|
|
|
|
|
|
|
|
|
#ifdef _WIN32
|
|
|
|
|
static ANSGpuFrameRegistry* resolveProcessWide();
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Free malloc'd CPU NV12 buffers and GPU cache (but NOT avframe/cpuAvframe —
|
|
|
|
|
// those go to pendingFree for the caller to av_frame_free).
|
|
|
|
|
void freeOwnedBuffers_locked(GpuFrameData* frame) {
|
2026-04-03 14:51:52 +11:00
|
|
|
REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d poolSlot=%p",
|
2026-04-02 22:07:27 +11:00
|
|
|
(void*)frame, (void*)frame->cpuYPlane, (void*)frame->cpuUvPlane,
|
|
|
|
|
frame->gpuCacheY, frame->gpuCacheUV,
|
2026-04-03 14:51:52 +11:00
|
|
|
frame->gpuCacheBytes, frame->gpuCacheDeviceIdx, (void*)frame->poolSlot);
|
|
|
|
|
// Release global pool slot via DEFERRED release — the slot enters a
|
|
|
|
|
// "cooling" state for SLOT_COOLDOWN_MS (200ms) before it becomes
|
|
|
|
|
// available for reuse. This guarantees that any in-flight GPU kernels
|
|
|
|
|
// (launched asynchronously by inference engines) have completed reading
|
|
|
|
|
// from the buffer. CPU refcount→0 does NOT mean the GPU is done.
|
|
|
|
|
if (frame->poolSlot) {
|
|
|
|
|
GpuNV12SlotPool::deferRelease(frame->poolSlot);
|
|
|
|
|
frame->poolSlot = nullptr;
|
|
|
|
|
// yPlane/uvPlane pointed into the pool slot — null them to
|
|
|
|
|
// prevent any stale reads after this point.
|
|
|
|
|
frame->yPlane = nullptr;
|
|
|
|
|
frame->uvPlane = nullptr;
|
|
|
|
|
}
|
2026-03-28 16:54:11 +11:00
|
|
|
if (frame->cpuYPlane) {
|
|
|
|
|
std::free(frame->cpuYPlane);
|
|
|
|
|
frame->cpuYPlane = nullptr;
|
|
|
|
|
}
|
|
|
|
|
if (frame->cpuUvPlane) {
|
|
|
|
|
std::free(frame->cpuUvPlane);
|
|
|
|
|
frame->cpuUvPlane = nullptr;
|
|
|
|
|
}
|
2026-04-02 22:07:27 +11:00
|
|
|
// GPU cache freed via CUDA — push to deferred list with device index
|
|
|
|
|
// so the caller (ANSGpuFrameOps.h) can cudaSetDevice + cudaFree.
|
2026-03-28 16:54:11 +11:00
|
|
|
if (frame->gpuCacheBytes > 0) {
|
|
|
|
|
onGpuCacheFreed(frame->gpuCacheBytes);
|
|
|
|
|
frame->gpuCacheValid = false;
|
|
|
|
|
frame->gpuCacheBytes = 0;
|
2026-04-02 22:07:27 +11:00
|
|
|
int devIdx = frame->gpuCacheDeviceIdx;
|
2026-04-03 14:51:52 +11:00
|
|
|
auto now = std::chrono::steady_clock::now();
|
2026-03-28 16:54:11 +11:00
|
|
|
if (frame->gpuCacheY)
|
2026-04-03 14:51:52 +11:00
|
|
|
m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx, now});
|
2026-03-28 16:54:11 +11:00
|
|
|
if (frame->gpuCacheUV)
|
2026-04-03 14:51:52 +11:00
|
|
|
m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx, now});
|
2026-03-28 16:54:11 +11:00
|
|
|
frame->gpuCacheY = nullptr;
|
|
|
|
|
frame->gpuCacheUV = nullptr;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::mutex m_mutex;
|
|
|
|
|
std::unordered_map<cv::Mat*, GpuFrameData*> m_map;
|
|
|
|
|
std::unordered_set<GpuFrameData*> m_frameSet; // All unique frames (for TTL scan)
|
|
|
|
|
std::vector<void*> m_pendingFree; // AVFrame* pointers to av_frame_free
|
2026-04-02 22:07:27 +11:00
|
|
|
std::vector<GpuPendingFreeEntry> m_pendingGpuFree; // CUDA device pointers to cudaFree
|
2026-03-28 16:54:11 +11:00
|
|
|
std::atomic<size_t> m_totalGpuCacheBytes{0};
|
|
|
|
|
size_t m_gpuCacheBudget = GPU_CACHE_BUDGET_DEFAULT;
|
|
|
|
|
std::chrono::steady_clock::time_point m_lastEvictCheck;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// ── Convenience free functions (FFmpeg-agnostic) ────────────────────────
|
|
|
|
|
|
|
|
|
|
// Lookup by cv::Mat* pointer (primary key)
|
|
|
|
|
inline GpuFrameData* gpu_frame_lookup(cv::Mat* mat) {
|
|
|
|
|
return ANSGpuFrameRegistry::instance().lookup(mat);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Backward-compat: lookup by datastart (O(n) — avoid in hot path)
|
|
|
|
|
inline GpuFrameData* gpu_frame_lookup(const uchar* datastart) {
|
|
|
|
|
return ANSGpuFrameRegistry::instance().lookup_by_datastart(datastart);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Add ref: link clone Mat* to same GpuFrameData as src Mat*
|
|
|
|
|
inline bool gpu_frame_addref(cv::Mat* src, cv::Mat* dst) {
|
|
|
|
|
return ANSGpuFrameRegistry::instance().addRef(src, dst);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Drain GPU device pointers that need cudaFree.
|
2026-04-02 22:07:27 +11:00
|
|
|
// Caller must cudaSetDevice(entry.deviceIdx) + cudaFree(entry.ptr) for each.
|
|
|
|
|
inline std::vector<GpuPendingFreeEntry> gpu_frame_drain_gpu_pending() {
|
2026-03-28 16:54:11 +11:00
|
|
|
return ANSGpuFrameRegistry::instance().drain_gpu_pending();
|
|
|
|
|
}
|