Initial setup for CLion

This commit is contained in:
2026-03-28 16:54:11 +11:00
parent 239cc02591
commit 7b4134133c
1136 changed files with 811916 additions and 0 deletions

View File

@@ -0,0 +1,414 @@
#pragma once
// ANSGpuFrameRegistry.h — Side-table registry associating cv::Mat pointers
// with GPU-friendly NV12 frame data for fast-path inference.
//
// Key: cv::Mat* (the heap-allocated pointer from anscv_mat_new), NOT datastart.
// This survives deep copies (CloneImage_S) because each clone gets its own key
// pointing to the same shared GpuFrameData via reference counting.
//
// When RTSP HW decode produces an NV12 AVFrame, we snapshot the CPU NV12 planes
// into owned buffers and register them keyed by the cv::Mat*. When CloneImage_S
// is called, addRef() links the new Mat* to the same GpuFrameData (refcount++).
// When inference runs, it reads the NV12 data via a thread-local pointer set by
// RunInferenceComplete_LV — no registry lookup needed in the engine hot path.
//
// Cleanup:
// - anscv_mat_delete() calls release() → refcount--; frees when 0
// - anscv_mat_replace() calls release() on old Mat* → same
// - TTL eviction catches stuck tasks (frames older than 3s with refcount > 0)
//
// Safety layers:
// 1. Refcount cap (64) — prevents runaway refs from bugs
// 2. Frame TTL (3s) — force-frees frames held by stuck tasks
// 3. Global VRAM budget (1GB) — caps GPU cache allocation
//
// Thread-safe: all methods lock internally.
//
// NOTE: This header is FFmpeg-free. CPU NV12 snapshots are owned malloc'd buffers.
// The opaque `avframe`/`cpuAvframe` pointers are retained for ANSCV to free via av_frame_free.
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <mutex>
#include <atomic>
#include <cstdint>
#include <cstdlib>
#include <chrono>
#include <opencv2/core/mat.hpp>
// Safety constants
static constexpr int MAX_FRAME_REFCOUNT = 64;
static constexpr int FRAME_TTL_SECONDS = 3;
static constexpr size_t GPU_CACHE_BUDGET_DEFAULT = 1ULL * 1024 * 1024 * 1024; // 1GB
static constexpr int EVICT_CHECK_INTERVAL_MS = 500;
struct GpuFrameData {
// --- CPU NV12 snapshot (OWNED malloc'd buffers, independent of decoder) ---
uint8_t* cpuYPlane = nullptr; // malloc'd Y plane copy
uint8_t* cpuUvPlane = nullptr; // malloc'd UV plane copy
int cpuYLinesize = 0; // Bytes per row in Y plane
int cpuUvLinesize = 0; // Bytes per row in UV plane
// --- GPU upload cache (created on first inference, shared across tasks) ---
void* gpuCacheY = nullptr; // cudaMalloc'd Y on inference GPU
void* gpuCacheUV = nullptr; // cudaMalloc'd UV on inference GPU
size_t gpuCacheYPitch = 0; // Pitch of cached Y plane
size_t gpuCacheUVPitch = 0; // Pitch of cached UV plane
size_t gpuCacheBytes = 0; // Total VRAM bytes (for budget tracking)
int gpuCacheDeviceIdx = -1; // GPU index where cache lives
bool gpuCacheValid = false; // true after first upload
// gpuCacheMutex is NOT here — use the registry mutex for cache creation
// --- Legacy opaque AVFrame pointers (freed by ANSCV via av_frame_free) ---
void* avframe = nullptr; // Original CUDA or CPU AVFrame (owned)
void* cpuAvframe = nullptr; // CPU fallback AVFrame (owned, may be nullptr)
// --- Frame metadata ---
int width = 0;
int height = 0;
int pixelFormat = 0; // 23=NV12, 1000=BGR full-res
int gpuIndex = -1; // GPU that decoded this frame
int64_t pts = 0; // Presentation timestamp
bool isCudaDevicePtr = false; // Legacy: true if original was CUDA zero-copy
// --- Legacy NV12 plane pointers (point into avframe, used during transition) ---
// TODO: Remove once all consumers use cpuYPlane/cpuUvPlane via thread-local
uint8_t* yPlane = nullptr;
uint8_t* uvPlane = nullptr;
int yLinesize = 0;
int uvLinesize = 0;
// --- Lifecycle ---
std::atomic<int> refcount{1};
std::chrono::steady_clock::time_point createdAt;
// Default constructor
GpuFrameData() = default;
// Move constructor (std::atomic is neither copyable nor movable)
GpuFrameData(GpuFrameData&& o) noexcept
: cpuYPlane(o.cpuYPlane), cpuUvPlane(o.cpuUvPlane)
, cpuYLinesize(o.cpuYLinesize), cpuUvLinesize(o.cpuUvLinesize)
, gpuCacheY(o.gpuCacheY), gpuCacheUV(o.gpuCacheUV)
, gpuCacheYPitch(o.gpuCacheYPitch), gpuCacheUVPitch(o.gpuCacheUVPitch)
, gpuCacheBytes(o.gpuCacheBytes), gpuCacheDeviceIdx(o.gpuCacheDeviceIdx)
, gpuCacheValid(o.gpuCacheValid)
, avframe(o.avframe), cpuAvframe(o.cpuAvframe)
, width(o.width), height(o.height), pixelFormat(o.pixelFormat)
, gpuIndex(o.gpuIndex), pts(o.pts), isCudaDevicePtr(o.isCudaDevicePtr)
, yPlane(o.yPlane), uvPlane(o.uvPlane)
, yLinesize(o.yLinesize), uvLinesize(o.uvLinesize)
, refcount(o.refcount.load()), createdAt(o.createdAt)
{
// Null out source to prevent double-free of owned pointers
o.cpuYPlane = nullptr;
o.cpuUvPlane = nullptr;
o.gpuCacheY = nullptr;
o.gpuCacheUV = nullptr;
o.avframe = nullptr;
o.cpuAvframe = nullptr;
o.yPlane = nullptr;
o.uvPlane = nullptr;
o.gpuCacheBytes = 0;
}
// No copy
GpuFrameData(const GpuFrameData&) = delete;
GpuFrameData& operator=(const GpuFrameData&) = delete;
};
class ANSGpuFrameRegistry {
public:
// Process-wide singleton. On Windows, header-only static locals are per-DLL.
// ANSCV.dll exports ANSGpuFrameRegistry_GetInstance() (defined in
// ANSGpuFrameRegistry.cpp); other DLLs find it via GetProcAddress at runtime.
static ANSGpuFrameRegistry& instance() {
#ifdef _WIN32
static ANSGpuFrameRegistry* s_inst = resolveProcessWide();
return *s_inst;
#else
static ANSGpuFrameRegistry reg;
return reg;
#endif
}
// --- Attach: register a new GpuFrameData keyed by cv::Mat* ---
// Allocates GpuFrameData on heap. Takes ownership of avframe/cpuAvframe.
// Returns old avframe pointer if this Mat* was already registered (caller must av_frame_free).
void* attach(cv::Mat* mat, GpuFrameData&& data) {
if (!mat) return nullptr;
void* oldAvframe = nullptr;
data.createdAt = std::chrono::steady_clock::now();
data.refcount.store(1);
auto* heapData = new GpuFrameData(std::move(data));
std::lock_guard<std::mutex> lock(m_mutex);
// If this Mat* already has an entry, release the old one
auto it = m_map.find(mat);
if (it != m_map.end()) {
auto* oldFrame = it->second;
int oldRef = oldFrame->refcount.fetch_sub(1);
if (oldRef <= 1) {
oldAvframe = oldFrame->avframe;
if (oldFrame->cpuAvframe)
m_pendingFree.push_back(oldFrame->cpuAvframe);
freeOwnedBuffers_locked(oldFrame);
m_frameSet.erase(oldFrame);
delete oldFrame;
}
// If oldRef > 1, other clones still reference it — just unlink this Mat*
m_map.erase(it);
}
m_map[mat] = heapData;
m_frameSet.insert(heapData);
return oldAvframe; // Caller must av_frame_free if non-null
}
// --- addRef: link a cloned cv::Mat* to the same GpuFrameData as src ---
// Returns true if successful, false if src not found or refcount cap reached.
bool addRef(cv::Mat* src, cv::Mat* dst) {
if (!src || !dst || src == dst) return false;
std::lock_guard<std::mutex> lock(m_mutex);
auto it = m_map.find(src);
if (it == m_map.end()) return false;
auto* frame = it->second;
int current = frame->refcount.load();
if (current >= MAX_FRAME_REFCOUNT) {
return false; // Cap reached — caller falls back to BGR
}
frame->refcount.fetch_add(1);
m_map[dst] = frame;
return true;
}
// --- release: decrement refcount for this Mat*, free if 0 ---
// Returns avframe pointer to free (or nullptr) via pendingFree.
// Caller must drain_pending() and av_frame_free each returned pointer.
void release(cv::Mat* mat) {
if (!mat) return;
std::lock_guard<std::mutex> lock(m_mutex);
auto it = m_map.find(mat);
if (it == m_map.end()) return;
auto* frame = it->second;
m_map.erase(it);
int oldRef = frame->refcount.fetch_sub(1);
if (oldRef <= 1) {
// Last reference — free everything
if (frame->avframe)
m_pendingFree.push_back(frame->avframe);
if (frame->cpuAvframe)
m_pendingFree.push_back(frame->cpuAvframe);
freeOwnedBuffers_locked(frame);
m_frameSet.erase(frame);
delete frame;
}
}
// --- lookup: find GpuFrameData by cv::Mat* (locking) ---
GpuFrameData* lookup(cv::Mat* mat) {
std::lock_guard<std::mutex> lock(m_mutex);
auto it = m_map.find(mat);
return (it != m_map.end()) ? it->second : nullptr;
}
// --- lookup_unlocked: caller MUST hold lock via acquire_lock() ---
GpuFrameData* lookup_unlocked(cv::Mat* mat) {
auto it = m_map.find(mat);
return (it != m_map.end()) ? it->second : nullptr;
}
// --- Backward-compat: lookup by datastart (for transition period) ---
// Searches all entries for matching datastart. O(n) — avoid in hot path.
GpuFrameData* lookup_by_datastart(const uchar* datastart) {
std::lock_guard<std::mutex> lock(m_mutex);
return lookup_by_datastart_unlocked(datastart);
}
GpuFrameData* lookup_by_datastart_unlocked(const uchar* datastart) {
if (!datastart) return nullptr;
for (auto& [mat, frame] : m_map) {
if (mat && mat->datastart == datastart)
return frame;
}
return nullptr;
}
// Acquire the registry lock explicitly.
std::unique_lock<std::mutex> acquire_lock() {
return std::unique_lock<std::mutex>(m_mutex);
}
// Number of map entries (Mat* keys) — caller MUST hold lock.
size_t size_unlocked() const { return m_map.size(); }
// Number of unique frames alive — caller MUST hold lock.
size_t frame_count_unlocked() const { return m_frameSet.size(); }
// --- Drain pending avframe pointers for caller to av_frame_free ---
std::vector<void*> drain_pending() {
std::lock_guard<std::mutex> lock(m_mutex);
std::vector<void*> result;
result.swap(m_pendingFree);
return result;
}
// --- Drain pending GPU device pointers for caller to cudaFree ---
std::vector<void*> drain_gpu_pending() {
std::lock_guard<std::mutex> lock(m_mutex);
std::vector<void*> result;
result.swap(m_pendingGpuFree);
return result;
}
// --- TTL eviction: force-free frames older than FRAME_TTL_SECONDS ---
// Call periodically from camera threads (piggybacked on mat_replace).
void evictStaleFrames() {
auto now = std::chrono::steady_clock::now();
// Throttle: skip if called too frequently
{
std::lock_guard<std::mutex> lock(m_mutex);
if (now - m_lastEvictCheck < std::chrono::milliseconds(EVICT_CHECK_INTERVAL_MS))
return;
m_lastEvictCheck = now;
}
std::lock_guard<std::mutex> lock(m_mutex);
for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
auto* frame = *it;
auto age_s = std::chrono::duration_cast<std::chrono::seconds>(
now - frame->createdAt).count();
if (age_s > FRAME_TTL_SECONDS && frame->refcount.load() > 0) {
// Force cleanup — remove all Mat* keys pointing to this frame
for (auto jt = m_map.begin(); jt != m_map.end(); ) {
if (jt->second == frame)
jt = m_map.erase(jt);
else
++jt;
}
// Push avframes to pendingFree
if (frame->avframe)
m_pendingFree.push_back(frame->avframe);
if (frame->cpuAvframe)
m_pendingFree.push_back(frame->cpuAvframe);
freeOwnedBuffers_locked(frame);
it = m_frameSet.erase(it);
delete frame;
} else {
++it;
}
}
}
// --- VRAM budget management ---
bool canAllocateGpuCache(size_t bytes) const {
return m_totalGpuCacheBytes.load(std::memory_order_relaxed) + bytes <= m_gpuCacheBudget;
}
void onGpuCacheCreated(size_t bytes) {
m_totalGpuCacheBytes.fetch_add(bytes, std::memory_order_relaxed);
}
void onGpuCacheFreed(size_t bytes) {
// Prevent underflow
size_t old = m_totalGpuCacheBytes.load(std::memory_order_relaxed);
while (old >= bytes) {
if (m_totalGpuCacheBytes.compare_exchange_weak(old, old - bytes,
std::memory_order_relaxed))
break;
}
}
size_t totalGpuCacheBytes() const {
return m_totalGpuCacheBytes.load(std::memory_order_relaxed);
}
void setGpuCacheBudget(size_t bytes) { m_gpuCacheBudget = bytes; }
size_t gpuCacheBudget() const { return m_gpuCacheBudget; }
private:
ANSGpuFrameRegistry() = default;
#ifdef _WIN32
static ANSGpuFrameRegistry* resolveProcessWide();
#endif
// Free malloc'd CPU NV12 buffers and GPU cache (but NOT avframe/cpuAvframe —
// those go to pendingFree for the caller to av_frame_free).
void freeOwnedBuffers_locked(GpuFrameData* frame) {
if (frame->cpuYPlane) {
std::free(frame->cpuYPlane);
frame->cpuYPlane = nullptr;
}
if (frame->cpuUvPlane) {
std::free(frame->cpuUvPlane);
frame->cpuUvPlane = nullptr;
}
// GPU cache freed via CUDA — caller (ANSODEngine) must handle this
// since we can't call cudaFree from this FFmpeg-free header.
// The gpuCacheBytes are tracked; actual deallocation happens in
// NV12PreprocessHelper or a GPU-aware cleanup path.
if (frame->gpuCacheBytes > 0) {
onGpuCacheFreed(frame->gpuCacheBytes);
// Mark as invalid so no one reads stale pointers
frame->gpuCacheValid = false;
frame->gpuCacheBytes = 0;
// NOTE: gpuCacheY/gpuCacheUV device pointers are leaked here
// unless the caller handles GPU cleanup. This is addressed in
// Step 8 (NV12PreprocessHelper) where cudaFree is available.
// For now, push to a separate GPU-free list.
if (frame->gpuCacheY)
m_pendingGpuFree.push_back(frame->gpuCacheY);
if (frame->gpuCacheUV)
m_pendingGpuFree.push_back(frame->gpuCacheUV);
frame->gpuCacheY = nullptr;
frame->gpuCacheUV = nullptr;
}
}
std::mutex m_mutex;
std::unordered_map<cv::Mat*, GpuFrameData*> m_map;
std::unordered_set<GpuFrameData*> m_frameSet; // All unique frames (for TTL scan)
std::vector<void*> m_pendingFree; // AVFrame* pointers to av_frame_free
std::vector<void*> m_pendingGpuFree; // CUDA device pointers to cudaFree
std::atomic<size_t> m_totalGpuCacheBytes{0};
size_t m_gpuCacheBudget = GPU_CACHE_BUDGET_DEFAULT;
std::chrono::steady_clock::time_point m_lastEvictCheck;
};
// ── Convenience free functions (FFmpeg-agnostic) ────────────────────────
// Lookup by cv::Mat* pointer (primary key)
inline GpuFrameData* gpu_frame_lookup(cv::Mat* mat) {
return ANSGpuFrameRegistry::instance().lookup(mat);
}
// Backward-compat: lookup by datastart (O(n) — avoid in hot path)
inline GpuFrameData* gpu_frame_lookup(const uchar* datastart) {
return ANSGpuFrameRegistry::instance().lookup_by_datastart(datastart);
}
// Add ref: link clone Mat* to same GpuFrameData as src Mat*
inline bool gpu_frame_addref(cv::Mat* src, cv::Mat* dst) {
return ANSGpuFrameRegistry::instance().addRef(src, dst);
}
// Drain GPU device pointers that need cudaFree.
// Caller must cudaFree each returned pointer.
inline std::vector<void*> gpu_frame_drain_gpu_pending() {
return ANSGpuFrameRegistry::instance().drain_gpu_pending();
}