Fix NV12 crash issue when recreate camera object
(new structure) does not work
This commit is contained in:
@@ -42,8 +42,10 @@
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
// Debug logging for registry operations — both stderr and OutputDebugString.
|
||||
// Debug logging for registry operations.
|
||||
// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
|
||||
#ifndef REG_DBG
|
||||
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
|
||||
#ifdef _WIN32
|
||||
#define REG_DBG(fmt, ...) do { \
|
||||
char _reg_buf[512]; \
|
||||
@@ -54,7 +56,13 @@
|
||||
#else
|
||||
#define REG_DBG(fmt, ...) fprintf(stderr, "[Registry] " fmt "\n", ##__VA_ARGS__)
|
||||
#endif
|
||||
#else
|
||||
#define REG_DBG(fmt, ...) ((void)0)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// GpuNV12Slot definition needed by freeOwnedBuffers_locked() (accesses inUse atomic).
|
||||
#include "GpuNV12SlotPool.h"
|
||||
|
||||
// Safety constants
|
||||
static constexpr int MAX_FRAME_REFCOUNT = 64;
|
||||
@@ -66,6 +74,7 @@ static constexpr int EVICT_CHECK_INTERVAL_MS = 500;
|
||||
struct GpuPendingFreeEntry {
|
||||
void* ptr = nullptr;
|
||||
int deviceIdx = -1;
|
||||
std::chrono::steady_clock::time_point queuedAt; // When this entry was queued
|
||||
};
|
||||
|
||||
struct GpuFrameData {
|
||||
@@ -116,6 +125,13 @@ struct GpuFrameData {
|
||||
void* ownerClient = nullptr;
|
||||
void (*onReleaseFn)(void*) = nullptr;
|
||||
|
||||
// --- Global pool slot (from GpuNV12SlotPool) ---
|
||||
// When non-null, yPlane/uvPlane point into this slot's buffers.
|
||||
// Released (slot->inUse = false) in freeOwnedBuffers_locked() when
|
||||
// the frame's refcount drops to 0 — guarantees the buffer is not
|
||||
// freed while any consumer is still reading it.
|
||||
GpuNV12Slot* poolSlot = nullptr;
|
||||
|
||||
// Default constructor
|
||||
GpuFrameData() = default;
|
||||
|
||||
@@ -134,6 +150,7 @@ struct GpuFrameData {
|
||||
, yLinesize(o.yLinesize), uvLinesize(o.uvLinesize)
|
||||
, refcount(o.refcount.load()), createdAt(o.createdAt)
|
||||
, ownerClient(o.ownerClient), onReleaseFn(o.onReleaseFn)
|
||||
, poolSlot(o.poolSlot)
|
||||
{
|
||||
// Null out source to prevent double-free of owned pointers
|
||||
o.cpuYPlane = nullptr;
|
||||
@@ -147,6 +164,7 @@ struct GpuFrameData {
|
||||
o.gpuCacheBytes = 0;
|
||||
o.ownerClient = nullptr;
|
||||
o.onReleaseFn = nullptr;
|
||||
o.poolSlot = nullptr;
|
||||
}
|
||||
|
||||
// No copy
|
||||
@@ -344,11 +362,30 @@ public:
|
||||
|
||||
// --- Drain pending GPU device pointers for caller to cudaFree ---
|
||||
// Each entry includes the device index for cudaSetDevice before cudaFree.
|
||||
std::vector<GpuPendingFreeEntry> drain_gpu_pending() {
|
||||
// If minAgeMs > 0, only drain entries older than minAgeMs milliseconds.
|
||||
// This allows time-based safety: entries queued >100ms ago are guaranteed
|
||||
// safe to free because all CUDA kernels complete in <10ms.
|
||||
std::vector<GpuPendingFreeEntry> drain_gpu_pending(int minAgeMs = 0) {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
std::vector<GpuPendingFreeEntry> result;
|
||||
result.swap(m_pendingGpuFree);
|
||||
return result;
|
||||
if (minAgeMs <= 0) {
|
||||
// Drain all (used by Destroy/Reconnect with cudaDeviceSynchronize)
|
||||
std::vector<GpuPendingFreeEntry> result;
|
||||
result.swap(m_pendingGpuFree);
|
||||
return result;
|
||||
}
|
||||
// Drain only entries older than minAgeMs
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
auto threshold = std::chrono::milliseconds(minAgeMs);
|
||||
std::vector<GpuPendingFreeEntry> ready;
|
||||
std::vector<GpuPendingFreeEntry> notReady;
|
||||
for (auto& entry : m_pendingGpuFree) {
|
||||
if (now - entry.queuedAt >= threshold)
|
||||
ready.push_back(entry);
|
||||
else
|
||||
notReady.push_back(entry);
|
||||
}
|
||||
m_pendingGpuFree = std::move(notReady);
|
||||
return ready;
|
||||
}
|
||||
|
||||
// --- TTL eviction: force-free frames older than FRAME_TTL_SECONDS ---
|
||||
@@ -506,10 +543,23 @@ private:
|
||||
// Free malloc'd CPU NV12 buffers and GPU cache (but NOT avframe/cpuAvframe —
|
||||
// those go to pendingFree for the caller to av_frame_free).
|
||||
void freeOwnedBuffers_locked(GpuFrameData* frame) {
|
||||
REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d",
|
||||
REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d poolSlot=%p",
|
||||
(void*)frame, (void*)frame->cpuYPlane, (void*)frame->cpuUvPlane,
|
||||
frame->gpuCacheY, frame->gpuCacheUV,
|
||||
frame->gpuCacheBytes, frame->gpuCacheDeviceIdx);
|
||||
frame->gpuCacheBytes, frame->gpuCacheDeviceIdx, (void*)frame->poolSlot);
|
||||
// Release global pool slot via DEFERRED release — the slot enters a
|
||||
// "cooling" state for SLOT_COOLDOWN_MS (200ms) before it becomes
|
||||
// available for reuse. This guarantees that any in-flight GPU kernels
|
||||
// (launched asynchronously by inference engines) have completed reading
|
||||
// from the buffer. CPU refcount→0 does NOT mean the GPU is done.
|
||||
if (frame->poolSlot) {
|
||||
GpuNV12SlotPool::deferRelease(frame->poolSlot);
|
||||
frame->poolSlot = nullptr;
|
||||
// yPlane/uvPlane pointed into the pool slot — null them to
|
||||
// prevent any stale reads after this point.
|
||||
frame->yPlane = nullptr;
|
||||
frame->uvPlane = nullptr;
|
||||
}
|
||||
if (frame->cpuYPlane) {
|
||||
std::free(frame->cpuYPlane);
|
||||
frame->cpuYPlane = nullptr;
|
||||
@@ -525,10 +575,11 @@ private:
|
||||
frame->gpuCacheValid = false;
|
||||
frame->gpuCacheBytes = 0;
|
||||
int devIdx = frame->gpuCacheDeviceIdx;
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
if (frame->gpuCacheY)
|
||||
m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx});
|
||||
m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx, now});
|
||||
if (frame->gpuCacheUV)
|
||||
m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx});
|
||||
m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx, now});
|
||||
frame->gpuCacheY = nullptr;
|
||||
frame->gpuCacheUV = nullptr;
|
||||
}
|
||||
|
||||
161
include/GpuNV12SlotPool.h
Normal file
161
include/GpuNV12SlotPool.h
Normal file
@@ -0,0 +1,161 @@
|
||||
#pragma once
|
||||
// GpuNV12SlotPool.h — Process-wide GPU NV12 buffer pool.
|
||||
//
|
||||
// Provides pre-allocated CUDA buffer slots (Y + UV planes) that are shared
|
||||
// across all RTSP camera instances. Slots are acquired per-frame by
|
||||
// GetRTSPCVImage and released back to the pool when the GpuFrameData's
|
||||
// refcount drops to 0 in freeOwnedBuffers_locked().
|
||||
//
|
||||
// KEY DESIGN: Slots are NEVER freed when a camera is destroyed — they are
|
||||
// recycled. This decouples GPU buffer lifetime from camera lifetime, so
|
||||
// inference engines can safely read NV12 data even after the camera object
|
||||
// that produced it has been deleted and recreated (the LabVIEW reconnect
|
||||
// pattern: ReleaseHandle → Destroy → delete → CreateHandle).
|
||||
//
|
||||
// TIME-DELAYED RELEASE: When a GpuFrameData's refcount drops to 0, the
|
||||
// slot is NOT immediately available. It enters a "cooling" state for
|
||||
// SLOT_COOLDOWN_MS (50ms) to guarantee that any in-flight GPU kernels
|
||||
// (launched asynchronously by inference engines) have completed reading
|
||||
// from the buffer. CUDA kernels typically complete in <10ms, so 50ms
|
||||
// provides a 5x safety margin. The cooldown is kept short to minimize
|
||||
// the number of slots in COOLING, which prevents POOL FULL events.
|
||||
// POOL FULL triggers per-frame cudaMalloc/cudaFree, which holds the
|
||||
// nvcuda64 SRW lock and causes cascading stalls on other cameras'
|
||||
// cudaMemcpy2D operations.
|
||||
//
|
||||
// Thread-safe: acquire() locks internally, deferRelease() is lock-free.
|
||||
//
|
||||
// Cross-DLL: uses the same resolveProcessWide() singleton pattern as
|
||||
// ANSGpuFrameRegistry. ANSCV.dll owns the canonical instance; other DLLs
|
||||
// find it via GetProcAddress("GpuNV12SlotPool_GetInstance").
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <chrono>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
// Safety constants
|
||||
static constexpr int GPU_NV12_POOL_MAX_SLOTS = 64;
|
||||
static constexpr int SLOT_COOLDOWN_MS = 50; // Time after CPU release before slot reuse
|
||||
// GPU kernels complete in <10ms; 50ms = 5× margin
|
||||
|
||||
// Debug logging for pool operations.
|
||||
// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
|
||||
// In production, these are silent to avoid OutputDebugString/fprintf
|
||||
// lock contention (measured: 500-2000 calls/sec causes process stalls).
|
||||
#ifndef NV12POOL_DBG
|
||||
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
|
||||
#ifdef _WIN32
|
||||
#define NV12POOL_DBG(fmt, ...) do { \
|
||||
char _p_buf[512]; \
|
||||
snprintf(_p_buf, sizeof(_p_buf), "[NV12Pool] " fmt "\n", ##__VA_ARGS__); \
|
||||
OutputDebugStringA(_p_buf); \
|
||||
fprintf(stderr, "%s", _p_buf); \
|
||||
} while(0)
|
||||
#else
|
||||
#define NV12POOL_DBG(fmt, ...) fprintf(stderr, "[NV12Pool] " fmt "\n", ##__VA_ARGS__)
|
||||
#endif
|
||||
#else
|
||||
#define NV12POOL_DBG(fmt, ...) ((void)0)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
struct GpuNV12Slot {
|
||||
void* bufY = nullptr; // cudaMallocPitch'd Y plane
|
||||
void* bufUV = nullptr; // cudaMallocPitch'd UV plane
|
||||
size_t pitchY = 0;
|
||||
size_t pitchUV = 0;
|
||||
int width = 0; // Resolution this slot was allocated for
|
||||
int height = 0;
|
||||
int gpuIdx = -1; // GPU device index
|
||||
|
||||
// Slot lifecycle state:
|
||||
// FREE (0) = available for acquire()
|
||||
// ACTIVE (1) = owned by a GpuFrameData (D2D copy + inference reading)
|
||||
// COOLING (2) = CPU released but GPU kernel may still be reading;
|
||||
// becomes FREE after SLOT_COOLDOWN_MS elapses.
|
||||
static constexpr int STATE_FREE = 0;
|
||||
static constexpr int STATE_ACTIVE = 1;
|
||||
static constexpr int STATE_COOLING = 2;
|
||||
std::atomic<int> state{STATE_FREE};
|
||||
|
||||
// Timestamp when the slot entered COOLING state.
|
||||
// Only meaningful when state == STATE_COOLING.
|
||||
std::chrono::steady_clock::time_point cooldownStart;
|
||||
|
||||
// Per-slot CUDA stream for D2D copy (non-blocking).
|
||||
// CRITICAL: cudaMemcpy2D (no stream arg) uses the NULL stream, which on
|
||||
// WDDM implicitly synchronizes with ALL other streams before executing.
|
||||
// This means the D2D copy must wait for all inference kernels to finish
|
||||
// first — causing 1-2 second stalls. Using a dedicated non-blocking
|
||||
// stream avoids this implicit sync entirely.
|
||||
// Stored as void* to avoid cuda_runtime.h in the header.
|
||||
void* copyStream = nullptr; // cudaStream_t
|
||||
};
|
||||
|
||||
class GpuNV12SlotPool {
|
||||
public:
|
||||
// Process-wide singleton (same pattern as ANSGpuFrameRegistry).
|
||||
static GpuNV12SlotPool& instance() {
|
||||
#ifdef _WIN32
|
||||
static GpuNV12SlotPool* s_inst = resolveProcessWide();
|
||||
return *s_inst;
|
||||
#else
|
||||
static GpuNV12SlotPool pool;
|
||||
return pool;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Acquire a free slot matching (gpuIdx, w, h).
|
||||
// Drains cooled-down slots first, then looks for a FREE match.
|
||||
// If none, allocates a new one (up to GPU_NV12_POOL_MAX_SLOTS).
|
||||
// Returns nullptr if pool full — caller falls back to CPU path.
|
||||
GpuNV12Slot* acquire(int gpuIdx, int w, int h);
|
||||
|
||||
// Deferred release: moves slot from ACTIVE → COOLING.
|
||||
// Called from freeOwnedBuffers_locked() when GpuFrameData refcount → 0.
|
||||
// The slot becomes FREE after SLOT_COOLDOWN_MS elapses (checked in acquire).
|
||||
static void deferRelease(GpuNV12Slot* slot) {
|
||||
if (slot) {
|
||||
slot->cooldownStart = std::chrono::steady_clock::now();
|
||||
slot->state.store(GpuNV12Slot::STATE_COOLING, std::memory_order_release);
|
||||
}
|
||||
}
|
||||
|
||||
// Number of allocated slots (for diagnostics).
|
||||
size_t slotCount() const {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
return m_slots.size();
|
||||
}
|
||||
|
||||
// Number of in-use slots (for diagnostics).
|
||||
size_t activeCount() const {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
size_t count = 0;
|
||||
for (auto& s : m_slots) {
|
||||
if (s->state.load(std::memory_order_relaxed) != GpuNV12Slot::STATE_FREE) ++count;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private:
|
||||
GpuNV12SlotPool() = default;
|
||||
|
||||
#ifdef _WIN32
|
||||
static GpuNV12SlotPool* resolveProcessWide();
|
||||
#endif
|
||||
|
||||
// Transition all COOLING slots that have exceeded SLOT_COOLDOWN_MS to FREE.
|
||||
// Called at the start of acquire() under the lock.
|
||||
void drainCooledSlots_locked();
|
||||
|
||||
mutable std::mutex m_mutex;
|
||||
std::vector<std::unique_ptr<GpuNV12Slot>> m_slots;
|
||||
};
|
||||
Reference in New Issue
Block a user