Fix NV12 crash issue when recreate camera object

(new structure) does not work
This commit is contained in:
2026-04-03 14:51:52 +11:00
parent 958cab6ae3
commit 6fb09830c5
16 changed files with 854 additions and 209 deletions

View File

@@ -42,8 +42,10 @@
#include <windows.h>
#endif
// Debug logging for registry operations — both stderr and OutputDebugString.
// Debug logging for registry operations.
// Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame GPU logging.
#ifndef REG_DBG
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
#ifdef _WIN32
#define REG_DBG(fmt, ...) do { \
char _reg_buf[512]; \
@@ -54,7 +56,13 @@
#else
#define REG_DBG(fmt, ...) fprintf(stderr, "[Registry] " fmt "\n", ##__VA_ARGS__)
#endif
#else
#define REG_DBG(fmt, ...) ((void)0)
#endif
#endif
// GpuNV12Slot definition needed by freeOwnedBuffers_locked() (accesses inUse atomic).
#include "GpuNV12SlotPool.h"
// Safety constants
static constexpr int MAX_FRAME_REFCOUNT = 64;
@@ -66,6 +74,7 @@ static constexpr int EVICT_CHECK_INTERVAL_MS = 500;
struct GpuPendingFreeEntry {
void* ptr = nullptr;
int deviceIdx = -1;
std::chrono::steady_clock::time_point queuedAt; // When this entry was queued
};
struct GpuFrameData {
@@ -116,6 +125,13 @@ struct GpuFrameData {
void* ownerClient = nullptr;
void (*onReleaseFn)(void*) = nullptr;
// --- Global pool slot (from GpuNV12SlotPool) ---
// When non-null, yPlane/uvPlane point into this slot's buffers.
// Released (slot->inUse = false) in freeOwnedBuffers_locked() when
// the frame's refcount drops to 0 — guarantees the buffer is not
// freed while any consumer is still reading it.
GpuNV12Slot* poolSlot = nullptr;
// Default constructor
GpuFrameData() = default;
@@ -134,6 +150,7 @@ struct GpuFrameData {
, yLinesize(o.yLinesize), uvLinesize(o.uvLinesize)
, refcount(o.refcount.load()), createdAt(o.createdAt)
, ownerClient(o.ownerClient), onReleaseFn(o.onReleaseFn)
, poolSlot(o.poolSlot)
{
// Null out source to prevent double-free of owned pointers
o.cpuYPlane = nullptr;
@@ -147,6 +164,7 @@ struct GpuFrameData {
o.gpuCacheBytes = 0;
o.ownerClient = nullptr;
o.onReleaseFn = nullptr;
o.poolSlot = nullptr;
}
// No copy
@@ -344,11 +362,30 @@ public:
// --- Drain pending GPU device pointers for caller to cudaFree ---
// Each entry includes the device index for cudaSetDevice before cudaFree.
std::vector<GpuPendingFreeEntry> drain_gpu_pending() {
// If minAgeMs > 0, only drain entries older than minAgeMs milliseconds.
// This allows time-based safety: entries queued >100ms ago are guaranteed
// safe to free because all CUDA kernels complete in <10ms.
std::vector<GpuPendingFreeEntry> drain_gpu_pending(int minAgeMs = 0) {
std::lock_guard<std::mutex> lock(m_mutex);
std::vector<GpuPendingFreeEntry> result;
result.swap(m_pendingGpuFree);
return result;
if (minAgeMs <= 0) {
// Drain all (used by Destroy/Reconnect with cudaDeviceSynchronize)
std::vector<GpuPendingFreeEntry> result;
result.swap(m_pendingGpuFree);
return result;
}
// Drain only entries older than minAgeMs
auto now = std::chrono::steady_clock::now();
auto threshold = std::chrono::milliseconds(minAgeMs);
std::vector<GpuPendingFreeEntry> ready;
std::vector<GpuPendingFreeEntry> notReady;
for (auto& entry : m_pendingGpuFree) {
if (now - entry.queuedAt >= threshold)
ready.push_back(entry);
else
notReady.push_back(entry);
}
m_pendingGpuFree = std::move(notReady);
return ready;
}
// --- TTL eviction: force-free frames older than FRAME_TTL_SECONDS ---
@@ -506,10 +543,23 @@ private:
// Free malloc'd CPU NV12 buffers and GPU cache (but NOT avframe/cpuAvframe —
// those go to pendingFree for the caller to av_frame_free).
void freeOwnedBuffers_locked(GpuFrameData* frame) {
REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d",
REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d poolSlot=%p",
(void*)frame, (void*)frame->cpuYPlane, (void*)frame->cpuUvPlane,
frame->gpuCacheY, frame->gpuCacheUV,
frame->gpuCacheBytes, frame->gpuCacheDeviceIdx);
frame->gpuCacheBytes, frame->gpuCacheDeviceIdx, (void*)frame->poolSlot);
// Release global pool slot via DEFERRED release — the slot enters a
// "cooling" state for SLOT_COOLDOWN_MS (200ms) before it becomes
// available for reuse. This guarantees that any in-flight GPU kernels
// (launched asynchronously by inference engines) have completed reading
// from the buffer. CPU refcount→0 does NOT mean the GPU is done.
if (frame->poolSlot) {
GpuNV12SlotPool::deferRelease(frame->poolSlot);
frame->poolSlot = nullptr;
// yPlane/uvPlane pointed into the pool slot — null them to
// prevent any stale reads after this point.
frame->yPlane = nullptr;
frame->uvPlane = nullptr;
}
if (frame->cpuYPlane) {
std::free(frame->cpuYPlane);
frame->cpuYPlane = nullptr;
@@ -525,10 +575,11 @@ private:
frame->gpuCacheValid = false;
frame->gpuCacheBytes = 0;
int devIdx = frame->gpuCacheDeviceIdx;
auto now = std::chrono::steady_clock::now();
if (frame->gpuCacheY)
m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx});
m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx, now});
if (frame->gpuCacheUV)
m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx});
m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx, now});
frame->gpuCacheY = nullptr;
frame->gpuCacheUV = nullptr;
}