Fix NV12 crash issue when recreate camera object
This commit is contained in:
@@ -34,15 +34,40 @@
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <chrono>
|
||||
#include <opencv2/core/mat.hpp>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
// Debug logging for registry operations — both stderr and OutputDebugString.
|
||||
#ifndef REG_DBG
|
||||
#ifdef _WIN32
|
||||
#define REG_DBG(fmt, ...) do { \
|
||||
char _reg_buf[512]; \
|
||||
snprintf(_reg_buf, sizeof(_reg_buf), "[Registry] " fmt "\n", ##__VA_ARGS__); \
|
||||
OutputDebugStringA(_reg_buf); \
|
||||
fprintf(stderr, "%s", _reg_buf); \
|
||||
} while(0)
|
||||
#else
|
||||
#define REG_DBG(fmt, ...) fprintf(stderr, "[Registry] " fmt "\n", ##__VA_ARGS__)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Safety constants
|
||||
static constexpr int MAX_FRAME_REFCOUNT = 64;
|
||||
static constexpr int FRAME_TTL_SECONDS = 3;
|
||||
static constexpr size_t GPU_CACHE_BUDGET_DEFAULT = 1ULL * 1024 * 1024 * 1024; // 1GB
|
||||
static constexpr int EVICT_CHECK_INTERVAL_MS = 500;
|
||||
|
||||
// Entry for deferred GPU memory deallocation (tracks device index for cudaSetDevice)
|
||||
struct GpuPendingFreeEntry {
|
||||
void* ptr = nullptr;
|
||||
int deviceIdx = -1;
|
||||
};
|
||||
|
||||
struct GpuFrameData {
|
||||
// --- CPU NV12 snapshot (OWNED malloc'd buffers, independent of decoder) ---
|
||||
uint8_t* cpuYPlane = nullptr; // malloc'd Y plane copy
|
||||
@@ -83,6 +108,14 @@ struct GpuFrameData {
|
||||
std::atomic<int> refcount{1};
|
||||
std::chrono::steady_clock::time_point createdAt;
|
||||
|
||||
// --- Owner callback (for per-client inference guard) ---
|
||||
// When the last reference to this frame drops, onReleaseFn is called
|
||||
// with ownerClient to decrement the RTSP client's in-flight counter.
|
||||
// This lets Destroy() wait for in-flight inference to finish before
|
||||
// freeing NVDEC surfaces (fixes LabVIEW crash).
|
||||
void* ownerClient = nullptr;
|
||||
void (*onReleaseFn)(void*) = nullptr;
|
||||
|
||||
// Default constructor
|
||||
GpuFrameData() = default;
|
||||
|
||||
@@ -100,6 +133,7 @@ struct GpuFrameData {
|
||||
, yPlane(o.yPlane), uvPlane(o.uvPlane)
|
||||
, yLinesize(o.yLinesize), uvLinesize(o.uvLinesize)
|
||||
, refcount(o.refcount.load()), createdAt(o.createdAt)
|
||||
, ownerClient(o.ownerClient), onReleaseFn(o.onReleaseFn)
|
||||
{
|
||||
// Null out source to prevent double-free of owned pointers
|
||||
o.cpuYPlane = nullptr;
|
||||
@@ -111,6 +145,8 @@ struct GpuFrameData {
|
||||
o.yPlane = nullptr;
|
||||
o.uvPlane = nullptr;
|
||||
o.gpuCacheBytes = 0;
|
||||
o.ownerClient = nullptr;
|
||||
o.onReleaseFn = nullptr;
|
||||
}
|
||||
|
||||
// No copy
|
||||
@@ -140,32 +176,50 @@ public:
|
||||
if (!mat) return nullptr;
|
||||
void* oldAvframe = nullptr;
|
||||
|
||||
// Capture old frame's owner callback to invoke OUTSIDE m_mutex
|
||||
void* oldOwner = nullptr;
|
||||
void (*oldReleaseFn)(void*) = nullptr;
|
||||
|
||||
data.createdAt = std::chrono::steady_clock::now();
|
||||
data.refcount.store(1);
|
||||
|
||||
auto* heapData = new GpuFrameData(std::move(data));
|
||||
REG_DBG("attach mat=%p new frame=%p yPlane=%p gpuCacheY=%p isCuda=%d %dx%d",
|
||||
(void*)mat, (void*)heapData,
|
||||
(void*)heapData->yPlane, heapData->gpuCacheY,
|
||||
(int)heapData->isCudaDevicePtr,
|
||||
heapData->width, heapData->height);
|
||||
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
// If this Mat* already has an entry, release the old one
|
||||
auto it = m_map.find(mat);
|
||||
if (it != m_map.end()) {
|
||||
auto* oldFrame = it->second;
|
||||
int oldRef = oldFrame->refcount.fetch_sub(1);
|
||||
if (oldRef <= 1) {
|
||||
oldAvframe = oldFrame->avframe;
|
||||
if (oldFrame->cpuAvframe)
|
||||
m_pendingFree.push_back(oldFrame->cpuAvframe);
|
||||
freeOwnedBuffers_locked(oldFrame);
|
||||
m_frameSet.erase(oldFrame);
|
||||
delete oldFrame;
|
||||
// If this Mat* already has an entry, release the old one
|
||||
auto it = m_map.find(mat);
|
||||
if (it != m_map.end()) {
|
||||
auto* oldFrame = it->second;
|
||||
int oldRef = oldFrame->refcount.fetch_sub(1);
|
||||
if (oldRef <= 1) {
|
||||
oldOwner = oldFrame->ownerClient;
|
||||
oldReleaseFn = oldFrame->onReleaseFn;
|
||||
oldAvframe = oldFrame->avframe;
|
||||
if (oldFrame->cpuAvframe)
|
||||
m_pendingFree.push_back(oldFrame->cpuAvframe);
|
||||
freeOwnedBuffers_locked(oldFrame);
|
||||
m_frameSet.erase(oldFrame);
|
||||
delete oldFrame;
|
||||
}
|
||||
// If oldRef > 1, other clones still reference it — just unlink this Mat*
|
||||
m_map.erase(it);
|
||||
}
|
||||
// If oldRef > 1, other clones still reference it — just unlink this Mat*
|
||||
m_map.erase(it);
|
||||
|
||||
m_map[mat] = heapData;
|
||||
m_frameSet.insert(heapData);
|
||||
}
|
||||
|
||||
m_map[mat] = heapData;
|
||||
m_frameSet.insert(heapData);
|
||||
// Notify old frame's owner OUTSIDE m_mutex
|
||||
if (oldReleaseFn && oldOwner) {
|
||||
oldReleaseFn(oldOwner);
|
||||
}
|
||||
|
||||
return oldAvframe; // Caller must av_frame_free if non-null
|
||||
}
|
||||
@@ -197,24 +251,46 @@ public:
|
||||
void release(cv::Mat* mat) {
|
||||
if (!mat) return;
|
||||
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
// Capture owner callback to invoke OUTSIDE m_mutex (deadlock safety)
|
||||
void* owner = nullptr;
|
||||
void (*releaseFn)(void*) = nullptr;
|
||||
|
||||
auto it = m_map.find(mat);
|
||||
if (it == m_map.end()) return;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
auto* frame = it->second;
|
||||
m_map.erase(it);
|
||||
auto it = m_map.find(mat);
|
||||
if (it == m_map.end()) return;
|
||||
|
||||
int oldRef = frame->refcount.fetch_sub(1);
|
||||
if (oldRef <= 1) {
|
||||
// Last reference — free everything
|
||||
if (frame->avframe)
|
||||
m_pendingFree.push_back(frame->avframe);
|
||||
if (frame->cpuAvframe)
|
||||
m_pendingFree.push_back(frame->cpuAvframe);
|
||||
freeOwnedBuffers_locked(frame);
|
||||
m_frameSet.erase(frame);
|
||||
delete frame;
|
||||
auto* frame = it->second;
|
||||
m_map.erase(it);
|
||||
|
||||
int oldRef = frame->refcount.fetch_sub(1);
|
||||
REG_DBG("release mat=%p refcount %d->%d yPlane=%p gpuCacheY=%p owner=%p",
|
||||
(void*)mat, oldRef, oldRef - 1,
|
||||
(void*)frame->yPlane, frame->gpuCacheY, frame->ownerClient);
|
||||
if (oldRef <= 1) {
|
||||
// Capture owner callback before deleting frame
|
||||
owner = frame->ownerClient;
|
||||
releaseFn = frame->onReleaseFn;
|
||||
REG_DBG("LAST REF — freeing frame=%p cpuY=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu",
|
||||
(void*)frame, (void*)frame->cpuYPlane,
|
||||
frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes);
|
||||
// Last reference — free everything
|
||||
if (frame->avframe)
|
||||
m_pendingFree.push_back(frame->avframe);
|
||||
if (frame->cpuAvframe)
|
||||
m_pendingFree.push_back(frame->cpuAvframe);
|
||||
freeOwnedBuffers_locked(frame);
|
||||
m_frameSet.erase(frame);
|
||||
delete frame;
|
||||
}
|
||||
}
|
||||
|
||||
// Notify owner OUTSIDE m_mutex — prevents lock-ordering deadlock
|
||||
// with ANSRTSPClient::_mutex (used by Destroy's condition_variable wait)
|
||||
if (releaseFn && owner) {
|
||||
REG_DBG("calling onReleaseFn owner=%p", owner);
|
||||
releaseFn(owner);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -267,9 +343,10 @@ public:
|
||||
}
|
||||
|
||||
// --- Drain pending GPU device pointers for caller to cudaFree ---
|
||||
std::vector<void*> drain_gpu_pending() {
|
||||
// Each entry includes the device index for cudaSetDevice before cudaFree.
|
||||
std::vector<GpuPendingFreeEntry> drain_gpu_pending() {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
std::vector<void*> result;
|
||||
std::vector<GpuPendingFreeEntry> result;
|
||||
result.swap(m_pendingGpuFree);
|
||||
return result;
|
||||
}
|
||||
@@ -287,31 +364,46 @@ public:
|
||||
m_lastEvictCheck = now;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
|
||||
auto* frame = *it;
|
||||
auto age_s = std::chrono::duration_cast<std::chrono::seconds>(
|
||||
now - frame->createdAt).count();
|
||||
if (age_s > FRAME_TTL_SECONDS && frame->refcount.load() > 0) {
|
||||
// Force cleanup — remove all Mat* keys pointing to this frame
|
||||
for (auto jt = m_map.begin(); jt != m_map.end(); ) {
|
||||
if (jt->second == frame)
|
||||
jt = m_map.erase(jt);
|
||||
else
|
||||
++jt;
|
||||
// Collect owner callbacks to invoke OUTSIDE m_mutex
|
||||
struct OwnerCallback { void* client; void (*fn)(void*); };
|
||||
std::vector<OwnerCallback> callbacks;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
|
||||
auto* frame = *it;
|
||||
auto age_s = std::chrono::duration_cast<std::chrono::seconds>(
|
||||
now - frame->createdAt).count();
|
||||
if (age_s > FRAME_TTL_SECONDS && frame->refcount.load() > 0) {
|
||||
// Capture owner callback before deleting
|
||||
if (frame->onReleaseFn && frame->ownerClient) {
|
||||
callbacks.push_back({frame->ownerClient, frame->onReleaseFn});
|
||||
}
|
||||
// Force cleanup — remove all Mat* keys pointing to this frame
|
||||
for (auto jt = m_map.begin(); jt != m_map.end(); ) {
|
||||
if (jt->second == frame)
|
||||
jt = m_map.erase(jt);
|
||||
else
|
||||
++jt;
|
||||
}
|
||||
// Push avframes to pendingFree
|
||||
if (frame->avframe)
|
||||
m_pendingFree.push_back(frame->avframe);
|
||||
if (frame->cpuAvframe)
|
||||
m_pendingFree.push_back(frame->cpuAvframe);
|
||||
freeOwnedBuffers_locked(frame);
|
||||
it = m_frameSet.erase(it);
|
||||
delete frame;
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
// Push avframes to pendingFree
|
||||
if (frame->avframe)
|
||||
m_pendingFree.push_back(frame->avframe);
|
||||
if (frame->cpuAvframe)
|
||||
m_pendingFree.push_back(frame->cpuAvframe);
|
||||
freeOwnedBuffers_locked(frame);
|
||||
it = m_frameSet.erase(it);
|
||||
delete frame;
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
// Notify owners OUTSIDE m_mutex
|
||||
for (auto& cb : callbacks) {
|
||||
cb.fn(cb.client);
|
||||
}
|
||||
}
|
||||
|
||||
// --- VRAM budget management ---
|
||||
@@ -340,6 +432,70 @@ public:
|
||||
void setGpuCacheBudget(size_t bytes) { m_gpuCacheBudget = bytes; }
|
||||
size_t gpuCacheBudget() const { return m_gpuCacheBudget; }
|
||||
|
||||
// --- Invalidate owner: nullify all callbacks for a client being destroyed ---
|
||||
// Called by Destroy() on timeout to prevent callbacks into a deleted object.
|
||||
void invalidateOwner(void* client) {
|
||||
if (!client) return;
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
for (auto* frame : m_frameSet) {
|
||||
if (frame->ownerClient == client) {
|
||||
frame->ownerClient = nullptr;
|
||||
frame->onReleaseFn = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- Force-release all frames owned by a client ---
|
||||
// Called by Destroy() BEFORE close() to free GPU buffers while the CUDA
|
||||
// context is still alive. Without this, unreleased clones (e.g. 70 cloned
|
||||
// images held by LabVIEW AI tasks that haven't finished) keep gpuCacheY/UV
|
||||
// allocated. When close() destroys the CUDA context, those buffers become
|
||||
// orphaned and later cudaFree calls crash.
|
||||
//
|
||||
// This force-frees ALL owned buffers for frames belonging to this client,
|
||||
// removes all Mat* keys pointing to them, and deletes the GpuFrameData.
|
||||
// Returns the number of frames force-released.
|
||||
int forceReleaseByOwner(void* client) {
|
||||
if (!client) return 0;
|
||||
int count = 0;
|
||||
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
for (auto it = m_frameSet.begin(); it != m_frameSet.end(); ) {
|
||||
auto* frame = *it;
|
||||
if (frame->ownerClient == client) {
|
||||
REG_DBG("forceReleaseByOwner: frame=%p refcount=%d gpuCacheY=%p gpuCacheUV=%p bytes=%zu",
|
||||
(void*)frame, frame->refcount.load(),
|
||||
frame->gpuCacheY, frame->gpuCacheUV, frame->gpuCacheBytes);
|
||||
|
||||
// Remove all Mat* keys pointing to this frame
|
||||
for (auto jt = m_map.begin(); jt != m_map.end(); ) {
|
||||
if (jt->second == frame)
|
||||
jt = m_map.erase(jt);
|
||||
else
|
||||
++jt;
|
||||
}
|
||||
|
||||
// Free owned buffers (CPU + GPU pending)
|
||||
if (frame->avframe)
|
||||
m_pendingFree.push_back(frame->avframe);
|
||||
if (frame->cpuAvframe)
|
||||
m_pendingFree.push_back(frame->cpuAvframe);
|
||||
freeOwnedBuffers_locked(frame);
|
||||
it = m_frameSet.erase(it);
|
||||
delete frame;
|
||||
++count;
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
if (count > 0) {
|
||||
REG_DBG("forceReleaseByOwner: force-released %d frames for client=%p", count, client);
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private:
|
||||
ANSGpuFrameRegistry() = default;
|
||||
|
||||
@@ -350,6 +506,10 @@ private:
|
||||
// Free malloc'd CPU NV12 buffers and GPU cache (but NOT avframe/cpuAvframe —
|
||||
// those go to pendingFree for the caller to av_frame_free).
|
||||
void freeOwnedBuffers_locked(GpuFrameData* frame) {
|
||||
REG_DBG("freeOwnedBuffers: frame=%p cpuY=%p cpuUV=%p gpuCacheY=%p gpuCacheUV=%p bytes=%zu dev=%d",
|
||||
(void*)frame, (void*)frame->cpuYPlane, (void*)frame->cpuUvPlane,
|
||||
frame->gpuCacheY, frame->gpuCacheUV,
|
||||
frame->gpuCacheBytes, frame->gpuCacheDeviceIdx);
|
||||
if (frame->cpuYPlane) {
|
||||
std::free(frame->cpuYPlane);
|
||||
frame->cpuYPlane = nullptr;
|
||||
@@ -358,23 +518,17 @@ private:
|
||||
std::free(frame->cpuUvPlane);
|
||||
frame->cpuUvPlane = nullptr;
|
||||
}
|
||||
// GPU cache freed via CUDA — caller (ANSODEngine) must handle this
|
||||
// since we can't call cudaFree from this FFmpeg-free header.
|
||||
// The gpuCacheBytes are tracked; actual deallocation happens in
|
||||
// NV12PreprocessHelper or a GPU-aware cleanup path.
|
||||
// GPU cache freed via CUDA — push to deferred list with device index
|
||||
// so the caller (ANSGpuFrameOps.h) can cudaSetDevice + cudaFree.
|
||||
if (frame->gpuCacheBytes > 0) {
|
||||
onGpuCacheFreed(frame->gpuCacheBytes);
|
||||
// Mark as invalid so no one reads stale pointers
|
||||
frame->gpuCacheValid = false;
|
||||
frame->gpuCacheBytes = 0;
|
||||
// NOTE: gpuCacheY/gpuCacheUV device pointers are leaked here
|
||||
// unless the caller handles GPU cleanup. This is addressed in
|
||||
// Step 8 (NV12PreprocessHelper) where cudaFree is available.
|
||||
// For now, push to a separate GPU-free list.
|
||||
int devIdx = frame->gpuCacheDeviceIdx;
|
||||
if (frame->gpuCacheY)
|
||||
m_pendingGpuFree.push_back(frame->gpuCacheY);
|
||||
m_pendingGpuFree.push_back({frame->gpuCacheY, devIdx});
|
||||
if (frame->gpuCacheUV)
|
||||
m_pendingGpuFree.push_back(frame->gpuCacheUV);
|
||||
m_pendingGpuFree.push_back({frame->gpuCacheUV, devIdx});
|
||||
frame->gpuCacheY = nullptr;
|
||||
frame->gpuCacheUV = nullptr;
|
||||
}
|
||||
@@ -384,7 +538,7 @@ private:
|
||||
std::unordered_map<cv::Mat*, GpuFrameData*> m_map;
|
||||
std::unordered_set<GpuFrameData*> m_frameSet; // All unique frames (for TTL scan)
|
||||
std::vector<void*> m_pendingFree; // AVFrame* pointers to av_frame_free
|
||||
std::vector<void*> m_pendingGpuFree; // CUDA device pointers to cudaFree
|
||||
std::vector<GpuPendingFreeEntry> m_pendingGpuFree; // CUDA device pointers to cudaFree
|
||||
std::atomic<size_t> m_totalGpuCacheBytes{0};
|
||||
size_t m_gpuCacheBudget = GPU_CACHE_BUDGET_DEFAULT;
|
||||
std::chrono::steady_clock::time_point m_lastEvictCheck;
|
||||
@@ -408,7 +562,7 @@ inline bool gpu_frame_addref(cv::Mat* src, cv::Mat* dst) {
|
||||
}
|
||||
|
||||
// Drain GPU device pointers that need cudaFree.
|
||||
// Caller must cudaFree each returned pointer.
|
||||
inline std::vector<void*> gpu_frame_drain_gpu_pending() {
|
||||
// Caller must cudaSetDevice(entry.deviceIdx) + cudaFree(entry.ptr) for each.
|
||||
inline std::vector<GpuPendingFreeEntry> gpu_frame_drain_gpu_pending() {
|
||||
return ANSGpuFrameRegistry::instance().drain_gpu_pending();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user