Fix NV12 crash issue when recreate camera object

(new structure) does not work
This commit is contained in:
2026-04-03 14:51:52 +11:00
parent 958cab6ae3
commit 6fb09830c5
16 changed files with 854 additions and 209 deletions

View File

@@ -1,6 +1,7 @@
#include "ANSRTSP.h"
#include "ANSMatRegistry.h"
#include "ANSGpuFrameOps.h"
#include "GpuNV12SlotPool.h"
#include <memory>
#include <format>
#include "media_codec.h"
@@ -23,8 +24,9 @@ extern "C"
// Note: per-instance thread safety is handled by ANSRTSPClient::_mutex
// Mat registry thread safety is handled by anscv_mat_replace's internal registry_mutex
// Debug logging — goes to both stderr AND OutputDebugString (DebugView).
// Debug logging. Define ANSCORE_GPU_DEBUG=1 to enable verbose per-frame logging.
#ifndef RTSP_DBG
#if defined(ANSCORE_GPU_DEBUG) && ANSCORE_GPU_DEBUG
#ifdef _WIN32
#define RTSP_DBG(fmt, ...) do { \
char _rtsp_buf[512]; \
@@ -35,6 +37,9 @@ extern "C"
#else
#define RTSP_DBG(fmt, ...) fprintf(stderr, fmt "\n", ##__VA_ARGS__)
#endif
#else
#define RTSP_DBG(fmt, ...) ((void)0)
#endif
#endif
static bool ansrtspLicenceValid = false;
// Global once_flag to protect license checking
@@ -62,6 +67,7 @@ namespace ANSCENTER {
ANSRTSPClient::~ANSRTSPClient() noexcept {
Destroy();
}
void ANSRTSPClient::Destroy() {
// Move the player client pointer out of the lock scope, then
// close it OUTSIDE the mutex. close() calls cuArrayDestroy /
@@ -80,69 +86,44 @@ namespace ANSCENTER {
}
}
// --- Inference guard: wait for in-flight frames to finish ---
// GetRTSPCVImage increments _inFlightFrames when it hands out
// a GPU frame; the registry decrements it when the frame is
// released after inference completes. We wait here so that
// close() doesn't free NVDEC surfaces while TensorRT is
// still reading from them (the LabVIEW crash root cause).
// --- Inference guard: wait for in-flight D2D copies to finish ---
// With synchronous D2D copy, in-flight means "currently inside
// GetRTSPCVImage between TryIncrementInFlight and attach_cuda".
// This is typically <1ms, so the wait is very fast.
int inFlight = _inFlightFrames.load(std::memory_order_acquire);
if (inFlight > 0) {
_logger.LogInfo("ANSRTSPClient::Destroy",
std::format("waiting for {} in-flight inference frame(s)...", inFlight),
std::format("waiting for {} in-flight frame(s)...", inFlight),
__FILE__, __LINE__);
bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
return _inFlightFrames.load(std::memory_order_acquire) <= 0;
});
if (!done) {
_logger.LogWarn("ANSRTSPClient::Destroy",
std::format("timed out waiting for in-flight frames "
"(still {} in-flight) — force-releasing GPU frames",
_inFlightFrames.load()),
std::format("timed out — still {} in-flight", _inFlightFrames.load()),
__FILE__, __LINE__);
}
}
// Force-release ALL GPU frames owned by this client BEFORE close().
// Unreleased clones (e.g. LabVIEW AI tasks still holding cloned
// cv::Mat*) keep gpuCacheY/gpuCacheUV allocated. We must cudaFree
// them NOW while the CUDA context is still alive. After close()
// destroys the context, cudaFree would crash.
int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
if (forceReleased > 0) {
_logger.LogWarn("ANSRTSPClient::Destroy",
std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
__FILE__, __LINE__);
// Drain and cudaFree the GPU buffers while CUDA context is alive
// Sync all GPU streams before freeing to avoid illegal access
cudaDeviceSynchronize();
auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
if (!gpuPending.empty()) {
RTSP_DBG("[Destroy] cudaFree %zu GPU ptrs before close()", gpuPending.size());
int prevDev = -1;
cudaGetDevice(&prevDev);
for (auto& entry : gpuPending) {
if (entry.ptr) {
if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
cudaFree(entry.ptr);
}
}
if (prevDev >= 0) cudaSetDevice(prevDev);
}
// Also drain any pending AVFrames
auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
for (void* p : avPending) {
AVFrame* f = static_cast<AVFrame*>(p);
av_frame_free(&f);
}
}
// Invalidate owner callbacks so stale GpuFrameData don't try to
// call DecrementInFlight on this (soon-to-be-deleted) object.
// The GpuFrameData and their global pool slots remain alive —
// inference engines can safely keep reading from them.
ANSGpuFrameRegistry::instance().invalidateOwner(this);
_inFlightFrames.store(0, std::memory_order_release);
// NO forceReleaseByOwner — frames survive camera deletion.
// Pool slot buffers are global (GpuNV12SlotPool) — NOT owned
// by this camera. They are recycled when inference finishes
// (GpuFrameData refcount → 0 → slot.inUse = false).
// NO cudaDeviceSynchronize — no GPU buffers to free here.
// NO DestroyGpuPool — per-camera pool has been removed.
clientToClose = std::move(_playerClient);
}
// CUDA cleanup happens here, outside the mutex — now safe.
// All GPU frames owned by this client have been force-freed above.
// close() destroys the NVDEC decoder ONLY. Pool slot buffers
// (regular cudaMallocPitch allocations) are untouched — they
// belong to the global GpuNV12SlotPool, not the decoder.
if (clientToClose) {
clientToClose->close();
}
@@ -232,66 +213,44 @@ namespace ANSCENTER {
bool ANSRTSPClient::Reconnect() {
// 1. Mark as not-playing under the mutex FIRST. This makes GetImage()
// return the cached _pLastFrame instead of calling into the player,
// preventing use-after-free when close() destroys CUDA resources.
// and blocks new TryIncrementInFlight calls.
{
std::unique_lock<std::recursive_mutex> lock(_mutex);
_isPlaying = false;
// --- Inference guard: wait for in-flight frames to finish ---
// Same guard as Destroy(): close() will free NVDEC surfaces, so
// we must wait for any inference engines still reading NV12 data
// via zero-copy CUDA device pointers.
// --- Inference guard: wait for in-flight D2D copies to finish ---
// With synchronous D2D copy, in-flight means "currently inside
// GetRTSPCVImage between TryIncrementInFlight and attach_cuda".
// This is typically <1ms, so the wait is very fast.
int inFlight = _inFlightFrames.load(std::memory_order_acquire);
if (inFlight > 0) {
_logger.LogInfo("ANSRTSPClient::Reconnect",
std::format("waiting for {} in-flight inference frame(s)...", inFlight),
std::format("waiting for {} in-flight frame(s)...", inFlight),
__FILE__, __LINE__);
bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
return _inFlightFrames.load(std::memory_order_acquire) <= 0;
});
if (!done) {
_logger.LogWarn("ANSRTSPClient::Reconnect",
std::format("timed out waiting for in-flight frames "
"(still {} in-flight) — force-releasing GPU frames",
_inFlightFrames.load()),
std::format("timed out — still {} in-flight", _inFlightFrames.load()),
__FILE__, __LINE__);
}
}
// Force-release GPU frames before close() — same as Destroy().
int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
if (forceReleased > 0) {
_logger.LogWarn("ANSRTSPClient::Reconnect",
std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
__FILE__, __LINE__);
// Sync all GPU streams before freeing
cudaDeviceSynchronize();
auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
if (!gpuPending.empty()) {
int prevDev = -1;
cudaGetDevice(&prevDev);
for (auto& entry : gpuPending) {
if (entry.ptr) {
if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
cudaFree(entry.ptr);
}
}
if (prevDev >= 0) cudaSetDevice(prevDev);
}
auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
for (void* p : avPending) {
AVFrame* f = static_cast<AVFrame*>(p);
av_frame_free(&f);
}
}
// Invalidate owner callbacks — prevents stale DecrementInFlight
// calls after Reconnect re-creates the decoder.
// Frames and their global pool slots remain alive for inference.
ANSGpuFrameRegistry::instance().invalidateOwner(this);
_inFlightFrames.store(0, std::memory_order_release);
// NO forceReleaseByOwner — frames survive reconnect.
// NO cudaDeviceSynchronize — no GPU buffers to free.
// NO DestroyGpuPool — per-camera pool has been removed.
}
// 2. close() does CUDA cleanup (cuArrayDestroy/cuMemFree) — run outside
// _mutex to avoid deadlocking with nvcuda64 SRW lock held by inference.
// Safe now because GetImage()/GetNV12Frame() won't touch the player
// while _isPlaying == false, and all in-flight frames have been released.
// 2. close() destroys NVDEC decoder ONLY — run outside _mutex to
// avoid deadlocking with nvcuda64 SRW lock held by inference.
// Pool slot buffers are global and untouched.
_logger.LogInfo("ANSRTSPClient::Reconnect",
"calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
@@ -1071,6 +1030,8 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
}
try {
auto t0 = std::chrono::steady_clock::now();
// Get image (shallow copy - reference counted, fast)
cv::Mat img = (*Handle)->GetImage(width, height, timeStamp);
@@ -1082,6 +1043,8 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
// Thread-safe Mat pointer swap (anscv_mat_replace has its own internal lock)
anscv_mat_replace(image, std::move(img));
auto t1 = std::chrono::steady_clock::now();
// Attach NV12 frame for GPU fast-path inference (side-table registry)
// attach() takes ownership — do NOT av_frame_free here
//
@@ -1101,7 +1064,11 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
cudaHW->width, cudaHW->height,
(void*)cudaHW->data[0], (void*)cudaHW->data[1]);
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
// Acquire a slot from the global pool — survives camera Destroy.
GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire(
gpuIdx, cudaHW->width, cudaHW->height);
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot);
} else {
// HW decode not active — try CPU NV12
AVFrame* nv12 = (*Handle)->GetNV12Frame();
@@ -1114,11 +1081,11 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
// TryIncrementInFlight already incremented; DecrementInFlight fires
// when the last clone of this frame is released after inference.
auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d gpuCacheY=%p",
RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p",
(void*)gpuData,
gpuData ? (void*)gpuData->yPlane : nullptr,
gpuData ? (int)gpuData->isCudaDevicePtr : -1,
gpuData ? gpuData->gpuCacheY : nullptr);
gpuData ? (void*)gpuData->poolSlot : nullptr);
if (gpuData) {
gpuData->ownerClient = *Handle;
gpuData->onReleaseFn = [](void* client) {
@@ -1136,6 +1103,20 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
}
// Lightweight timing via spdlog (no OutputDebugString).
// Logs only when the frame grab + D2D exceeds 50ms — helps diagnose stalls
// without the overhead of per-frame debug logging.
auto t2 = std::chrono::steady_clock::now();
double getImageMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
double cudaMs = std::chrono::duration<double, std::milli>(t2 - t1).count();
double totalMs = getImageMs + cudaMs;
if (totalMs > 50.0) {
(*Handle)->_logger.LogWarn("GetRTSPCVImage",
std::format("SLOW FRAME: total={:.1f}ms (getImage={:.1f}ms cuda={:.1f}ms) {}x{}",
totalMs, getImageMs, cudaMs, width, height),
__FILE__, __LINE__);
}
return 1; // Success
}
catch (const cv::Exception& e) {