Fix NV12 crash issue when recreate camera object

This commit is contained in:
2026-04-02 22:07:27 +11:00
parent 4bedf3a3a2
commit 958cab6ae3
25 changed files with 1459 additions and 393 deletions

View File

@@ -2,6 +2,7 @@
#include "ANSMatRegistry.h"
#include "ANSGpuFrameOps.h"
#include <memory>
#include <format>
#include "media_codec.h"
#include <cstdint>
#include <cuda_runtime.h>
@@ -21,6 +22,20 @@ extern "C"
}
// Note: per-instance thread safety is handled by ANSRTSPClient::_mutex
// Mat registry thread safety is handled by anscv_mat_replace's internal registry_mutex
// Debug logging — goes to both stderr AND OutputDebugString (DebugView).
#ifndef RTSP_DBG
#ifdef _WIN32
#define RTSP_DBG(fmt, ...) do { \
char _rtsp_buf[512]; \
snprintf(_rtsp_buf, sizeof(_rtsp_buf), fmt "\n", ##__VA_ARGS__); \
OutputDebugStringA(_rtsp_buf); \
fprintf(stderr, "%s", _rtsp_buf); \
} while(0)
#else
#define RTSP_DBG(fmt, ...) fprintf(stderr, fmt "\n", ##__VA_ARGS__)
#endif
#endif
static bool ansrtspLicenceValid = false;
// Global once_flag to protect license checking
static std::once_flag ansrtspLicenseOnceFlag;
@@ -48,19 +63,88 @@ namespace ANSCENTER {
Destroy();
}
void ANSRTSPClient::Destroy() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (_playerClient) {
// Stop the stream first so the video decoder is flushed and
// the RTSP callback thread is no longer feeding frames into
// decode(). Without this, rtsp_close() can block waiting for
// CRtspClient::m_pMutex (held by the callback mid-decode),
// and the hardware decoder flush during destruction can hang
// on the GPU.
if (_isPlaying) {
_playerClient->stop();
_isPlaying = false;
// Move the player client pointer out of the lock scope, then
// close it OUTSIDE the mutex. close() calls cuArrayDestroy /
// cuMemFree which acquire an EXCLUSIVE SRW lock inside nvcuda64.
// If we hold _mutex during close(), and another thread holds
// the nvcuda64 SRW lock (e.g. cuStreamSynchronize during
// inference), we get a deadlock: Stop() → _mutex → nvcuda64
// vs inference → nvcuda64 → (blocked by exclusive waiter).
decltype(_playerClient) clientToClose;
{
std::unique_lock<std::recursive_mutex> lock(_mutex);
if (_playerClient) {
if (_isPlaying) {
_playerClient->stop();
_isPlaying = false;
}
}
_playerClient->close();
// --- Inference guard: wait for in-flight frames to finish ---
// GetRTSPCVImage increments _inFlightFrames when it hands out
// a GPU frame; the registry decrements it when the frame is
// released after inference completes. We wait here so that
// close() doesn't free NVDEC surfaces while TensorRT is
// still reading from them (the LabVIEW crash root cause).
int inFlight = _inFlightFrames.load(std::memory_order_acquire);
if (inFlight > 0) {
_logger.LogInfo("ANSRTSPClient::Destroy",
std::format("waiting for {} in-flight inference frame(s)...", inFlight),
__FILE__, __LINE__);
bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
return _inFlightFrames.load(std::memory_order_acquire) <= 0;
});
if (!done) {
_logger.LogWarn("ANSRTSPClient::Destroy",
std::format("timed out waiting for in-flight frames "
"(still {} in-flight) — force-releasing GPU frames",
_inFlightFrames.load()),
__FILE__, __LINE__);
}
}
// Force-release ALL GPU frames owned by this client BEFORE close().
// Unreleased clones (e.g. LabVIEW AI tasks still holding cloned
// cv::Mat*) keep gpuCacheY/gpuCacheUV allocated. We must cudaFree
// them NOW while the CUDA context is still alive. After close()
// destroys the context, cudaFree would crash.
int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
if (forceReleased > 0) {
_logger.LogWarn("ANSRTSPClient::Destroy",
std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
__FILE__, __LINE__);
// Drain and cudaFree the GPU buffers while CUDA context is alive
// Sync all GPU streams before freeing to avoid illegal access
cudaDeviceSynchronize();
auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
if (!gpuPending.empty()) {
RTSP_DBG("[Destroy] cudaFree %zu GPU ptrs before close()", gpuPending.size());
int prevDev = -1;
cudaGetDevice(&prevDev);
for (auto& entry : gpuPending) {
if (entry.ptr) {
if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
cudaFree(entry.ptr);
}
}
if (prevDev >= 0) cudaSetDevice(prevDev);
}
// Also drain any pending AVFrames
auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
for (void* p : avPending) {
AVFrame* f = static_cast<AVFrame*>(p);
av_frame_free(&f);
}
}
ANSGpuFrameRegistry::instance().invalidateOwner(this);
_inFlightFrames.store(0, std::memory_order_release);
clientToClose = std::move(_playerClient);
}
// CUDA cleanup happens here, outside the mutex — now safe.
// All GPU frames owned by this client have been force-freed above.
if (clientToClose) {
clientToClose->close();
}
}
static void VerifyGlobalANSRTSPLicense(const std::string& licenseKey) {
@@ -146,10 +230,81 @@ namespace ANSCENTER {
_playerClient->setCrop(crop);
}
bool ANSRTSPClient::Reconnect() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// 1. Mark as not-playing under the mutex FIRST. This makes GetImage()
// return the cached _pLastFrame instead of calling into the player,
// preventing use-after-free when close() destroys CUDA resources.
{
std::unique_lock<std::recursive_mutex> lock(_mutex);
_isPlaying = false;
// --- Inference guard: wait for in-flight frames to finish ---
// Same guard as Destroy(): close() will free NVDEC surfaces, so
// we must wait for any inference engines still reading NV12 data
// via zero-copy CUDA device pointers.
int inFlight = _inFlightFrames.load(std::memory_order_acquire);
if (inFlight > 0) {
_logger.LogInfo("ANSRTSPClient::Reconnect",
std::format("waiting for {} in-flight inference frame(s)...", inFlight),
__FILE__, __LINE__);
bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
return _inFlightFrames.load(std::memory_order_acquire) <= 0;
});
if (!done) {
_logger.LogWarn("ANSRTSPClient::Reconnect",
std::format("timed out waiting for in-flight frames "
"(still {} in-flight) — force-releasing GPU frames",
_inFlightFrames.load()),
__FILE__, __LINE__);
}
}
// Force-release GPU frames before close() — same as Destroy().
int forceReleased = ANSGpuFrameRegistry::instance().forceReleaseByOwner(this);
if (forceReleased > 0) {
_logger.LogWarn("ANSRTSPClient::Reconnect",
std::format("force-released {} GPU frame(s) with unreleased clones", forceReleased),
__FILE__, __LINE__);
// Sync all GPU streams before freeing
cudaDeviceSynchronize();
auto gpuPending = ANSGpuFrameRegistry::instance().drain_gpu_pending();
if (!gpuPending.empty()) {
int prevDev = -1;
cudaGetDevice(&prevDev);
for (auto& entry : gpuPending) {
if (entry.ptr) {
if (entry.deviceIdx >= 0) cudaSetDevice(entry.deviceIdx);
cudaFree(entry.ptr);
}
}
if (prevDev >= 0) cudaSetDevice(prevDev);
}
auto avPending = ANSGpuFrameRegistry::instance().drain_pending();
for (void* p : avPending) {
AVFrame* f = static_cast<AVFrame*>(p);
av_frame_free(&f);
}
}
ANSGpuFrameRegistry::instance().invalidateOwner(this);
_inFlightFrames.store(0, std::memory_order_release);
}
// 2. close() does CUDA cleanup (cuArrayDestroy/cuMemFree) — run outside
// _mutex to avoid deadlocking with nvcuda64 SRW lock held by inference.
// Safe now because GetImage()/GetNV12Frame() won't touch the player
// while _isPlaying == false, and all in-flight frames have been released.
_logger.LogInfo("ANSRTSPClient::Reconnect",
"calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
_playerClient->close();
RTSP_DBG("[Reconnect] AFTER close() this=%p", (void*)this);
// 3. Re-setup and play under the mutex.
std::lock_guard<std::recursive_mutex> lock(_mutex);
_logger.LogInfo("ANSRTSPClient::Reconnect",
"calling Setup() + play()", __FILE__, __LINE__);
Setup();
_isPlaying = _playerClient->play();
RTSP_DBG("[Reconnect] DONE isPlaying=%d this=%p", (int)_isPlaying, (void*)this);
return _isPlaying;
}
void ANSRTSPClient::EnableAudio(bool status) {
@@ -169,11 +324,23 @@ namespace ANSCENTER {
}
bool ANSRTSPClient::Stop() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (_isPlaying) {
_playerClient->stop();
_isPlaying = false;
}
// Grab the player pointer and clear _isPlaying under the lock,
// then call stop() OUTSIDE the mutex. stop() internally calls
// StopVideoDecoder -> decoder->flush() which does CUDA calls
// that can block on the nvcuda64 SRW lock. Holding _mutex
// during that time blocks all other operations on this client
// and contributes to the convoy when many clients stop at once.
CRtspPlayer* player = nullptr;
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (_isPlaying) {
_isPlaying = false;
player = _playerClient.get();
}
}
if (player) {
player->stop();
}
return true;
}
bool ANSRTSPClient::Pause() {
@@ -759,10 +926,12 @@ namespace ANSCENTER {
}
AVFrame* ANSRTSPClient::GetNV12Frame() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!_isPlaying) return nullptr; // Player may be mid-reconnect (CUDA resources freed)
return _playerClient->getNV12Frame(); // Returns clone, caller must av_frame_free
}
AVFrame* ANSRTSPClient::GetCudaHWFrame() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!_isPlaying) return nullptr; // Player may be mid-reconnect (CUDA resources freed)
return _playerClient->getCudaHWFrame();
}
bool ANSRTSPClient::IsCudaHWAccel() {
@@ -810,6 +979,11 @@ extern "C" __declspec(dllexport) int CreateANSRTSPHandle(ANSCENTER::ANSRTSPClien
if (_username.empty() && _password.empty()) result = ptr->Init(licenseKey, url);
else result = ptr->Init(licenseKey, username, password, url);
if (result) {
// Default to CUDA/NVDEC HW decoding (mode 7) for NV12 zero-copy
// fast path. LabVIEW may not call SetRTSPHWDecoding after
// destroy+recreate cycles, so this ensures the new handle always
// uses the GPU decode path instead of falling back to D3D11VA/CPU.
ptr->SetHWDecoding(7); // HW_DECODING_CUDA
*Handle = ptr.release();
extern void anscv_unregister_handle(void*);
extern void anscv_register_handle(void*, void(*)(void*));
@@ -830,9 +1004,37 @@ extern "C" __declspec(dllexport) int ReleaseANSRTSPHandle(ANSCENTER::ANSRTSPClie
try {
extern void anscv_unregister_handle(void*);
anscv_unregister_handle(*Handle);
// unique_ptr destructor calls ~ANSRTSPClient which calls Destroy() — no need to call Destroy() separately
std::unique_ptr<ANSCENTER::ANSRTSPClient> ptr(*Handle);
// Grab the raw pointer and NULL the caller's handle immediately.
// This prevents the caller (LabVIEW) from issuing new calls.
ANSCENTER::ANSRTSPClient* raw = *Handle;
*Handle = nullptr;
// Mark as not-playing under _mutex ONLY. This makes
// GetImage()/GetNV12Frame()/GetCudaHWFrame() return empty/null
// on any subsequent call, and prevents NEW NV12 GPU surface
// pointers from being handed out.
//
// Do NOT call Destroy()/close() here — close() frees the
// NVDEC GPU surfaces (cuArrayDestroy/cuMemFree) which may
// still be in use by a CUDA inference kernel that received
// the NV12 pointer from a GetRTSPCVImage call that already
// completed before this Release was called.
{
// Use the client's _mutex to safely set _isPlaying = false.
// This is the same lock GetImage/GetNV12Frame acquire.
raw->Stop(); // sets _isPlaying = false, stops playback
}
// Defer the full cleanup (Destroy + delete) to a background thread
// so LabVIEW's UI thread is not blocked. Destroy() now waits
// precisely for in-flight inference to finish (via _inFlightFrames
// counter + condition variable) instead of the old 500ms sleep hack.
std::thread([raw]() {
try { raw->Destroy(); } catch (...) {}
try { delete raw; } catch (...) {}
}).detach();
return 0;
} catch (...) {
if (Handle) *Handle = nullptr;
@@ -882,19 +1084,56 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
// Attach NV12 frame for GPU fast-path inference (side-table registry)
// attach() takes ownership — do NOT av_frame_free here
//
// CRITICAL: TryIncrementInFlight() MUST be called BEFORE GetCudaHWFrame().
// It atomically checks _isPlaying and increments _inFlightFrames under
// the same mutex, so Reconnect() cannot call close() while we're doing
// the D2D copy from NVDEC surfaces inside gpu_frame_attach_cuda().
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
if (cudaHW) {
// CUDA zero-copy: frame data[0]/data[1] are CUDA device pointers.
// Also attach CPU NV12 as fallback for cross-GPU inference
// (when decode GPU != inference GPU, CUDA ptrs aren't accessible).
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
} else {
AVFrame* nv12 = (*Handle)->GetNV12Frame();
if (nv12) {
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight();
RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
(void*)*image, gpuIdx, (int)inFlightGuardHeld);
if (inFlightGuardHeld) {
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
if (cudaHW) {
RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p",
cudaHW->width, cudaHW->height,
(void*)cudaHW->data[0], (void*)cudaHW->data[1]);
AVFrame* cpuNV12 = (*Handle)->GetNV12Frame();
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12);
} else {
// HW decode not active — try CPU NV12
AVFrame* nv12 = (*Handle)->GetNV12Frame();
if (nv12) {
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
}
}
// Wire up the registry callback to release the in-flight guard.
// TryIncrementInFlight already incremented; DecrementInFlight fires
// when the last clone of this frame is released after inference.
auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d gpuCacheY=%p",
(void*)gpuData,
gpuData ? (void*)gpuData->yPlane : nullptr,
gpuData ? (int)gpuData->isCudaDevicePtr : -1,
gpuData ? gpuData->gpuCacheY : nullptr);
if (gpuData) {
gpuData->ownerClient = *Handle;
gpuData->onReleaseFn = [](void* client) {
static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight();
};
// NOTE: Do NOT call IncrementInFlight() again here —
// TryIncrementInFlight() already did it above.
} else {
// No gpuData registered (attach failed?) — release the guard
(*Handle)->DecrementInFlight();
}
} else {
// Player is stopping/reconnecting — skip CUDA path entirely.
// GetImage() already returned a cached BGR frame, which is safe.
RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
}
return 1; // Success