Disable NV12 path for ANSCV by default. Currenly use cv::Mat** directly
This commit is contained in:
@@ -213,44 +213,44 @@ namespace ANSCENTER {
|
||||
bool ANSRTSPClient::Reconnect() {
|
||||
// 1. Mark as not-playing under the mutex FIRST. This makes GetImage()
|
||||
// return the cached _pLastFrame instead of calling into the player,
|
||||
// and blocks new TryIncrementInFlight calls.
|
||||
// and blocks new TryIncrementInFlight calls (no new NV12 attachments).
|
||||
{
|
||||
std::unique_lock<std::recursive_mutex> lock(_mutex);
|
||||
_isPlaying = false;
|
||||
|
||||
// --- Inference guard: wait for in-flight D2D copies to finish ---
|
||||
// With synchronous D2D copy, in-flight means "currently inside
|
||||
// GetRTSPCVImage between TryIncrementInFlight and attach_cuda".
|
||||
// This is typically <1ms, so the wait is very fast.
|
||||
// --- Inference guard: wait for ALL in-flight inference to finish ---
|
||||
// _inFlightFrames tracks frames from GetRTSPCVImage through to the
|
||||
// end of inference (DecrementInFlight fires when last clone is released).
|
||||
// We MUST wait for this to reach 0 before calling close(), because
|
||||
// inference may still be reading NV12 pool buffer data that depends
|
||||
// on the NVDEC decoder context being alive.
|
||||
//
|
||||
// DO NOT force-reset _inFlightFrames or invalidate onReleaseFn —
|
||||
// let inference finish naturally so DecrementInFlight fires correctly.
|
||||
int inFlight = _inFlightFrames.load(std::memory_order_acquire);
|
||||
if (inFlight > 0) {
|
||||
_logger.LogInfo("ANSRTSPClient::Reconnect",
|
||||
std::format("waiting for {} in-flight frame(s)...", inFlight),
|
||||
std::format("waiting for {} in-flight inference(s) to complete...", inFlight),
|
||||
__FILE__, __LINE__);
|
||||
bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(5), [this] {
|
||||
bool done = _inFlightDone.wait_for(lock, std::chrono::seconds(10), [this] {
|
||||
return _inFlightFrames.load(std::memory_order_acquire) <= 0;
|
||||
});
|
||||
if (!done) {
|
||||
_logger.LogWarn("ANSRTSPClient::Reconnect",
|
||||
std::format("timed out — still {} in-flight", _inFlightFrames.load()),
|
||||
std::format("timed out — still {} in-flight, proceeding with close()",
|
||||
_inFlightFrames.load()),
|
||||
__FILE__, __LINE__);
|
||||
// Force-reset only on timeout as last resort
|
||||
ANSGpuFrameRegistry::instance().invalidateOwner(this);
|
||||
_inFlightFrames.store(0, std::memory_order_release);
|
||||
}
|
||||
}
|
||||
|
||||
// Invalidate owner callbacks — prevents stale DecrementInFlight
|
||||
// calls after Reconnect re-creates the decoder.
|
||||
// Frames and their global pool slots remain alive for inference.
|
||||
ANSGpuFrameRegistry::instance().invalidateOwner(this);
|
||||
_inFlightFrames.store(0, std::memory_order_release);
|
||||
|
||||
// NO forceReleaseByOwner — frames survive reconnect.
|
||||
// NO cudaDeviceSynchronize — no GPU buffers to free.
|
||||
// NO DestroyGpuPool — per-camera pool has been removed.
|
||||
}
|
||||
|
||||
// 2. close() destroys NVDEC decoder ONLY — run outside _mutex to
|
||||
// avoid deadlocking with nvcuda64 SRW lock held by inference.
|
||||
// Pool slot buffers are global and untouched.
|
||||
// avoid deadlocking with nvcuda64 SRW lock held by other cameras.
|
||||
// At this point, all inference using this camera's NV12 data has
|
||||
// completed (or timed out), so close() is safe.
|
||||
_logger.LogInfo("ANSRTSPClient::Reconnect",
|
||||
"calling close() — NVDEC decoder will be destroyed", __FILE__, __LINE__);
|
||||
RTSP_DBG("[Reconnect] BEFORE close() this=%p", (void*)this);
|
||||
@@ -883,6 +883,14 @@ namespace ANSCENTER {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_playerClient->setImageQuality(mode); // 0=fast (AI), 1=quality (display)
|
||||
}
|
||||
void ANSRTSPClient::SetTargetFPS(double intervalMs) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_playerClient->setTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS
|
||||
}
|
||||
void ANSRTSPClient::SetNV12FastPath(bool enable) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
_useNV12FastPath = enable;
|
||||
}
|
||||
AVFrame* ANSRTSPClient::GetNV12Frame() {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
if (!_isPlaying) return nullptr; // Player may be mid-reconnect (CUDA resources freed)
|
||||
@@ -1045,67 +1053,60 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
|
||||
|
||||
auto t1 = std::chrono::steady_clock::now();
|
||||
|
||||
// Attach NV12 frame for GPU fast-path inference (side-table registry)
|
||||
// attach() takes ownership — do NOT av_frame_free here
|
||||
//
|
||||
// CRITICAL: TryIncrementInFlight() MUST be called BEFORE GetCudaHWFrame().
|
||||
// It atomically checks _isPlaying and increments _inFlightFrames under
|
||||
// the same mutex, so Reconnect() cannot call close() while we're doing
|
||||
// the D2D copy from NVDEC surfaces inside gpu_frame_attach_cuda().
|
||||
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
|
||||
bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight();
|
||||
RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
|
||||
(void*)*image, gpuIdx, (int)inFlightGuardHeld);
|
||||
// NV12 GPU fast path: attach NV12 frame data for zero-copy inference.
|
||||
// When disabled (_useNV12FastPath=false), the original stable CPU path is used:
|
||||
// GetImage() returns BGR cv::Mat in CPU RAM → no CUDA calls → no SRW lock contention.
|
||||
// When enabled, D2D copies NV12 from NVDEC to pool buffers for GPU inference.
|
||||
if ((*Handle)->IsNV12FastPath()) {
|
||||
int gpuIdx = (*Handle)->GetHWDecodingGpuIndex();
|
||||
bool inFlightGuardHeld = (*Handle)->TryIncrementInFlight();
|
||||
RTSP_DBG("[GetRTSPCVImage] mat=%p gpuIdx=%d inFlightGuard=%d",
|
||||
(void*)*image, gpuIdx, (int)inFlightGuardHeld);
|
||||
|
||||
if (inFlightGuardHeld) {
|
||||
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
|
||||
if (cudaHW) {
|
||||
RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p",
|
||||
cudaHW->width, cudaHW->height,
|
||||
(void*)cudaHW->data[0], (void*)cudaHW->data[1]);
|
||||
if (inFlightGuardHeld) {
|
||||
AVFrame* cudaHW = (*Handle)->GetCudaHWFrame();
|
||||
if (cudaHW) {
|
||||
RTSP_DBG("[GetRTSPCVImage] cudaHW: %dx%d data[0]=%p data[1]=%p",
|
||||
cudaHW->width, cudaHW->height,
|
||||
(void*)cudaHW->data[0], (void*)cudaHW->data[1]);
|
||||
|
||||
// Acquire a slot from the global pool — survives camera Destroy.
|
||||
GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire(
|
||||
gpuIdx, cudaHW->width, cudaHW->height);
|
||||
// Acquire a slot from the global pool — survives camera Destroy.
|
||||
GpuNV12Slot* slot = GpuNV12SlotPool::instance().acquire(
|
||||
gpuIdx, cudaHW->width, cudaHW->height);
|
||||
|
||||
// Only fetch CPU NV12 if pool slot unavailable (cross-GPU fallback).
|
||||
// When slot is valid, the D2D copy goes GPU→GPU and CPU NV12 is never used.
|
||||
// Skipping av_frame_clone + av_frame_free saves ~0.1ms per frame.
|
||||
AVFrame* cpuNV12 = slot ? nullptr : (*Handle)->GetNV12Frame();
|
||||
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot);
|
||||
} else {
|
||||
// HW decode not active — try CPU NV12
|
||||
AVFrame* nv12 = (*Handle)->GetNV12Frame();
|
||||
if (nv12) {
|
||||
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
|
||||
// Only fetch CPU NV12 if pool slot unavailable (cross-GPU fallback).
|
||||
AVFrame* cpuNV12 = slot ? nullptr : (*Handle)->GetNV12Frame();
|
||||
gpu_frame_attach_cuda(*image, cudaHW, gpuIdx, timeStamp, cpuNV12, slot);
|
||||
} else {
|
||||
// HW decode not active — try CPU NV12
|
||||
AVFrame* nv12 = (*Handle)->GetNV12Frame();
|
||||
if (nv12) {
|
||||
gpu_frame_attach(*image, nv12, gpuIdx, timeStamp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Wire up the registry callback to release the in-flight guard.
|
||||
// TryIncrementInFlight already incremented; DecrementInFlight fires
|
||||
// when the last clone of this frame is released after inference.
|
||||
auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
|
||||
RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p",
|
||||
(void*)gpuData,
|
||||
gpuData ? (void*)gpuData->yPlane : nullptr,
|
||||
gpuData ? (int)gpuData->isCudaDevicePtr : -1,
|
||||
gpuData ? (void*)gpuData->poolSlot : nullptr);
|
||||
if (gpuData) {
|
||||
gpuData->ownerClient = *Handle;
|
||||
gpuData->onReleaseFn = [](void* client) {
|
||||
static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight();
|
||||
};
|
||||
// NOTE: Do NOT call IncrementInFlight() again here —
|
||||
// TryIncrementInFlight() already did it above.
|
||||
// Wire up the registry callback to release the in-flight guard.
|
||||
auto* gpuData = ANSGpuFrameRegistry::instance().lookup(*image);
|
||||
RTSP_DBG("[GetRTSPCVImage] after attach: gpuData=%p yPlane=%p isCuda=%d poolSlot=%p",
|
||||
(void*)gpuData,
|
||||
gpuData ? (void*)gpuData->yPlane : nullptr,
|
||||
gpuData ? (int)gpuData->isCudaDevicePtr : -1,
|
||||
gpuData ? (void*)gpuData->poolSlot : nullptr);
|
||||
if (gpuData) {
|
||||
gpuData->ownerClient = *Handle;
|
||||
gpuData->onReleaseFn = [](void* client) {
|
||||
static_cast<ANSCENTER::ANSRTSPClient*>(client)->DecrementInFlight();
|
||||
};
|
||||
} else {
|
||||
(*Handle)->DecrementInFlight();
|
||||
}
|
||||
} else {
|
||||
// No gpuData registered (attach failed?) — release the guard
|
||||
(*Handle)->DecrementInFlight();
|
||||
RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
|
||||
}
|
||||
} else {
|
||||
// Player is stopping/reconnecting — skip CUDA path entirely.
|
||||
// GetImage() already returned a cached BGR frame, which is safe.
|
||||
RTSP_DBG("[GetRTSPCVImage] SKIP CUDA — player not playing (reconnecting?)");
|
||||
}
|
||||
// else: original CPU path — cv::Mat** contains BGR data in CPU RAM.
|
||||
// No CUDA calls, no pool slots, no GPU frame registry.
|
||||
// Inference uses cv::Mat directly (upload to GPU in engine).
|
||||
|
||||
// Lightweight timing — logs only when frame grab + D2D exceeds 50ms.
|
||||
// Goes to both spdlog (console/file) AND OutputDebugString (DebugView)
|
||||
@@ -1115,7 +1116,7 @@ extern "C" __declspec(dllexport) int GetRTSPCVImage(
|
||||
double getImageMs = std::chrono::duration<double, std::milli>(t1 - t0).count();
|
||||
double cudaMs = std::chrono::duration<double, std::milli>(t2 - t1).count();
|
||||
double totalMs = getImageMs + cudaMs;
|
||||
if (totalMs > 50.0) {
|
||||
if (totalMs > 500.0) {
|
||||
auto msg = std::format("SLOW FRAME: total={:.1f}ms (getImage={:.1f}ms cuda={:.1f}ms) {}x{}",
|
||||
totalMs, getImageMs, cudaMs, width, height);
|
||||
(*Handle)->_logger.LogWarn("GetRTSPCVImage", msg, __FILE__, __LINE__);
|
||||
@@ -1452,6 +1453,18 @@ extern "C" __declspec(dllexport) void SetRTSPDisplayResolution(ANSCENTER::ANSRTS
|
||||
(*Handle)->SetDisplayResolution(width, height);
|
||||
} catch (...) { }
|
||||
}
|
||||
extern "C" __declspec(dllexport) void SetRTSPTargetFPS(ANSCENTER::ANSRTSPClient** Handle, double intervalMs) {
|
||||
if (Handle == nullptr || *Handle == nullptr) return;
|
||||
try {
|
||||
(*Handle)->SetTargetFPS(intervalMs); // 0=no limit, 100=~10FPS, 200=~5FPS
|
||||
} catch (...) { }
|
||||
}
|
||||
extern "C" __declspec(dllexport) void SetRTSPNV12FastPath(ANSCENTER::ANSRTSPClient** Handle, int enable) {
|
||||
if (Handle == nullptr || *Handle == nullptr) return;
|
||||
try {
|
||||
(*Handle)->SetNV12FastPath(enable != 0); // 0=original CPU path (stable), 1=NV12 GPU fast path
|
||||
} catch (...) { }
|
||||
}
|
||||
extern "C" __declspec(dllexport) int SetCropFlagRTSP(ANSCENTER::ANSRTSPClient** Handle, int cropFlag) {
|
||||
if (Handle == nullptr || *Handle == nullptr) return -1;
|
||||
try {
|
||||
|
||||
Reference in New Issue
Block a user