Fix RTSP stall by releasing _mutex during BGR conversion

getImage() previously held _mutex across the 4K NV12->BGR sws_scale in
avframeToCVMat, blocking the decoder callback (onVideoFrame) for 100-300ms
per frame. Under multi-camera load this cascaded into 5-21s frame stalls
and STALE PTS events in the log.

- avframeToCVMat: drop outer _mutex. NV12/YUV420P paths touch no shared
  state; avframeAnyToCvmat still locks internally for swsCtx.
- getImage: split into two short locked phases with the BGR conversion
  unlocked between them. Decoder callbacks can push new frames and run
  the CUDA HW capture path in parallel with the reader's conversion.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-20 08:23:28 +10:00
parent 3418090042
commit 0925aa1d63

View File

@@ -1360,7 +1360,10 @@ cv::Mat CVideoPlayer::avframeYUV420PToCvMat(const AVFrame* frame) {
}
cv::Mat CVideoPlayer::avframeToCVMat(const AVFrame* pFrame) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
// No _mutex here: caller (getImage) releases the mutex before invoking this
// so the expensive NV12/YUV420P→BGR conversion does not block onVideoFrame.
// NV12/YUV420P paths touch only the caller-owned AVFrame clone and benign
// member reads. avframeAnyToCvmat() takes its own lock for swsCtx.
try {
// 1. Validate input frame
if (!pFrame || !pFrame->data[0] || pFrame->width <= 10 || pFrame->height <= 10) {
@@ -2233,87 +2236,98 @@ void CVideoPlayer::onAudioFrame(AVFrame* frame)
cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
try {
// Lock the mutex using RAII (ensures unlock even in exceptions)
std::lock_guard<std::recursive_mutex> lock(_mutex); // Protect against concurrent access
AVFrame* frameToProcess = nullptr;
uint64_t currentSeq = 0;
if (!m_bPlaying) {
// Return the last valid frame if playback is stopped
width = m_currentImage.cols;
height = m_currentImage.rows;
pts = m_pts;
return m_currentImage; // Shallow copy (reference counted, safe under mutex)
}
// --- Phase 1: short locked section — examine state, pull latest frame ---
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
// While waiting for keyframe or during settle period after restart,
// return the last good cached image to avoid showing corrupted frames
if (m_bWaitingForKeyframe || m_cleanFrameCount < SETTLE_FRAME_COUNT) {
width = m_currentImage.cols;
height = m_currentImage.rows;
pts = m_pts;
return m_currentImage; // Last good frame (may be empty on first-ever start)
}
// Fast path: check if a new frame has arrived using sequence counter
// This avoids expensive av_frame_clone + NV12→BGR conversion when frame hasn't changed
uint64_t currentSeq = g_frameQueue.getSequence();
if (currentSeq == m_lastFrameSeq && !m_currentImage.empty()) {
width = m_currentImage.cols;
height = m_currentImage.rows;
pts = m_pts;
return m_currentImage; // Same frame, skip all conversion
}
// Get latest frame from queue
if (g_frameQueue.isEmpty()) {
width = m_currentImage.cols;
height = m_currentImage.rows;
pts = m_pts;
std::cerr << "No frame available in getImage()" << std::endl;
return cv::Mat(); // Return an empty cv::Mat() if no frame is available
}
AVFrame* frameToProcess = g_frameQueue.getLatestFrame();
if (!frameToProcess) {
// If no frame available, return last valid image
width = m_currentImage.cols;
height = m_currentImage.rows;
pts = m_pts;
return cv::Mat(); // Return an empty cv::Mat() if no frame is available
}
try {
// Convert AVFrame to cv::Mat
m_currentImage = avframeToCVMat(frameToProcess);
// Update timestamp and sequence if conversion is successful
if (!m_currentImage.empty()) {
m_pts++;
m_lastFrameSeq = currentSeq; // Mark this sequence as processed
if (!m_bPlaying) {
width = m_currentImage.cols;
height = m_currentImage.rows;
pts = m_pts;
return m_currentImage; // Shallow copy (reference counted)
}
// While waiting for keyframe or during settle period after restart,
// return the last good cached image to avoid showing corrupted frames
if (m_bWaitingForKeyframe || m_cleanFrameCount < SETTLE_FRAME_COUNT) {
width = m_currentImage.cols;
height = m_currentImage.rows;
pts = m_pts;
return m_currentImage;
}
// Fast path: same frame as last call — skip clone + BGR conversion
currentSeq = g_frameQueue.getSequence();
if (currentSeq == m_lastFrameSeq && !m_currentImage.empty()) {
width = m_currentImage.cols;
height = m_currentImage.rows;
pts = m_pts;
return m_currentImage;
}
if (g_frameQueue.isEmpty()) {
width = m_currentImage.cols;
height = m_currentImage.rows;
pts = m_pts;
std::cerr << "No frame available in getImage()" << std::endl;
return cv::Mat();
}
// getLatestFrame() clones the AVFrame — we own it from here
frameToProcess = g_frameQueue.getLatestFrame();
if (!frameToProcess) {
width = m_currentImage.cols;
height = m_currentImage.rows;
pts = m_pts;
return cv::Mat();
}
}
// --- _mutex released here ---
// At 4K NV12, cvtColorTwoPlane takes ~100300 ms on CPU; during that
// window the decoder callback (onVideoFrame) is free to push the next
// frame and the CUDA HW capture path can run in parallel.
cv::Mat converted;
try {
converted = avframeToCVMat(frameToProcess);
}
catch (const std::exception& e) {
std::cerr << "Exception while converting AVFrame to cv::Mat: " << e.what() << std::endl;
}
// Preserve raw YUV/NV12 frame for GPU fast-path inference
// (NV12 from HW decode, YUV420P/YUVJ420P from SW decode)
if (frameToProcess &&
(frameToProcess->format == AV_PIX_FMT_NV12 ||
frameToProcess->format == AV_PIX_FMT_YUV420P ||
frameToProcess->format == AV_PIX_FMT_YUVJ420P)) {
if (m_currentNV12Frame) av_frame_free(&m_currentNV12Frame);
m_currentNV12Frame = av_frame_clone(frameToProcess);
// --- Phase 2: short locked section — publish new frame state ---
cv::Mat result; // Snapshot taken under the lock, returned after release.
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!converted.empty()) {
m_currentImage = converted;
m_pts++;
m_lastFrameSeq = currentSeq;
}
// Preserve raw YUV/NV12 frame for GPU fast-path inference
// (NV12 from HW decode, YUV420P/YUVJ420P from SW decode)
if (frameToProcess &&
(frameToProcess->format == AV_PIX_FMT_NV12 ||
frameToProcess->format == AV_PIX_FMT_YUV420P ||
frameToProcess->format == AV_PIX_FMT_YUVJ420P)) {
if (m_currentNV12Frame) av_frame_free(&m_currentNV12Frame);
m_currentNV12Frame = av_frame_clone(frameToProcess);
}
width = m_currentImage.cols;
height = m_currentImage.rows;
pts = m_pts;
result = m_currentImage; // Shallow copy under lock — refcount keeps buffer alive
}
// Free the cloned frame to avoid memory leaks
av_frame_free(&frameToProcess);
// Update frame dimensions and PTS
width = m_currentImage.cols;
height = m_currentImage.rows;
pts = m_pts;
// Return the processed image (shallow copy — caller gets reference-counted Mat)
return m_currentImage;
return result;
}
catch (const std::exception& e) {
std::cerr << "Unexpected exception in getImage(): " << e.what() << std::endl;