Fix RTSP stall by releasing _mutex during BGR conversion

getImage() previously held _mutex across the 4K NV12->BGR sws_scale in avframeToCVMat, blocking the decoder callback (onVideoFrame) for 100-300ms per frame. Under multi-camera load this cascaded into 5-21s frame stalls and STALE PTS events in the log. - avframeToCVMat: drop outer _mutex. NV12/YUV420P paths touch no shared state; avframeAnyToCvmat still locks internally for swsCtx. - getImage: split into two short locked phases with the BGR conversion unlocked between them. Decoder callbacks can push new frames and run the CUDA HW capture path in parallel with the reader's conversion. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 08:23:28 +10:00
parent 3418090042
commit 0925aa1d63
1 changed files with 84 additions and 70 deletions
--- a/MediaClient/media/video_player.cpp
+++ b/MediaClient/media/video_player.cpp
@@ -1360,7 +1360,10 @@ cv::Mat CVideoPlayer::avframeYUV420PToCvMat(const AVFrame* frame) {
 }

 cv::Mat CVideoPlayer::avframeToCVMat(const AVFrame* pFrame) {
-	std::lock_guard<std::recursive_mutex> lock(_mutex);
+	// No _mutex here: caller (getImage) releases the mutex before invoking this
+	// so the expensive NV12/YUV420P→BGR conversion does not block onVideoFrame.
+	// NV12/YUV420P paths touch only the caller-owned AVFrame clone and benign
+	// member reads. avframeAnyToCvmat() takes its own lock for swsCtx.
 	try {
 		// 1. Validate input frame
 		if (!pFrame || !pFrame->data[0] || pFrame->width <= 10 || pFrame->height <= 10) {
@@ -2233,87 +2236,98 @@ void CVideoPlayer::onAudioFrame(AVFrame* frame)

 cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
 	try {
-		// Lock the mutex using RAII (ensures unlock even in exceptions)
-		std::lock_guard<std::recursive_mutex> lock(_mutex);  // Protect against concurrent access
+		AVFrame* frameToProcess = nullptr;
+		uint64_t currentSeq = 0;

-		if (!m_bPlaying) {
-			// Return the last valid frame if playback is stopped
-			width = m_currentImage.cols;
-			height = m_currentImage.rows;
-			pts = m_pts;
-			return m_currentImage;  // Shallow copy (reference counted, safe under mutex)
-		}
+		// --- Phase 1: short locked section — examine state, pull latest frame ---
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);

-		// While waiting for keyframe or during settle period after restart,
-		// return the last good cached image to avoid showing corrupted frames
-		if (m_bWaitingForKeyframe || m_cleanFrameCount < SETTLE_FRAME_COUNT) {
-			width = m_currentImage.cols;
-			height = m_currentImage.rows;
-			pts = m_pts;
-			return m_currentImage;  // Last good frame (may be empty on first-ever start)
-		}
-
-		// Fast path: check if a new frame has arrived using sequence counter
-		// This avoids expensive av_frame_clone + NV12→BGR conversion when frame hasn't changed
-		uint64_t currentSeq = g_frameQueue.getSequence();
-		if (currentSeq == m_lastFrameSeq && !m_currentImage.empty()) {
-			width = m_currentImage.cols;
-			height = m_currentImage.rows;
-			pts = m_pts;
-			return m_currentImage;  // Same frame, skip all conversion
-		}
-
-		// Get latest frame from queue
-		if (g_frameQueue.isEmpty()) {
-			width = m_currentImage.cols;
-			height = m_currentImage.rows;
-			pts = m_pts;
-			std::cerr << "No frame available in getImage()" << std::endl;
-			return cv::Mat();  // Return an empty cv::Mat() if no frame is available
-		}
-		AVFrame* frameToProcess = g_frameQueue.getLatestFrame();
-		if (!frameToProcess) {
-			// If no frame available, return last valid image
-			width = m_currentImage.cols;
-			height = m_currentImage.rows;
-			pts = m_pts;
-			return cv::Mat();  // Return an empty cv::Mat() if no frame is available
-		}
-
-		try {
-			// Convert AVFrame to cv::Mat
-			m_currentImage = avframeToCVMat(frameToProcess);
-
-			// Update timestamp and sequence if conversion is successful
-			if (!m_currentImage.empty()) {
-				m_pts++;
-				m_lastFrameSeq = currentSeq;  // Mark this sequence as processed
+			if (!m_bPlaying) {
+				width = m_currentImage.cols;
+				height = m_currentImage.rows;
+				pts = m_pts;
+				return m_currentImage;  // Shallow copy (reference counted)
 			}
+
+			// While waiting for keyframe or during settle period after restart,
+			// return the last good cached image to avoid showing corrupted frames
+			if (m_bWaitingForKeyframe || m_cleanFrameCount < SETTLE_FRAME_COUNT) {
+				width = m_currentImage.cols;
+				height = m_currentImage.rows;
+				pts = m_pts;
+				return m_currentImage;
+			}
+
+			// Fast path: same frame as last call — skip clone + BGR conversion
+			currentSeq = g_frameQueue.getSequence();
+			if (currentSeq == m_lastFrameSeq && !m_currentImage.empty()) {
+				width = m_currentImage.cols;
+				height = m_currentImage.rows;
+				pts = m_pts;
+				return m_currentImage;
+			}
+
+			if (g_frameQueue.isEmpty()) {
+				width = m_currentImage.cols;
+				height = m_currentImage.rows;
+				pts = m_pts;
+				std::cerr << "No frame available in getImage()" << std::endl;
+				return cv::Mat();
+			}
+
+			// getLatestFrame() clones the AVFrame — we own it from here
+			frameToProcess = g_frameQueue.getLatestFrame();
+			if (!frameToProcess) {
+				width = m_currentImage.cols;
+				height = m_currentImage.rows;
+				pts = m_pts;
+				return cv::Mat();
+			}
+		}
+		// --- _mutex released here ---
+		// At 4K NV12, cvtColorTwoPlane takes ~100–300 ms on CPU; during that
+		// window the decoder callback (onVideoFrame) is free to push the next
+		// frame and the CUDA HW capture path can run in parallel.
+
+		cv::Mat converted;
+		try {
+			converted = avframeToCVMat(frameToProcess);
 		}
 		catch (const std::exception& e) {
 			std::cerr << "Exception while converting AVFrame to cv::Mat: " << e.what() << std::endl;
 		}

-		// Preserve raw YUV/NV12 frame for GPU fast-path inference
-		// (NV12 from HW decode, YUV420P/YUVJ420P from SW decode)
-		if (frameToProcess &&
-			(frameToProcess->format == AV_PIX_FMT_NV12 ||
-			 frameToProcess->format == AV_PIX_FMT_YUV420P ||
-			 frameToProcess->format == AV_PIX_FMT_YUVJ420P)) {
-			if (m_currentNV12Frame) av_frame_free(&m_currentNV12Frame);
-			m_currentNV12Frame = av_frame_clone(frameToProcess);
+		// --- Phase 2: short locked section — publish new frame state ---
+		cv::Mat result;  // Snapshot taken under the lock, returned after release.
+		{
+			std::lock_guard<std::recursive_mutex> lock(_mutex);
+
+			if (!converted.empty()) {
+				m_currentImage = converted;
+				m_pts++;
+				m_lastFrameSeq = currentSeq;
+			}
+
+			// Preserve raw YUV/NV12 frame for GPU fast-path inference
+			// (NV12 from HW decode, YUV420P/YUVJ420P from SW decode)
+			if (frameToProcess &&
+				(frameToProcess->format == AV_PIX_FMT_NV12 ||
+				 frameToProcess->format == AV_PIX_FMT_YUV420P ||
+				 frameToProcess->format == AV_PIX_FMT_YUVJ420P)) {
+				if (m_currentNV12Frame) av_frame_free(&m_currentNV12Frame);
+				m_currentNV12Frame = av_frame_clone(frameToProcess);
+			}
+
+			width = m_currentImage.cols;
+			height = m_currentImage.rows;
+			pts = m_pts;
+			result = m_currentImage;  // Shallow copy under lock — refcount keeps buffer alive
 		}

-		// Free the cloned frame to avoid memory leaks
 		av_frame_free(&frameToProcess);

-		// Update frame dimensions and PTS
-		width = m_currentImage.cols;
-		height = m_currentImage.rows;
-		pts = m_pts;
-
-		// Return the processed image (shallow copy — caller gets reference-counted Mat)
-		return m_currentImage;
+		return result;
 	}
 	catch (const std::exception& e) {
 		std::cerr << "Unexpected exception in getImage(): " << e.what() << std::endl;