Use software decoder by default

2026-04-04 20:19:54 +11:00
parent 3a21026790
commit e134ebdf15
24 changed files with 693 additions and 215 deletions
--- a/MediaClient/media/rtsp_player.cpp
+++ b/MediaClient/media/rtsp_player.cpp
@@ -258,7 +258,15 @@ void CRtspPlayer::stop()
    // Set flags BEFORE stopping decoder so TCP rx thread stops calling decode()
    m_bPlaying = FALSE;
    m_bPaused = FALSE;
-    CVideoPlayer::StopVideoDecoder(); // Stop the video decoder
+    CVideoPlayer::StopVideoDecoder(); // Stop the video decoder + uninit (free VRAM)
+
+    // Close RTSP connection and shut down RX threads.
+    // Without this, stopped cameras keep TCP/UDP threads running,
+    // sockets open, and receiving network data — wasting CPU and
+    // network resources. With 100 cameras and only 5 running,
+    // 95 idle threads would consume CPU for no purpose.
+    // Start() → Setup() → open() will reconnect when needed.
+    m_rtsp.rtsp_close();
 }

 BOOL CRtspPlayer::pause()
--- a/MediaClient/media/video_player.cpp
+++ b/MediaClient/media/video_player.cpp
@@ -1275,6 +1275,90 @@ cv::Mat CVideoPlayer::avframeNV12ToCvMat(const AVFrame* frame)
 		return cv::Mat();
 	}
 }
+cv::Mat CVideoPlayer::avframeYUV420PToCvMat(const AVFrame* frame) {
+	try {
+		if (!frame || frame->width <= 0 || frame->height <= 0) {
+			return cv::Mat();
+		}
+
+		const int width = frame->width;
+		const int height = frame->height;
+
+		// YUV420P has 3 separate planes: Y (full res), U (half), V (half).
+		// OpenCV's cvtColor(COLOR_YUV2BGR_I420) expects a single contiguous buffer
+		// with Y on top (H rows) and U,V stacked below (H/2 rows total).
+		// Layout: [Y: W×H] [U: W/2 × H/2] [V: W/2 × H/2]
+		// Total height = H * 3/2, width = W, single channel.
+
+		// If all planes are contiguous with matching strides, wrap directly
+		const int yStride = frame->linesize[0];
+		const int uStride = frame->linesize[1];
+		const int vStride = frame->linesize[2];
+
+		// Fast path: planes are packed contiguously with stride == width
+		if (yStride == width && uStride == width / 2 && vStride == width / 2 &&
+			frame->data[1] == frame->data[0] + width * height &&
+			frame->data[2] == frame->data[1] + (width / 2) * (height / 2)) {
+			// Contiguous I420 — wrap directly, zero copy
+			cv::Mat yuv(height * 3 / 2, width, CV_8UC1, frame->data[0]);
+			cv::Mat bgrImage;
+			cv::cvtColor(yuv, bgrImage, cv::COLOR_YUV2BGR_I420);
+			if (m_nImageQuality == 1) {
+				bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0);
+			}
+			return bgrImage;
+		}
+
+		// Slow path: planes have padding (linesize > width) — copy to contiguous buffer
+		const int uvWidth = width / 2;
+		const int uvHeight = height / 2;
+		const int totalSize = width * height + uvWidth * uvHeight * 2;
+
+		cv::Mat yuv(height * 3 / 2, width, CV_8UC1);
+		uint8_t* dst = yuv.data;
+
+		// Copy Y plane (line by line if stride != width)
+		if (yStride == width) {
+			std::memcpy(dst, frame->data[0], width * height);
+		} else {
+			for (int row = 0; row < height; ++row) {
+				std::memcpy(dst + row * width, frame->data[0] + row * yStride, width);
+			}
+		}
+		dst += width * height;
+
+		// Copy U plane
+		if (uStride == uvWidth) {
+			std::memcpy(dst, frame->data[1], uvWidth * uvHeight);
+		} else {
+			for (int row = 0; row < uvHeight; ++row) {
+				std::memcpy(dst + row * uvWidth, frame->data[1] + row * uStride, uvWidth);
+			}
+		}
+		dst += uvWidth * uvHeight;
+
+		// Copy V plane
+		if (vStride == uvWidth) {
+			std::memcpy(dst, frame->data[2], uvWidth * uvHeight);
+		} else {
+			for (int row = 0; row < uvHeight; ++row) {
+				std::memcpy(dst + row * uvWidth, frame->data[2] + row * vStride, uvWidth);
+			}
+		}
+
+		cv::Mat bgrImage;
+		cv::cvtColor(yuv, bgrImage, cv::COLOR_YUV2BGR_I420);
+		if (m_nImageQuality == 1) {
+			bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0);
+		}
+		return bgrImage;
+	}
+	catch (const std::exception& e) {
+		std::cerr << "Exception in avframeYUV420PToCvMat: " << e.what() << std::endl;
+		return cv::Mat();
+	}
+}
+
 cv::Mat CVideoPlayer::avframeToCVMat(const AVFrame* pFrame) {
 	std::lock_guard<std::recursive_mutex> lock(_mutex);
 	try {
@@ -1287,8 +1371,9 @@ cv::Mat CVideoPlayer::avframeToCVMat(const AVFrame* pFrame) {
 		switch (pFrame->format) {
 		case AV_PIX_FMT_NV12:
 			return avframeNV12ToCvMat(pFrame);
+		case AV_PIX_FMT_YUV420P:
 		case AV_PIX_FMT_YUVJ420P:
-			return avframeAnyToCvmat(pFrame);
+			return avframeYUV420PToCvMat(pFrame);
 		default:
 			return avframeAnyToCvmat(pFrame);

@@ -1305,7 +1390,7 @@ CVideoPlayer::CVideoPlayer() :
 	, m_bAudioInited(FALSE)
 	, m_bPlaying(FALSE)
 	, m_bPaused(FALSE)
-	, m_nHWDecoding(HW_DECODING_AUTO)//(HW_DECODING_AUTO)// HW_DECODING_D3D11 //HW_DECODING_DISABLE
+	, m_nHWDecoding(HW_DECODING_DISABLE)// Software decode by default — saves VRAM (no NVDEC DPB surfaces)
 	, m_bUpdown(FALSE)
 	, m_bSnapshot(FALSE)
 	, m_nSnapVideoFmt(AV_PIX_FMT_YUVJ420P)
@@ -1740,6 +1825,13 @@ void  CVideoPlayer::StopVideoDecoder() {
 		// Flush decoder to drain and discard any buffered frames,
 		// so stale reference frames don't corrupt the next session
 		decoder->flush();
+		// Free NVDEC decoder context and all GPU surfaces (DPB buffers).
+		// Stopped cameras should not hold VRAM — with 100 cameras created
+		// but only 5 running, the 95 idle decoders would consume ~5-10 GB.
+		// The decoder will be re-initialized automatically when the next
+		// video packet arrives after Start() is called.
+		decoder->uninit();
+		m_bVideoInited = FALSE;
 	}
 	// Clear queue but KEEP m_currentImage and m_lastJpegImage —
 	// getImage()/getJpegImage() will return the last good frame while decoder stabilizes
@@ -1842,6 +1934,13 @@ void CVideoPlayer::setTargetFPS(double intervalMs)
 	m_targetIntervalMs = intervalMs;
 	m_targetFPSInitialized = false;  // reset timing on change
 }
+double CVideoPlayer::getLastFrameAgeMs()
+{
+	std::lock_guard<std::recursive_mutex> lock(_mutex);
+	if (!m_lastDecoderFrameTimeSet) return 0.0;
+	auto now = std::chrono::steady_clock::now();
+	return std::chrono::duration<double, std::milli>(now - m_lastDecoderFrameTime).count();
+}
 void CVideoPlayer::playVideo(uint8* data, int len, uint32 ts, uint16 seq)
 {
 	if (m_bRecording)
@@ -2061,6 +2160,11 @@ void CVideoPlayer::onVideoFrame(AVFrame* frame)
 			}
 		}

+		// Record wall-clock time of every decoded frame (even rate-limited ones).
+		// Used by getLastFrameAgeMs() to detect truly stale cameras.
+		m_lastDecoderFrameTime = std::chrono::steady_clock::now();
+		m_lastDecoderFrameTimeSet = true;
+
 		// --- Frame rate limiting ---
 		// Skip post-decode processing (clone, queue push, CUDA clone) if not enough
 		// time has elapsed since the last processed frame. The decode itself still
--- a/MediaClient/media/video_player.h
+++ b/MediaClient/media/video_player.h
@@ -148,6 +148,7 @@ public:
    // Image quality mode: 0=fast (OpenCV BT.601, ~2ms), 1=quality (sws BT.709+range, ~12ms)
    virtual void    setImageQuality(int mode) { m_nImageQuality = mode; }
    void            setTargetFPS(double intervalMs);  // Set minimum interval between processed frames in ms (0 = no limit, 100 = ~10 FPS)
+    double          getLastFrameAgeMs();              // Milliseconds since last frame arrived from decoder (0 if no frame yet)
    virtual void    setRtpMulticast(BOOL flag) {}
    virtual void    setRtpOverUdp(BOOL flag) {}

@@ -223,6 +224,7 @@ protected:

    cv::Mat         avframeAnyToCvmat(const AVFrame* frame);
    cv::Mat         avframeNV12ToCvMat(const AVFrame* frame);
+    cv::Mat         avframeYUV420PToCvMat(const AVFrame* frame);   // YUV420P/YUVJ420P → BGR (OpenCV, no sws_scale)
    cv::Mat         avframeYUVJ420PToCvmat(const AVFrame* frame);
    cv::Mat         avframeToCVMat(const AVFrame* frame);

@@ -273,6 +275,12 @@ protected:
    std::chrono::steady_clock::time_point m_lastProcessedTime;  // timestamp of last processed frame
    bool m_targetFPSInitialized = false;                   // first-frame flag

+    // Wall-clock timestamp of last frame received from the decoder (NOT from getImage).
+    // Updated in onVideoFrame() for EVERY decoded frame, even rate-limited ones.
+    // Used by LabVIEW to detect truly stale cameras vs rate-limited ones.
+    std::chrono::steady_clock::time_point m_lastDecoderFrameTime;
+    bool m_lastDecoderFrameTimeSet = false;
+
    BOOL            m_bPlaying;
    BOOL            m_bPaused;