Improve ANSCV with sotfware decoder:

Thread-local staging Mat (video_player.cpp:1400-1407) — single biggest win. Eliminates the 12 MB per-call malloc/free cycle. Contiguous get_buffer2 allocator (video_decoder.cpp:35-102) — keeps the 3 bulk memcpys cache-friendly. Would also enable FAST/zero-copy for resolutions where visible_h % 64 == 0. SW-decoder thread config (video_decoder.cpp:528-540) — thread_count=0, thread_type=FRAME|SLICE. FRAME is downgraded to SLICE-only by AV_CODEC_FLAG_LOW_DELAY, but decode throughput is sufficient for your input rate. SetTargetFPS(100) delivery throttle (already there) — caps onVideoFrame post-decode work at 10 FPS. Keeps the caller path warm-cached. Instrumentation — [MEDIA_DecInit] / [MEDIA_Convert] / [MEDIA_SWDec] / [MEDIA_Timing] / [MEDIA_JpegTiming] — always-on regression detector, zero cost when ANSCORE_DEBUGVIEW=OFF.
2026-04-20 12:18:43 +10:00
parent adf32da2a2
commit 9f0a10a4c8
13 changed files with 431 additions and 201 deletions
--- a/MediaClient/media/video_decoder.cpp
+++ b/MediaClient/media/video_decoder.cpp
@@ -6,10 +6,101 @@
 #include "media_parse.h"
 #include <memory>

+#include "ANSLicense.h"   // ANS_DBG macro (gated by ANSCORE_DEBUGVIEW)
+
+extern "C" {
+#include "libavutil/imgutils.h"
+#include "libavutil/buffer.h"
+#include "libavutil/mem.h"
+}
+
+// ---------------------------------------------------------------------------
+//  Contiguous YUV420P allocator — trims per-call malloc overhead and enables
+//  the zero-copy fast path in avframeYUV420PToCvMat for resolutions where the
+//  codec's aligned height happens to equal the visible height.
+//  (4K HEVC at 2160 rows still needs 2176-row alignment → one 16-row gap
+//  between Y and U remains; the fast path stays off for that case but the
+//  single-block layout still improves cache behaviour for the bulk memcpy.)
+// ---------------------------------------------------------------------------
+namespace {
+    void anscore_contiguous_free(void* /*opaque*/, uint8_t* data) {
+        av_free(data);
+    }
+}
+
 uint32  g_hw_decoder_nums = 0;
 uint32  g_hw_decoder_max = 4;   // Hardware decoding resources are limited, Limit up to 4 hardware decoding sessions
 void* g_hw_decoder_mutex = sys_os_create_mutex();

+int CVideoDecoder::contiguousGetBuffer2(AVCodecContext* s, AVFrame* frame, int flags) {
+    // Never touch HW surfaces — those are owned by the hwframe pool.
+    if (s->hw_frames_ctx) {
+        return avcodec_default_get_buffer2(s, frame, flags);
+    }
+    // Only pack planar 8-bit 4:2:0. Everything else (NV12 from unpackers, 10-bit
+    // YUV, 4:2:2, 4:4:4, RGB, paletted, …) goes through the stock allocator.
+    if (frame->format != AV_PIX_FMT_YUV420P && frame->format != AV_PIX_FMT_YUVJ420P) {
+        return avcodec_default_get_buffer2(s, frame, flags);
+    }
+    if (frame->width <= 0 || frame->height <= 0) {
+        return avcodec_default_get_buffer2(s, frame, flags);
+    }
+
+    // Ask the codec for the minimum aligned dimensions it needs. For HEVC
+    // this typically rounds up to a multiple of 64 (the CTU size); for H.264
+    // to a multiple of 16. stride_align[i] is the per-plane linesize alignment.
+    int aligned_w = frame->width;
+    int aligned_h = frame->height;
+    int stride_align[AV_NUM_DATA_POINTERS] = {0};
+    avcodec_align_dimensions2(s, &aligned_w, &aligned_h, stride_align);
+
+    // Round up to the strictest stride_align across all planes (simpler and
+    // safe — FFmpeg only asks for alignment, not exact equality).
+    int max_align = 32;
+    for (int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
+        if (stride_align[i] > max_align) max_align = stride_align[i];
+    }
+
+    auto align_up = [](int v, int a) { return (v + a - 1) & ~(a - 1); };
+
+    const int y_stride  = align_up(aligned_w,     max_align);
+    const int uv_stride = align_up(aligned_w / 2, max_align / 2 > 0 ? max_align / 2 : 16);
+    const int y_h       = aligned_h;
+    const int uv_h      = (aligned_h + 1) / 2;
+
+    const size_t y_sz  = (size_t)y_stride  * y_h;
+    const size_t uv_sz = (size_t)uv_stride * uv_h;
+    const size_t total = y_sz + 2 * uv_sz + AV_INPUT_BUFFER_PADDING_SIZE;
+
+    uint8_t* buf = (uint8_t*)av_mallocz(total);
+    if (!buf) {
+        return AVERROR(ENOMEM);
+    }
+
+    AVBufferRef* ref = av_buffer_create(buf, (int)total,
+                                        anscore_contiguous_free, nullptr, 0);
+    if (!ref) {
+        av_free(buf);
+        return AVERROR(ENOMEM);
+    }
+
+    for (int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
+        frame->buf[i]      = nullptr;
+        frame->data[i]     = nullptr;
+        frame->linesize[i] = 0;
+    }
+    frame->buf[0]        = ref;
+    frame->data[0]       = buf;
+    frame->data[1]       = buf + y_sz;
+    frame->data[2]       = buf + y_sz + uv_sz;
+    frame->linesize[0]   = y_stride;
+    frame->linesize[1]   = uv_stride;
+    frame->linesize[2]   = uv_stride;
+    frame->extended_data = frame->data;
+
+    return 0;
+}
+
 // ---------------------------------------------------------------------------
 //  HWDecoderPool implementation
 // ---------------------------------------------------------------------------
@@ -424,6 +515,30 @@ BOOL CVideoDecoder::init(enum AVCodecID codec, uint8* extradata, int extradata_s
        }
    }

+    // Configure multi-threading for the SOFTWARE decoder.
+    // Hardware decoders (NVDEC, DXVA2/D3D11VA, QSV, VideoToolbox) do their
+    // own parallelism inside the GPU/fixed-function block and ignore these
+    // fields — so we only enable threading when HW init was skipped (hwMode
+    // == HW_DECODING_DISABLE) or failed (fell back to SW).
+    //
+    // Without this, libavcodec's HEVC/H.264 decoder runs on a single core,
+    // which on 4K HEVC streams is ~80–120 ms per frame. Frame + slice
+    // threading on a 24-thread CPU typically brings that down to 10–20 ms.
+    // thread_count = 0 lets FFmpeg auto-pick (capped internally ~16).
+    if (!m_bHardwareDecoderEnabled) {
+        m_pContext->thread_count = 0;
+        m_pContext->thread_type  = FF_THREAD_FRAME | FF_THREAD_SLICE;
+
+        // Install contiguous Y+U+V allocator. This packs all three planes
+        // into a single av_malloc block so the BGR-conversion fast path
+        // (avframeYUV420PToCvMat) can either wrap the frame zero-copy, or
+        // at minimum hit a tight 3-call bulk memcpy with good cache locality
+        // instead of per-row copies into a freshly allocated staging Mat.
+        // HW decoders must NEVER have get_buffer2 overridden — they use
+        // hw_frames_ctx for surface management.
+        m_pContext->get_buffer2 = &CVideoDecoder::contiguousGetBuffer2;
+    }
+
    // FIXED: Use avcodec_open2 instead of avcodec_thread_open
    if (avcodec_open2(m_pContext, m_pCodec, NULL) < 0)
    {
@@ -432,6 +547,27 @@ BOOL CVideoDecoder::init(enum AVCodecID codec, uint8* extradata, int extradata_s
        return FALSE;
    }

+    // Debug: one-shot visibility into which decoder actually got opened.
+    // m_bHardwareDecoderEnabled is set by hwDecoderInit() on success; when
+    // hwMode == HW_DECODING_DISABLE or hwDecoderInit failed, it stays FALSE
+    // and the SW decoder (avcodec_find_decoder) is used.
+    // active_thread_type is what FFmpeg actually negotiated after open2
+    // (bit 1 = FF_THREAD_FRAME, bit 2 = FF_THREAD_SLICE).
+    ANS_DBG("MEDIA_DecInit",
+        "avcodec_open2 OK codec=%s(%s) %dx%d hwMode=%d hwEnabled=%d cudaHW=%d gpu=%d "
+        "threads=%d thread_type_req=0x%x active=0x%x -> %s decoder",
+        m_pCodec->name ? m_pCodec->name : "?",
+        m_pCodec->long_name ? m_pCodec->long_name : "?",
+        m_pContext->width, m_pContext->height,
+        hwMode,
+        (int)m_bHardwareDecoderEnabled,
+        (int)m_bCudaHWAccel,
+        m_hwGpuIndex,
+        m_pContext->thread_count,
+        m_pContext->thread_type,
+        m_pContext->active_thread_type,
+        m_bHardwareDecoderEnabled ? "HARDWARE" : "SOFTWARE");
+
    m_pFrame = av_frame_alloc();
    if (NULL == m_pFrame)
    {
--- a/MediaClient/media/video_decoder.h
+++ b/MediaClient/media/video_decoder.h
@@ -147,6 +147,15 @@ public:
 	AVCodecContext* getAVCodeContext() {
 		return m_pContext;
 	}
+
+	// Custom AVCodecContext::get_buffer2 callback used by the SOFTWARE decoder.
+	// Allocates Y, U, and V planes of YUV420P / YUVJ420P frames in a SINGLE
+	// contiguous av_malloc block so that CVideoPlayer::avframeYUV420PToCvMat
+	// can wrap them zero-copy into an I420 cv::Mat when the allocated height
+	// matches the visible height (i.e. no codec padding rows between planes).
+	// For unhandled formats (HW surfaces, 10-bit, 4:2:2, 4:4:4, planar-alpha,
+	// …) it delegates to avcodec_default_get_buffer2, preserving correctness.
+	static int contiguousGetBuffer2(AVCodecContext* s, AVFrame* frame, int flags);
 private:
 	BOOL    readFrame();
 	int     render(AVFrame* frame);
--- a/MediaClient/media/video_player.cpp
+++ b/MediaClient/media/video_player.cpp
@@ -28,12 +28,24 @@ extern "C"
 #include <string>
 #include <vector>
 #include <chrono>
+#include <atomic>
 #include <libswscale/swscale.h>
 #if defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64)
 #include <emmintrin.h>
 #define HAS_SSE2 1
 #endif

+#include "ANSLicense.h"   // ANS_DBG macro (gated by ANSCORE_DEBUGVIEW)
+
+// libyuv: SIMD-accelerated YUV↔RGB conversion with native strided-plane input.
+// Replaces the memcpy-into-staging + cv::cvtColor(COLOR_YUV2BGR_I420) chain
+// in avframeYUV420PToCvMat with a direct I420→RGB24 (== OpenCV BGR memory
+// order) call. When the submodule isn't checked out, ANSCORE_HAS_LIBYUV is
+// not defined and we fall back to the pre-libyuv path.
+#if defined(ANSCORE_HAS_LIBYUV) && ANSCORE_HAS_LIBYUV
+#include "libyuv/convert_argb.h"   // libyuv::I420ToRGB24
+#endif
+


 void VideoDecoderCallback(AVFrame* frame, void* userdata)
@@ -1284,6 +1296,71 @@ cv::Mat CVideoPlayer::avframeYUV420PToCvMat(const AVFrame* frame) {
 		const int width = frame->width;
 		const int height = frame->height;

+		// Debug: confirm this SW-decode conversion is actually hit.
+		// Throttled to ~1 log/sec at 30 fps to keep DebugView readable.
+		// Gated by ANSCORE_DEBUGVIEW — compiles to nothing in production.
+		{
+			static std::atomic<uint64_t> s_swCallCount{0};
+			uint64_t n = s_swCallCount.fetch_add(1, std::memory_order_relaxed);
+			if ((n % 30) == 0) {
+				const char* fmtName = av_get_pix_fmt_name((AVPixelFormat)frame->format);
+				const bool contig =
+					(frame->linesize[0] == width &&
+					 frame->linesize[1] == width / 2 &&
+					 frame->linesize[2] == width / 2 &&
+					 frame->data[1] == frame->data[0] + width * height &&
+					 frame->data[2] == frame->data[1] + (width / 2) * (height / 2));
+				// Report the codec's allocated Y-plane height (inferred from
+				// the Y/U pointer spacing and Y stride). Lets us see whether
+				// our custom get_buffer2 achieved alloc_h == visible_h.
+				const int yStrideDbg = frame->linesize[0] > 0 ? frame->linesize[0] : 1;
+				const int alloc_h_y = (int)((frame->data[1] - frame->data[0]) / yStrideDbg);
+#if defined(ANSCORE_HAS_LIBYUV) && ANSCORE_HAS_LIBYUV
+				const char* pathLabel = "LIBYUV/I420ToRGB24";
+#else
+				const char* pathLabel =
+					contig                       ? "FAST/zero-copy" :
+					(frame->linesize[0] == width) ? "SLOW/bulk-memcpy" :
+					                                "SLOW/per-row-copy";
+#endif
+				(void)contig;  // silence unused warning when libyuv is on
+				ANS_DBG("MEDIA_SWDec",
+					"avframeYUV420PToCvMat ENTRY call#%llu fmt=%s visible=%dx%d alloc_h_y=%d "
+					"linesize=[%d,%d,%d] path=%s (this=%p)",
+					(unsigned long long)n,
+					fmtName ? fmtName : "?",
+					width, height, alloc_h_y,
+					frame->linesize[0], frame->linesize[1], frame->linesize[2],
+					pathLabel,
+					(void*)this);
+			}
+		}
+
+#if defined(ANSCORE_HAS_LIBYUV) && ANSCORE_HAS_LIBYUV
+		// libyuv path: direct I420 (3 strided planes) → RGB24 (== BGR in memory
+		// order for libyuv, matches cv::Mat CV_8UC3 default). No staging buffer,
+		// no memcpy, no cv::cvtColor — one SIMD-optimized sweep.
+		//
+		// libyuv's "RGB24" is B,G,R per pixel in memory (see RGB24ToARGBRow_C
+		// in libyuv/source/row_common.cc where src[0]=b, src[1]=g, src[2]=r).
+		// That matches OpenCV's BGR layout — safe to wrap in CV_8UC3.
+		cv::Mat bgrImage(height, width, CV_8UC3);
+		int ret = libyuv::I420ToRGB24(
+			frame->data[0], frame->linesize[0],
+			frame->data[1], frame->linesize[1],
+			frame->data[2], frame->linesize[2],
+			bgrImage.data, static_cast<int>(bgrImage.step),
+			width, height);
+		if (ret != 0) {
+			std::cerr << "libyuv::I420ToRGB24 failed with ret=" << ret << std::endl;
+			return cv::Mat();
+		}
+		if (m_nImageQuality == 1) {
+			bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0);
+		}
+		return bgrImage;
+#else
+
 		// YUV420P has 3 separate planes: Y (full res), U (half), V (half).
 		// OpenCV's cvtColor(COLOR_YUV2BGR_I420) expects a single contiguous buffer
 		// with Y on top (H rows) and U,V stacked below (H/2 rows total).
@@ -1309,12 +1386,25 @@ cv::Mat CVideoPlayer::avframeYUV420PToCvMat(const AVFrame* frame) {
 			return bgrImage;
 		}

-		// Slow path: planes have padding (linesize > width) — copy to contiguous buffer
+		// Slow path: planes have padding (linesize > width) OR Y/U/V live in
+		// non-adjacent buffers. Copy into a single I420-layout staging buffer
+		// so cvtColor(COLOR_YUV2BGR_I420) can process it in one SIMD sweep.
 		const int uvWidth = width / 2;
 		const int uvHeight = height / 2;
 		const int totalSize = width * height + uvWidth * uvHeight * 2;

-		cv::Mat yuv(height * 3 / 2, width, CV_8UC1);
+		// Thread-local staging Mat — reused across calls to avoid a 12 MB malloc
+		// on every 4K frame. Each decoder runs on its own worker thread, so
+		// thread_local is the right granularity (no cross-thread sharing, no
+		// locking). The Mat reallocates only when dimensions change.
+		static thread_local cv::Mat s_yuvStaging;
+		if (s_yuvStaging.rows != height * 3 / 2 ||
+			s_yuvStaging.cols != width ||
+			s_yuvStaging.type() != CV_8UC1 ||
+			!s_yuvStaging.isContinuous()) {
+			s_yuvStaging.create(height * 3 / 2, width, CV_8UC1);
+		}
+		cv::Mat& yuv = s_yuvStaging;
 		uint8_t* dst = yuv.data;

 		// Copy Y plane (line by line if stride != width)
@@ -1352,6 +1442,7 @@ cv::Mat CVideoPlayer::avframeYUV420PToCvMat(const AVFrame* frame) {
 			bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0);
 		}
 		return bgrImage;
+#endif  // ANSCORE_HAS_LIBYUV
 	}
 	catch (const std::exception& e) {
 		std::cerr << "Exception in avframeYUV420PToCvMat: " << e.what() << std::endl;
@@ -1371,13 +1462,54 @@ cv::Mat CVideoPlayer::avframeToCVMat(const AVFrame* pFrame) {
 			return cv::Mat();
 		}

+		// One-shot diagnostic: print the pixel format the first time through so
+		// we can see which branch of the switch below is taken. Remove after use.
+		static bool s_loggedFmt = false;
+		if (!s_loggedFmt) {
+			s_loggedFmt = true;
+			const char* name = av_get_pix_fmt_name((AVPixelFormat)pFrame->format);
+			fprintf(stderr, "[avframeToCVMat] first frame format=%d (%s) %dx%d\n",
+			        pFrame->format, name ? name : "?", pFrame->width, pFrame->height);
+			ANS_DBG("MEDIA_Convert",
+				"avframeToCVMat FIRST-FRAME fmt=%d(%s) %dx%d HWDecoding=%d (this=%p)",
+				pFrame->format, name ? name : "?",
+				pFrame->width, pFrame->height,
+				m_nHWDecoding, (void*)this);
+		}
+
+		// Per-branch throttled trace so we can see the dispatch at runtime.
+		// Gated by ANSCORE_DEBUGVIEW — zero overhead in production.
+		static std::atomic<uint64_t> s_dispatchCount{0};
+		const uint64_t dispN = s_dispatchCount.fetch_add(1, std::memory_order_relaxed);
+		const bool logThis = ((dispN % 30) == 0);
+
 		switch (pFrame->format) {
 		case AV_PIX_FMT_NV12:
+			if (logThis) {
+				ANS_DBG("MEDIA_Convert",
+					"DISPATCH call#%llu fmt=NV12 %dx%d -> avframeNV12ToCvMat (HW-decode path)",
+					(unsigned long long)dispN, pFrame->width, pFrame->height);
+			}
 			return avframeNV12ToCvMat(pFrame);
 		case AV_PIX_FMT_YUV420P:
 		case AV_PIX_FMT_YUVJ420P:
+			if (logThis) {
+				ANS_DBG("MEDIA_Convert",
+					"DISPATCH call#%llu fmt=%s %dx%d -> avframeYUV420PToCvMat (SW-decode path)",
+					(unsigned long long)dispN,
+					(pFrame->format == AV_PIX_FMT_YUVJ420P) ? "YUVJ420P" : "YUV420P",
+					pFrame->width, pFrame->height);
+			}
 			return avframeYUV420PToCvMat(pFrame);
 		default:
+			if (logThis) {
+				const char* name = av_get_pix_fmt_name((AVPixelFormat)pFrame->format);
+				ANS_DBG("MEDIA_Convert",
+					"DISPATCH call#%llu fmt=%d(%s) %dx%d -> avframeAnyToCvmat (sws_scale fallback)",
+					(unsigned long long)dispN,
+					pFrame->format, name ? name : "?",
+					pFrame->width, pFrame->height);
+			}
 			return avframeAnyToCvmat(pFrame);

 		}
@@ -2239,6 +2371,12 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
 		AVFrame* frameToProcess = nullptr;
 		uint64_t currentSeq = 0;

+		// Timing breakdown — gated by ANSCORE_DEBUGVIEW (zero overhead in production).
+		// t0 = entry, t1 = after pulling frame from queue, t2 = after YUV->BGR,
+		// t3 = after publish. Throttled to every 30 full-path calls (~1/sec @30fps).
+		using clk = std::chrono::steady_clock;
+		const auto t0 = clk::now();
+
 		// --- Phase 1: short locked section — examine state, pull latest frame ---
 		{
 			std::lock_guard<std::recursive_mutex> lock(_mutex);
@@ -2289,6 +2427,7 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
 		// At 4K NV12, cvtColorTwoPlane takes ~100–300 ms on CPU; during that
 		// window the decoder callback (onVideoFrame) is free to push the next
 		// frame and the CUDA HW capture path can run in parallel.
+		const auto t1 = clk::now();

 		cv::Mat converted;
 		try {
@@ -2297,6 +2436,7 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
 		catch (const std::exception& e) {
 			std::cerr << "Exception while converting AVFrame to cv::Mat: " << e.what() << std::endl;
 		}
+		const auto t2 = clk::now();

 		// --- Phase 2: short locked section — publish new frame state ---
 		cv::Mat result;  // Snapshot taken under the lock, returned after release.
@@ -2327,6 +2467,26 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {

 		av_frame_free(&frameToProcess);

+		// Emit timing breakdown. Throttled so DebugView / stderr stay readable.
+		{
+			static std::atomic<uint64_t> s_timingCount{0};
+			const uint64_t n = s_timingCount.fetch_add(1, std::memory_order_relaxed);
+			if ((n % 30) == 0) {
+				const auto t3 = clk::now();
+				auto ms = [](clk::time_point a, clk::time_point b) {
+					return std::chrono::duration<double, std::milli>(b - a).count();
+				};
+				ANS_DBG("MEDIA_Timing",
+					"getImage call#%llu pull=%.2fms convert=%.2fms publish=%.2fms total=%.2fms "
+					"size=%dx%d seq=%llu (this=%p)",
+					(unsigned long long)n,
+					ms(t0, t1), ms(t1, t2), ms(t2, t3), ms(t0, t3),
+					width, height,
+					(unsigned long long)currentSeq,
+					(void*)this);
+			}
+		}
+
 		return result;
 	}
 	catch (const std::exception& e) {
@@ -2342,9 +2502,14 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {

 std::string CVideoPlayer::getJpegImage(int& width, int& height, int64_t& pts) {
 	try {
+		// Timing breakdown — gated by ANSCORE_DEBUGVIEW (zero overhead in production).
+		using clk = std::chrono::steady_clock;
+		const auto t0 = clk::now();
+
 		// Use same _mutex as getImage() to protect shared state consistently
 		// recursive_mutex allows nested calls to avframeToJpegString → _mutex
 		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		const auto t1 = clk::now();

 		// While waiting for keyframe or during settle period after restart,
 		// return the last good cached JPEG to avoid showing corrupted frames
@@ -2359,6 +2524,10 @@ std::string CVideoPlayer::getJpegImage(int& width, int& height, int64_t& pts) {
 		if (!frameToProcess) {
 			return m_lastJpegImage;  // Return the last valid JPEG image if no frame is available
 		}
+		const auto t2 = clk::now();
+		const int frameFmt = frameToProcess->format;
+		const int frameW = frameToProcess->width;
+		const int frameH = frameToProcess->height;

 		try {
 			if (frameToProcess->format == AV_PIX_FMT_NV12) {
@@ -2373,6 +2542,7 @@ std::string CVideoPlayer::getJpegImage(int& width, int& height, int64_t& pts) {
 			av_frame_free(&frameToProcess);
 			return m_lastJpegImage;
 		}
+		const auto t3 = clk::now();

 		av_frame_free(&frameToProcess);

@@ -2392,6 +2562,28 @@ std::string CVideoPlayer::getJpegImage(int& width, int& height, int64_t& pts) {
 			m_lastJpegImage = std::move(m_jpegImage);  // Move instead of copy
 		}

+		// Throttled timing breakdown for the JPEG hot path.
+		{
+			static std::atomic<uint64_t> s_jpegTimingCount{0};
+			const uint64_t n = s_jpegTimingCount.fetch_add(1, std::memory_order_relaxed);
+			if ((n % 30) == 0) {
+				const auto t4 = clk::now();
+				auto ms = [](clk::time_point a, clk::time_point b) {
+					return std::chrono::duration<double, std::milli>(b - a).count();
+				};
+				const char* fmtName = av_get_pix_fmt_name((AVPixelFormat)frameFmt);
+				ANS_DBG("MEDIA_JpegTiming",
+					"getJpegImage call#%llu lock=%.2fms pull=%.2fms encode=%.2fms publish=%.2fms "
+					"total=%.2fms src_fmt=%s %dx%d jpeg_bytes=%zu (this=%p)",
+					(unsigned long long)n,
+					ms(t0, t1), ms(t1, t2), ms(t2, t3), ms(t3, t4), ms(t0, t4),
+					fmtName ? fmtName : "?",
+					frameW, frameH,
+					m_lastJpegImage.size(),
+					(void*)this);
+			}
+		}
+
 		// Return the most recent valid JPEG image
 		return m_lastJpegImage;
 	}