Enable log information. Disable NPU in U9

2026-04-21 15:48:27 +10:00
parent 00f6e2f852
commit 97d814936d
18 changed files with 301 additions and 54 deletions
--- a/MediaClient/media/video_decoder.cpp
+++ b/MediaClient/media/video_decoder.cpp
@@ -4,6 +4,7 @@
 #include "lock.h"
 #include "media_codec.h"
 #include "media_parse.h"
+#include <atomic>
 #include <memory>

 #include "ANSLicense.h"   // ANS_DBG macro (gated by ANSCORE_DEBUGVIEW)
@@ -14,6 +15,16 @@ extern "C" {
 #include "libavutil/mem.h"
 }

+// ---------------------------------------------------------------------------
+//  Leak diagnostics — exported counters for media allocation balance.
+//  Incremented in allocation sites, decremented in free paths. If (alloc -
+//  free) climbs monotonically over time, the allocator is leaking.
+//  Read by the MEDIA_Leak heartbeat in video_player.cpp (every 60 s).
+// ---------------------------------------------------------------------------
+std::atomic<int64_t> g_contiguousAllocs{0};
+std::atomic<int64_t> g_contiguousFrees{0};
+std::atomic<int64_t> g_contiguousBytesInFlight{0};   // sum(total) of unfreed buffers
+
 // ---------------------------------------------------------------------------
 //  Contiguous YUV420P allocator — trims per-call malloc overhead and enables
 //  the zero-copy fast path in avframeYUV420PToCvMat for resolutions where the
@@ -23,7 +34,20 @@ extern "C" {
 //  single-block layout still improves cache behaviour for the bulk memcpy.)
 // ---------------------------------------------------------------------------
 namespace {
-    void anscore_contiguous_free(void* /*opaque*/, uint8_t* data) {
+    // Opaque payload stored in AVBufferRef so the free callback can account
+    // for the exact byte count being returned (no global lookup needed).
+    struct ContiguousOpaque {
+        size_t bytes;
+    };
+
+    void anscore_contiguous_free(void* opaque, uint8_t* data) {
+        if (opaque) {
+            auto* o = static_cast<ContiguousOpaque*>(opaque);
+            g_contiguousBytesInFlight.fetch_sub(static_cast<int64_t>(o->bytes),
+                                                std::memory_order_relaxed);
+            delete o;
+        }
+        g_contiguousFrees.fetch_add(1, std::memory_order_relaxed);
        av_free(data);
    }
 }
@@ -77,13 +101,24 @@ int CVideoDecoder::contiguousGetBuffer2(AVCodecContext* s, AVFrame* frame, int f
        return AVERROR(ENOMEM);
    }

-    AVBufferRef* ref = av_buffer_create(buf, (int)total,
-                                        anscore_contiguous_free, nullptr, 0);
-    if (!ref) {
+    auto* opaque = new (std::nothrow) ContiguousOpaque{total};
+    if (!opaque) {
        av_free(buf);
        return AVERROR(ENOMEM);
    }

+    AVBufferRef* ref = av_buffer_create(buf, (int)total,
+                                        anscore_contiguous_free, opaque, 0);
+    if (!ref) {
+        delete opaque;
+        av_free(buf);
+        return AVERROR(ENOMEM);
+    }
+
+    g_contiguousAllocs.fetch_add(1, std::memory_order_relaxed);
+    g_contiguousBytesInFlight.fetch_add(static_cast<int64_t>(total),
+                                        std::memory_order_relaxed);
+
    for (int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
        frame->buf[i]      = nullptr;
        frame->data[i]     = nullptr;
--- a/MediaClient/media/video_player.cpp
+++ b/MediaClient/media/video_player.cpp
@@ -37,6 +37,22 @@ extern "C"

 #include "ANSLicense.h"   // ANS_DBG macro (gated by ANSCORE_DEBUGVIEW)

+// ---------------------------------------------------------------------------
+//  Leak diagnostics — definitions for counters declared extern in header.
+//  Also references counters defined in video_decoder.cpp so the heartbeat
+//  below can report media allocator balance in a single line.
+// ---------------------------------------------------------------------------
+std::atomic<int64_t> g_queueClones{0};
+std::atomic<int64_t> g_queueFrees{0};
+std::atomic<int64_t> g_nv12Clones{0};
+std::atomic<int64_t> g_nv12Frees{0};
+std::atomic<int64_t> g_cudaHWClones{0};
+std::atomic<int64_t> g_cudaHWFrees{0};
+
+extern std::atomic<int64_t> g_contiguousAllocs;
+extern std::atomic<int64_t> g_contiguousFrees;
+extern std::atomic<int64_t> g_contiguousBytesInFlight;
+
 // libyuv: SIMD-accelerated YUV↔RGB conversion with native strided-plane input.
 // Replaces the memcpy-into-staging + cv::cvtColor(COLOR_YUV2BGR_I420) chain
 // in avframeYUV420PToCvMat with a direct I420→RGB24 (== OpenCV BGR memory
@@ -1629,10 +1645,12 @@ void CVideoPlayer::close()
 	closeAudio();
 	if (m_currentNV12Frame) {
 		av_frame_free(&m_currentNV12Frame);
+		g_nv12Frees.fetch_add(1, std::memory_order_relaxed);
 		m_currentNV12Frame = nullptr;
 	}
 	if (m_currentCudaHWFrame) {
 		av_frame_free(&m_currentCudaHWFrame);
+		g_cudaHWFrees.fetch_add(1, std::memory_order_relaxed);
 		m_currentCudaHWFrame = nullptr;
 	}
 	if (m_pSnapFrame)
@@ -2329,8 +2347,12 @@ void CVideoPlayer::onVideoFrame(AVFrame* frame)
 		// and we can safely clone the CUDA frame without deadlock risk.
 		// cloneCudaHWFrame_unlocked() is safe because decoder._mutex is already held.
 		if (m_pVideoDecoder && m_pVideoDecoder->isCudaHWAccel()) {
-			if (m_currentCudaHWFrame) av_frame_free(&m_currentCudaHWFrame);
+			if (m_currentCudaHWFrame) {
+				av_frame_free(&m_currentCudaHWFrame);
+				g_cudaHWFrees.fetch_add(1, std::memory_order_relaxed);
+			}
 			m_currentCudaHWFrame = m_pVideoDecoder->cloneCudaHWFrame_unlocked();
+			if (m_currentCudaHWFrame) g_cudaHWClones.fetch_add(1, std::memory_order_relaxed);
 		}

 		// Track how many clean frames have arrived since keyframe
@@ -2455,8 +2477,12 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
 				(frameToProcess->format == AV_PIX_FMT_NV12 ||
 				 frameToProcess->format == AV_PIX_FMT_YUV420P ||
 				 frameToProcess->format == AV_PIX_FMT_YUVJ420P)) {
-				if (m_currentNV12Frame) av_frame_free(&m_currentNV12Frame);
+				if (m_currentNV12Frame) {
+					av_frame_free(&m_currentNV12Frame);
+					g_nv12Frees.fetch_add(1, std::memory_order_relaxed);
+				}
 				m_currentNV12Frame = av_frame_clone(frameToProcess);
+				if (m_currentNV12Frame) g_nv12Clones.fetch_add(1, std::memory_order_relaxed);
 			}

 			width = m_currentImage.cols;
@@ -2466,6 +2492,49 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
 		}

 		av_frame_free(&frameToProcess);
+		g_queueFrees.fetch_add(1, std::memory_order_relaxed);
+
+		// Leak diagnostics — one heartbeat every 60 s across the whole process.
+		// Each counter pair (allocs, frees) should stay balanced. A monotonic
+		// rise in (allocs - frees) identifies the leaking pool. Bytes field
+		// covers the ~12 MB/frame contiguous YUV420P buffers specifically —
+		// watch for steady climb while the counters look balanced (refcount
+		// leak in a held clone would show that shape).
+		{
+			using clk = std::chrono::steady_clock;
+			static std::atomic<long long> s_nextLeakLogTick{0};
+			const long long tick = clk::now().time_since_epoch().count();
+			long long expected = s_nextLeakLogTick.load(std::memory_order_relaxed);
+			if (tick >= expected) {
+				const long long deadline = tick +
+					std::chrono::duration_cast<clk::duration>(
+						std::chrono::seconds(60)).count();
+				// Claim the next window — first writer wins so only one thread logs.
+				if (s_nextLeakLogTick.compare_exchange_strong(
+						expected, deadline, std::memory_order_relaxed)) {
+					const int64_t qA = g_queueClones.load(std::memory_order_relaxed);
+					const int64_t qF = g_queueFrees.load(std::memory_order_relaxed);
+					const int64_t nvA = g_nv12Clones.load(std::memory_order_relaxed);
+					const int64_t nvF = g_nv12Frees.load(std::memory_order_relaxed);
+					const int64_t cuA = g_cudaHWClones.load(std::memory_order_relaxed);
+					const int64_t cuF = g_cudaHWFrees.load(std::memory_order_relaxed);
+					const int64_t cgA = g_contiguousAllocs.load(std::memory_order_relaxed);
+					const int64_t cgF = g_contiguousFrees.load(std::memory_order_relaxed);
+					const int64_t cgB = g_contiguousBytesInFlight.load(std::memory_order_relaxed);
+					ANS_DBG("MEDIA_Leak",
+						"queue(C=%lld F=%lld net=%lld depth=%zu) "
+						"nv12(C=%lld F=%lld net=%lld) "
+						"cudaHW(C=%lld F=%lld net=%lld) "
+						"contig(A=%lld F=%lld net=%lld bytesMB=%.1f)",
+						(long long)qA, (long long)qF, (long long)(qA - qF),
+						g_frameQueue.size(),
+						(long long)nvA, (long long)nvF, (long long)(nvA - nvF),
+						(long long)cuA, (long long)cuF, (long long)(cuA - cuF),
+						(long long)cgA, (long long)cgF, (long long)(cgA - cgF),
+						(double)cgB / (1024.0 * 1024.0));
+				}
+			}
+		}

 		// Emit timing breakdown. Throttled so DebugView / stderr stay readable.
 		{
@@ -2540,11 +2609,13 @@ std::string CVideoPlayer::getJpegImage(int& width, int& height, int64_t& pts) {
 		catch (const std::exception& e) {
 			std::cerr << "Exception while converting AVFrame to JPEG string: " << e.what() << std::endl;
 			av_frame_free(&frameToProcess);
+			g_queueFrees.fetch_add(1, std::memory_order_relaxed);
 			return m_lastJpegImage;
 		}
 		const auto t3 = clk::now();

 		av_frame_free(&frameToProcess);
+		g_queueFrees.fetch_add(1, std::memory_order_relaxed);

 		if (m_pts < INT64_MAX) {
 			m_pts++;
--- a/MediaClient/media/video_player.h
+++ b/MediaClient/media/video_player.h
@@ -15,8 +15,18 @@
 #include <opencv2/highgui.hpp>
 #include <opencv2/opencv.hpp>
 #include <turbojpeg.h>
+#include <atomic>
 #include <chrono>

+// Leak diagnostics — net counters surfaced in MEDIA_Leak heartbeat.
+// Defined in video_player.cpp; also incremented from FrameQueue here.
+extern std::atomic<int64_t> g_queueClones;         // av_frame_clone from FrameQueue
+extern std::atomic<int64_t> g_queueFrees;          // av_frame_free from FrameQueue
+extern std::atomic<int64_t> g_nv12Clones;          // m_currentNV12Frame = av_frame_clone
+extern std::atomic<int64_t> g_nv12Frees;           // av_frame_free(&m_currentNV12Frame)
+extern std::atomic<int64_t> g_cudaHWClones;        // m_currentCudaHWFrame = clone
+extern std::atomic<int64_t> g_cudaHWFrees;         // av_frame_free(&m_currentCudaHWFrame)
+
 typedef struct
 {
    uint32          SyncTimestamp;
@@ -46,6 +56,7 @@ public:
            std::cerr << "Failed to clone AVFrame!" << std::endl;
            return;
        }
+        g_queueClones.fetch_add(1, std::memory_order_relaxed);

        frameQueue.push(frameCopy);
        m_frameSeq++;  // New frame arrived
@@ -55,6 +66,7 @@ public:
            AVFrame* oldFrame = frameQueue.front();
            frameQueue.pop();
            av_frame_free(&oldFrame);
+            g_queueFrees.fetch_add(1, std::memory_order_relaxed);
        }
    }

@@ -73,7 +85,15 @@ public:
        }

        // Clone the latest frame before returning it
-        return av_frame_clone(frameQueue.back());
+        AVFrame* clone = av_frame_clone(frameQueue.back());
+        if (clone) g_queueClones.fetch_add(1, std::memory_order_relaxed);
+        return clone;
+    }
+
+    // Current depth — snapshot used by the leak heartbeat.
+    size_t size() {
+        std::lock_guard<std::mutex> lock(queueMutex);
+        return frameQueue.size();
    }

    // Retrieve and remove the oldest frame from the queue
@@ -102,6 +122,7 @@ public:
            AVFrame* frame = frameQueue.front();
            frameQueue.pop();
            av_frame_free(&frame);
+            g_queueFrees.fetch_add(1, std::memory_order_relaxed);
        }
        m_frameSeq = 0;
    }