Improve ANSCV with sotfware decoder:
Thread-local staging Mat (video_player.cpp:1400-1407) — single biggest win. Eliminates the 12 MB per-call malloc/free cycle. Contiguous get_buffer2 allocator (video_decoder.cpp:35-102) — keeps the 3 bulk memcpys cache-friendly. Would also enable FAST/zero-copy for resolutions where visible_h % 64 == 0. SW-decoder thread config (video_decoder.cpp:528-540) — thread_count=0, thread_type=FRAME|SLICE. FRAME is downgraded to SLICE-only by AV_CODEC_FLAG_LOW_DELAY, but decode throughput is sufficient for your input rate. SetTargetFPS(100) delivery throttle (already there) — caps onVideoFrame post-decode work at 10 FPS. Keeps the caller path warm-cached. Instrumentation — [MEDIA_DecInit] / [MEDIA_Convert] / [MEDIA_SWDec] / [MEDIA_Timing] / [MEDIA_JpegTiming] — always-on regression detector, zero cost when ANSCORE_DEBUGVIEW=OFF.
This commit is contained in:
@@ -28,12 +28,24 @@ extern "C"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
#include <atomic>
|
||||
#include <libswscale/swscale.h>
|
||||
#if defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64)
|
||||
#include <emmintrin.h>
|
||||
#define HAS_SSE2 1
|
||||
#endif
|
||||
|
||||
#include "ANSLicense.h" // ANS_DBG macro (gated by ANSCORE_DEBUGVIEW)
|
||||
|
||||
// libyuv: SIMD-accelerated YUV↔RGB conversion with native strided-plane input.
|
||||
// Replaces the memcpy-into-staging + cv::cvtColor(COLOR_YUV2BGR_I420) chain
|
||||
// in avframeYUV420PToCvMat with a direct I420→RGB24 (== OpenCV BGR memory
|
||||
// order) call. When the submodule isn't checked out, ANSCORE_HAS_LIBYUV is
|
||||
// not defined and we fall back to the pre-libyuv path.
|
||||
#if defined(ANSCORE_HAS_LIBYUV) && ANSCORE_HAS_LIBYUV
|
||||
#include "libyuv/convert_argb.h" // libyuv::I420ToRGB24
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
void VideoDecoderCallback(AVFrame* frame, void* userdata)
|
||||
@@ -1284,6 +1296,71 @@ cv::Mat CVideoPlayer::avframeYUV420PToCvMat(const AVFrame* frame) {
|
||||
const int width = frame->width;
|
||||
const int height = frame->height;
|
||||
|
||||
// Debug: confirm this SW-decode conversion is actually hit.
|
||||
// Throttled to ~1 log/sec at 30 fps to keep DebugView readable.
|
||||
// Gated by ANSCORE_DEBUGVIEW — compiles to nothing in production.
|
||||
{
|
||||
static std::atomic<uint64_t> s_swCallCount{0};
|
||||
uint64_t n = s_swCallCount.fetch_add(1, std::memory_order_relaxed);
|
||||
if ((n % 30) == 0) {
|
||||
const char* fmtName = av_get_pix_fmt_name((AVPixelFormat)frame->format);
|
||||
const bool contig =
|
||||
(frame->linesize[0] == width &&
|
||||
frame->linesize[1] == width / 2 &&
|
||||
frame->linesize[2] == width / 2 &&
|
||||
frame->data[1] == frame->data[0] + width * height &&
|
||||
frame->data[2] == frame->data[1] + (width / 2) * (height / 2));
|
||||
// Report the codec's allocated Y-plane height (inferred from
|
||||
// the Y/U pointer spacing and Y stride). Lets us see whether
|
||||
// our custom get_buffer2 achieved alloc_h == visible_h.
|
||||
const int yStrideDbg = frame->linesize[0] > 0 ? frame->linesize[0] : 1;
|
||||
const int alloc_h_y = (int)((frame->data[1] - frame->data[0]) / yStrideDbg);
|
||||
#if defined(ANSCORE_HAS_LIBYUV) && ANSCORE_HAS_LIBYUV
|
||||
const char* pathLabel = "LIBYUV/I420ToRGB24";
|
||||
#else
|
||||
const char* pathLabel =
|
||||
contig ? "FAST/zero-copy" :
|
||||
(frame->linesize[0] == width) ? "SLOW/bulk-memcpy" :
|
||||
"SLOW/per-row-copy";
|
||||
#endif
|
||||
(void)contig; // silence unused warning when libyuv is on
|
||||
ANS_DBG("MEDIA_SWDec",
|
||||
"avframeYUV420PToCvMat ENTRY call#%llu fmt=%s visible=%dx%d alloc_h_y=%d "
|
||||
"linesize=[%d,%d,%d] path=%s (this=%p)",
|
||||
(unsigned long long)n,
|
||||
fmtName ? fmtName : "?",
|
||||
width, height, alloc_h_y,
|
||||
frame->linesize[0], frame->linesize[1], frame->linesize[2],
|
||||
pathLabel,
|
||||
(void*)this);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(ANSCORE_HAS_LIBYUV) && ANSCORE_HAS_LIBYUV
|
||||
// libyuv path: direct I420 (3 strided planes) → RGB24 (== BGR in memory
|
||||
// order for libyuv, matches cv::Mat CV_8UC3 default). No staging buffer,
|
||||
// no memcpy, no cv::cvtColor — one SIMD-optimized sweep.
|
||||
//
|
||||
// libyuv's "RGB24" is B,G,R per pixel in memory (see RGB24ToARGBRow_C
|
||||
// in libyuv/source/row_common.cc where src[0]=b, src[1]=g, src[2]=r).
|
||||
// That matches OpenCV's BGR layout — safe to wrap in CV_8UC3.
|
||||
cv::Mat bgrImage(height, width, CV_8UC3);
|
||||
int ret = libyuv::I420ToRGB24(
|
||||
frame->data[0], frame->linesize[0],
|
||||
frame->data[1], frame->linesize[1],
|
||||
frame->data[2], frame->linesize[2],
|
||||
bgrImage.data, static_cast<int>(bgrImage.step),
|
||||
width, height);
|
||||
if (ret != 0) {
|
||||
std::cerr << "libyuv::I420ToRGB24 failed with ret=" << ret << std::endl;
|
||||
return cv::Mat();
|
||||
}
|
||||
if (m_nImageQuality == 1) {
|
||||
bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0);
|
||||
}
|
||||
return bgrImage;
|
||||
#else
|
||||
|
||||
// YUV420P has 3 separate planes: Y (full res), U (half), V (half).
|
||||
// OpenCV's cvtColor(COLOR_YUV2BGR_I420) expects a single contiguous buffer
|
||||
// with Y on top (H rows) and U,V stacked below (H/2 rows total).
|
||||
@@ -1309,12 +1386,25 @@ cv::Mat CVideoPlayer::avframeYUV420PToCvMat(const AVFrame* frame) {
|
||||
return bgrImage;
|
||||
}
|
||||
|
||||
// Slow path: planes have padding (linesize > width) — copy to contiguous buffer
|
||||
// Slow path: planes have padding (linesize > width) OR Y/U/V live in
|
||||
// non-adjacent buffers. Copy into a single I420-layout staging buffer
|
||||
// so cvtColor(COLOR_YUV2BGR_I420) can process it in one SIMD sweep.
|
||||
const int uvWidth = width / 2;
|
||||
const int uvHeight = height / 2;
|
||||
const int totalSize = width * height + uvWidth * uvHeight * 2;
|
||||
|
||||
cv::Mat yuv(height * 3 / 2, width, CV_8UC1);
|
||||
// Thread-local staging Mat — reused across calls to avoid a 12 MB malloc
|
||||
// on every 4K frame. Each decoder runs on its own worker thread, so
|
||||
// thread_local is the right granularity (no cross-thread sharing, no
|
||||
// locking). The Mat reallocates only when dimensions change.
|
||||
static thread_local cv::Mat s_yuvStaging;
|
||||
if (s_yuvStaging.rows != height * 3 / 2 ||
|
||||
s_yuvStaging.cols != width ||
|
||||
s_yuvStaging.type() != CV_8UC1 ||
|
||||
!s_yuvStaging.isContinuous()) {
|
||||
s_yuvStaging.create(height * 3 / 2, width, CV_8UC1);
|
||||
}
|
||||
cv::Mat& yuv = s_yuvStaging;
|
||||
uint8_t* dst = yuv.data;
|
||||
|
||||
// Copy Y plane (line by line if stride != width)
|
||||
@@ -1352,6 +1442,7 @@ cv::Mat CVideoPlayer::avframeYUV420PToCvMat(const AVFrame* frame) {
|
||||
bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0);
|
||||
}
|
||||
return bgrImage;
|
||||
#endif // ANSCORE_HAS_LIBYUV
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "Exception in avframeYUV420PToCvMat: " << e.what() << std::endl;
|
||||
@@ -1371,13 +1462,54 @@ cv::Mat CVideoPlayer::avframeToCVMat(const AVFrame* pFrame) {
|
||||
return cv::Mat();
|
||||
}
|
||||
|
||||
// One-shot diagnostic: print the pixel format the first time through so
|
||||
// we can see which branch of the switch below is taken. Remove after use.
|
||||
static bool s_loggedFmt = false;
|
||||
if (!s_loggedFmt) {
|
||||
s_loggedFmt = true;
|
||||
const char* name = av_get_pix_fmt_name((AVPixelFormat)pFrame->format);
|
||||
fprintf(stderr, "[avframeToCVMat] first frame format=%d (%s) %dx%d\n",
|
||||
pFrame->format, name ? name : "?", pFrame->width, pFrame->height);
|
||||
ANS_DBG("MEDIA_Convert",
|
||||
"avframeToCVMat FIRST-FRAME fmt=%d(%s) %dx%d HWDecoding=%d (this=%p)",
|
||||
pFrame->format, name ? name : "?",
|
||||
pFrame->width, pFrame->height,
|
||||
m_nHWDecoding, (void*)this);
|
||||
}
|
||||
|
||||
// Per-branch throttled trace so we can see the dispatch at runtime.
|
||||
// Gated by ANSCORE_DEBUGVIEW — zero overhead in production.
|
||||
static std::atomic<uint64_t> s_dispatchCount{0};
|
||||
const uint64_t dispN = s_dispatchCount.fetch_add(1, std::memory_order_relaxed);
|
||||
const bool logThis = ((dispN % 30) == 0);
|
||||
|
||||
switch (pFrame->format) {
|
||||
case AV_PIX_FMT_NV12:
|
||||
if (logThis) {
|
||||
ANS_DBG("MEDIA_Convert",
|
||||
"DISPATCH call#%llu fmt=NV12 %dx%d -> avframeNV12ToCvMat (HW-decode path)",
|
||||
(unsigned long long)dispN, pFrame->width, pFrame->height);
|
||||
}
|
||||
return avframeNV12ToCvMat(pFrame);
|
||||
case AV_PIX_FMT_YUV420P:
|
||||
case AV_PIX_FMT_YUVJ420P:
|
||||
if (logThis) {
|
||||
ANS_DBG("MEDIA_Convert",
|
||||
"DISPATCH call#%llu fmt=%s %dx%d -> avframeYUV420PToCvMat (SW-decode path)",
|
||||
(unsigned long long)dispN,
|
||||
(pFrame->format == AV_PIX_FMT_YUVJ420P) ? "YUVJ420P" : "YUV420P",
|
||||
pFrame->width, pFrame->height);
|
||||
}
|
||||
return avframeYUV420PToCvMat(pFrame);
|
||||
default:
|
||||
if (logThis) {
|
||||
const char* name = av_get_pix_fmt_name((AVPixelFormat)pFrame->format);
|
||||
ANS_DBG("MEDIA_Convert",
|
||||
"DISPATCH call#%llu fmt=%d(%s) %dx%d -> avframeAnyToCvmat (sws_scale fallback)",
|
||||
(unsigned long long)dispN,
|
||||
pFrame->format, name ? name : "?",
|
||||
pFrame->width, pFrame->height);
|
||||
}
|
||||
return avframeAnyToCvmat(pFrame);
|
||||
|
||||
}
|
||||
@@ -2239,6 +2371,12 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
|
||||
AVFrame* frameToProcess = nullptr;
|
||||
uint64_t currentSeq = 0;
|
||||
|
||||
// Timing breakdown — gated by ANSCORE_DEBUGVIEW (zero overhead in production).
|
||||
// t0 = entry, t1 = after pulling frame from queue, t2 = after YUV->BGR,
|
||||
// t3 = after publish. Throttled to every 30 full-path calls (~1/sec @30fps).
|
||||
using clk = std::chrono::steady_clock;
|
||||
const auto t0 = clk::now();
|
||||
|
||||
// --- Phase 1: short locked section — examine state, pull latest frame ---
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
@@ -2289,6 +2427,7 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
|
||||
// At 4K NV12, cvtColorTwoPlane takes ~100–300 ms on CPU; during that
|
||||
// window the decoder callback (onVideoFrame) is free to push the next
|
||||
// frame and the CUDA HW capture path can run in parallel.
|
||||
const auto t1 = clk::now();
|
||||
|
||||
cv::Mat converted;
|
||||
try {
|
||||
@@ -2297,6 +2436,7 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "Exception while converting AVFrame to cv::Mat: " << e.what() << std::endl;
|
||||
}
|
||||
const auto t2 = clk::now();
|
||||
|
||||
// --- Phase 2: short locked section — publish new frame state ---
|
||||
cv::Mat result; // Snapshot taken under the lock, returned after release.
|
||||
@@ -2327,6 +2467,26 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
|
||||
|
||||
av_frame_free(&frameToProcess);
|
||||
|
||||
// Emit timing breakdown. Throttled so DebugView / stderr stay readable.
|
||||
{
|
||||
static std::atomic<uint64_t> s_timingCount{0};
|
||||
const uint64_t n = s_timingCount.fetch_add(1, std::memory_order_relaxed);
|
||||
if ((n % 30) == 0) {
|
||||
const auto t3 = clk::now();
|
||||
auto ms = [](clk::time_point a, clk::time_point b) {
|
||||
return std::chrono::duration<double, std::milli>(b - a).count();
|
||||
};
|
||||
ANS_DBG("MEDIA_Timing",
|
||||
"getImage call#%llu pull=%.2fms convert=%.2fms publish=%.2fms total=%.2fms "
|
||||
"size=%dx%d seq=%llu (this=%p)",
|
||||
(unsigned long long)n,
|
||||
ms(t0, t1), ms(t1, t2), ms(t2, t3), ms(t0, t3),
|
||||
width, height,
|
||||
(unsigned long long)currentSeq,
|
||||
(void*)this);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
@@ -2342,9 +2502,14 @@ cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
|
||||
|
||||
std::string CVideoPlayer::getJpegImage(int& width, int& height, int64_t& pts) {
|
||||
try {
|
||||
// Timing breakdown — gated by ANSCORE_DEBUGVIEW (zero overhead in production).
|
||||
using clk = std::chrono::steady_clock;
|
||||
const auto t0 = clk::now();
|
||||
|
||||
// Use same _mutex as getImage() to protect shared state consistently
|
||||
// recursive_mutex allows nested calls to avframeToJpegString → _mutex
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
const auto t1 = clk::now();
|
||||
|
||||
// While waiting for keyframe or during settle period after restart,
|
||||
// return the last good cached JPEG to avoid showing corrupted frames
|
||||
@@ -2359,6 +2524,10 @@ std::string CVideoPlayer::getJpegImage(int& width, int& height, int64_t& pts) {
|
||||
if (!frameToProcess) {
|
||||
return m_lastJpegImage; // Return the last valid JPEG image if no frame is available
|
||||
}
|
||||
const auto t2 = clk::now();
|
||||
const int frameFmt = frameToProcess->format;
|
||||
const int frameW = frameToProcess->width;
|
||||
const int frameH = frameToProcess->height;
|
||||
|
||||
try {
|
||||
if (frameToProcess->format == AV_PIX_FMT_NV12) {
|
||||
@@ -2373,6 +2542,7 @@ std::string CVideoPlayer::getJpegImage(int& width, int& height, int64_t& pts) {
|
||||
av_frame_free(&frameToProcess);
|
||||
return m_lastJpegImage;
|
||||
}
|
||||
const auto t3 = clk::now();
|
||||
|
||||
av_frame_free(&frameToProcess);
|
||||
|
||||
@@ -2392,6 +2562,28 @@ std::string CVideoPlayer::getJpegImage(int& width, int& height, int64_t& pts) {
|
||||
m_lastJpegImage = std::move(m_jpegImage); // Move instead of copy
|
||||
}
|
||||
|
||||
// Throttled timing breakdown for the JPEG hot path.
|
||||
{
|
||||
static std::atomic<uint64_t> s_jpegTimingCount{0};
|
||||
const uint64_t n = s_jpegTimingCount.fetch_add(1, std::memory_order_relaxed);
|
||||
if ((n % 30) == 0) {
|
||||
const auto t4 = clk::now();
|
||||
auto ms = [](clk::time_point a, clk::time_point b) {
|
||||
return std::chrono::duration<double, std::milli>(b - a).count();
|
||||
};
|
||||
const char* fmtName = av_get_pix_fmt_name((AVPixelFormat)frameFmt);
|
||||
ANS_DBG("MEDIA_JpegTiming",
|
||||
"getJpegImage call#%llu lock=%.2fms pull=%.2fms encode=%.2fms publish=%.2fms "
|
||||
"total=%.2fms src_fmt=%s %dx%d jpeg_bytes=%zu (this=%p)",
|
||||
(unsigned long long)n,
|
||||
ms(t0, t1), ms(t1, t2), ms(t2, t3), ms(t3, t4), ms(t0, t4),
|
||||
fmtName ? fmtName : "?",
|
||||
frameW, frameH,
|
||||
m_lastJpegImage.size(),
|
||||
(void*)this);
|
||||
}
|
||||
}
|
||||
|
||||
// Return the most recent valid JPEG image
|
||||
return m_lastJpegImage;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user