Improve ANSCV with sotfware decoder:
Thread-local staging Mat (video_player.cpp:1400-1407) — single biggest win. Eliminates the 12 MB per-call malloc/free cycle. Contiguous get_buffer2 allocator (video_decoder.cpp:35-102) — keeps the 3 bulk memcpys cache-friendly. Would also enable FAST/zero-copy for resolutions where visible_h % 64 == 0. SW-decoder thread config (video_decoder.cpp:528-540) — thread_count=0, thread_type=FRAME|SLICE. FRAME is downgraded to SLICE-only by AV_CODEC_FLAG_LOW_DELAY, but decode throughput is sufficient for your input rate. SetTargetFPS(100) delivery throttle (already there) — caps onVideoFrame post-decode work at 10 FPS. Keeps the caller path warm-cached. Instrumentation — [MEDIA_DecInit] / [MEDIA_Convert] / [MEDIA_SWDec] / [MEDIA_Timing] / [MEDIA_JpegTiming] — always-on regression detector, zero cost when ANSCORE_DEBUGVIEW=OFF.
This commit is contained in:
@@ -6,10 +6,101 @@
|
||||
#include "media_parse.h"
|
||||
#include <memory>
|
||||
|
||||
#include "ANSLicense.h" // ANS_DBG macro (gated by ANSCORE_DEBUGVIEW)
|
||||
|
||||
extern "C" {
|
||||
#include "libavutil/imgutils.h"
|
||||
#include "libavutil/buffer.h"
|
||||
#include "libavutil/mem.h"
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Contiguous YUV420P allocator — trims per-call malloc overhead and enables
|
||||
// the zero-copy fast path in avframeYUV420PToCvMat for resolutions where the
|
||||
// codec's aligned height happens to equal the visible height.
|
||||
// (4K HEVC at 2160 rows still needs 2176-row alignment → one 16-row gap
|
||||
// between Y and U remains; the fast path stays off for that case but the
|
||||
// single-block layout still improves cache behaviour for the bulk memcpy.)
|
||||
// ---------------------------------------------------------------------------
|
||||
namespace {
|
||||
void anscore_contiguous_free(void* /*opaque*/, uint8_t* data) {
|
||||
av_free(data);
|
||||
}
|
||||
}
|
||||
|
||||
uint32 g_hw_decoder_nums = 0;
|
||||
uint32 g_hw_decoder_max = 4; // Hardware decoding resources are limited, Limit up to 4 hardware decoding sessions
|
||||
void* g_hw_decoder_mutex = sys_os_create_mutex();
|
||||
|
||||
int CVideoDecoder::contiguousGetBuffer2(AVCodecContext* s, AVFrame* frame, int flags) {
|
||||
// Never touch HW surfaces — those are owned by the hwframe pool.
|
||||
if (s->hw_frames_ctx) {
|
||||
return avcodec_default_get_buffer2(s, frame, flags);
|
||||
}
|
||||
// Only pack planar 8-bit 4:2:0. Everything else (NV12 from unpackers, 10-bit
|
||||
// YUV, 4:2:2, 4:4:4, RGB, paletted, …) goes through the stock allocator.
|
||||
if (frame->format != AV_PIX_FMT_YUV420P && frame->format != AV_PIX_FMT_YUVJ420P) {
|
||||
return avcodec_default_get_buffer2(s, frame, flags);
|
||||
}
|
||||
if (frame->width <= 0 || frame->height <= 0) {
|
||||
return avcodec_default_get_buffer2(s, frame, flags);
|
||||
}
|
||||
|
||||
// Ask the codec for the minimum aligned dimensions it needs. For HEVC
|
||||
// this typically rounds up to a multiple of 64 (the CTU size); for H.264
|
||||
// to a multiple of 16. stride_align[i] is the per-plane linesize alignment.
|
||||
int aligned_w = frame->width;
|
||||
int aligned_h = frame->height;
|
||||
int stride_align[AV_NUM_DATA_POINTERS] = {0};
|
||||
avcodec_align_dimensions2(s, &aligned_w, &aligned_h, stride_align);
|
||||
|
||||
// Round up to the strictest stride_align across all planes (simpler and
|
||||
// safe — FFmpeg only asks for alignment, not exact equality).
|
||||
int max_align = 32;
|
||||
for (int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
|
||||
if (stride_align[i] > max_align) max_align = stride_align[i];
|
||||
}
|
||||
|
||||
auto align_up = [](int v, int a) { return (v + a - 1) & ~(a - 1); };
|
||||
|
||||
const int y_stride = align_up(aligned_w, max_align);
|
||||
const int uv_stride = align_up(aligned_w / 2, max_align / 2 > 0 ? max_align / 2 : 16);
|
||||
const int y_h = aligned_h;
|
||||
const int uv_h = (aligned_h + 1) / 2;
|
||||
|
||||
const size_t y_sz = (size_t)y_stride * y_h;
|
||||
const size_t uv_sz = (size_t)uv_stride * uv_h;
|
||||
const size_t total = y_sz + 2 * uv_sz + AV_INPUT_BUFFER_PADDING_SIZE;
|
||||
|
||||
uint8_t* buf = (uint8_t*)av_mallocz(total);
|
||||
if (!buf) {
|
||||
return AVERROR(ENOMEM);
|
||||
}
|
||||
|
||||
AVBufferRef* ref = av_buffer_create(buf, (int)total,
|
||||
anscore_contiguous_free, nullptr, 0);
|
||||
if (!ref) {
|
||||
av_free(buf);
|
||||
return AVERROR(ENOMEM);
|
||||
}
|
||||
|
||||
for (int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
|
||||
frame->buf[i] = nullptr;
|
||||
frame->data[i] = nullptr;
|
||||
frame->linesize[i] = 0;
|
||||
}
|
||||
frame->buf[0] = ref;
|
||||
frame->data[0] = buf;
|
||||
frame->data[1] = buf + y_sz;
|
||||
frame->data[2] = buf + y_sz + uv_sz;
|
||||
frame->linesize[0] = y_stride;
|
||||
frame->linesize[1] = uv_stride;
|
||||
frame->linesize[2] = uv_stride;
|
||||
frame->extended_data = frame->data;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HWDecoderPool implementation
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -424,6 +515,30 @@ BOOL CVideoDecoder::init(enum AVCodecID codec, uint8* extradata, int extradata_s
|
||||
}
|
||||
}
|
||||
|
||||
// Configure multi-threading for the SOFTWARE decoder.
|
||||
// Hardware decoders (NVDEC, DXVA2/D3D11VA, QSV, VideoToolbox) do their
|
||||
// own parallelism inside the GPU/fixed-function block and ignore these
|
||||
// fields — so we only enable threading when HW init was skipped (hwMode
|
||||
// == HW_DECODING_DISABLE) or failed (fell back to SW).
|
||||
//
|
||||
// Without this, libavcodec's HEVC/H.264 decoder runs on a single core,
|
||||
// which on 4K HEVC streams is ~80–120 ms per frame. Frame + slice
|
||||
// threading on a 24-thread CPU typically brings that down to 10–20 ms.
|
||||
// thread_count = 0 lets FFmpeg auto-pick (capped internally ~16).
|
||||
if (!m_bHardwareDecoderEnabled) {
|
||||
m_pContext->thread_count = 0;
|
||||
m_pContext->thread_type = FF_THREAD_FRAME | FF_THREAD_SLICE;
|
||||
|
||||
// Install contiguous Y+U+V allocator. This packs all three planes
|
||||
// into a single av_malloc block so the BGR-conversion fast path
|
||||
// (avframeYUV420PToCvMat) can either wrap the frame zero-copy, or
|
||||
// at minimum hit a tight 3-call bulk memcpy with good cache locality
|
||||
// instead of per-row copies into a freshly allocated staging Mat.
|
||||
// HW decoders must NEVER have get_buffer2 overridden — they use
|
||||
// hw_frames_ctx for surface management.
|
||||
m_pContext->get_buffer2 = &CVideoDecoder::contiguousGetBuffer2;
|
||||
}
|
||||
|
||||
// FIXED: Use avcodec_open2 instead of avcodec_thread_open
|
||||
if (avcodec_open2(m_pContext, m_pCodec, NULL) < 0)
|
||||
{
|
||||
@@ -432,6 +547,27 @@ BOOL CVideoDecoder::init(enum AVCodecID codec, uint8* extradata, int extradata_s
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
// Debug: one-shot visibility into which decoder actually got opened.
|
||||
// m_bHardwareDecoderEnabled is set by hwDecoderInit() on success; when
|
||||
// hwMode == HW_DECODING_DISABLE or hwDecoderInit failed, it stays FALSE
|
||||
// and the SW decoder (avcodec_find_decoder) is used.
|
||||
// active_thread_type is what FFmpeg actually negotiated after open2
|
||||
// (bit 1 = FF_THREAD_FRAME, bit 2 = FF_THREAD_SLICE).
|
||||
ANS_DBG("MEDIA_DecInit",
|
||||
"avcodec_open2 OK codec=%s(%s) %dx%d hwMode=%d hwEnabled=%d cudaHW=%d gpu=%d "
|
||||
"threads=%d thread_type_req=0x%x active=0x%x -> %s decoder",
|
||||
m_pCodec->name ? m_pCodec->name : "?",
|
||||
m_pCodec->long_name ? m_pCodec->long_name : "?",
|
||||
m_pContext->width, m_pContext->height,
|
||||
hwMode,
|
||||
(int)m_bHardwareDecoderEnabled,
|
||||
(int)m_bCudaHWAccel,
|
||||
m_hwGpuIndex,
|
||||
m_pContext->thread_count,
|
||||
m_pContext->thread_type,
|
||||
m_pContext->active_thread_type,
|
||||
m_bHardwareDecoderEnabled ? "HARDWARE" : "SOFTWARE");
|
||||
|
||||
m_pFrame = av_frame_alloc();
|
||||
if (NULL == m_pFrame)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user