Files
ANSCORE/MediaClient/media/video_decoder.cpp
2026-04-21 09:26:02 +10:00

1151 lines
37 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include "video_decoder.h"
#include "avcodec_mutex.h"
#include "lock.h"
#include "media_codec.h"
#include "media_parse.h"
#include <memory>
#include "ANSLicense.h" // ANS_DBG macro (gated by ANSCORE_DEBUGVIEW)
extern "C" {
#include "libavutil/imgutils.h"
#include "libavutil/buffer.h"
#include "libavutil/mem.h"
}
// ---------------------------------------------------------------------------
// Contiguous YUV420P allocator — trims per-call malloc overhead and enables
// the zero-copy fast path in avframeYUV420PToCvMat for resolutions where the
// codec's aligned height happens to equal the visible height.
// (4K HEVC at 2160 rows still needs 2176-row alignment → one 16-row gap
// between Y and U remains; the fast path stays off for that case but the
// single-block layout still improves cache behaviour for the bulk memcpy.)
// ---------------------------------------------------------------------------
namespace {
void anscore_contiguous_free(void* /*opaque*/, uint8_t* data) {
av_free(data);
}
}
uint32 g_hw_decoder_nums = 0;
uint32 g_hw_decoder_max = 4; // Hardware decoding resources are limited, Limit up to 4 hardware decoding sessions
void* g_hw_decoder_mutex = sys_os_create_mutex();
int CVideoDecoder::contiguousGetBuffer2(AVCodecContext* s, AVFrame* frame, int flags) {
// Never touch HW surfaces — those are owned by the hwframe pool.
if (s->hw_frames_ctx) {
return avcodec_default_get_buffer2(s, frame, flags);
}
// Only pack planar 8-bit 4:2:0. Everything else (NV12 from unpackers, 10-bit
// YUV, 4:2:2, 4:4:4, RGB, paletted, …) goes through the stock allocator.
if (frame->format != AV_PIX_FMT_YUV420P && frame->format != AV_PIX_FMT_YUVJ420P) {
return avcodec_default_get_buffer2(s, frame, flags);
}
if (frame->width <= 0 || frame->height <= 0) {
return avcodec_default_get_buffer2(s, frame, flags);
}
// Ask the codec for the minimum aligned dimensions it needs. For HEVC
// this typically rounds up to a multiple of 64 (the CTU size); for H.264
// to a multiple of 16. stride_align[i] is the per-plane linesize alignment.
int aligned_w = frame->width;
int aligned_h = frame->height;
int stride_align[AV_NUM_DATA_POINTERS] = {0};
avcodec_align_dimensions2(s, &aligned_w, &aligned_h, stride_align);
// Round up to the strictest stride_align across all planes (simpler and
// safe — FFmpeg only asks for alignment, not exact equality).
int max_align = 32;
for (int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
if (stride_align[i] > max_align) max_align = stride_align[i];
}
auto align_up = [](int v, int a) { return (v + a - 1) & ~(a - 1); };
const int y_stride = align_up(aligned_w, max_align);
const int uv_stride = align_up(aligned_w / 2, max_align / 2 > 0 ? max_align / 2 : 16);
const int y_h = aligned_h;
const int uv_h = (aligned_h + 1) / 2;
const size_t y_sz = (size_t)y_stride * y_h;
const size_t uv_sz = (size_t)uv_stride * uv_h;
const size_t total = y_sz + 2 * uv_sz + AV_INPUT_BUFFER_PADDING_SIZE;
uint8_t* buf = (uint8_t*)av_mallocz(total);
if (!buf) {
return AVERROR(ENOMEM);
}
AVBufferRef* ref = av_buffer_create(buf, (int)total,
anscore_contiguous_free, nullptr, 0);
if (!ref) {
av_free(buf);
return AVERROR(ENOMEM);
}
for (int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
frame->buf[i] = nullptr;
frame->data[i] = nullptr;
frame->linesize[i] = 0;
}
frame->buf[0] = ref;
frame->data[0] = buf;
frame->data[1] = buf + y_sz;
frame->data[2] = buf + y_sz + uv_sz;
frame->linesize[0] = y_stride;
frame->linesize[1] = uv_stride;
frame->linesize[2] = uv_stride;
frame->extended_data = frame->data;
return 0;
}
// ---------------------------------------------------------------------------
// HWDecoderPool implementation
// ---------------------------------------------------------------------------
HWDecoderPool& HWDecoderPool::instance() {
static HWDecoderPool pool;
return pool;
}
void HWDecoderPool::configure(int numGpus, int maxPerGpu) {
// Uniform limit: same max for all GPUs
std::vector<int> limits(numGpus, maxPerGpu);
configure(limits);
}
void HWDecoderPool::configure(const std::vector<int>& maxPerGpuList) {
std::lock_guard<std::mutex> lock(m_mutex);
m_maxPerGpu = maxPerGpuList;
m_activePerGpu.assign(maxPerGpuList.size(), 0);
m_configured = true;
// Also update legacy global for backward compatibility
int total = 0;
for (int m : m_maxPerGpu) total += m;
g_hw_decoder_max = static_cast<uint32>(total);
for (int i = 0; i < static_cast<int>(m_maxPerGpu.size()); ++i) {
fprintf(stderr, "[HWDecode] HWDecoderPool: GPU[%d] max=%d sessions\n", i, m_maxPerGpu[i]);
}
fprintf(stderr, "[HWDecode] HWDecoderPool: configured %d GPU(s), %d total sessions\n",
static_cast<int>(m_maxPerGpu.size()), total);
}
bool HWDecoderPool::isConfigured() const {
return m_configured;
}
int HWDecoderPool::acquireSlot(int preferredGpu) {
std::lock_guard<std::mutex> lock(m_mutex);
if (!m_configured || m_activePerGpu.empty()) return -1;
// If caller requested a specific GPU (e.g. to match inference GPU for NV12 zero-copy),
// try that GPU first. This avoids cross-GPU device pointer access which causes
// "illegal memory access" sticky CUDA errors.
if (preferredGpu >= 0 && preferredGpu < static_cast<int>(m_activePerGpu.size())) {
if (m_activePerGpu[preferredGpu] < m_maxPerGpu[preferredGpu]) {
m_activePerGpu[preferredGpu]++;
fprintf(stderr, "[HWDecode] HWDecoderPool: acquired slot on PREFERRED GPU[%d] (%d/%d)\n",
preferredGpu, m_activePerGpu[preferredGpu], m_maxPerGpu[preferredGpu]);
return preferredGpu;
}
fprintf(stderr, "[HWDecode] HWDecoderPool: preferred GPU[%d] at capacity (%d/%d), falling back to least-loaded\n",
preferredGpu, m_activePerGpu[preferredGpu], m_maxPerGpu[preferredGpu]);
}
// Fallback: find the GPU with the fewest active sessions that still has capacity
int bestGpu = -1;
int bestCount = INT_MAX;
for (int i = 0; i < static_cast<int>(m_activePerGpu.size()); ++i) {
if (m_activePerGpu[i] < m_maxPerGpu[i] && m_activePerGpu[i] < bestCount) {
bestCount = m_activePerGpu[i];
bestGpu = i;
}
}
if (bestGpu >= 0) {
m_activePerGpu[bestGpu]++;
fprintf(stderr, "[HWDecode] HWDecoderPool: acquired slot on GPU[%d] (%d/%d)\n",
bestGpu, m_activePerGpu[bestGpu], m_maxPerGpu[bestGpu]);
}
return bestGpu;
}
void HWDecoderPool::releaseSlot(int gpuIndex) {
std::lock_guard<std::mutex> lock(m_mutex);
if (!m_configured) return;
if (gpuIndex >= 0 && gpuIndex < static_cast<int>(m_activePerGpu.size())) {
if (m_activePerGpu[gpuIndex] > 0) {
m_activePerGpu[gpuIndex]--;
fprintf(stderr, "[HWDecode] HWDecoderPool: released slot on GPU[%d] (%d/%d)\n",
gpuIndex, m_activePerGpu[gpuIndex], m_maxPerGpu[gpuIndex]);
}
}
}
int HWDecoderPool::getTotalMax() const {
int total = 0;
for (int m : m_maxPerGpu) total += m;
return total;
}
int HWDecoderPool::getTotalActive() const {
int total = 0;
for (int c : m_activePerGpu) total += c;
return total;
}
// ---------------------------------------------------------------------------
// SharedHWDeviceCtx implementation
// ---------------------------------------------------------------------------
SharedHWDeviceCtx& SharedHWDeviceCtx::instance() {
static SharedHWDeviceCtx inst;
return inst;
}
SharedHWDeviceCtx::~SharedHWDeviceCtx() {
// Intentionally empty — do NOT release GPU/D3D11 resources here.
// This destructor runs during DLL_PROCESS_DETACH while the OS loader
// lock is held. Releasing D3D11/NVIDIA resources requires driver
// worker threads that also need the loader lock → deadlock.
// The OS reclaims all GPU resources when the process exits.
}
AVBufferRef* SharedHWDeviceCtx::acquire(int gpuIndex, AVHWDeviceType type) {
std::lock_guard<std::mutex> lock(m_mutex);
// Grow cache if needed
if (gpuIndex < 0) gpuIndex = 0;
if (static_cast<int>(m_cache.size()) <= gpuIndex) {
m_cache.resize(gpuIndex + 1);
}
GpuCtx& slot = m_cache[gpuIndex];
// If already created for this GPU and same type, return a new reference
if (slot.ctx && slot.type == type) {
fprintf(stderr, "[HWDecode] SharedHWDeviceCtx: reusing shared context for GPU[%d]\n", gpuIndex);
return av_buffer_ref(slot.ctx);
}
// Release old context if type changed
if (slot.ctx) {
av_buffer_unref(&slot.ctx);
slot.ctx = nullptr;
}
// Create new HW device context for this GPU
char adapterStr[16] = {};
snprintf(adapterStr, sizeof(adapterStr), "%d", gpuIndex);
int err = av_hwdevice_ctx_create(&slot.ctx, type, adapterStr, nullptr, 0);
if (err < 0) {
char error_buf[AV_ERROR_MAX_STRING_SIZE];
av_strerror(err, error_buf, sizeof(error_buf));
fprintf(stderr, "[HWDecode] SharedHWDeviceCtx: FAILED to create context for GPU[%d]: %s\n",
gpuIndex, error_buf);
slot.ctx = nullptr;
return nullptr;
}
slot.type = type;
fprintf(stderr, "[HWDecode] SharedHWDeviceCtx: created shared context for GPU[%d] type=%s\n",
gpuIndex, av_hwdevice_get_type_name(type));
// Return a new reference (caller owns it)
return av_buffer_ref(slot.ctx);
}
void SharedHWDeviceCtx::releaseAll() {
std::lock_guard<std::mutex> lock(m_mutex);
for (auto& slot : m_cache) {
if (slot.ctx) {
av_buffer_unref(&slot.ctx);
slot.ctx = nullptr;
}
}
m_cache.clear();
}
enum AVPixelFormat getHWFormat(AVCodecContext* ctx, const enum AVPixelFormat* pix_fmts)
{
CVideoDecoder* pthis = (CVideoDecoder*)ctx->opaque;
AVPixelFormat dst_pix_fmt = AV_PIX_FMT_NONE;
pthis->getHWFormat(ctx, pix_fmts, &dst_pix_fmt);
return dst_pix_fmt;
}
enum AVPixelFormat get_hw_format(AVCodecContext* ctx, const enum AVPixelFormat* pix_fmts)
{
for (const enum AVPixelFormat* p = pix_fmts; *p != -1; p++) {
if (*p == AV_PIX_FMT_YUVJ420P) {
return AV_PIX_FMT_YUVJ420P;
}
}
// If YUVJ420P is not available, fall back to default
return ctx->pix_fmt;
}
CVideoDecoder::CVideoDecoder()
{
m_bInited = FALSE;
m_bRunning = FALSE;
m_bHardwareDecoderEnabled = FALSE;
m_bCudaHWAccel = false;
m_hwGpuIndex = -1;
m_pCodec = NULL;
m_pContext = NULL;
m_pFrame = NULL;
m_pSoftFrame = NULL;
m_pCudaHWFrame = NULL;
m_pCallback = NULL;
m_pUserdata = NULL;
m_hwPixFmt = AV_PIX_FMT_NONE;
m_pHWDeviceCtx = NULL;
}
AVFrame* CVideoDecoder::takeCudaHWFrame() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
AVFrame* result = m_pCudaHWFrame;
m_pCudaHWFrame = nullptr;
return result;
}
AVFrame* CVideoDecoder::cloneCudaHWFrame_unlocked() {
// Caller MUST already hold _mutex (called from decode thread's callback chain).
// Returns a clone so the original m_pCudaHWFrame stays valid for the decode loop.
return m_pCudaHWFrame ? av_frame_clone(m_pCudaHWFrame) : nullptr;
}
CVideoDecoder::~CVideoDecoder()
{
uninit();
}
void CVideoDecoder::uninit()
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
// [MEDIA_DecClose] heartbeat — paired with [MEDIA_DecInit] for leak diagnosis.
// Pair count over a long run reveals whether avcodec_open2 calls are
// matched by full teardowns. If close-count < init-count, the FFmpeg
// codec context (and its custom get_buffer2 arena) is leaking per reopen.
{
static std::atomic<uint64_t> s_closeCount{0};
const uint64_t n = s_closeCount.fetch_add(1) + 1;
ANS_DBG("MEDIA_DecClose",
"uninit ENTRY #%llu inited=%d codec=%s %dx%d hwEnabled=%d cudaHW=%d gpu=%d (this=%p)",
(unsigned long long)n,
(int)m_bInited,
(m_pCodec && m_pCodec->name) ? m_pCodec->name : "?",
m_pContext ? m_pContext->width : 0,
m_pContext ? m_pContext->height : 0,
(int)m_bHardwareDecoderEnabled,
(int)m_bCudaHWAccel,
m_hwGpuIndex,
(void*)this);
}
// Stop processing first
// Backup first
BOOL wasRunning = m_bRunning;
m_bRunning = FALSE;
flush();
// FIXED: Clean up frames before context to avoid use-after-free
if (m_pFrame)
{
av_frame_free(&m_pFrame);
m_pFrame = NULL;
}
if (m_pSoftFrame)
{
av_frame_free(&m_pSoftFrame);
m_pSoftFrame = NULL;
}
if (m_pCudaHWFrame)
{
av_frame_free(&m_pCudaHWFrame);
m_pCudaHWFrame = NULL;
}
if (m_pContext)
{
// FIXED: Free extradata before freeing context
if (m_pContext->extradata) {
av_free(m_pContext->extradata);
m_pContext->extradata = NULL;
m_pContext->extradata_size = 0;
}
// FIXED: Properly release hardware context reference
if (m_pContext->hw_device_ctx) {
av_buffer_unref(&m_pContext->hw_device_ctx);
m_pContext->hw_device_ctx = NULL;
}
// FIXED: Close codec before freeing context
avcodec_close(m_pContext);
avcodec_free_context(&m_pContext);
m_pContext = NULL;
}
// Only decrement hardware decoder count if it was actually enabled
if (m_pHWDeviceCtx && m_bHardwareDecoderEnabled)
{
av_buffer_unref(&m_pHWDeviceCtx);
m_pHWDeviceCtx = NULL;
// Release via per-GPU pool or legacy global counter
HWDecoderPool& pool = HWDecoderPool::instance();
if (pool.isConfigured() && m_hwGpuIndex >= 0) {
pool.releaseSlot(m_hwGpuIndex);
} else {
CLock hw_lock(g_hw_decoder_mutex);
if (g_hw_decoder_nums > 0) {
g_hw_decoder_nums--;
}
}
m_hwGpuIndex = -1;
m_bHardwareDecoderEnabled = FALSE;
}
// Restore running state if needed
m_bRunning = wasRunning;
m_pCodec = NULL;
m_bInited = FALSE;
}
BOOL CVideoDecoder::init(enum AVCodecID codec, uint8* extradata, int extradata_size, int hwMode, int preferredGpu)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
// Clean up any existing state
if (m_bInited) {
uninit();
}
int width = 0;
int height = 0;
if (extradata && extradata_size > 0)
{
int vcodec = VIDEO_CODEC_NONE;
if (AV_CODEC_ID_H264 == codec)
{
vcodec = VIDEO_CODEC_H264;
}
else if (AV_CODEC_ID_HEVC == codec)
{
vcodec = VIDEO_CODEC_H265;
}
else if (AV_CODEC_ID_MJPEG == codec)
{
vcodec = VIDEO_CODEC_JPEG;
}
else if (AV_CODEC_ID_MPEG4 == codec)
{
vcodec = VIDEO_CODEC_MP4;
}
avc_parse_video_size(vcodec, extradata, extradata_size, &width, &height);
}
#ifdef ANDROID
if (HW_DECODING_DISABLE != hwMode && width * height >= 320 * 240)
{
if (AV_CODEC_ID_H264 == codec)
{
m_pCodec = avcodec_find_decoder_by_name("h264_mediacodec");
}
else if (AV_CODEC_ID_HEVC == codec)
{
m_pCodec = avcodec_find_decoder_by_name("hevc_mediacodec");
}
else if (AV_CODEC_ID_MPEG4 == codec)
{
m_pCodec = avcodec_find_decoder_by_name("mpeg4_mediacodec");
}
}
if (NULL == m_pCodec)
{
m_pCodec = avcodec_find_decoder(codec);
}
#else
m_pCodec = avcodec_find_decoder(codec);
#endif
if (NULL == m_pCodec)
{
log_print(HT_LOG_ERR, "%s, m_pCodec is NULL for codec %d\r\n", __FUNCTION__, codec);
return FALSE;
}
m_pContext = avcodec_alloc_context3(m_pCodec);
if (NULL == m_pContext)
{
log_print(HT_LOG_ERR, "%s, avcodec_alloc_context3 failed\r\n", __FUNCTION__);
return FALSE;
}
m_pContext->width = width;
m_pContext->height = height;
m_pContext->flags |= AV_CODEC_FLAG_LOW_DELAY;
m_pContext->flags2 |= AV_CODEC_FLAG2_FAST;
m_pContext->flags |= AV_CODEC_FLAG_OUTPUT_CORRUPT;
m_pContext->err_recognition = AV_EF_IGNORE_ERR;
av_opt_set_int(m_pContext, "refcounted_frames", 1, 0);
// Initialize hardware decoder
if (HW_DECODING_DISABLE != hwMode) {
int hw_ret = hwDecoderInit(m_pContext, hwMode, preferredGpu);
if (hw_ret < 0) {
log_print(HT_LOG_WARN, "%s, hwDecoderInit failed with error %d, falling back to software decoding\r\n", __FUNCTION__, hw_ret);
}
}
// Handle extradata
if (extradata && extradata_size > 0)
{
int size = extradata_size + AV_INPUT_BUFFER_PADDING_SIZE;
m_pContext->extradata = (uint8*)av_mallocz(size);
if (m_pContext->extradata)
{
m_pContext->extradata_size = extradata_size;
memcpy(m_pContext->extradata, extradata, extradata_size);
}
else
{
log_print(HT_LOG_ERR, "%s, Failed to allocate extradata\r\n", __FUNCTION__);
uninit(); // FIXED: Clean up on failure
return FALSE;
}
}
// Configure multi-threading for the SOFTWARE decoder.
// Hardware decoders (NVDEC, DXVA2/D3D11VA, QSV, VideoToolbox) do their
// own parallelism inside the GPU/fixed-function block and ignore these
// fields — so we only enable threading when HW init was skipped (hwMode
// == HW_DECODING_DISABLE) or failed (fell back to SW).
//
// Without this, libavcodec's HEVC/H.264 decoder runs on a single core,
// which on 4K HEVC streams is ~80120 ms per frame. Frame + slice
// threading on a 24-thread CPU typically brings that down to 1020 ms.
// thread_count = 0 lets FFmpeg auto-pick (capped internally ~16).
if (!m_bHardwareDecoderEnabled) {
m_pContext->thread_count = 0;
m_pContext->thread_type = FF_THREAD_FRAME | FF_THREAD_SLICE;
// Install contiguous Y+U+V allocator. This packs all three planes
// into a single av_malloc block so the BGR-conversion fast path
// (avframeYUV420PToCvMat) can either wrap the frame zero-copy, or
// at minimum hit a tight 3-call bulk memcpy with good cache locality
// instead of per-row copies into a freshly allocated staging Mat.
// HW decoders must NEVER have get_buffer2 overridden — they use
// hw_frames_ctx for surface management.
m_pContext->get_buffer2 = &CVideoDecoder::contiguousGetBuffer2;
}
// FIXED: Use avcodec_open2 instead of avcodec_thread_open
if (avcodec_open2(m_pContext, m_pCodec, NULL) < 0)
{
log_print(HT_LOG_ERR, "%s, avcodec_open2 failed\r\n", __FUNCTION__);
uninit(); // FIXED: Clean up on failure
return FALSE;
}
// Debug: one-shot visibility into which decoder actually got opened.
// m_bHardwareDecoderEnabled is set by hwDecoderInit() on success; when
// hwMode == HW_DECODING_DISABLE or hwDecoderInit failed, it stays FALSE
// and the SW decoder (avcodec_find_decoder) is used.
// active_thread_type is what FFmpeg actually negotiated after open2
// (bit 1 = FF_THREAD_FRAME, bit 2 = FF_THREAD_SLICE).
ANS_DBG("MEDIA_DecInit",
"avcodec_open2 OK codec=%s(%s) %dx%d hwMode=%d hwEnabled=%d cudaHW=%d gpu=%d "
"threads=%d thread_type_req=0x%x active=0x%x -> %s decoder",
m_pCodec->name ? m_pCodec->name : "?",
m_pCodec->long_name ? m_pCodec->long_name : "?",
m_pContext->width, m_pContext->height,
hwMode,
(int)m_bHardwareDecoderEnabled,
(int)m_bCudaHWAccel,
m_hwGpuIndex,
m_pContext->thread_count,
m_pContext->thread_type,
m_pContext->active_thread_type,
m_bHardwareDecoderEnabled ? "HARDWARE" : "SOFTWARE");
m_pFrame = av_frame_alloc();
if (NULL == m_pFrame)
{
log_print(HT_LOG_ERR, "%s, av_frame_alloc failed for m_pFrame\r\n", __FUNCTION__);
uninit(); // FIXED: Clean up on failure
return FALSE;
}
m_pSoftFrame = av_frame_alloc();
if (NULL == m_pSoftFrame)
{
log_print(HT_LOG_ERR, "%s, av_frame_alloc failed for m_pSoftFrame\r\n", __FUNCTION__);
uninit(); // FIXED: Clean up on failure
return FALSE;
}
m_bInited = TRUE;
//m_bRunning = TRUE;
return TRUE;
}
BOOL CVideoDecoder::init(int codec, uint8* extradata, int extradata_size, int hwMode, int preferredGpu)
{
BOOL result = init(to_video_avcodecid(codec), extradata, extradata_size, hwMode, preferredGpu);
if (result) {
m_bRunning = TRUE; // Set running only if initialization succeeded
}
return result;
}
int CVideoDecoder::hwDecoderInit(AVCodecContext* ctx, int hwMode, int preferredGpu) {
int err = 0;
std::string hwtype;
enum AVHWDeviceType type = AV_HWDEVICE_TYPE_NONE;
if (hwMode == HW_DECODING_DISABLE) {
return 0; // Hardware decoding is disabled
}
// -- Per-GPU pool path (preferred) or legacy global path ----------------
int assignedGpu = -1;
HWDecoderPool& pool = HWDecoderPool::instance();
if (pool.isConfigured()) {
// Per-GPU: acquire a slot, preferring the caller's requested GPU
// (e.g. inference GPU for NV12 zero-copy alignment)
assignedGpu = pool.acquireSlot(preferredGpu);
if (assignedGpu < 0) {
log_print(HT_LOG_WARN, "%s, All GPU HW decoder slots full (%d/%d total)\r\n",
__FUNCTION__, pool.getTotalActive(), pool.getTotalMax());
return -1;
}
} else {
// Legacy: single global counter
CLock lock(g_hw_decoder_mutex);
if (g_hw_decoder_max > 0 && g_hw_decoder_nums >= g_hw_decoder_max) {
log_print(HT_LOG_WARN, "%s, Maximum number of hardware decoders reached (%d/%d)\r\n",
__FUNCTION__, g_hw_decoder_nums, g_hw_decoder_max);
return -1;
}
}
// Determine the hardware type based on platform and hardware mode
if (!getHardwareTypeForPlatform(hwMode, hwtype)) {
log_print(HT_LOG_WARN, "%s, Unsupported hardware mode %d for the current platform\r\n", __FUNCTION__, hwMode);
if (assignedGpu >= 0) pool.releaseSlot(assignedGpu);
return -1;
}
// Find the hardware device type by name
type = av_hwdevice_find_type_by_name(hwtype.c_str());
if (type == AV_HWDEVICE_TYPE_NONE) {
log_print(HT_LOG_WARN, "%s, Hardware device type %s is not supported\r\n", __FUNCTION__, hwtype.c_str());
logSupportedHwTypes();
if (assignedGpu >= 0) pool.releaseSlot(assignedGpu);
return -1;
}
// Find a hardware configuration that supports the specified device type
if (!findHwConfigForDeviceType(type)) {
log_print(HT_LOG_WARN, "%s, Decoder %s does not support the specified hardware device type %s\r\n",
__FUNCTION__, m_pCodec->long_name, av_hwdevice_get_type_name(type));
if (assignedGpu >= 0) pool.releaseSlot(assignedGpu);
return -1;
}
// Get or create a shared HW device context for this GPU.
// NVIDIA recommends sharing CUDA contexts across decode sessions to minimize
// GPU memory overhead (each CUDA context costs ~50-100MB).
// See: NVDEC Video Decoder API Programming Guide, Section "Multi-session decoding"
int gpuIdx = (assignedGpu >= 0) ? assignedGpu : 0;
m_pHWDeviceCtx = SharedHWDeviceCtx::instance().acquire(gpuIdx, type);
if (!m_pHWDeviceCtx) {
log_print(HT_LOG_ERR, "%s, Failed to acquire shared HW device context, type=%s, gpu=%d\r\n",
__FUNCTION__, av_hwdevice_get_type_name(type), gpuIdx);
if (assignedGpu >= 0) pool.releaseSlot(assignedGpu);
return -1;
}
// Configure the codec context to use the shared hardware device
ctx->opaque = this;
ctx->get_format = ::getHWFormat;
ctx->hw_device_ctx = av_buffer_ref(m_pHWDeviceCtx);
ctx->err_recognition = AV_EF_IGNORE_ERR;
ctx->flags2 |= AV_CODEC_FLAG2_EXPORT_MVS;
// Reserve extra NVDEC surfaces for application-held av_frame_clone() references.
// The clone chain (decoder → player → registry) holds ~2 surfaces simultaneously
// (decoder's clone + registry's clone; getCudaHWFrame uses ownership transfer).
// Without this, the default pool (num_decode_surfaces + 2) can run out under
// load with many concurrent streams, causing the decoder to stall.
ctx->extra_hw_frames = 2;
// Track which GPU this decoder is on
m_hwGpuIndex = assignedGpu;
m_bHardwareDecoderEnabled = TRUE;
m_bCudaHWAccel = (type == AV_HWDEVICE_TYPE_CUDA);
// Legacy counter (for backward compatibility)
if (!pool.isConfigured()) {
CLock lock(g_hw_decoder_mutex);
g_hw_decoder_nums++;
}
log_print(HT_LOG_INFO, "%s, Successfully initialized hardware decoder %s on GPU[%d] (%d/%d)\r\n",
__FUNCTION__, av_hwdevice_get_type_name(type),
gpuIdx,
pool.isConfigured() ? pool.getTotalActive() : g_hw_decoder_nums,
pool.isConfigured() ? pool.getTotalMax() : g_hw_decoder_max);
return 0;
}
void CVideoDecoder::Start() {
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (m_pContext) {
avcodec_flush_buffers(m_pContext);
}
m_bRunning = TRUE;
log_print(HT_LOG_INFO, "%s, Video decoder started\r\n", __FUNCTION__);
}
void CVideoDecoder::Stop() {
// Atomically signal the decoder to stop WITHOUT acquiring _mutex.
// decode() holds _mutex while inside avcodec_send_packet / CUDA calls
// that can block on the nvcuda64 SRW lock for a long time.
// If we waited for _mutex here, Stop() would deadlock whenever a
// concurrent decode() is stuck waiting for a CUDA operation held by
// an inference thread.
m_bRunning.store(FALSE, std::memory_order_release);
log_print(HT_LOG_INFO, "%s, Video decoder stopped\r\n", __FUNCTION__);
}
// Log all supported hardware types
void CVideoDecoder::logSupportedHwTypes() {
enum AVHWDeviceType type = AV_HWDEVICE_TYPE_NONE;
log_print(HT_LOG_INFO, "%s, Available hardware device types:\r\n", __FUNCTION__);
while ((type = av_hwdevice_iterate_types(type)) != AV_HWDEVICE_TYPE_NONE) {
log_print(HT_LOG_INFO, "%s, - %s\r\n", __FUNCTION__, av_hwdevice_get_type_name(type));
}
}
// Platform-specific function to determine the hardware type based on the mode
bool CVideoDecoder::getHardwareTypeForPlatform(int hwMode, std::string& hwtype) {
#if __WINDOWS_OS__
switch (hwMode) {
case HW_DECODING_D3D11: hwtype = "d3d11va"; break;
case HW_DECODING_DXVA: hwtype = "dxva2"; break;
case HW_DECODING_CUDA: hwtype = "cuda"; break;
case HW_DECODING_AUTO:
hwtype = "cuda";
if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) {
hwtype = "d3d11va";
if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) {
hwtype = "dxva2";
}
}
break;
default: return false;
}
#elif defined(IOS)
switch (hwMode) {
case HW_DECODING_VIDEOTOOLBOX: hwtype = "videotoolbox"; break;
case HW_DECODING_OPENCL: hwtype = "opencl"; break;
case HW_DECODING_AUTO:
hwtype = "videotoolbox";
if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) {
hwtype = "opencl";
}
break;
default: return false;
}
#elif defined(ANDROID)
if (hwMode == HW_DECODING_MEDIACODEC || hwMode == HW_DECODING_AUTO) {
hwtype = "mediacodec";
}
else {
return false;
}
#elif __LINUX_OS__
switch (hwMode) {
case HW_DECODING_VAAPI: hwtype = "vaapi"; break;
case HW_DECODING_OPENCL: hwtype = "opencl"; break;
case HW_DECODING_AUTO:
hwtype = "vaapi";
if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) {
hwtype = "opencl";
}
break;
default: return false;
}
#else
return false; // Unsupported platform
#endif
return true;
}
// Find a hardware configuration that matches the specified device type
bool CVideoDecoder::findHwConfigForDeviceType(AVHWDeviceType type) {
for (int i = 0;; i++) {
const AVCodecHWConfig* config = avcodec_get_hw_config(m_pCodec, i);
if (!config) {
return false; // No matching hardware configuration found
}
if (config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX && config->device_type == type) {
m_hwPixFmt = config->pix_fmt;
return true;
}
}
}
BOOL CVideoDecoder::getHWFormat(AVCodecContext* ctx, const AVPixelFormat* pix_fmts, AVPixelFormat* dst)
{
const AVPixelFormat* p;
*dst = AV_PIX_FMT_NONE;
// First, attempt to use hardware pixel format if available
for (p = pix_fmts; *p != -1; p++)
{
if (*p == m_hwPixFmt)
{
*dst = *p;
return TRUE;
}
}
// If hardware format is not supported, fall back to YUVJ420P
for (p = pix_fmts; *p != -1; p++)
{
if (*p == AV_PIX_FMT_YUVJ420P ||
*p == AV_PIX_FMT_YUVJ422P ||
*p == AV_PIX_FMT_YUVJ444P)
{
*dst = *p;
return TRUE;
}
}
// As a last resort, use other formats (YUV420P)
for (p = pix_fmts; *p != -1; p++)
{
if (*p == AV_PIX_FMT_YUV420P ||
*p == AV_PIX_FMT_YUV422P ||
*p == AV_PIX_FMT_YUV444P)
{
*dst = *p;
return TRUE;
}
}
if (*pix_fmts != -1)
{
*dst = *pix_fmts;
return TRUE;
}
log_print(HT_LOG_ERR, "%s, Failed to get HW surface format\r\n", __FUNCTION__);
return FALSE;
}
int CVideoDecoder::getWidth()
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (m_pContext)
{
return m_pContext->width;
}
return 0;
}
int CVideoDecoder::getHeight()
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (m_pContext)
{
return m_pContext->height;
}
return 0;
}
double CVideoDecoder::getFrameRate()
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (m_pContext)
{
if (m_pContext->framerate.den > 0)
{
return (double)((double)(m_pContext->framerate.num) / m_pContext->framerate.den);
}
}
return 0;
}
BOOL CVideoDecoder::decode(AVPacket* pkt)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!m_bInited)
{
log_print(HT_LOG_ERR, "%s, Decoder not initialized\r\n", __FUNCTION__);
return FALSE;
}
if (!m_bRunning)
{
log_print(HT_LOG_WARN, "%s, Decoder not running\r\n", __FUNCTION__);
return FALSE;
}
if (!m_pContext) {
log_print(HT_LOG_ERR, "%s, Context is NULL\r\n", __FUNCTION__);
return FALSE;
}
int ret;
int retryCount = 0;
const int maxRetries = 3;
// Attempt to send packet to decoder
while ((ret = avcodec_send_packet(m_pContext, pkt)) == AVERROR(EAGAIN) &&
retryCount < maxRetries)
{
if (!readFrame())
{
log_print(HT_LOG_ERR, "%s, Failed to read frame during retry %d\r\n", __FUNCTION__, retryCount);
return FALSE;
}
Sleep(1); // Reduced sleep time
retryCount++;
}
// Check for other errors
if (ret < 0 && ret != AVERROR_EOF)
{
char error_buf[AV_ERROR_MAX_STRING_SIZE];
av_strerror(ret, error_buf, sizeof(error_buf));
log_print(HT_LOG_ERR, "%s, avcodec_send_packet failed: %s (ret=%d)\r\n", __FUNCTION__, error_buf, ret);
return FALSE;
}
// If the packet was successfully sent, proceed to read frame
return readFrame();
}
BOOL CVideoDecoder::decode(uint8* data, int len, int64_t pts)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!m_bInited)
{
log_print(HT_LOG_ERR, "%s, Decoder not initialized\r\n", __FUNCTION__);
return FALSE;
}
if (!m_bRunning)
{
log_print(HT_LOG_WARN, "%s, Decoder not running\r\n", __FUNCTION__);
return FALSE;
}
if (!data || len <= 0) {
log_print(HT_LOG_ERR, "%s, Invalid input data\r\n", __FUNCTION__);
return FALSE;
}
// Allocate packet
AVPacket* packet = av_packet_alloc();
if (!packet) {
log_print(HT_LOG_ERR, "%s, Failed to allocate AVPacket\r\n", __FUNCTION__);
return FALSE;
}
// FIXED: Use av_new_packet() to properly allocate and manage packet data
int ret = av_new_packet(packet, len);
if (ret < 0) {
char error_buf[AV_ERROR_MAX_STRING_SIZE];
av_strerror(ret, error_buf, sizeof(error_buf));
log_print(HT_LOG_ERR, "%s, Failed to allocate packet data: %s\r\n", __FUNCTION__, error_buf);
av_packet_free(&packet);
return FALSE;
}
// Copy data - av_new_packet() already allocated the buffer with proper padding
memcpy(packet->data, data, len);
// Set packet timing information
packet->pts = pts;
packet->dts = pts;
// Call decode function
BOOL result = decode(packet);
// Clean up - av_packet_free will properly handle the data buffer
av_packet_free(&packet);
return result;
}
BOOL CVideoDecoder::readFrame()
{
int ret = 0;
AVFrame* tmp_frame = NULL;
BOOL frame_processed = FALSE;
while (ret >= 0)
{
ret = avcodec_receive_frame(m_pContext, m_pFrame);
if (ret == AVERROR(EAGAIN)) {
// Need more input data
return frame_processed ? TRUE : FALSE;
}
else if (ret == AVERROR_EOF) {
// End of stream
return TRUE;
}
else if (ret < 0) {
char error_buf[AV_ERROR_MAX_STRING_SIZE];
av_strerror(ret, error_buf, sizeof(error_buf));
log_print(HT_LOG_ERR, "%s, avcodec_receive_frame failed: %s (ret=%d)\r\n", __FUNCTION__, error_buf, ret);
return FALSE;
}
// Check if we got a valid frame
if (!m_pFrame || m_pFrame->width <= 0 || m_pFrame->height <= 0) {
log_print(HT_LOG_WARN, "%s, Received invalid frame\r\n", __FUNCTION__);
av_frame_unref(m_pFrame);
continue;
}
if (m_pFrame->format == m_hwPixFmt)
{
// CUDA HW accel: clone the HW frame BEFORE transfer so inference
// can use CUDA device pointers directly (zero-copy, no upload).
if (m_bCudaHWAccel) {
if (m_pCudaHWFrame) av_frame_free(&m_pCudaHWFrame);
m_pCudaHWFrame = av_frame_clone(m_pFrame);
}
// FIXED: Ensure m_pSoftFrame is properly initialized before transfer
av_frame_unref(m_pSoftFrame); // Clear any previous data
// Hardware frame - transfer to software (needed for display)
ret = av_hwframe_transfer_data(m_pSoftFrame, m_pFrame, 0);
if (ret < 0)
{
char error_buf[AV_ERROR_MAX_STRING_SIZE];
av_strerror(ret, error_buf, sizeof(error_buf));
log_print(HT_LOG_ERR, "%s, Error transferring hardware frame to system memory: %s (ret=%d)\r\n",
__FUNCTION__, error_buf, ret);
av_frame_unref(m_pFrame);
if (m_pCudaHWFrame) av_frame_free(&m_pCudaHWFrame);
continue;
}
// Copy timing information
m_pSoftFrame->pts = m_pFrame->pts;
m_pSoftFrame->pkt_dts = m_pFrame->pkt_dts;
m_pSoftFrame->best_effort_timestamp = m_pFrame->best_effort_timestamp;
tmp_frame = m_pSoftFrame;
}
else
{
// Software frame - use directly
tmp_frame = m_pFrame;
}
// Render the frame
if (tmp_frame) {
render(tmp_frame);
frame_processed = TRUE;
}
// FIXED: Ensure proper cleanup of frame references
if (tmp_frame == m_pSoftFrame) {
av_frame_unref(m_pSoftFrame);
}
av_frame_unref(m_pFrame);
}
return TRUE;
}
int CVideoDecoder::render(AVFrame* frame)
{
if (!m_bRunning || !frame)
{
return 0;
}
if (m_pCallback)
{
try {
m_pCallback(frame, m_pUserdata);
}
catch (...) {
log_print(HT_LOG_ERR, "%s, Exception in callback function\r\n", __FUNCTION__);
return 0;
}
}
return 1;
}
void CVideoDecoder::flush()
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (NULL == m_pContext ||
NULL == m_pContext->codec ||
!(m_pContext->codec->capabilities & AV_CODEC_CAP_DELAY))
{
return;
}
log_print(HT_LOG_INFO, "%s, Flushing decoder buffers\r\n", __FUNCTION__);
// Send NULL packet to flush
avcodec_send_packet(m_pContext, NULL);
// FIXED: Drain all frames after flushing
while (true) {
int ret = avcodec_receive_frame(m_pContext, m_pFrame);
if (ret == AVERROR_EOF || ret == AVERROR(EAGAIN)) {
break;
}
if (ret < 0) {
char error_buf[AV_ERROR_MAX_STRING_SIZE];
av_strerror(ret, error_buf, sizeof(error_buf));
log_print(HT_LOG_WARN, "%s, Error during flush: %s\r\n", __FUNCTION__, error_buf);
break;
}
// Process the frame if needed, or just unref it
av_frame_unref(m_pFrame);
}
// Also flush the codec buffers
if (m_pContext) {
avcodec_flush_buffers(m_pContext);
}
}