1151 lines
37 KiB
C++
1151 lines
37 KiB
C++
|
||
#include "video_decoder.h"
|
||
#include "avcodec_mutex.h"
|
||
#include "lock.h"
|
||
#include "media_codec.h"
|
||
#include "media_parse.h"
|
||
#include <memory>
|
||
|
||
#include "ANSLicense.h" // ANS_DBG macro (gated by ANSCORE_DEBUGVIEW)
|
||
|
||
extern "C" {
|
||
#include "libavutil/imgutils.h"
|
||
#include "libavutil/buffer.h"
|
||
#include "libavutil/mem.h"
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Contiguous YUV420P allocator — trims per-call malloc overhead and enables
|
||
// the zero-copy fast path in avframeYUV420PToCvMat for resolutions where the
|
||
// codec's aligned height happens to equal the visible height.
|
||
// (4K HEVC at 2160 rows still needs 2176-row alignment → one 16-row gap
|
||
// between Y and U remains; the fast path stays off for that case but the
|
||
// single-block layout still improves cache behaviour for the bulk memcpy.)
|
||
// ---------------------------------------------------------------------------
|
||
namespace {
|
||
void anscore_contiguous_free(void* /*opaque*/, uint8_t* data) {
|
||
av_free(data);
|
||
}
|
||
}
|
||
|
||
uint32 g_hw_decoder_nums = 0;
|
||
uint32 g_hw_decoder_max = 4; // Hardware decoding resources are limited, Limit up to 4 hardware decoding sessions
|
||
void* g_hw_decoder_mutex = sys_os_create_mutex();
|
||
|
||
int CVideoDecoder::contiguousGetBuffer2(AVCodecContext* s, AVFrame* frame, int flags) {
|
||
// Never touch HW surfaces — those are owned by the hwframe pool.
|
||
if (s->hw_frames_ctx) {
|
||
return avcodec_default_get_buffer2(s, frame, flags);
|
||
}
|
||
// Only pack planar 8-bit 4:2:0. Everything else (NV12 from unpackers, 10-bit
|
||
// YUV, 4:2:2, 4:4:4, RGB, paletted, …) goes through the stock allocator.
|
||
if (frame->format != AV_PIX_FMT_YUV420P && frame->format != AV_PIX_FMT_YUVJ420P) {
|
||
return avcodec_default_get_buffer2(s, frame, flags);
|
||
}
|
||
if (frame->width <= 0 || frame->height <= 0) {
|
||
return avcodec_default_get_buffer2(s, frame, flags);
|
||
}
|
||
|
||
// Ask the codec for the minimum aligned dimensions it needs. For HEVC
|
||
// this typically rounds up to a multiple of 64 (the CTU size); for H.264
|
||
// to a multiple of 16. stride_align[i] is the per-plane linesize alignment.
|
||
int aligned_w = frame->width;
|
||
int aligned_h = frame->height;
|
||
int stride_align[AV_NUM_DATA_POINTERS] = {0};
|
||
avcodec_align_dimensions2(s, &aligned_w, &aligned_h, stride_align);
|
||
|
||
// Round up to the strictest stride_align across all planes (simpler and
|
||
// safe — FFmpeg only asks for alignment, not exact equality).
|
||
int max_align = 32;
|
||
for (int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
|
||
if (stride_align[i] > max_align) max_align = stride_align[i];
|
||
}
|
||
|
||
auto align_up = [](int v, int a) { return (v + a - 1) & ~(a - 1); };
|
||
|
||
const int y_stride = align_up(aligned_w, max_align);
|
||
const int uv_stride = align_up(aligned_w / 2, max_align / 2 > 0 ? max_align / 2 : 16);
|
||
const int y_h = aligned_h;
|
||
const int uv_h = (aligned_h + 1) / 2;
|
||
|
||
const size_t y_sz = (size_t)y_stride * y_h;
|
||
const size_t uv_sz = (size_t)uv_stride * uv_h;
|
||
const size_t total = y_sz + 2 * uv_sz + AV_INPUT_BUFFER_PADDING_SIZE;
|
||
|
||
uint8_t* buf = (uint8_t*)av_mallocz(total);
|
||
if (!buf) {
|
||
return AVERROR(ENOMEM);
|
||
}
|
||
|
||
AVBufferRef* ref = av_buffer_create(buf, (int)total,
|
||
anscore_contiguous_free, nullptr, 0);
|
||
if (!ref) {
|
||
av_free(buf);
|
||
return AVERROR(ENOMEM);
|
||
}
|
||
|
||
for (int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
|
||
frame->buf[i] = nullptr;
|
||
frame->data[i] = nullptr;
|
||
frame->linesize[i] = 0;
|
||
}
|
||
frame->buf[0] = ref;
|
||
frame->data[0] = buf;
|
||
frame->data[1] = buf + y_sz;
|
||
frame->data[2] = buf + y_sz + uv_sz;
|
||
frame->linesize[0] = y_stride;
|
||
frame->linesize[1] = uv_stride;
|
||
frame->linesize[2] = uv_stride;
|
||
frame->extended_data = frame->data;
|
||
|
||
return 0;
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// HWDecoderPool implementation
|
||
// ---------------------------------------------------------------------------
|
||
HWDecoderPool& HWDecoderPool::instance() {
|
||
static HWDecoderPool pool;
|
||
return pool;
|
||
}
|
||
|
||
void HWDecoderPool::configure(int numGpus, int maxPerGpu) {
|
||
// Uniform limit: same max for all GPUs
|
||
std::vector<int> limits(numGpus, maxPerGpu);
|
||
configure(limits);
|
||
}
|
||
|
||
void HWDecoderPool::configure(const std::vector<int>& maxPerGpuList) {
|
||
std::lock_guard<std::mutex> lock(m_mutex);
|
||
m_maxPerGpu = maxPerGpuList;
|
||
m_activePerGpu.assign(maxPerGpuList.size(), 0);
|
||
m_configured = true;
|
||
// Also update legacy global for backward compatibility
|
||
int total = 0;
|
||
for (int m : m_maxPerGpu) total += m;
|
||
g_hw_decoder_max = static_cast<uint32>(total);
|
||
for (int i = 0; i < static_cast<int>(m_maxPerGpu.size()); ++i) {
|
||
fprintf(stderr, "[HWDecode] HWDecoderPool: GPU[%d] max=%d sessions\n", i, m_maxPerGpu[i]);
|
||
}
|
||
fprintf(stderr, "[HWDecode] HWDecoderPool: configured %d GPU(s), %d total sessions\n",
|
||
static_cast<int>(m_maxPerGpu.size()), total);
|
||
}
|
||
|
||
bool HWDecoderPool::isConfigured() const {
|
||
return m_configured;
|
||
}
|
||
|
||
int HWDecoderPool::acquireSlot(int preferredGpu) {
|
||
std::lock_guard<std::mutex> lock(m_mutex);
|
||
if (!m_configured || m_activePerGpu.empty()) return -1;
|
||
|
||
// If caller requested a specific GPU (e.g. to match inference GPU for NV12 zero-copy),
|
||
// try that GPU first. This avoids cross-GPU device pointer access which causes
|
||
// "illegal memory access" sticky CUDA errors.
|
||
if (preferredGpu >= 0 && preferredGpu < static_cast<int>(m_activePerGpu.size())) {
|
||
if (m_activePerGpu[preferredGpu] < m_maxPerGpu[preferredGpu]) {
|
||
m_activePerGpu[preferredGpu]++;
|
||
fprintf(stderr, "[HWDecode] HWDecoderPool: acquired slot on PREFERRED GPU[%d] (%d/%d)\n",
|
||
preferredGpu, m_activePerGpu[preferredGpu], m_maxPerGpu[preferredGpu]);
|
||
return preferredGpu;
|
||
}
|
||
fprintf(stderr, "[HWDecode] HWDecoderPool: preferred GPU[%d] at capacity (%d/%d), falling back to least-loaded\n",
|
||
preferredGpu, m_activePerGpu[preferredGpu], m_maxPerGpu[preferredGpu]);
|
||
}
|
||
|
||
// Fallback: find the GPU with the fewest active sessions that still has capacity
|
||
int bestGpu = -1;
|
||
int bestCount = INT_MAX;
|
||
for (int i = 0; i < static_cast<int>(m_activePerGpu.size()); ++i) {
|
||
if (m_activePerGpu[i] < m_maxPerGpu[i] && m_activePerGpu[i] < bestCount) {
|
||
bestCount = m_activePerGpu[i];
|
||
bestGpu = i;
|
||
}
|
||
}
|
||
|
||
if (bestGpu >= 0) {
|
||
m_activePerGpu[bestGpu]++;
|
||
fprintf(stderr, "[HWDecode] HWDecoderPool: acquired slot on GPU[%d] (%d/%d)\n",
|
||
bestGpu, m_activePerGpu[bestGpu], m_maxPerGpu[bestGpu]);
|
||
}
|
||
return bestGpu;
|
||
}
|
||
|
||
void HWDecoderPool::releaseSlot(int gpuIndex) {
|
||
std::lock_guard<std::mutex> lock(m_mutex);
|
||
if (!m_configured) return;
|
||
if (gpuIndex >= 0 && gpuIndex < static_cast<int>(m_activePerGpu.size())) {
|
||
if (m_activePerGpu[gpuIndex] > 0) {
|
||
m_activePerGpu[gpuIndex]--;
|
||
fprintf(stderr, "[HWDecode] HWDecoderPool: released slot on GPU[%d] (%d/%d)\n",
|
||
gpuIndex, m_activePerGpu[gpuIndex], m_maxPerGpu[gpuIndex]);
|
||
}
|
||
}
|
||
}
|
||
|
||
int HWDecoderPool::getTotalMax() const {
|
||
int total = 0;
|
||
for (int m : m_maxPerGpu) total += m;
|
||
return total;
|
||
}
|
||
|
||
int HWDecoderPool::getTotalActive() const {
|
||
int total = 0;
|
||
for (int c : m_activePerGpu) total += c;
|
||
return total;
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// SharedHWDeviceCtx implementation
|
||
// ---------------------------------------------------------------------------
|
||
SharedHWDeviceCtx& SharedHWDeviceCtx::instance() {
|
||
static SharedHWDeviceCtx inst;
|
||
return inst;
|
||
}
|
||
|
||
SharedHWDeviceCtx::~SharedHWDeviceCtx() {
|
||
// Intentionally empty — do NOT release GPU/D3D11 resources here.
|
||
// This destructor runs during DLL_PROCESS_DETACH while the OS loader
|
||
// lock is held. Releasing D3D11/NVIDIA resources requires driver
|
||
// worker threads that also need the loader lock → deadlock.
|
||
// The OS reclaims all GPU resources when the process exits.
|
||
}
|
||
|
||
AVBufferRef* SharedHWDeviceCtx::acquire(int gpuIndex, AVHWDeviceType type) {
|
||
std::lock_guard<std::mutex> lock(m_mutex);
|
||
|
||
// Grow cache if needed
|
||
if (gpuIndex < 0) gpuIndex = 0;
|
||
if (static_cast<int>(m_cache.size()) <= gpuIndex) {
|
||
m_cache.resize(gpuIndex + 1);
|
||
}
|
||
|
||
GpuCtx& slot = m_cache[gpuIndex];
|
||
|
||
// If already created for this GPU and same type, return a new reference
|
||
if (slot.ctx && slot.type == type) {
|
||
fprintf(stderr, "[HWDecode] SharedHWDeviceCtx: reusing shared context for GPU[%d]\n", gpuIndex);
|
||
return av_buffer_ref(slot.ctx);
|
||
}
|
||
|
||
// Release old context if type changed
|
||
if (slot.ctx) {
|
||
av_buffer_unref(&slot.ctx);
|
||
slot.ctx = nullptr;
|
||
}
|
||
|
||
// Create new HW device context for this GPU
|
||
char adapterStr[16] = {};
|
||
snprintf(adapterStr, sizeof(adapterStr), "%d", gpuIndex);
|
||
|
||
int err = av_hwdevice_ctx_create(&slot.ctx, type, adapterStr, nullptr, 0);
|
||
if (err < 0) {
|
||
char error_buf[AV_ERROR_MAX_STRING_SIZE];
|
||
av_strerror(err, error_buf, sizeof(error_buf));
|
||
fprintf(stderr, "[HWDecode] SharedHWDeviceCtx: FAILED to create context for GPU[%d]: %s\n",
|
||
gpuIndex, error_buf);
|
||
slot.ctx = nullptr;
|
||
return nullptr;
|
||
}
|
||
|
||
slot.type = type;
|
||
fprintf(stderr, "[HWDecode] SharedHWDeviceCtx: created shared context for GPU[%d] type=%s\n",
|
||
gpuIndex, av_hwdevice_get_type_name(type));
|
||
|
||
// Return a new reference (caller owns it)
|
||
return av_buffer_ref(slot.ctx);
|
||
}
|
||
|
||
void SharedHWDeviceCtx::releaseAll() {
|
||
std::lock_guard<std::mutex> lock(m_mutex);
|
||
for (auto& slot : m_cache) {
|
||
if (slot.ctx) {
|
||
av_buffer_unref(&slot.ctx);
|
||
slot.ctx = nullptr;
|
||
}
|
||
}
|
||
m_cache.clear();
|
||
}
|
||
|
||
enum AVPixelFormat getHWFormat(AVCodecContext* ctx, const enum AVPixelFormat* pix_fmts)
|
||
{
|
||
CVideoDecoder* pthis = (CVideoDecoder*)ctx->opaque;
|
||
|
||
AVPixelFormat dst_pix_fmt = AV_PIX_FMT_NONE;
|
||
|
||
pthis->getHWFormat(ctx, pix_fmts, &dst_pix_fmt);
|
||
|
||
return dst_pix_fmt;
|
||
}
|
||
|
||
enum AVPixelFormat get_hw_format(AVCodecContext* ctx, const enum AVPixelFormat* pix_fmts)
|
||
{
|
||
for (const enum AVPixelFormat* p = pix_fmts; *p != -1; p++) {
|
||
if (*p == AV_PIX_FMT_YUVJ420P) {
|
||
return AV_PIX_FMT_YUVJ420P;
|
||
}
|
||
}
|
||
// If YUVJ420P is not available, fall back to default
|
||
return ctx->pix_fmt;
|
||
}
|
||
|
||
CVideoDecoder::CVideoDecoder()
|
||
{
|
||
m_bInited = FALSE;
|
||
m_bRunning = FALSE;
|
||
m_bHardwareDecoderEnabled = FALSE;
|
||
m_bCudaHWAccel = false;
|
||
m_hwGpuIndex = -1;
|
||
|
||
m_pCodec = NULL;
|
||
m_pContext = NULL;
|
||
m_pFrame = NULL;
|
||
m_pSoftFrame = NULL;
|
||
m_pCudaHWFrame = NULL;
|
||
|
||
m_pCallback = NULL;
|
||
m_pUserdata = NULL;
|
||
|
||
m_hwPixFmt = AV_PIX_FMT_NONE;
|
||
m_pHWDeviceCtx = NULL;
|
||
}
|
||
|
||
AVFrame* CVideoDecoder::takeCudaHWFrame() {
|
||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||
AVFrame* result = m_pCudaHWFrame;
|
||
m_pCudaHWFrame = nullptr;
|
||
return result;
|
||
}
|
||
|
||
AVFrame* CVideoDecoder::cloneCudaHWFrame_unlocked() {
|
||
// Caller MUST already hold _mutex (called from decode thread's callback chain).
|
||
// Returns a clone so the original m_pCudaHWFrame stays valid for the decode loop.
|
||
return m_pCudaHWFrame ? av_frame_clone(m_pCudaHWFrame) : nullptr;
|
||
}
|
||
|
||
CVideoDecoder::~CVideoDecoder()
|
||
{
|
||
uninit();
|
||
}
|
||
|
||
void CVideoDecoder::uninit()
|
||
{
|
||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||
|
||
// [MEDIA_DecClose] heartbeat — paired with [MEDIA_DecInit] for leak diagnosis.
|
||
// Pair count over a long run reveals whether avcodec_open2 calls are
|
||
// matched by full teardowns. If close-count < init-count, the FFmpeg
|
||
// codec context (and its custom get_buffer2 arena) is leaking per reopen.
|
||
{
|
||
static std::atomic<uint64_t> s_closeCount{0};
|
||
const uint64_t n = s_closeCount.fetch_add(1) + 1;
|
||
ANS_DBG("MEDIA_DecClose",
|
||
"uninit ENTRY #%llu inited=%d codec=%s %dx%d hwEnabled=%d cudaHW=%d gpu=%d (this=%p)",
|
||
(unsigned long long)n,
|
||
(int)m_bInited,
|
||
(m_pCodec && m_pCodec->name) ? m_pCodec->name : "?",
|
||
m_pContext ? m_pContext->width : 0,
|
||
m_pContext ? m_pContext->height : 0,
|
||
(int)m_bHardwareDecoderEnabled,
|
||
(int)m_bCudaHWAccel,
|
||
m_hwGpuIndex,
|
||
(void*)this);
|
||
}
|
||
|
||
// Stop processing first
|
||
// Backup first
|
||
BOOL wasRunning = m_bRunning;
|
||
m_bRunning = FALSE;
|
||
|
||
flush();
|
||
|
||
// FIXED: Clean up frames before context to avoid use-after-free
|
||
if (m_pFrame)
|
||
{
|
||
av_frame_free(&m_pFrame);
|
||
m_pFrame = NULL;
|
||
}
|
||
|
||
if (m_pSoftFrame)
|
||
{
|
||
av_frame_free(&m_pSoftFrame);
|
||
m_pSoftFrame = NULL;
|
||
}
|
||
|
||
if (m_pCudaHWFrame)
|
||
{
|
||
av_frame_free(&m_pCudaHWFrame);
|
||
m_pCudaHWFrame = NULL;
|
||
}
|
||
|
||
if (m_pContext)
|
||
{
|
||
// FIXED: Free extradata before freeing context
|
||
if (m_pContext->extradata) {
|
||
av_free(m_pContext->extradata);
|
||
m_pContext->extradata = NULL;
|
||
m_pContext->extradata_size = 0;
|
||
}
|
||
|
||
// FIXED: Properly release hardware context reference
|
||
if (m_pContext->hw_device_ctx) {
|
||
av_buffer_unref(&m_pContext->hw_device_ctx);
|
||
m_pContext->hw_device_ctx = NULL;
|
||
}
|
||
|
||
// FIXED: Close codec before freeing context
|
||
avcodec_close(m_pContext);
|
||
avcodec_free_context(&m_pContext);
|
||
m_pContext = NULL;
|
||
}
|
||
|
||
// Only decrement hardware decoder count if it was actually enabled
|
||
if (m_pHWDeviceCtx && m_bHardwareDecoderEnabled)
|
||
{
|
||
av_buffer_unref(&m_pHWDeviceCtx);
|
||
m_pHWDeviceCtx = NULL;
|
||
|
||
// Release via per-GPU pool or legacy global counter
|
||
HWDecoderPool& pool = HWDecoderPool::instance();
|
||
if (pool.isConfigured() && m_hwGpuIndex >= 0) {
|
||
pool.releaseSlot(m_hwGpuIndex);
|
||
} else {
|
||
CLock hw_lock(g_hw_decoder_mutex);
|
||
if (g_hw_decoder_nums > 0) {
|
||
g_hw_decoder_nums--;
|
||
}
|
||
}
|
||
m_hwGpuIndex = -1;
|
||
m_bHardwareDecoderEnabled = FALSE;
|
||
}
|
||
// Restore running state if needed
|
||
m_bRunning = wasRunning;
|
||
m_pCodec = NULL;
|
||
m_bInited = FALSE;
|
||
}
|
||
|
||
BOOL CVideoDecoder::init(enum AVCodecID codec, uint8* extradata, int extradata_size, int hwMode, int preferredGpu)
|
||
{
|
||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||
|
||
// Clean up any existing state
|
||
if (m_bInited) {
|
||
uninit();
|
||
}
|
||
|
||
int width = 0;
|
||
int height = 0;
|
||
|
||
if (extradata && extradata_size > 0)
|
||
{
|
||
int vcodec = VIDEO_CODEC_NONE;
|
||
|
||
if (AV_CODEC_ID_H264 == codec)
|
||
{
|
||
vcodec = VIDEO_CODEC_H264;
|
||
}
|
||
else if (AV_CODEC_ID_HEVC == codec)
|
||
{
|
||
vcodec = VIDEO_CODEC_H265;
|
||
}
|
||
else if (AV_CODEC_ID_MJPEG == codec)
|
||
{
|
||
vcodec = VIDEO_CODEC_JPEG;
|
||
}
|
||
else if (AV_CODEC_ID_MPEG4 == codec)
|
||
{
|
||
vcodec = VIDEO_CODEC_MP4;
|
||
}
|
||
|
||
avc_parse_video_size(vcodec, extradata, extradata_size, &width, &height);
|
||
}
|
||
|
||
#ifdef ANDROID
|
||
if (HW_DECODING_DISABLE != hwMode && width * height >= 320 * 240)
|
||
{
|
||
if (AV_CODEC_ID_H264 == codec)
|
||
{
|
||
m_pCodec = avcodec_find_decoder_by_name("h264_mediacodec");
|
||
}
|
||
else if (AV_CODEC_ID_HEVC == codec)
|
||
{
|
||
m_pCodec = avcodec_find_decoder_by_name("hevc_mediacodec");
|
||
}
|
||
else if (AV_CODEC_ID_MPEG4 == codec)
|
||
{
|
||
m_pCodec = avcodec_find_decoder_by_name("mpeg4_mediacodec");
|
||
}
|
||
}
|
||
|
||
if (NULL == m_pCodec)
|
||
{
|
||
m_pCodec = avcodec_find_decoder(codec);
|
||
}
|
||
#else
|
||
m_pCodec = avcodec_find_decoder(codec);
|
||
#endif
|
||
|
||
if (NULL == m_pCodec)
|
||
{
|
||
log_print(HT_LOG_ERR, "%s, m_pCodec is NULL for codec %d\r\n", __FUNCTION__, codec);
|
||
return FALSE;
|
||
}
|
||
|
||
m_pContext = avcodec_alloc_context3(m_pCodec);
|
||
if (NULL == m_pContext)
|
||
{
|
||
log_print(HT_LOG_ERR, "%s, avcodec_alloc_context3 failed\r\n", __FUNCTION__);
|
||
return FALSE;
|
||
}
|
||
|
||
m_pContext->width = width;
|
||
m_pContext->height = height;
|
||
|
||
m_pContext->flags |= AV_CODEC_FLAG_LOW_DELAY;
|
||
m_pContext->flags2 |= AV_CODEC_FLAG2_FAST;
|
||
m_pContext->flags |= AV_CODEC_FLAG_OUTPUT_CORRUPT;
|
||
m_pContext->err_recognition = AV_EF_IGNORE_ERR;
|
||
|
||
av_opt_set_int(m_pContext, "refcounted_frames", 1, 0);
|
||
|
||
// Initialize hardware decoder
|
||
if (HW_DECODING_DISABLE != hwMode) {
|
||
int hw_ret = hwDecoderInit(m_pContext, hwMode, preferredGpu);
|
||
if (hw_ret < 0) {
|
||
log_print(HT_LOG_WARN, "%s, hwDecoderInit failed with error %d, falling back to software decoding\r\n", __FUNCTION__, hw_ret);
|
||
}
|
||
}
|
||
|
||
// Handle extradata
|
||
if (extradata && extradata_size > 0)
|
||
{
|
||
int size = extradata_size + AV_INPUT_BUFFER_PADDING_SIZE;
|
||
|
||
m_pContext->extradata = (uint8*)av_mallocz(size);
|
||
if (m_pContext->extradata)
|
||
{
|
||
m_pContext->extradata_size = extradata_size;
|
||
memcpy(m_pContext->extradata, extradata, extradata_size);
|
||
}
|
||
else
|
||
{
|
||
log_print(HT_LOG_ERR, "%s, Failed to allocate extradata\r\n", __FUNCTION__);
|
||
uninit(); // FIXED: Clean up on failure
|
||
return FALSE;
|
||
}
|
||
}
|
||
|
||
// Configure multi-threading for the SOFTWARE decoder.
|
||
// Hardware decoders (NVDEC, DXVA2/D3D11VA, QSV, VideoToolbox) do their
|
||
// own parallelism inside the GPU/fixed-function block and ignore these
|
||
// fields — so we only enable threading when HW init was skipped (hwMode
|
||
// == HW_DECODING_DISABLE) or failed (fell back to SW).
|
||
//
|
||
// Without this, libavcodec's HEVC/H.264 decoder runs on a single core,
|
||
// which on 4K HEVC streams is ~80–120 ms per frame. Frame + slice
|
||
// threading on a 24-thread CPU typically brings that down to 10–20 ms.
|
||
// thread_count = 0 lets FFmpeg auto-pick (capped internally ~16).
|
||
if (!m_bHardwareDecoderEnabled) {
|
||
m_pContext->thread_count = 0;
|
||
m_pContext->thread_type = FF_THREAD_FRAME | FF_THREAD_SLICE;
|
||
|
||
// Install contiguous Y+U+V allocator. This packs all three planes
|
||
// into a single av_malloc block so the BGR-conversion fast path
|
||
// (avframeYUV420PToCvMat) can either wrap the frame zero-copy, or
|
||
// at minimum hit a tight 3-call bulk memcpy with good cache locality
|
||
// instead of per-row copies into a freshly allocated staging Mat.
|
||
// HW decoders must NEVER have get_buffer2 overridden — they use
|
||
// hw_frames_ctx for surface management.
|
||
m_pContext->get_buffer2 = &CVideoDecoder::contiguousGetBuffer2;
|
||
}
|
||
|
||
// FIXED: Use avcodec_open2 instead of avcodec_thread_open
|
||
if (avcodec_open2(m_pContext, m_pCodec, NULL) < 0)
|
||
{
|
||
log_print(HT_LOG_ERR, "%s, avcodec_open2 failed\r\n", __FUNCTION__);
|
||
uninit(); // FIXED: Clean up on failure
|
||
return FALSE;
|
||
}
|
||
|
||
// Debug: one-shot visibility into which decoder actually got opened.
|
||
// m_bHardwareDecoderEnabled is set by hwDecoderInit() on success; when
|
||
// hwMode == HW_DECODING_DISABLE or hwDecoderInit failed, it stays FALSE
|
||
// and the SW decoder (avcodec_find_decoder) is used.
|
||
// active_thread_type is what FFmpeg actually negotiated after open2
|
||
// (bit 1 = FF_THREAD_FRAME, bit 2 = FF_THREAD_SLICE).
|
||
ANS_DBG("MEDIA_DecInit",
|
||
"avcodec_open2 OK codec=%s(%s) %dx%d hwMode=%d hwEnabled=%d cudaHW=%d gpu=%d "
|
||
"threads=%d thread_type_req=0x%x active=0x%x -> %s decoder",
|
||
m_pCodec->name ? m_pCodec->name : "?",
|
||
m_pCodec->long_name ? m_pCodec->long_name : "?",
|
||
m_pContext->width, m_pContext->height,
|
||
hwMode,
|
||
(int)m_bHardwareDecoderEnabled,
|
||
(int)m_bCudaHWAccel,
|
||
m_hwGpuIndex,
|
||
m_pContext->thread_count,
|
||
m_pContext->thread_type,
|
||
m_pContext->active_thread_type,
|
||
m_bHardwareDecoderEnabled ? "HARDWARE" : "SOFTWARE");
|
||
|
||
m_pFrame = av_frame_alloc();
|
||
if (NULL == m_pFrame)
|
||
{
|
||
log_print(HT_LOG_ERR, "%s, av_frame_alloc failed for m_pFrame\r\n", __FUNCTION__);
|
||
uninit(); // FIXED: Clean up on failure
|
||
return FALSE;
|
||
}
|
||
|
||
m_pSoftFrame = av_frame_alloc();
|
||
if (NULL == m_pSoftFrame)
|
||
{
|
||
log_print(HT_LOG_ERR, "%s, av_frame_alloc failed for m_pSoftFrame\r\n", __FUNCTION__);
|
||
uninit(); // FIXED: Clean up on failure
|
||
return FALSE;
|
||
}
|
||
|
||
m_bInited = TRUE;
|
||
//m_bRunning = TRUE;
|
||
return TRUE;
|
||
}
|
||
|
||
BOOL CVideoDecoder::init(int codec, uint8* extradata, int extradata_size, int hwMode, int preferredGpu)
|
||
{
|
||
BOOL result = init(to_video_avcodecid(codec), extradata, extradata_size, hwMode, preferredGpu);
|
||
if (result) {
|
||
m_bRunning = TRUE; // Set running only if initialization succeeded
|
||
}
|
||
return result;
|
||
}
|
||
|
||
int CVideoDecoder::hwDecoderInit(AVCodecContext* ctx, int hwMode, int preferredGpu) {
|
||
int err = 0;
|
||
std::string hwtype;
|
||
enum AVHWDeviceType type = AV_HWDEVICE_TYPE_NONE;
|
||
|
||
if (hwMode == HW_DECODING_DISABLE) {
|
||
return 0; // Hardware decoding is disabled
|
||
}
|
||
|
||
// -- Per-GPU pool path (preferred) or legacy global path ----------------
|
||
int assignedGpu = -1;
|
||
HWDecoderPool& pool = HWDecoderPool::instance();
|
||
|
||
if (pool.isConfigured()) {
|
||
// Per-GPU: acquire a slot, preferring the caller's requested GPU
|
||
// (e.g. inference GPU for NV12 zero-copy alignment)
|
||
assignedGpu = pool.acquireSlot(preferredGpu);
|
||
if (assignedGpu < 0) {
|
||
log_print(HT_LOG_WARN, "%s, All GPU HW decoder slots full (%d/%d total)\r\n",
|
||
__FUNCTION__, pool.getTotalActive(), pool.getTotalMax());
|
||
return -1;
|
||
}
|
||
} else {
|
||
// Legacy: single global counter
|
||
CLock lock(g_hw_decoder_mutex);
|
||
if (g_hw_decoder_max > 0 && g_hw_decoder_nums >= g_hw_decoder_max) {
|
||
log_print(HT_LOG_WARN, "%s, Maximum number of hardware decoders reached (%d/%d)\r\n",
|
||
__FUNCTION__, g_hw_decoder_nums, g_hw_decoder_max);
|
||
return -1;
|
||
}
|
||
}
|
||
|
||
// Determine the hardware type based on platform and hardware mode
|
||
if (!getHardwareTypeForPlatform(hwMode, hwtype)) {
|
||
log_print(HT_LOG_WARN, "%s, Unsupported hardware mode %d for the current platform\r\n", __FUNCTION__, hwMode);
|
||
if (assignedGpu >= 0) pool.releaseSlot(assignedGpu);
|
||
return -1;
|
||
}
|
||
|
||
// Find the hardware device type by name
|
||
type = av_hwdevice_find_type_by_name(hwtype.c_str());
|
||
if (type == AV_HWDEVICE_TYPE_NONE) {
|
||
log_print(HT_LOG_WARN, "%s, Hardware device type %s is not supported\r\n", __FUNCTION__, hwtype.c_str());
|
||
logSupportedHwTypes();
|
||
if (assignedGpu >= 0) pool.releaseSlot(assignedGpu);
|
||
return -1;
|
||
}
|
||
|
||
// Find a hardware configuration that supports the specified device type
|
||
if (!findHwConfigForDeviceType(type)) {
|
||
log_print(HT_LOG_WARN, "%s, Decoder %s does not support the specified hardware device type %s\r\n",
|
||
__FUNCTION__, m_pCodec->long_name, av_hwdevice_get_type_name(type));
|
||
if (assignedGpu >= 0) pool.releaseSlot(assignedGpu);
|
||
return -1;
|
||
}
|
||
|
||
// Get or create a shared HW device context for this GPU.
|
||
// NVIDIA recommends sharing CUDA contexts across decode sessions to minimize
|
||
// GPU memory overhead (each CUDA context costs ~50-100MB).
|
||
// See: NVDEC Video Decoder API Programming Guide, Section "Multi-session decoding"
|
||
int gpuIdx = (assignedGpu >= 0) ? assignedGpu : 0;
|
||
m_pHWDeviceCtx = SharedHWDeviceCtx::instance().acquire(gpuIdx, type);
|
||
if (!m_pHWDeviceCtx) {
|
||
log_print(HT_LOG_ERR, "%s, Failed to acquire shared HW device context, type=%s, gpu=%d\r\n",
|
||
__FUNCTION__, av_hwdevice_get_type_name(type), gpuIdx);
|
||
if (assignedGpu >= 0) pool.releaseSlot(assignedGpu);
|
||
return -1;
|
||
}
|
||
|
||
// Configure the codec context to use the shared hardware device
|
||
ctx->opaque = this;
|
||
ctx->get_format = ::getHWFormat;
|
||
ctx->hw_device_ctx = av_buffer_ref(m_pHWDeviceCtx);
|
||
ctx->err_recognition = AV_EF_IGNORE_ERR;
|
||
ctx->flags2 |= AV_CODEC_FLAG2_EXPORT_MVS;
|
||
|
||
// Reserve extra NVDEC surfaces for application-held av_frame_clone() references.
|
||
// The clone chain (decoder → player → registry) holds ~2 surfaces simultaneously
|
||
// (decoder's clone + registry's clone; getCudaHWFrame uses ownership transfer).
|
||
// Without this, the default pool (num_decode_surfaces + 2) can run out under
|
||
// load with many concurrent streams, causing the decoder to stall.
|
||
ctx->extra_hw_frames = 2;
|
||
|
||
// Track which GPU this decoder is on
|
||
m_hwGpuIndex = assignedGpu;
|
||
m_bHardwareDecoderEnabled = TRUE;
|
||
m_bCudaHWAccel = (type == AV_HWDEVICE_TYPE_CUDA);
|
||
|
||
// Legacy counter (for backward compatibility)
|
||
if (!pool.isConfigured()) {
|
||
CLock lock(g_hw_decoder_mutex);
|
||
g_hw_decoder_nums++;
|
||
}
|
||
|
||
log_print(HT_LOG_INFO, "%s, Successfully initialized hardware decoder %s on GPU[%d] (%d/%d)\r\n",
|
||
__FUNCTION__, av_hwdevice_get_type_name(type),
|
||
gpuIdx,
|
||
pool.isConfigured() ? pool.getTotalActive() : g_hw_decoder_nums,
|
||
pool.isConfigured() ? pool.getTotalMax() : g_hw_decoder_max);
|
||
|
||
return 0;
|
||
}
|
||
|
||
void CVideoDecoder::Start() {
|
||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||
if (m_pContext) {
|
||
avcodec_flush_buffers(m_pContext);
|
||
}
|
||
m_bRunning = TRUE;
|
||
log_print(HT_LOG_INFO, "%s, Video decoder started\r\n", __FUNCTION__);
|
||
}
|
||
|
||
void CVideoDecoder::Stop() {
|
||
// Atomically signal the decoder to stop WITHOUT acquiring _mutex.
|
||
// decode() holds _mutex while inside avcodec_send_packet / CUDA calls
|
||
// that can block on the nvcuda64 SRW lock for a long time.
|
||
// If we waited for _mutex here, Stop() would deadlock whenever a
|
||
// concurrent decode() is stuck waiting for a CUDA operation held by
|
||
// an inference thread.
|
||
m_bRunning.store(FALSE, std::memory_order_release);
|
||
log_print(HT_LOG_INFO, "%s, Video decoder stopped\r\n", __FUNCTION__);
|
||
}
|
||
|
||
// Log all supported hardware types
|
||
void CVideoDecoder::logSupportedHwTypes() {
|
||
enum AVHWDeviceType type = AV_HWDEVICE_TYPE_NONE;
|
||
log_print(HT_LOG_INFO, "%s, Available hardware device types:\r\n", __FUNCTION__);
|
||
while ((type = av_hwdevice_iterate_types(type)) != AV_HWDEVICE_TYPE_NONE) {
|
||
log_print(HT_LOG_INFO, "%s, - %s\r\n", __FUNCTION__, av_hwdevice_get_type_name(type));
|
||
}
|
||
}
|
||
|
||
// Platform-specific function to determine the hardware type based on the mode
|
||
bool CVideoDecoder::getHardwareTypeForPlatform(int hwMode, std::string& hwtype) {
|
||
#if __WINDOWS_OS__
|
||
switch (hwMode) {
|
||
case HW_DECODING_D3D11: hwtype = "d3d11va"; break;
|
||
case HW_DECODING_DXVA: hwtype = "dxva2"; break;
|
||
case HW_DECODING_CUDA: hwtype = "cuda"; break;
|
||
case HW_DECODING_AUTO:
|
||
hwtype = "cuda";
|
||
if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) {
|
||
hwtype = "d3d11va";
|
||
if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) {
|
||
hwtype = "dxva2";
|
||
}
|
||
}
|
||
break;
|
||
default: return false;
|
||
}
|
||
#elif defined(IOS)
|
||
switch (hwMode) {
|
||
case HW_DECODING_VIDEOTOOLBOX: hwtype = "videotoolbox"; break;
|
||
case HW_DECODING_OPENCL: hwtype = "opencl"; break;
|
||
case HW_DECODING_AUTO:
|
||
hwtype = "videotoolbox";
|
||
if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) {
|
||
hwtype = "opencl";
|
||
}
|
||
break;
|
||
default: return false;
|
||
}
|
||
#elif defined(ANDROID)
|
||
if (hwMode == HW_DECODING_MEDIACODEC || hwMode == HW_DECODING_AUTO) {
|
||
hwtype = "mediacodec";
|
||
}
|
||
else {
|
||
return false;
|
||
}
|
||
#elif __LINUX_OS__
|
||
switch (hwMode) {
|
||
case HW_DECODING_VAAPI: hwtype = "vaapi"; break;
|
||
case HW_DECODING_OPENCL: hwtype = "opencl"; break;
|
||
case HW_DECODING_AUTO:
|
||
hwtype = "vaapi";
|
||
if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) {
|
||
hwtype = "opencl";
|
||
}
|
||
break;
|
||
default: return false;
|
||
}
|
||
#else
|
||
return false; // Unsupported platform
|
||
#endif
|
||
return true;
|
||
}
|
||
|
||
// Find a hardware configuration that matches the specified device type
|
||
bool CVideoDecoder::findHwConfigForDeviceType(AVHWDeviceType type) {
|
||
for (int i = 0;; i++) {
|
||
const AVCodecHWConfig* config = avcodec_get_hw_config(m_pCodec, i);
|
||
if (!config) {
|
||
return false; // No matching hardware configuration found
|
||
}
|
||
if (config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX && config->device_type == type) {
|
||
m_hwPixFmt = config->pix_fmt;
|
||
return true;
|
||
}
|
||
}
|
||
}
|
||
|
||
BOOL CVideoDecoder::getHWFormat(AVCodecContext* ctx, const AVPixelFormat* pix_fmts, AVPixelFormat* dst)
|
||
{
|
||
const AVPixelFormat* p;
|
||
*dst = AV_PIX_FMT_NONE;
|
||
|
||
// First, attempt to use hardware pixel format if available
|
||
for (p = pix_fmts; *p != -1; p++)
|
||
{
|
||
if (*p == m_hwPixFmt)
|
||
{
|
||
*dst = *p;
|
||
return TRUE;
|
||
}
|
||
}
|
||
|
||
// If hardware format is not supported, fall back to YUVJ420P
|
||
for (p = pix_fmts; *p != -1; p++)
|
||
{
|
||
if (*p == AV_PIX_FMT_YUVJ420P ||
|
||
*p == AV_PIX_FMT_YUVJ422P ||
|
||
*p == AV_PIX_FMT_YUVJ444P)
|
||
{
|
||
*dst = *p;
|
||
return TRUE;
|
||
}
|
||
}
|
||
|
||
// As a last resort, use other formats (YUV420P)
|
||
for (p = pix_fmts; *p != -1; p++)
|
||
{
|
||
if (*p == AV_PIX_FMT_YUV420P ||
|
||
*p == AV_PIX_FMT_YUV422P ||
|
||
*p == AV_PIX_FMT_YUV444P)
|
||
{
|
||
*dst = *p;
|
||
return TRUE;
|
||
}
|
||
}
|
||
|
||
if (*pix_fmts != -1)
|
||
{
|
||
*dst = *pix_fmts;
|
||
return TRUE;
|
||
}
|
||
|
||
log_print(HT_LOG_ERR, "%s, Failed to get HW surface format\r\n", __FUNCTION__);
|
||
|
||
return FALSE;
|
||
}
|
||
|
||
int CVideoDecoder::getWidth()
|
||
{
|
||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||
if (m_pContext)
|
||
{
|
||
return m_pContext->width;
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
int CVideoDecoder::getHeight()
|
||
{
|
||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||
if (m_pContext)
|
||
{
|
||
return m_pContext->height;
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
double CVideoDecoder::getFrameRate()
|
||
{
|
||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||
if (m_pContext)
|
||
{
|
||
if (m_pContext->framerate.den > 0)
|
||
{
|
||
return (double)((double)(m_pContext->framerate.num) / m_pContext->framerate.den);
|
||
}
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
BOOL CVideoDecoder::decode(AVPacket* pkt)
|
||
{
|
||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||
|
||
if (!m_bInited)
|
||
{
|
||
log_print(HT_LOG_ERR, "%s, Decoder not initialized\r\n", __FUNCTION__);
|
||
return FALSE;
|
||
}
|
||
|
||
if (!m_bRunning)
|
||
{
|
||
log_print(HT_LOG_WARN, "%s, Decoder not running\r\n", __FUNCTION__);
|
||
return FALSE;
|
||
}
|
||
|
||
if (!m_pContext) {
|
||
log_print(HT_LOG_ERR, "%s, Context is NULL\r\n", __FUNCTION__);
|
||
return FALSE;
|
||
}
|
||
|
||
int ret;
|
||
int retryCount = 0;
|
||
const int maxRetries = 3;
|
||
|
||
// Attempt to send packet to decoder
|
||
while ((ret = avcodec_send_packet(m_pContext, pkt)) == AVERROR(EAGAIN) &&
|
||
retryCount < maxRetries)
|
||
{
|
||
if (!readFrame())
|
||
{
|
||
log_print(HT_LOG_ERR, "%s, Failed to read frame during retry %d\r\n", __FUNCTION__, retryCount);
|
||
return FALSE;
|
||
}
|
||
Sleep(1); // Reduced sleep time
|
||
retryCount++;
|
||
}
|
||
|
||
// Check for other errors
|
||
if (ret < 0 && ret != AVERROR_EOF)
|
||
{
|
||
char error_buf[AV_ERROR_MAX_STRING_SIZE];
|
||
av_strerror(ret, error_buf, sizeof(error_buf));
|
||
log_print(HT_LOG_ERR, "%s, avcodec_send_packet failed: %s (ret=%d)\r\n", __FUNCTION__, error_buf, ret);
|
||
return FALSE;
|
||
}
|
||
|
||
// If the packet was successfully sent, proceed to read frame
|
||
return readFrame();
|
||
}
|
||
BOOL CVideoDecoder::decode(uint8* data, int len, int64_t pts)
|
||
{
|
||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||
|
||
if (!m_bInited)
|
||
{
|
||
log_print(HT_LOG_ERR, "%s, Decoder not initialized\r\n", __FUNCTION__);
|
||
return FALSE;
|
||
}
|
||
|
||
if (!m_bRunning)
|
||
{
|
||
log_print(HT_LOG_WARN, "%s, Decoder not running\r\n", __FUNCTION__);
|
||
return FALSE;
|
||
}
|
||
|
||
if (!data || len <= 0) {
|
||
log_print(HT_LOG_ERR, "%s, Invalid input data\r\n", __FUNCTION__);
|
||
return FALSE;
|
||
}
|
||
|
||
// Allocate packet
|
||
AVPacket* packet = av_packet_alloc();
|
||
if (!packet) {
|
||
log_print(HT_LOG_ERR, "%s, Failed to allocate AVPacket\r\n", __FUNCTION__);
|
||
return FALSE;
|
||
}
|
||
|
||
// FIXED: Use av_new_packet() to properly allocate and manage packet data
|
||
int ret = av_new_packet(packet, len);
|
||
if (ret < 0) {
|
||
char error_buf[AV_ERROR_MAX_STRING_SIZE];
|
||
av_strerror(ret, error_buf, sizeof(error_buf));
|
||
log_print(HT_LOG_ERR, "%s, Failed to allocate packet data: %s\r\n", __FUNCTION__, error_buf);
|
||
av_packet_free(&packet);
|
||
return FALSE;
|
||
}
|
||
|
||
// Copy data - av_new_packet() already allocated the buffer with proper padding
|
||
memcpy(packet->data, data, len);
|
||
|
||
// Set packet timing information
|
||
packet->pts = pts;
|
||
packet->dts = pts;
|
||
|
||
// Call decode function
|
||
BOOL result = decode(packet);
|
||
|
||
// Clean up - av_packet_free will properly handle the data buffer
|
||
av_packet_free(&packet);
|
||
|
||
return result;
|
||
}
|
||
|
||
BOOL CVideoDecoder::readFrame()
|
||
{
|
||
int ret = 0;
|
||
AVFrame* tmp_frame = NULL;
|
||
BOOL frame_processed = FALSE;
|
||
|
||
while (ret >= 0)
|
||
{
|
||
ret = avcodec_receive_frame(m_pContext, m_pFrame);
|
||
if (ret == AVERROR(EAGAIN)) {
|
||
// Need more input data
|
||
return frame_processed ? TRUE : FALSE;
|
||
}
|
||
else if (ret == AVERROR_EOF) {
|
||
// End of stream
|
||
return TRUE;
|
||
}
|
||
else if (ret < 0) {
|
||
char error_buf[AV_ERROR_MAX_STRING_SIZE];
|
||
av_strerror(ret, error_buf, sizeof(error_buf));
|
||
log_print(HT_LOG_ERR, "%s, avcodec_receive_frame failed: %s (ret=%d)\r\n", __FUNCTION__, error_buf, ret);
|
||
return FALSE;
|
||
}
|
||
|
||
// Check if we got a valid frame
|
||
if (!m_pFrame || m_pFrame->width <= 0 || m_pFrame->height <= 0) {
|
||
log_print(HT_LOG_WARN, "%s, Received invalid frame\r\n", __FUNCTION__);
|
||
av_frame_unref(m_pFrame);
|
||
continue;
|
||
}
|
||
|
||
if (m_pFrame->format == m_hwPixFmt)
|
||
{
|
||
// CUDA HW accel: clone the HW frame BEFORE transfer so inference
|
||
// can use CUDA device pointers directly (zero-copy, no upload).
|
||
if (m_bCudaHWAccel) {
|
||
if (m_pCudaHWFrame) av_frame_free(&m_pCudaHWFrame);
|
||
m_pCudaHWFrame = av_frame_clone(m_pFrame);
|
||
}
|
||
|
||
// FIXED: Ensure m_pSoftFrame is properly initialized before transfer
|
||
av_frame_unref(m_pSoftFrame); // Clear any previous data
|
||
|
||
// Hardware frame - transfer to software (needed for display)
|
||
ret = av_hwframe_transfer_data(m_pSoftFrame, m_pFrame, 0);
|
||
if (ret < 0)
|
||
{
|
||
char error_buf[AV_ERROR_MAX_STRING_SIZE];
|
||
av_strerror(ret, error_buf, sizeof(error_buf));
|
||
log_print(HT_LOG_ERR, "%s, Error transferring hardware frame to system memory: %s (ret=%d)\r\n",
|
||
__FUNCTION__, error_buf, ret);
|
||
av_frame_unref(m_pFrame);
|
||
if (m_pCudaHWFrame) av_frame_free(&m_pCudaHWFrame);
|
||
continue;
|
||
}
|
||
|
||
// Copy timing information
|
||
m_pSoftFrame->pts = m_pFrame->pts;
|
||
m_pSoftFrame->pkt_dts = m_pFrame->pkt_dts;
|
||
m_pSoftFrame->best_effort_timestamp = m_pFrame->best_effort_timestamp;
|
||
|
||
tmp_frame = m_pSoftFrame;
|
||
}
|
||
else
|
||
{
|
||
// Software frame - use directly
|
||
tmp_frame = m_pFrame;
|
||
}
|
||
|
||
// Render the frame
|
||
if (tmp_frame) {
|
||
render(tmp_frame);
|
||
frame_processed = TRUE;
|
||
}
|
||
|
||
// FIXED: Ensure proper cleanup of frame references
|
||
if (tmp_frame == m_pSoftFrame) {
|
||
av_frame_unref(m_pSoftFrame);
|
||
}
|
||
av_frame_unref(m_pFrame);
|
||
}
|
||
|
||
return TRUE;
|
||
}
|
||
|
||
int CVideoDecoder::render(AVFrame* frame)
|
||
{
|
||
if (!m_bRunning || !frame)
|
||
{
|
||
return 0;
|
||
}
|
||
|
||
if (m_pCallback)
|
||
{
|
||
try {
|
||
m_pCallback(frame, m_pUserdata);
|
||
}
|
||
catch (...) {
|
||
log_print(HT_LOG_ERR, "%s, Exception in callback function\r\n", __FUNCTION__);
|
||
return 0;
|
||
}
|
||
}
|
||
return 1;
|
||
}
|
||
|
||
void CVideoDecoder::flush()
|
||
{
|
||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||
|
||
if (NULL == m_pContext ||
|
||
NULL == m_pContext->codec ||
|
||
!(m_pContext->codec->capabilities & AV_CODEC_CAP_DELAY))
|
||
{
|
||
return;
|
||
}
|
||
|
||
log_print(HT_LOG_INFO, "%s, Flushing decoder buffers\r\n", __FUNCTION__);
|
||
|
||
// Send NULL packet to flush
|
||
avcodec_send_packet(m_pContext, NULL);
|
||
|
||
// FIXED: Drain all frames after flushing
|
||
while (true) {
|
||
int ret = avcodec_receive_frame(m_pContext, m_pFrame);
|
||
if (ret == AVERROR_EOF || ret == AVERROR(EAGAIN)) {
|
||
break;
|
||
}
|
||
if (ret < 0) {
|
||
char error_buf[AV_ERROR_MAX_STRING_SIZE];
|
||
av_strerror(ret, error_buf, sizeof(error_buf));
|
||
log_print(HT_LOG_WARN, "%s, Error during flush: %s\r\n", __FUNCTION__, error_buf);
|
||
break;
|
||
}
|
||
// Process the frame if needed, or just unref it
|
||
av_frame_unref(m_pFrame);
|
||
}
|
||
|
||
// Also flush the codec buffers
|
||
if (m_pContext) {
|
||
avcodec_flush_buffers(m_pContext);
|
||
}
|
||
}
|