ANSCORE/MediaClient/media/video_decoder.cpp


#include "video_decoder.h"
#include "avcodec_mutex.h"
#include "lock.h"
#include "media_codec.h"
#include "media_parse.h"
#include <memory>

#include "ANSLicense.h"   // ANS_DBG macro (gated by ANSCORE_DEBUGVIEW)

extern "C" {
#include "libavutil/imgutils.h"
#include "libavutil/buffer.h"
#include "libavutil/mem.h"
}

// ---------------------------------------------------------------------------
//  Contiguous YUV420P allocator — trims per-call malloc overhead and enables
//  the zero-copy fast path in avframeYUV420PToCvMat for resolutions where the
//  codec's aligned height happens to equal the visible height.
//  (4K HEVC at 2160 rows still needs 2176-row alignment → one 16-row gap
//  between Y and U remains; the fast path stays off for that case but the
//  single-block layout still improves cache behaviour for the bulk memcpy.)
// ---------------------------------------------------------------------------
namespace {
    void anscore_contiguous_free(void* /*opaque*/, uint8_t* data) {
        av_free(data);
    }
}

uint32  g_hw_decoder_nums = 0;
uint32  g_hw_decoder_max = 4;   // Hardware decoding resources are limited, Limit up to 4 hardware decoding sessions
void* g_hw_decoder_mutex = sys_os_create_mutex();

int CVideoDecoder::contiguousGetBuffer2(AVCodecContext* s, AVFrame* frame, int flags) {
    // Never touch HW surfaces — those are owned by the hwframe pool.
    if (s->hw_frames_ctx) {
        return avcodec_default_get_buffer2(s, frame, flags);
    }
    // Only pack planar 8-bit 4:2:0. Everything else (NV12 from unpackers, 10-bit
    // YUV, 4:2:2, 4:4:4, RGB, paletted, …) goes through the stock allocator.
    if (frame->format != AV_PIX_FMT_YUV420P && frame->format != AV_PIX_FMT_YUVJ420P) {
        return avcodec_default_get_buffer2(s, frame, flags);
    }
    if (frame->width <= 0 || frame->height <= 0) {
        return avcodec_default_get_buffer2(s, frame, flags);
    }

    // Ask the codec for the minimum aligned dimensions it needs. For HEVC
    // this typically rounds up to a multiple of 64 (the CTU size); for H.264
    // to a multiple of 16. stride_align[i] is the per-plane linesize alignment.
    int aligned_w = frame->width;
    int aligned_h = frame->height;
    int stride_align[AV_NUM_DATA_POINTERS] = {0};
    avcodec_align_dimensions2(s, &aligned_w, &aligned_h, stride_align);

    // Round up to the strictest stride_align across all planes (simpler and
    // safe — FFmpeg only asks for alignment, not exact equality).
    int max_align = 32;
    for (int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
        if (stride_align[i] > max_align) max_align = stride_align[i];
    }

    auto align_up = [](int v, int a) { return (v + a - 1) & ~(a - 1); };

    const int y_stride  = align_up(aligned_w,     max_align);
    const int uv_stride = align_up(aligned_w / 2, max_align / 2 > 0 ? max_align / 2 : 16);
    const int y_h       = aligned_h;
    const int uv_h      = (aligned_h + 1) / 2;

    const size_t y_sz  = (size_t)y_stride  * y_h;
    const size_t uv_sz = (size_t)uv_stride * uv_h;
    const size_t total = y_sz + 2 * uv_sz + AV_INPUT_BUFFER_PADDING_SIZE;

    uint8_t* buf = (uint8_t*)av_mallocz(total);
    if (!buf) {
        return AVERROR(ENOMEM);
    }

    AVBufferRef* ref = av_buffer_create(buf, (int)total,
                                        anscore_contiguous_free, nullptr, 0);
    if (!ref) {
        av_free(buf);
        return AVERROR(ENOMEM);
    }

    for (int i = 0; i < AV_NUM_DATA_POINTERS; ++i) {
        frame->buf[i]      = nullptr;
        frame->data[i]     = nullptr;
        frame->linesize[i] = 0;
    }
    frame->buf[0]        = ref;
    frame->data[0]       = buf;
    frame->data[1]       = buf + y_sz;
    frame->data[2]       = buf + y_sz + uv_sz;
    frame->linesize[0]   = y_stride;
    frame->linesize[1]   = uv_stride;
    frame->linesize[2]   = uv_stride;
    frame->extended_data = frame->data;

    return 0;
}

// ---------------------------------------------------------------------------
//  HWDecoderPool implementation
// ---------------------------------------------------------------------------
HWDecoderPool& HWDecoderPool::instance() {
    static HWDecoderPool pool;
    return pool;
}

void HWDecoderPool::configure(int numGpus, int maxPerGpu) {
    // Uniform limit: same max for all GPUs
    std::vector<int> limits(numGpus, maxPerGpu);
    configure(limits);
}

void HWDecoderPool::configure(const std::vector<int>& maxPerGpuList) {
    std::lock_guard<std::mutex> lock(m_mutex);
    m_maxPerGpu = maxPerGpuList;
    m_activePerGpu.assign(maxPerGpuList.size(), 0);
    m_configured = true;
    // Also update legacy global for backward compatibility
    int total = 0;
    for (int m : m_maxPerGpu) total += m;
    g_hw_decoder_max = static_cast<uint32>(total);
    for (int i = 0; i < static_cast<int>(m_maxPerGpu.size()); ++i) {
        fprintf(stderr, "[HWDecode] HWDecoderPool: GPU[%d] max=%d sessions\n", i, m_maxPerGpu[i]);
    }
    fprintf(stderr, "[HWDecode] HWDecoderPool: configured %d GPU(s), %d total sessions\n",
        static_cast<int>(m_maxPerGpu.size()), total);
}

bool HWDecoderPool::isConfigured() const {
    return m_configured;
}

int HWDecoderPool::acquireSlot(int preferredGpu) {
    std::lock_guard<std::mutex> lock(m_mutex);
    if (!m_configured || m_activePerGpu.empty()) return -1;

    // If caller requested a specific GPU (e.g. to match inference GPU for NV12 zero-copy),
    // try that GPU first. This avoids cross-GPU device pointer access which causes
    // "illegal memory access" sticky CUDA errors.
    if (preferredGpu >= 0 && preferredGpu < static_cast<int>(m_activePerGpu.size())) {
        if (m_activePerGpu[preferredGpu] < m_maxPerGpu[preferredGpu]) {
            m_activePerGpu[preferredGpu]++;
            fprintf(stderr, "[HWDecode] HWDecoderPool: acquired slot on PREFERRED GPU[%d] (%d/%d)\n",
                preferredGpu, m_activePerGpu[preferredGpu], m_maxPerGpu[preferredGpu]);
            return preferredGpu;
        }
        fprintf(stderr, "[HWDecode] HWDecoderPool: preferred GPU[%d] at capacity (%d/%d), falling back to least-loaded\n",
            preferredGpu, m_activePerGpu[preferredGpu], m_maxPerGpu[preferredGpu]);
    }

    // Fallback: find the GPU with the fewest active sessions that still has capacity
    int bestGpu = -1;
    int bestCount = INT_MAX;
    for (int i = 0; i < static_cast<int>(m_activePerGpu.size()); ++i) {
        if (m_activePerGpu[i] < m_maxPerGpu[i] && m_activePerGpu[i] < bestCount) {
            bestCount = m_activePerGpu[i];
            bestGpu = i;
        }
    }

    if (bestGpu >= 0) {
        m_activePerGpu[bestGpu]++;
        fprintf(stderr, "[HWDecode] HWDecoderPool: acquired slot on GPU[%d] (%d/%d)\n",
            bestGpu, m_activePerGpu[bestGpu], m_maxPerGpu[bestGpu]);
    }
    return bestGpu;
}

void HWDecoderPool::releaseSlot(int gpuIndex) {
    std::lock_guard<std::mutex> lock(m_mutex);
    if (!m_configured) return;
    if (gpuIndex >= 0 && gpuIndex < static_cast<int>(m_activePerGpu.size())) {
        if (m_activePerGpu[gpuIndex] > 0) {
            m_activePerGpu[gpuIndex]--;
            fprintf(stderr, "[HWDecode] HWDecoderPool: released slot on GPU[%d] (%d/%d)\n",
                gpuIndex, m_activePerGpu[gpuIndex], m_maxPerGpu[gpuIndex]);
        }
    }
}

int HWDecoderPool::getTotalMax() const {
    int total = 0;
    for (int m : m_maxPerGpu) total += m;
    return total;
}

int HWDecoderPool::getTotalActive() const {
    int total = 0;
    for (int c : m_activePerGpu) total += c;
    return total;
}

// ---------------------------------------------------------------------------
//  SharedHWDeviceCtx implementation
// ---------------------------------------------------------------------------
SharedHWDeviceCtx& SharedHWDeviceCtx::instance() {
    static SharedHWDeviceCtx inst;
    return inst;
}

SharedHWDeviceCtx::~SharedHWDeviceCtx() {
    // Intentionally empty — do NOT release GPU/D3D11 resources here.
    // This destructor runs during DLL_PROCESS_DETACH while the OS loader
    // lock is held.  Releasing D3D11/NVIDIA resources requires driver
    // worker threads that also need the loader lock → deadlock.
    // The OS reclaims all GPU resources when the process exits.
}

AVBufferRef* SharedHWDeviceCtx::acquire(int gpuIndex, AVHWDeviceType type) {
    std::lock_guard<std::mutex> lock(m_mutex);

    // Grow cache if needed
    if (gpuIndex < 0) gpuIndex = 0;
    if (static_cast<int>(m_cache.size()) <= gpuIndex) {
        m_cache.resize(gpuIndex + 1);
    }

    GpuCtx& slot = m_cache[gpuIndex];

    // If already created for this GPU and same type, return a new reference
    if (slot.ctx && slot.type == type) {
        fprintf(stderr, "[HWDecode] SharedHWDeviceCtx: reusing shared context for GPU[%d]\n", gpuIndex);
        return av_buffer_ref(slot.ctx);
    }

    // Release old context if type changed
    if (slot.ctx) {
        av_buffer_unref(&slot.ctx);
        slot.ctx = nullptr;
    }

    // Create new HW device context for this GPU
    char adapterStr[16] = {};
    snprintf(adapterStr, sizeof(adapterStr), "%d", gpuIndex);

    int err = av_hwdevice_ctx_create(&slot.ctx, type, adapterStr, nullptr, 0);
    if (err < 0) {
        char error_buf[AV_ERROR_MAX_STRING_SIZE];
        av_strerror(err, error_buf, sizeof(error_buf));
        fprintf(stderr, "[HWDecode] SharedHWDeviceCtx: FAILED to create context for GPU[%d]: %s\n",
            gpuIndex, error_buf);
        slot.ctx = nullptr;
        return nullptr;
    }

    slot.type = type;
    fprintf(stderr, "[HWDecode] SharedHWDeviceCtx: created shared context for GPU[%d] type=%s\n",
        gpuIndex, av_hwdevice_get_type_name(type));

    // Return a new reference (caller owns it)
    return av_buffer_ref(slot.ctx);
}

void SharedHWDeviceCtx::releaseAll() {
    std::lock_guard<std::mutex> lock(m_mutex);
    for (auto& slot : m_cache) {
        if (slot.ctx) {
            av_buffer_unref(&slot.ctx);
            slot.ctx = nullptr;
        }
    }
    m_cache.clear();
}

enum AVPixelFormat getHWFormat(AVCodecContext* ctx, const enum AVPixelFormat* pix_fmts)
{
    CVideoDecoder* pthis = (CVideoDecoder*)ctx->opaque;

    AVPixelFormat dst_pix_fmt = AV_PIX_FMT_NONE;

    pthis->getHWFormat(ctx, pix_fmts, &dst_pix_fmt);

    return dst_pix_fmt;
}

enum AVPixelFormat get_hw_format(AVCodecContext* ctx, const enum AVPixelFormat* pix_fmts)
{
    for (const enum AVPixelFormat* p = pix_fmts; *p != -1; p++) {
        if (*p == AV_PIX_FMT_YUVJ420P) {
            return AV_PIX_FMT_YUVJ420P;
        }
    }
    // If YUVJ420P is not available, fall back to default
    return ctx->pix_fmt;
}

CVideoDecoder::CVideoDecoder()
{
    m_bInited = FALSE;
    m_bRunning = FALSE;
    m_bHardwareDecoderEnabled = FALSE;
    m_bCudaHWAccel = false;
    m_hwGpuIndex = -1;

    m_pCodec = NULL;
    m_pContext = NULL;
    m_pFrame = NULL;
    m_pSoftFrame = NULL;
    m_pCudaHWFrame = NULL;

    m_pCallback = NULL;
    m_pUserdata = NULL;

    m_hwPixFmt = AV_PIX_FMT_NONE;
    m_pHWDeviceCtx = NULL;
}

AVFrame* CVideoDecoder::takeCudaHWFrame() {
    std::lock_guard<std::recursive_mutex> lock(_mutex);
    AVFrame* result = m_pCudaHWFrame;
    m_pCudaHWFrame = nullptr;
    return result;
}

AVFrame* CVideoDecoder::cloneCudaHWFrame_unlocked() {
    // Caller MUST already hold _mutex (called from decode thread's callback chain).
    // Returns a clone so the original m_pCudaHWFrame stays valid for the decode loop.
    return m_pCudaHWFrame ? av_frame_clone(m_pCudaHWFrame) : nullptr;
}

CVideoDecoder::~CVideoDecoder()
{
    uninit();
}

void CVideoDecoder::uninit()
{
    std::lock_guard<std::recursive_mutex> lock(_mutex);

    // [MEDIA_DecClose] heartbeat — paired with [MEDIA_DecInit] for leak diagnosis.
    // Pair count over a long run reveals whether avcodec_open2 calls are
    // matched by full teardowns. If close-count < init-count, the FFmpeg
    // codec context (and its custom get_buffer2 arena) is leaking per reopen.
    {
        static std::atomic<uint64_t> s_closeCount{0};
        const uint64_t n = s_closeCount.fetch_add(1) + 1;
        ANS_DBG("MEDIA_DecClose",
            "uninit ENTRY #%llu inited=%d codec=%s %dx%d hwEnabled=%d cudaHW=%d gpu=%d (this=%p)",
            (unsigned long long)n,
            (int)m_bInited,
            (m_pCodec && m_pCodec->name) ? m_pCodec->name : "?",
            m_pContext ? m_pContext->width  : 0,
            m_pContext ? m_pContext->height : 0,
            (int)m_bHardwareDecoderEnabled,
            (int)m_bCudaHWAccel,
            m_hwGpuIndex,
            (void*)this);
    }

    // Stop processing first
    // Backup first
	BOOL wasRunning = m_bRunning;
    m_bRunning = FALSE;

    flush();

    // FIXED: Clean up frames before context to avoid use-after-free
    if (m_pFrame)
    {
        av_frame_free(&m_pFrame);
        m_pFrame = NULL;
    }

    if (m_pSoftFrame)
    {
        av_frame_free(&m_pSoftFrame);
        m_pSoftFrame = NULL;
    }

    if (m_pCudaHWFrame)
    {
        av_frame_free(&m_pCudaHWFrame);
        m_pCudaHWFrame = NULL;
    }

    if (m_pContext)
    {
        // FIXED: Free extradata before freeing context
        if (m_pContext->extradata) {
            av_free(m_pContext->extradata);
            m_pContext->extradata = NULL;
            m_pContext->extradata_size = 0;
        }

        // FIXED: Properly release hardware context reference
        if (m_pContext->hw_device_ctx) {
            av_buffer_unref(&m_pContext->hw_device_ctx);
            m_pContext->hw_device_ctx = NULL;
        }

        // FIXED: Close codec before freeing context
        avcodec_close(m_pContext);
        avcodec_free_context(&m_pContext);
        m_pContext = NULL;
    }

    // Only decrement hardware decoder count if it was actually enabled
    if (m_pHWDeviceCtx && m_bHardwareDecoderEnabled)
    {
        av_buffer_unref(&m_pHWDeviceCtx);
        m_pHWDeviceCtx = NULL;

        // Release via per-GPU pool or legacy global counter
        HWDecoderPool& pool = HWDecoderPool::instance();
        if (pool.isConfigured() && m_hwGpuIndex >= 0) {
            pool.releaseSlot(m_hwGpuIndex);
        } else {
            CLock hw_lock(g_hw_decoder_mutex);
            if (g_hw_decoder_nums > 0) {
                g_hw_decoder_nums--;
            }
        }
        m_hwGpuIndex = -1;
        m_bHardwareDecoderEnabled = FALSE;
    }
	// Restore running state if needed
	m_bRunning = wasRunning;
    m_pCodec = NULL;
    m_bInited = FALSE;
}

BOOL CVideoDecoder::init(enum AVCodecID codec, uint8* extradata, int extradata_size, int hwMode, int preferredGpu)
{
    std::lock_guard<std::recursive_mutex> lock(_mutex);

    // Clean up any existing state
    if (m_bInited) {
        uninit();
    }

    int width = 0;
    int height = 0;

    if (extradata && extradata_size > 0)
    {
        int vcodec = VIDEO_CODEC_NONE;

        if (AV_CODEC_ID_H264 == codec)
        {
            vcodec = VIDEO_CODEC_H264;
        }
        else if (AV_CODEC_ID_HEVC == codec)
        {
            vcodec = VIDEO_CODEC_H265;
        }
        else if (AV_CODEC_ID_MJPEG == codec)
        {
            vcodec = VIDEO_CODEC_JPEG;
        }
        else if (AV_CODEC_ID_MPEG4 == codec)
        {
            vcodec = VIDEO_CODEC_MP4;
        }

        avc_parse_video_size(vcodec, extradata, extradata_size, &width, &height);
    }

#ifdef ANDROID
    if (HW_DECODING_DISABLE != hwMode && width * height >= 320 * 240)
    {
        if (AV_CODEC_ID_H264 == codec)
        {
            m_pCodec = avcodec_find_decoder_by_name("h264_mediacodec");
        }
        else if (AV_CODEC_ID_HEVC == codec)
        {
            m_pCodec = avcodec_find_decoder_by_name("hevc_mediacodec");
        }
        else if (AV_CODEC_ID_MPEG4 == codec)
        {
            m_pCodec = avcodec_find_decoder_by_name("mpeg4_mediacodec");
        }
    }

    if (NULL == m_pCodec)
    {
        m_pCodec = avcodec_find_decoder(codec);
    }
#else
    m_pCodec = avcodec_find_decoder(codec);
#endif

    if (NULL == m_pCodec)
    {
        log_print(HT_LOG_ERR, "%s, m_pCodec is NULL for codec %d\r\n", __FUNCTION__, codec);
        return FALSE;
    }

    m_pContext = avcodec_alloc_context3(m_pCodec);
    if (NULL == m_pContext)
    {
        log_print(HT_LOG_ERR, "%s, avcodec_alloc_context3 failed\r\n", __FUNCTION__);
        return FALSE;
    }

    m_pContext->width = width;
    m_pContext->height = height;

    m_pContext->flags |= AV_CODEC_FLAG_LOW_DELAY;
    m_pContext->flags2 |= AV_CODEC_FLAG2_FAST;
    m_pContext->flags |= AV_CODEC_FLAG_OUTPUT_CORRUPT;
    m_pContext->err_recognition = AV_EF_IGNORE_ERR;

    av_opt_set_int(m_pContext, "refcounted_frames", 1, 0);

    // Initialize hardware decoder
    if (HW_DECODING_DISABLE != hwMode) {
        int hw_ret = hwDecoderInit(m_pContext, hwMode, preferredGpu);
        if (hw_ret < 0) {
            log_print(HT_LOG_WARN, "%s, hwDecoderInit failed with error %d, falling back to software decoding\r\n", __FUNCTION__, hw_ret);
        }
    }

    // Handle extradata
    if (extradata && extradata_size > 0)
    {
        int size = extradata_size + AV_INPUT_BUFFER_PADDING_SIZE;

        m_pContext->extradata = (uint8*)av_mallocz(size);
        if (m_pContext->extradata)
        {
            m_pContext->extradata_size = extradata_size;
            memcpy(m_pContext->extradata, extradata, extradata_size);
        }
        else
        {
            log_print(HT_LOG_ERR, "%s, Failed to allocate extradata\r\n", __FUNCTION__);
            uninit(); // FIXED: Clean up on failure
            return FALSE;
        }
    }

    // Configure multi-threading for the SOFTWARE decoder.
    // Hardware decoders (NVDEC, DXVA2/D3D11VA, QSV, VideoToolbox) do their
    // own parallelism inside the GPU/fixed-function block and ignore these
    // fields — so we only enable threading when HW init was skipped (hwMode
    // == HW_DECODING_DISABLE) or failed (fell back to SW).
    //
    // Without this, libavcodec's HEVC/H.264 decoder runs on a single core,
    // which on 4K HEVC streams is ~80–120 ms per frame. Frame + slice
    // threading on a 24-thread CPU typically brings that down to 10–20 ms.
    // thread_count = 0 lets FFmpeg auto-pick (capped internally ~16).
    if (!m_bHardwareDecoderEnabled) {
        m_pContext->thread_count = 0;
        m_pContext->thread_type  = FF_THREAD_FRAME | FF_THREAD_SLICE;

        // Install contiguous Y+U+V allocator. This packs all three planes
        // into a single av_malloc block so the BGR-conversion fast path
        // (avframeYUV420PToCvMat) can either wrap the frame zero-copy, or
        // at minimum hit a tight 3-call bulk memcpy with good cache locality
        // instead of per-row copies into a freshly allocated staging Mat.
        // HW decoders must NEVER have get_buffer2 overridden — they use
        // hw_frames_ctx for surface management.
        m_pContext->get_buffer2 = &CVideoDecoder::contiguousGetBuffer2;
    }

    // FIXED: Use avcodec_open2 instead of avcodec_thread_open
    if (avcodec_open2(m_pContext, m_pCodec, NULL) < 0)
    {
        log_print(HT_LOG_ERR, "%s, avcodec_open2 failed\r\n", __FUNCTION__);
        uninit(); // FIXED: Clean up on failure
        return FALSE;
    }

    // Debug: one-shot visibility into which decoder actually got opened.
    // m_bHardwareDecoderEnabled is set by hwDecoderInit() on success; when
    // hwMode == HW_DECODING_DISABLE or hwDecoderInit failed, it stays FALSE
    // and the SW decoder (avcodec_find_decoder) is used.
    // active_thread_type is what FFmpeg actually negotiated after open2
    // (bit 1 = FF_THREAD_FRAME, bit 2 = FF_THREAD_SLICE).
    ANS_DBG("MEDIA_DecInit",
        "avcodec_open2 OK codec=%s(%s) %dx%d hwMode=%d hwEnabled=%d cudaHW=%d gpu=%d "
        "threads=%d thread_type_req=0x%x active=0x%x -> %s decoder",
        m_pCodec->name ? m_pCodec->name : "?",
        m_pCodec->long_name ? m_pCodec->long_name : "?",
        m_pContext->width, m_pContext->height,
        hwMode,
        (int)m_bHardwareDecoderEnabled,
        (int)m_bCudaHWAccel,
        m_hwGpuIndex,
        m_pContext->thread_count,
        m_pContext->thread_type,
        m_pContext->active_thread_type,
        m_bHardwareDecoderEnabled ? "HARDWARE" : "SOFTWARE");

    m_pFrame = av_frame_alloc();
    if (NULL == m_pFrame)
    {
        log_print(HT_LOG_ERR, "%s, av_frame_alloc failed for m_pFrame\r\n", __FUNCTION__);
        uninit(); // FIXED: Clean up on failure
        return FALSE;
    }

    m_pSoftFrame = av_frame_alloc();
    if (NULL == m_pSoftFrame)
    {
        log_print(HT_LOG_ERR, "%s, av_frame_alloc failed for m_pSoftFrame\r\n", __FUNCTION__);
        uninit(); // FIXED: Clean up on failure
        return FALSE;
    }

    m_bInited = TRUE;
    //m_bRunning = TRUE;
    return TRUE;
}

BOOL CVideoDecoder::init(int codec, uint8* extradata, int extradata_size, int hwMode, int preferredGpu)
{
    BOOL result = init(to_video_avcodecid(codec), extradata, extradata_size, hwMode, preferredGpu);
    if (result) {
        m_bRunning = TRUE; // Set running only if initialization succeeded
    }
    return result;
}

int CVideoDecoder::hwDecoderInit(AVCodecContext* ctx, int hwMode, int preferredGpu) {
    int err = 0;
    std::string hwtype;
    enum AVHWDeviceType type = AV_HWDEVICE_TYPE_NONE;

    if (hwMode == HW_DECODING_DISABLE) {
        return 0; // Hardware decoding is disabled
    }

    // -- Per-GPU pool path (preferred) or legacy global path ----------------
    int assignedGpu = -1;
    HWDecoderPool& pool = HWDecoderPool::instance();

    if (pool.isConfigured()) {
        // Per-GPU: acquire a slot, preferring the caller's requested GPU
        // (e.g. inference GPU for NV12 zero-copy alignment)
        assignedGpu = pool.acquireSlot(preferredGpu);
        if (assignedGpu < 0) {
            log_print(HT_LOG_WARN, "%s, All GPU HW decoder slots full (%d/%d total)\r\n",
                __FUNCTION__, pool.getTotalActive(), pool.getTotalMax());
            return -1;
        }
    } else {
        // Legacy: single global counter
        CLock lock(g_hw_decoder_mutex);
        if (g_hw_decoder_max > 0 && g_hw_decoder_nums >= g_hw_decoder_max) {
            log_print(HT_LOG_WARN, "%s, Maximum number of hardware decoders reached (%d/%d)\r\n",
                __FUNCTION__, g_hw_decoder_nums, g_hw_decoder_max);
            return -1;
        }
    }

    // Determine the hardware type based on platform and hardware mode
    if (!getHardwareTypeForPlatform(hwMode, hwtype)) {
        log_print(HT_LOG_WARN, "%s, Unsupported hardware mode %d for the current platform\r\n", __FUNCTION__, hwMode);
        if (assignedGpu >= 0) pool.releaseSlot(assignedGpu);
        return -1;
    }

    // Find the hardware device type by name
    type = av_hwdevice_find_type_by_name(hwtype.c_str());
    if (type == AV_HWDEVICE_TYPE_NONE) {
        log_print(HT_LOG_WARN, "%s, Hardware device type %s is not supported\r\n", __FUNCTION__, hwtype.c_str());
        logSupportedHwTypes();
        if (assignedGpu >= 0) pool.releaseSlot(assignedGpu);
        return -1;
    }

    // Find a hardware configuration that supports the specified device type
    if (!findHwConfigForDeviceType(type)) {
        log_print(HT_LOG_WARN, "%s, Decoder %s does not support the specified hardware device type %s\r\n",
            __FUNCTION__, m_pCodec->long_name, av_hwdevice_get_type_name(type));
        if (assignedGpu >= 0) pool.releaseSlot(assignedGpu);
        return -1;
    }

    // Get or create a shared HW device context for this GPU.
    // NVIDIA recommends sharing CUDA contexts across decode sessions to minimize
    // GPU memory overhead (each CUDA context costs ~50-100MB).
    // See: NVDEC Video Decoder API Programming Guide, Section "Multi-session decoding"
    int gpuIdx = (assignedGpu >= 0) ? assignedGpu : 0;
    m_pHWDeviceCtx = SharedHWDeviceCtx::instance().acquire(gpuIdx, type);
    if (!m_pHWDeviceCtx) {
        log_print(HT_LOG_ERR, "%s, Failed to acquire shared HW device context, type=%s, gpu=%d\r\n",
            __FUNCTION__, av_hwdevice_get_type_name(type), gpuIdx);
        if (assignedGpu >= 0) pool.releaseSlot(assignedGpu);
        return -1;
    }

    // Configure the codec context to use the shared hardware device
    ctx->opaque = this;
    ctx->get_format = ::getHWFormat;
    ctx->hw_device_ctx = av_buffer_ref(m_pHWDeviceCtx);
    ctx->err_recognition = AV_EF_IGNORE_ERR;
    ctx->flags2 |= AV_CODEC_FLAG2_EXPORT_MVS;

    // Reserve extra NVDEC surfaces for application-held av_frame_clone() references.
    // The clone chain (decoder → player → registry) holds ~2 surfaces simultaneously
    // (decoder's clone + registry's clone; getCudaHWFrame uses ownership transfer).
    // Without this, the default pool (num_decode_surfaces + 2) can run out under
    // load with many concurrent streams, causing the decoder to stall.
    ctx->extra_hw_frames = 2;

    // Track which GPU this decoder is on
    m_hwGpuIndex = assignedGpu;
    m_bHardwareDecoderEnabled = TRUE;
    m_bCudaHWAccel = (type == AV_HWDEVICE_TYPE_CUDA);

    // Legacy counter (for backward compatibility)
    if (!pool.isConfigured()) {
        CLock lock(g_hw_decoder_mutex);
        g_hw_decoder_nums++;
    }

    log_print(HT_LOG_INFO, "%s, Successfully initialized hardware decoder %s on GPU[%d] (%d/%d)\r\n",
        __FUNCTION__, av_hwdevice_get_type_name(type),
        gpuIdx,
        pool.isConfigured() ? pool.getTotalActive() : g_hw_decoder_nums,
        pool.isConfigured() ? pool.getTotalMax() : g_hw_decoder_max);

    return 0;
}

void CVideoDecoder::Start() {
    std::lock_guard<std::recursive_mutex> lock(_mutex);
    if (m_pContext) {
        avcodec_flush_buffers(m_pContext);
    }
    m_bRunning = TRUE;
    log_print(HT_LOG_INFO, "%s, Video decoder started\r\n", __FUNCTION__);
}

void CVideoDecoder::Stop() {
    // Atomically signal the decoder to stop WITHOUT acquiring _mutex.
    // decode() holds _mutex while inside avcodec_send_packet / CUDA calls
    // that can block on the nvcuda64 SRW lock for a long time.
    // If we waited for _mutex here, Stop() would deadlock whenever a
    // concurrent decode() is stuck waiting for a CUDA operation held by
    // an inference thread.
    m_bRunning.store(FALSE, std::memory_order_release);
    log_print(HT_LOG_INFO, "%s, Video decoder stopped\r\n", __FUNCTION__);
}

// Log all supported hardware types
void CVideoDecoder::logSupportedHwTypes() {
    enum AVHWDeviceType type = AV_HWDEVICE_TYPE_NONE;
    log_print(HT_LOG_INFO, "%s, Available hardware device types:\r\n", __FUNCTION__);
    while ((type = av_hwdevice_iterate_types(type)) != AV_HWDEVICE_TYPE_NONE) {
        log_print(HT_LOG_INFO, "%s, - %s\r\n", __FUNCTION__, av_hwdevice_get_type_name(type));
    }
}

// Platform-specific function to determine the hardware type based on the mode
bool CVideoDecoder::getHardwareTypeForPlatform(int hwMode, std::string& hwtype) {
#if __WINDOWS_OS__
    switch (hwMode) {
    case HW_DECODING_D3D11: hwtype = "d3d11va"; break;
    case HW_DECODING_DXVA: hwtype = "dxva2"; break;
    case HW_DECODING_CUDA: hwtype = "cuda"; break;
    case HW_DECODING_AUTO:
        hwtype = "cuda";
        if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) {
            hwtype = "d3d11va";
            if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) {
                hwtype = "dxva2";
            }
        }
        break;
    default: return false;
    }
#elif defined(IOS)
    switch (hwMode) {
    case HW_DECODING_VIDEOTOOLBOX: hwtype = "videotoolbox"; break;
    case HW_DECODING_OPENCL: hwtype = "opencl"; break;
    case HW_DECODING_AUTO:
        hwtype = "videotoolbox";
        if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) {
            hwtype = "opencl";
        }
        break;
    default: return false;
    }
#elif defined(ANDROID)
    if (hwMode == HW_DECODING_MEDIACODEC || hwMode == HW_DECODING_AUTO) {
        hwtype = "mediacodec";
    }
    else {
        return false;
    }
#elif __LINUX_OS__
    switch (hwMode) {
    case HW_DECODING_VAAPI: hwtype = "vaapi"; break;
    case HW_DECODING_OPENCL: hwtype = "opencl"; break;
    case HW_DECODING_AUTO:
        hwtype = "vaapi";
        if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) {
            hwtype = "opencl";
        }
        break;
    default: return false;
    }
#else
    return false; // Unsupported platform
#endif
    return true;
}

// Find a hardware configuration that matches the specified device type
bool CVideoDecoder::findHwConfigForDeviceType(AVHWDeviceType type) {
    for (int i = 0;; i++) {
        const AVCodecHWConfig* config = avcodec_get_hw_config(m_pCodec, i);
        if (!config) {
            return false; // No matching hardware configuration found
        }
        if (config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX && config->device_type == type) {
            m_hwPixFmt = config->pix_fmt;
            return true;
        }
    }
}

BOOL CVideoDecoder::getHWFormat(AVCodecContext* ctx, const AVPixelFormat* pix_fmts, AVPixelFormat* dst)
{
    const AVPixelFormat* p;
    *dst = AV_PIX_FMT_NONE;

    // First, attempt to use hardware pixel format if available
    for (p = pix_fmts; *p != -1; p++)
    {
        if (*p == m_hwPixFmt)
        {
            *dst = *p;
            return TRUE;
        }
    }

    // If hardware format is not supported, fall back to YUVJ420P
    for (p = pix_fmts; *p != -1; p++)
    {
        if (*p == AV_PIX_FMT_YUVJ420P ||
            *p == AV_PIX_FMT_YUVJ422P ||
            *p == AV_PIX_FMT_YUVJ444P)
        {
            *dst = *p;
            return TRUE;
        }
    }

    // As a last resort, use other formats (YUV420P)
    for (p = pix_fmts; *p != -1; p++)
    {
        if (*p == AV_PIX_FMT_YUV420P ||
            *p == AV_PIX_FMT_YUV422P ||
            *p == AV_PIX_FMT_YUV444P)
        {
            *dst = *p;
            return TRUE;
        }
    }

    if (*pix_fmts != -1)
    {
        *dst = *pix_fmts;
        return TRUE;
    }

    log_print(HT_LOG_ERR, "%s, Failed to get HW surface format\r\n", __FUNCTION__);

    return FALSE;
}

int CVideoDecoder::getWidth()
{
    std::lock_guard<std::recursive_mutex> lock(_mutex);
    if (m_pContext)
    {
        return m_pContext->width;
    }
    return 0;
}

int CVideoDecoder::getHeight()
{
    std::lock_guard<std::recursive_mutex> lock(_mutex);
    if (m_pContext)
    {
        return m_pContext->height;
    }
    return 0;
}

double CVideoDecoder::getFrameRate()
{
    std::lock_guard<std::recursive_mutex> lock(_mutex);
    if (m_pContext)
    {
        if (m_pContext->framerate.den > 0)
        {
            return (double)((double)(m_pContext->framerate.num) / m_pContext->framerate.den);
        }
    }
    return 0;
}

BOOL CVideoDecoder::decode(AVPacket* pkt)
{
    std::lock_guard<std::recursive_mutex> lock(_mutex);

    if (!m_bInited)
    {
        log_print(HT_LOG_ERR, "%s, Decoder not initialized\r\n", __FUNCTION__);
        return FALSE;
    }

    if (!m_bRunning)
    {
        log_print(HT_LOG_WARN, "%s, Decoder not running\r\n", __FUNCTION__);
        return FALSE;
    }

    if (!m_pContext) {
        log_print(HT_LOG_ERR, "%s, Context is NULL\r\n", __FUNCTION__);
        return FALSE;
    }

    int ret;
    int retryCount = 0;
    const int maxRetries = 3;

    // Attempt to send packet to decoder
    while ((ret = avcodec_send_packet(m_pContext, pkt)) == AVERROR(EAGAIN) &&
        retryCount < maxRetries)
    {
        if (!readFrame())
        {
            log_print(HT_LOG_ERR, "%s, Failed to read frame during retry %d\r\n", __FUNCTION__, retryCount);
            return FALSE;
        }
        Sleep(1); // Reduced sleep time
        retryCount++;
    }

    // Check for other errors
    if (ret < 0 && ret != AVERROR_EOF)
    {
        char error_buf[AV_ERROR_MAX_STRING_SIZE];
        av_strerror(ret, error_buf, sizeof(error_buf));
        log_print(HT_LOG_ERR, "%s, avcodec_send_packet failed: %s (ret=%d)\r\n", __FUNCTION__, error_buf, ret);
        return FALSE;
    }

    // If the packet was successfully sent, proceed to read frame
    return readFrame();
}
BOOL CVideoDecoder::decode(uint8* data, int len, int64_t pts)
{
    std::lock_guard<std::recursive_mutex> lock(_mutex);

    if (!m_bInited)
    {
        log_print(HT_LOG_ERR, "%s, Decoder not initialized\r\n", __FUNCTION__);
        return FALSE;
    }

    if (!m_bRunning)
    {
        log_print(HT_LOG_WARN, "%s, Decoder not running\r\n", __FUNCTION__);
        return FALSE;
    }

    if (!data || len <= 0) {
        log_print(HT_LOG_ERR, "%s, Invalid input data\r\n", __FUNCTION__);
        return FALSE;
    }

    // Allocate packet
    AVPacket* packet = av_packet_alloc();
    if (!packet) {
        log_print(HT_LOG_ERR, "%s, Failed to allocate AVPacket\r\n", __FUNCTION__);
        return FALSE;
    }

    // FIXED: Use av_new_packet() to properly allocate and manage packet data
    int ret = av_new_packet(packet, len);
    if (ret < 0) {
        char error_buf[AV_ERROR_MAX_STRING_SIZE];
        av_strerror(ret, error_buf, sizeof(error_buf));
        log_print(HT_LOG_ERR, "%s, Failed to allocate packet data: %s\r\n", __FUNCTION__, error_buf);
        av_packet_free(&packet);
        return FALSE;
    }

    // Copy data - av_new_packet() already allocated the buffer with proper padding
    memcpy(packet->data, data, len);

    // Set packet timing information
    packet->pts = pts;
    packet->dts = pts;

    // Call decode function
    BOOL result = decode(packet);

    // Clean up - av_packet_free will properly handle the data buffer
    av_packet_free(&packet);

    return result;
}

BOOL CVideoDecoder::readFrame()
{
    int ret = 0;
    AVFrame* tmp_frame = NULL;
    BOOL frame_processed = FALSE;

    while (ret >= 0)
    {
        ret = avcodec_receive_frame(m_pContext, m_pFrame);
        if (ret == AVERROR(EAGAIN)) {
            // Need more input data
            return frame_processed ? TRUE : FALSE;
        }
        else if (ret == AVERROR_EOF) {
            // End of stream
            return TRUE;
        }
        else if (ret < 0) {
            char error_buf[AV_ERROR_MAX_STRING_SIZE];
            av_strerror(ret, error_buf, sizeof(error_buf));
            log_print(HT_LOG_ERR, "%s, avcodec_receive_frame failed: %s (ret=%d)\r\n", __FUNCTION__, error_buf, ret);
            return FALSE;
        }

        // Check if we got a valid frame
        if (!m_pFrame || m_pFrame->width <= 0 || m_pFrame->height <= 0) {
            log_print(HT_LOG_WARN, "%s, Received invalid frame\r\n", __FUNCTION__);
            av_frame_unref(m_pFrame);
            continue;
        }

        if (m_pFrame->format == m_hwPixFmt)
        {
            // CUDA HW accel: clone the HW frame BEFORE transfer so inference
            // can use CUDA device pointers directly (zero-copy, no upload).
            if (m_bCudaHWAccel) {
                if (m_pCudaHWFrame) av_frame_free(&m_pCudaHWFrame);
                m_pCudaHWFrame = av_frame_clone(m_pFrame);
            }

            // FIXED: Ensure m_pSoftFrame is properly initialized before transfer
            av_frame_unref(m_pSoftFrame);  // Clear any previous data

            // Hardware frame - transfer to software (needed for display)
            ret = av_hwframe_transfer_data(m_pSoftFrame, m_pFrame, 0);
            if (ret < 0)
            {
                char error_buf[AV_ERROR_MAX_STRING_SIZE];
                av_strerror(ret, error_buf, sizeof(error_buf));
                log_print(HT_LOG_ERR, "%s, Error transferring hardware frame to system memory: %s (ret=%d)\r\n",
                    __FUNCTION__, error_buf, ret);
                av_frame_unref(m_pFrame);
                if (m_pCudaHWFrame) av_frame_free(&m_pCudaHWFrame);
                continue;
            }

            // Copy timing information
            m_pSoftFrame->pts = m_pFrame->pts;
            m_pSoftFrame->pkt_dts = m_pFrame->pkt_dts;
            m_pSoftFrame->best_effort_timestamp = m_pFrame->best_effort_timestamp;

            tmp_frame = m_pSoftFrame;
        }
        else
        {
            // Software frame - use directly
            tmp_frame = m_pFrame;
        }

        // Render the frame
        if (tmp_frame) {
            render(tmp_frame);
            frame_processed = TRUE;
        }

        // FIXED: Ensure proper cleanup of frame references
        if (tmp_frame == m_pSoftFrame) {
            av_frame_unref(m_pSoftFrame);
        }
        av_frame_unref(m_pFrame);
    }

    return TRUE;
}

int CVideoDecoder::render(AVFrame* frame)
{
    if (!m_bRunning || !frame)
    {
        return 0;
    }

    if (m_pCallback)
    {
        try {
            m_pCallback(frame, m_pUserdata);
        }
        catch (...) {
            log_print(HT_LOG_ERR, "%s, Exception in callback function\r\n", __FUNCTION__);
            return 0;
        }
    }
    return 1;
}

void CVideoDecoder::flush()
{
    std::lock_guard<std::recursive_mutex> lock(_mutex);

    if (NULL == m_pContext ||
        NULL == m_pContext->codec ||
        !(m_pContext->codec->capabilities & AV_CODEC_CAP_DELAY))
    {
        return;
    }

    log_print(HT_LOG_INFO, "%s, Flushing decoder buffers\r\n", __FUNCTION__);

    // Send NULL packet to flush
    avcodec_send_packet(m_pContext, NULL);

    // FIXED: Drain all frames after flushing
    while (true) {
        int ret = avcodec_receive_frame(m_pContext, m_pFrame);
        if (ret == AVERROR_EOF || ret == AVERROR(EAGAIN)) {
            break;
        }
        if (ret < 0) {
            char error_buf[AV_ERROR_MAX_STRING_SIZE];
            av_strerror(ret, error_buf, sizeof(error_buf));
            log_print(HT_LOG_WARN, "%s, Error during flush: %s\r\n", __FUNCTION__, error_buf);
            break;
        }
        // Process the frame if needed, or just unref it
        av_frame_unref(m_pFrame);
    }

    // Also flush the codec buffers
    if (m_pContext) {
        avcodec_flush_buffers(m_pContext);
    }
}