#include "video_decoder.h" #include "avcodec_mutex.h" #include "lock.h" #include "media_codec.h" #include "media_parse.h" #include uint32 g_hw_decoder_nums = 0; uint32 g_hw_decoder_max = 4; // Hardware decoding resources are limited, Limit up to 4 hardware decoding sessions void* g_hw_decoder_mutex = sys_os_create_mutex(); // --------------------------------------------------------------------------- // HWDecoderPool implementation // --------------------------------------------------------------------------- HWDecoderPool& HWDecoderPool::instance() { static HWDecoderPool pool; return pool; } void HWDecoderPool::configure(int numGpus, int maxPerGpu) { // Uniform limit: same max for all GPUs std::vector limits(numGpus, maxPerGpu); configure(limits); } void HWDecoderPool::configure(const std::vector& maxPerGpuList) { std::lock_guard lock(m_mutex); m_maxPerGpu = maxPerGpuList; m_activePerGpu.assign(maxPerGpuList.size(), 0); m_configured = true; // Also update legacy global for backward compatibility int total = 0; for (int m : m_maxPerGpu) total += m; g_hw_decoder_max = static_cast(total); for (int i = 0; i < static_cast(m_maxPerGpu.size()); ++i) { fprintf(stderr, "[HWDecode] HWDecoderPool: GPU[%d] max=%d sessions\n", i, m_maxPerGpu[i]); } fprintf(stderr, "[HWDecode] HWDecoderPool: configured %d GPU(s), %d total sessions\n", static_cast(m_maxPerGpu.size()), total); } bool HWDecoderPool::isConfigured() const { return m_configured; } int HWDecoderPool::acquireSlot(int preferredGpu) { std::lock_guard lock(m_mutex); if (!m_configured || m_activePerGpu.empty()) return -1; // If caller requested a specific GPU (e.g. to match inference GPU for NV12 zero-copy), // try that GPU first. This avoids cross-GPU device pointer access which causes // "illegal memory access" sticky CUDA errors. if (preferredGpu >= 0 && preferredGpu < static_cast(m_activePerGpu.size())) { if (m_activePerGpu[preferredGpu] < m_maxPerGpu[preferredGpu]) { m_activePerGpu[preferredGpu]++; fprintf(stderr, "[HWDecode] HWDecoderPool: acquired slot on PREFERRED GPU[%d] (%d/%d)\n", preferredGpu, m_activePerGpu[preferredGpu], m_maxPerGpu[preferredGpu]); return preferredGpu; } fprintf(stderr, "[HWDecode] HWDecoderPool: preferred GPU[%d] at capacity (%d/%d), falling back to least-loaded\n", preferredGpu, m_activePerGpu[preferredGpu], m_maxPerGpu[preferredGpu]); } // Fallback: find the GPU with the fewest active sessions that still has capacity int bestGpu = -1; int bestCount = INT_MAX; for (int i = 0; i < static_cast(m_activePerGpu.size()); ++i) { if (m_activePerGpu[i] < m_maxPerGpu[i] && m_activePerGpu[i] < bestCount) { bestCount = m_activePerGpu[i]; bestGpu = i; } } if (bestGpu >= 0) { m_activePerGpu[bestGpu]++; fprintf(stderr, "[HWDecode] HWDecoderPool: acquired slot on GPU[%d] (%d/%d)\n", bestGpu, m_activePerGpu[bestGpu], m_maxPerGpu[bestGpu]); } return bestGpu; } void HWDecoderPool::releaseSlot(int gpuIndex) { std::lock_guard lock(m_mutex); if (!m_configured) return; if (gpuIndex >= 0 && gpuIndex < static_cast(m_activePerGpu.size())) { if (m_activePerGpu[gpuIndex] > 0) { m_activePerGpu[gpuIndex]--; fprintf(stderr, "[HWDecode] HWDecoderPool: released slot on GPU[%d] (%d/%d)\n", gpuIndex, m_activePerGpu[gpuIndex], m_maxPerGpu[gpuIndex]); } } } int HWDecoderPool::getTotalMax() const { int total = 0; for (int m : m_maxPerGpu) total += m; return total; } int HWDecoderPool::getTotalActive() const { int total = 0; for (int c : m_activePerGpu) total += c; return total; } // --------------------------------------------------------------------------- // SharedHWDeviceCtx implementation // --------------------------------------------------------------------------- SharedHWDeviceCtx& SharedHWDeviceCtx::instance() { static SharedHWDeviceCtx inst; return inst; } SharedHWDeviceCtx::~SharedHWDeviceCtx() { // Intentionally empty — do NOT release GPU/D3D11 resources here. // This destructor runs during DLL_PROCESS_DETACH while the OS loader // lock is held. Releasing D3D11/NVIDIA resources requires driver // worker threads that also need the loader lock → deadlock. // The OS reclaims all GPU resources when the process exits. } AVBufferRef* SharedHWDeviceCtx::acquire(int gpuIndex, AVHWDeviceType type) { std::lock_guard lock(m_mutex); // Grow cache if needed if (gpuIndex < 0) gpuIndex = 0; if (static_cast(m_cache.size()) <= gpuIndex) { m_cache.resize(gpuIndex + 1); } GpuCtx& slot = m_cache[gpuIndex]; // If already created for this GPU and same type, return a new reference if (slot.ctx && slot.type == type) { fprintf(stderr, "[HWDecode] SharedHWDeviceCtx: reusing shared context for GPU[%d]\n", gpuIndex); return av_buffer_ref(slot.ctx); } // Release old context if type changed if (slot.ctx) { av_buffer_unref(&slot.ctx); slot.ctx = nullptr; } // Create new HW device context for this GPU char adapterStr[16] = {}; snprintf(adapterStr, sizeof(adapterStr), "%d", gpuIndex); int err = av_hwdevice_ctx_create(&slot.ctx, type, adapterStr, nullptr, 0); if (err < 0) { char error_buf[AV_ERROR_MAX_STRING_SIZE]; av_strerror(err, error_buf, sizeof(error_buf)); fprintf(stderr, "[HWDecode] SharedHWDeviceCtx: FAILED to create context for GPU[%d]: %s\n", gpuIndex, error_buf); slot.ctx = nullptr; return nullptr; } slot.type = type; fprintf(stderr, "[HWDecode] SharedHWDeviceCtx: created shared context for GPU[%d] type=%s\n", gpuIndex, av_hwdevice_get_type_name(type)); // Return a new reference (caller owns it) return av_buffer_ref(slot.ctx); } void SharedHWDeviceCtx::releaseAll() { std::lock_guard lock(m_mutex); for (auto& slot : m_cache) { if (slot.ctx) { av_buffer_unref(&slot.ctx); slot.ctx = nullptr; } } m_cache.clear(); } enum AVPixelFormat getHWFormat(AVCodecContext* ctx, const enum AVPixelFormat* pix_fmts) { CVideoDecoder* pthis = (CVideoDecoder*)ctx->opaque; AVPixelFormat dst_pix_fmt = AV_PIX_FMT_NONE; pthis->getHWFormat(ctx, pix_fmts, &dst_pix_fmt); return dst_pix_fmt; } enum AVPixelFormat get_hw_format(AVCodecContext* ctx, const enum AVPixelFormat* pix_fmts) { for (const enum AVPixelFormat* p = pix_fmts; *p != -1; p++) { if (*p == AV_PIX_FMT_YUVJ420P) { return AV_PIX_FMT_YUVJ420P; } } // If YUVJ420P is not available, fall back to default return ctx->pix_fmt; } CVideoDecoder::CVideoDecoder() { m_bInited = FALSE; m_bRunning = FALSE; m_bHardwareDecoderEnabled = FALSE; m_bCudaHWAccel = false; m_hwGpuIndex = -1; m_pCodec = NULL; m_pContext = NULL; m_pFrame = NULL; m_pSoftFrame = NULL; m_pCudaHWFrame = NULL; m_pCallback = NULL; m_pUserdata = NULL; m_hwPixFmt = AV_PIX_FMT_NONE; m_pHWDeviceCtx = NULL; } AVFrame* CVideoDecoder::takeCudaHWFrame() { std::lock_guard lock(_mutex); AVFrame* result = m_pCudaHWFrame; m_pCudaHWFrame = nullptr; return result; } AVFrame* CVideoDecoder::cloneCudaHWFrame_unlocked() { // Caller MUST already hold _mutex (called from decode thread's callback chain). // Returns a clone so the original m_pCudaHWFrame stays valid for the decode loop. return m_pCudaHWFrame ? av_frame_clone(m_pCudaHWFrame) : nullptr; } CVideoDecoder::~CVideoDecoder() { uninit(); } void CVideoDecoder::uninit() { std::lock_guard lock(_mutex); // Stop processing first // Backup first BOOL wasRunning = m_bRunning; m_bRunning = FALSE; flush(); // FIXED: Clean up frames before context to avoid use-after-free if (m_pFrame) { av_frame_free(&m_pFrame); m_pFrame = NULL; } if (m_pSoftFrame) { av_frame_free(&m_pSoftFrame); m_pSoftFrame = NULL; } if (m_pCudaHWFrame) { av_frame_free(&m_pCudaHWFrame); m_pCudaHWFrame = NULL; } if (m_pContext) { // FIXED: Free extradata before freeing context if (m_pContext->extradata) { av_free(m_pContext->extradata); m_pContext->extradata = NULL; m_pContext->extradata_size = 0; } // FIXED: Properly release hardware context reference if (m_pContext->hw_device_ctx) { av_buffer_unref(&m_pContext->hw_device_ctx); m_pContext->hw_device_ctx = NULL; } // FIXED: Close codec before freeing context avcodec_close(m_pContext); avcodec_free_context(&m_pContext); m_pContext = NULL; } // Only decrement hardware decoder count if it was actually enabled if (m_pHWDeviceCtx && m_bHardwareDecoderEnabled) { av_buffer_unref(&m_pHWDeviceCtx); m_pHWDeviceCtx = NULL; // Release via per-GPU pool or legacy global counter HWDecoderPool& pool = HWDecoderPool::instance(); if (pool.isConfigured() && m_hwGpuIndex >= 0) { pool.releaseSlot(m_hwGpuIndex); } else { CLock hw_lock(g_hw_decoder_mutex); if (g_hw_decoder_nums > 0) { g_hw_decoder_nums--; } } m_hwGpuIndex = -1; m_bHardwareDecoderEnabled = FALSE; } // Restore running state if needed m_bRunning = wasRunning; m_pCodec = NULL; m_bInited = FALSE; } BOOL CVideoDecoder::init(enum AVCodecID codec, uint8* extradata, int extradata_size, int hwMode, int preferredGpu) { std::lock_guard lock(_mutex); // Clean up any existing state if (m_bInited) { uninit(); } int width = 0; int height = 0; if (extradata && extradata_size > 0) { int vcodec = VIDEO_CODEC_NONE; if (AV_CODEC_ID_H264 == codec) { vcodec = VIDEO_CODEC_H264; } else if (AV_CODEC_ID_HEVC == codec) { vcodec = VIDEO_CODEC_H265; } else if (AV_CODEC_ID_MJPEG == codec) { vcodec = VIDEO_CODEC_JPEG; } else if (AV_CODEC_ID_MPEG4 == codec) { vcodec = VIDEO_CODEC_MP4; } avc_parse_video_size(vcodec, extradata, extradata_size, &width, &height); } #ifdef ANDROID if (HW_DECODING_DISABLE != hwMode && width * height >= 320 * 240) { if (AV_CODEC_ID_H264 == codec) { m_pCodec = avcodec_find_decoder_by_name("h264_mediacodec"); } else if (AV_CODEC_ID_HEVC == codec) { m_pCodec = avcodec_find_decoder_by_name("hevc_mediacodec"); } else if (AV_CODEC_ID_MPEG4 == codec) { m_pCodec = avcodec_find_decoder_by_name("mpeg4_mediacodec"); } } if (NULL == m_pCodec) { m_pCodec = avcodec_find_decoder(codec); } #else m_pCodec = avcodec_find_decoder(codec); #endif if (NULL == m_pCodec) { log_print(HT_LOG_ERR, "%s, m_pCodec is NULL for codec %d\r\n", __FUNCTION__, codec); return FALSE; } m_pContext = avcodec_alloc_context3(m_pCodec); if (NULL == m_pContext) { log_print(HT_LOG_ERR, "%s, avcodec_alloc_context3 failed\r\n", __FUNCTION__); return FALSE; } m_pContext->width = width; m_pContext->height = height; m_pContext->flags |= AV_CODEC_FLAG_LOW_DELAY; m_pContext->flags2 |= AV_CODEC_FLAG2_FAST; m_pContext->flags |= AV_CODEC_FLAG_OUTPUT_CORRUPT; m_pContext->err_recognition = AV_EF_IGNORE_ERR; av_opt_set_int(m_pContext, "refcounted_frames", 1, 0); // Initialize hardware decoder if (HW_DECODING_DISABLE != hwMode) { int hw_ret = hwDecoderInit(m_pContext, hwMode, preferredGpu); if (hw_ret < 0) { log_print(HT_LOG_WARN, "%s, hwDecoderInit failed with error %d, falling back to software decoding\r\n", __FUNCTION__, hw_ret); } } // Handle extradata if (extradata && extradata_size > 0) { int size = extradata_size + AV_INPUT_BUFFER_PADDING_SIZE; m_pContext->extradata = (uint8*)av_mallocz(size); if (m_pContext->extradata) { m_pContext->extradata_size = extradata_size; memcpy(m_pContext->extradata, extradata, extradata_size); } else { log_print(HT_LOG_ERR, "%s, Failed to allocate extradata\r\n", __FUNCTION__); uninit(); // FIXED: Clean up on failure return FALSE; } } // FIXED: Use avcodec_open2 instead of avcodec_thread_open if (avcodec_open2(m_pContext, m_pCodec, NULL) < 0) { log_print(HT_LOG_ERR, "%s, avcodec_open2 failed\r\n", __FUNCTION__); uninit(); // FIXED: Clean up on failure return FALSE; } m_pFrame = av_frame_alloc(); if (NULL == m_pFrame) { log_print(HT_LOG_ERR, "%s, av_frame_alloc failed for m_pFrame\r\n", __FUNCTION__); uninit(); // FIXED: Clean up on failure return FALSE; } m_pSoftFrame = av_frame_alloc(); if (NULL == m_pSoftFrame) { log_print(HT_LOG_ERR, "%s, av_frame_alloc failed for m_pSoftFrame\r\n", __FUNCTION__); uninit(); // FIXED: Clean up on failure return FALSE; } m_bInited = TRUE; //m_bRunning = TRUE; return TRUE; } BOOL CVideoDecoder::init(int codec, uint8* extradata, int extradata_size, int hwMode, int preferredGpu) { BOOL result = init(to_video_avcodecid(codec), extradata, extradata_size, hwMode, preferredGpu); if (result) { m_bRunning = TRUE; // Set running only if initialization succeeded } return result; } int CVideoDecoder::hwDecoderInit(AVCodecContext* ctx, int hwMode, int preferredGpu) { int err = 0; std::string hwtype; enum AVHWDeviceType type = AV_HWDEVICE_TYPE_NONE; if (hwMode == HW_DECODING_DISABLE) { return 0; // Hardware decoding is disabled } // -- Per-GPU pool path (preferred) or legacy global path ---------------- int assignedGpu = -1; HWDecoderPool& pool = HWDecoderPool::instance(); if (pool.isConfigured()) { // Per-GPU: acquire a slot, preferring the caller's requested GPU // (e.g. inference GPU for NV12 zero-copy alignment) assignedGpu = pool.acquireSlot(preferredGpu); if (assignedGpu < 0) { log_print(HT_LOG_WARN, "%s, All GPU HW decoder slots full (%d/%d total)\r\n", __FUNCTION__, pool.getTotalActive(), pool.getTotalMax()); return -1; } } else { // Legacy: single global counter CLock lock(g_hw_decoder_mutex); if (g_hw_decoder_max > 0 && g_hw_decoder_nums >= g_hw_decoder_max) { log_print(HT_LOG_WARN, "%s, Maximum number of hardware decoders reached (%d/%d)\r\n", __FUNCTION__, g_hw_decoder_nums, g_hw_decoder_max); return -1; } } // Determine the hardware type based on platform and hardware mode if (!getHardwareTypeForPlatform(hwMode, hwtype)) { log_print(HT_LOG_WARN, "%s, Unsupported hardware mode %d for the current platform\r\n", __FUNCTION__, hwMode); if (assignedGpu >= 0) pool.releaseSlot(assignedGpu); return -1; } // Find the hardware device type by name type = av_hwdevice_find_type_by_name(hwtype.c_str()); if (type == AV_HWDEVICE_TYPE_NONE) { log_print(HT_LOG_WARN, "%s, Hardware device type %s is not supported\r\n", __FUNCTION__, hwtype.c_str()); logSupportedHwTypes(); if (assignedGpu >= 0) pool.releaseSlot(assignedGpu); return -1; } // Find a hardware configuration that supports the specified device type if (!findHwConfigForDeviceType(type)) { log_print(HT_LOG_WARN, "%s, Decoder %s does not support the specified hardware device type %s\r\n", __FUNCTION__, m_pCodec->long_name, av_hwdevice_get_type_name(type)); if (assignedGpu >= 0) pool.releaseSlot(assignedGpu); return -1; } // Get or create a shared HW device context for this GPU. // NVIDIA recommends sharing CUDA contexts across decode sessions to minimize // GPU memory overhead (each CUDA context costs ~50-100MB). // See: NVDEC Video Decoder API Programming Guide, Section "Multi-session decoding" int gpuIdx = (assignedGpu >= 0) ? assignedGpu : 0; m_pHWDeviceCtx = SharedHWDeviceCtx::instance().acquire(gpuIdx, type); if (!m_pHWDeviceCtx) { log_print(HT_LOG_ERR, "%s, Failed to acquire shared HW device context, type=%s, gpu=%d\r\n", __FUNCTION__, av_hwdevice_get_type_name(type), gpuIdx); if (assignedGpu >= 0) pool.releaseSlot(assignedGpu); return -1; } // Configure the codec context to use the shared hardware device ctx->opaque = this; ctx->get_format = ::getHWFormat; ctx->hw_device_ctx = av_buffer_ref(m_pHWDeviceCtx); ctx->err_recognition = AV_EF_IGNORE_ERR; ctx->flags2 |= AV_CODEC_FLAG2_EXPORT_MVS; // Reserve extra NVDEC surfaces for application-held av_frame_clone() references. // The clone chain (decoder → player → registry) holds ~2 surfaces simultaneously // (decoder's clone + registry's clone; getCudaHWFrame uses ownership transfer). // Without this, the default pool (num_decode_surfaces + 2) can run out under // load with many concurrent streams, causing the decoder to stall. ctx->extra_hw_frames = 2; // Track which GPU this decoder is on m_hwGpuIndex = assignedGpu; m_bHardwareDecoderEnabled = TRUE; m_bCudaHWAccel = (type == AV_HWDEVICE_TYPE_CUDA); // Legacy counter (for backward compatibility) if (!pool.isConfigured()) { CLock lock(g_hw_decoder_mutex); g_hw_decoder_nums++; } log_print(HT_LOG_INFO, "%s, Successfully initialized hardware decoder %s on GPU[%d] (%d/%d)\r\n", __FUNCTION__, av_hwdevice_get_type_name(type), gpuIdx, pool.isConfigured() ? pool.getTotalActive() : g_hw_decoder_nums, pool.isConfigured() ? pool.getTotalMax() : g_hw_decoder_max); return 0; } void CVideoDecoder::Start() { std::lock_guard lock(_mutex); if (m_pContext) { avcodec_flush_buffers(m_pContext); } m_bRunning = TRUE; log_print(HT_LOG_INFO, "%s, Video decoder started\r\n", __FUNCTION__); } void CVideoDecoder::Stop() { // Atomically signal the decoder to stop WITHOUT acquiring _mutex. // decode() holds _mutex while inside avcodec_send_packet / CUDA calls // that can block on the nvcuda64 SRW lock for a long time. // If we waited for _mutex here, Stop() would deadlock whenever a // concurrent decode() is stuck waiting for a CUDA operation held by // an inference thread. m_bRunning.store(FALSE, std::memory_order_release); log_print(HT_LOG_INFO, "%s, Video decoder stopped\r\n", __FUNCTION__); } // Log all supported hardware types void CVideoDecoder::logSupportedHwTypes() { enum AVHWDeviceType type = AV_HWDEVICE_TYPE_NONE; log_print(HT_LOG_INFO, "%s, Available hardware device types:\r\n", __FUNCTION__); while ((type = av_hwdevice_iterate_types(type)) != AV_HWDEVICE_TYPE_NONE) { log_print(HT_LOG_INFO, "%s, - %s\r\n", __FUNCTION__, av_hwdevice_get_type_name(type)); } } // Platform-specific function to determine the hardware type based on the mode bool CVideoDecoder::getHardwareTypeForPlatform(int hwMode, std::string& hwtype) { #if __WINDOWS_OS__ switch (hwMode) { case HW_DECODING_D3D11: hwtype = "d3d11va"; break; case HW_DECODING_DXVA: hwtype = "dxva2"; break; case HW_DECODING_CUDA: hwtype = "cuda"; break; case HW_DECODING_AUTO: hwtype = "cuda"; if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) { hwtype = "d3d11va"; if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) { hwtype = "dxva2"; } } break; default: return false; } #elif defined(IOS) switch (hwMode) { case HW_DECODING_VIDEOTOOLBOX: hwtype = "videotoolbox"; break; case HW_DECODING_OPENCL: hwtype = "opencl"; break; case HW_DECODING_AUTO: hwtype = "videotoolbox"; if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) { hwtype = "opencl"; } break; default: return false; } #elif defined(ANDROID) if (hwMode == HW_DECODING_MEDIACODEC || hwMode == HW_DECODING_AUTO) { hwtype = "mediacodec"; } else { return false; } #elif __LINUX_OS__ switch (hwMode) { case HW_DECODING_VAAPI: hwtype = "vaapi"; break; case HW_DECODING_OPENCL: hwtype = "opencl"; break; case HW_DECODING_AUTO: hwtype = "vaapi"; if (av_hwdevice_find_type_by_name(hwtype.c_str()) == AV_HWDEVICE_TYPE_NONE) { hwtype = "opencl"; } break; default: return false; } #else return false; // Unsupported platform #endif return true; } // Find a hardware configuration that matches the specified device type bool CVideoDecoder::findHwConfigForDeviceType(AVHWDeviceType type) { for (int i = 0;; i++) { const AVCodecHWConfig* config = avcodec_get_hw_config(m_pCodec, i); if (!config) { return false; // No matching hardware configuration found } if (config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX && config->device_type == type) { m_hwPixFmt = config->pix_fmt; return true; } } } BOOL CVideoDecoder::getHWFormat(AVCodecContext* ctx, const AVPixelFormat* pix_fmts, AVPixelFormat* dst) { const AVPixelFormat* p; *dst = AV_PIX_FMT_NONE; // First, attempt to use hardware pixel format if available for (p = pix_fmts; *p != -1; p++) { if (*p == m_hwPixFmt) { *dst = *p; return TRUE; } } // If hardware format is not supported, fall back to YUVJ420P for (p = pix_fmts; *p != -1; p++) { if (*p == AV_PIX_FMT_YUVJ420P || *p == AV_PIX_FMT_YUVJ422P || *p == AV_PIX_FMT_YUVJ444P) { *dst = *p; return TRUE; } } // As a last resort, use other formats (YUV420P) for (p = pix_fmts; *p != -1; p++) { if (*p == AV_PIX_FMT_YUV420P || *p == AV_PIX_FMT_YUV422P || *p == AV_PIX_FMT_YUV444P) { *dst = *p; return TRUE; } } if (*pix_fmts != -1) { *dst = *pix_fmts; return TRUE; } log_print(HT_LOG_ERR, "%s, Failed to get HW surface format\r\n", __FUNCTION__); return FALSE; } int CVideoDecoder::getWidth() { std::lock_guard lock(_mutex); if (m_pContext) { return m_pContext->width; } return 0; } int CVideoDecoder::getHeight() { std::lock_guard lock(_mutex); if (m_pContext) { return m_pContext->height; } return 0; } double CVideoDecoder::getFrameRate() { std::lock_guard lock(_mutex); if (m_pContext) { if (m_pContext->framerate.den > 0) { return (double)((double)(m_pContext->framerate.num) / m_pContext->framerate.den); } } return 0; } BOOL CVideoDecoder::decode(AVPacket* pkt) { std::lock_guard lock(_mutex); if (!m_bInited) { log_print(HT_LOG_ERR, "%s, Decoder not initialized\r\n", __FUNCTION__); return FALSE; } if (!m_bRunning) { log_print(HT_LOG_WARN, "%s, Decoder not running\r\n", __FUNCTION__); return FALSE; } if (!m_pContext) { log_print(HT_LOG_ERR, "%s, Context is NULL\r\n", __FUNCTION__); return FALSE; } int ret; int retryCount = 0; const int maxRetries = 3; // Attempt to send packet to decoder while ((ret = avcodec_send_packet(m_pContext, pkt)) == AVERROR(EAGAIN) && retryCount < maxRetries) { if (!readFrame()) { log_print(HT_LOG_ERR, "%s, Failed to read frame during retry %d\r\n", __FUNCTION__, retryCount); return FALSE; } Sleep(1); // Reduced sleep time retryCount++; } // Check for other errors if (ret < 0 && ret != AVERROR_EOF) { char error_buf[AV_ERROR_MAX_STRING_SIZE]; av_strerror(ret, error_buf, sizeof(error_buf)); log_print(HT_LOG_ERR, "%s, avcodec_send_packet failed: %s (ret=%d)\r\n", __FUNCTION__, error_buf, ret); return FALSE; } // If the packet was successfully sent, proceed to read frame return readFrame(); } BOOL CVideoDecoder::decode(uint8* data, int len, int64_t pts) { std::lock_guard lock(_mutex); if (!m_bInited) { log_print(HT_LOG_ERR, "%s, Decoder not initialized\r\n", __FUNCTION__); return FALSE; } if (!m_bRunning) { log_print(HT_LOG_WARN, "%s, Decoder not running\r\n", __FUNCTION__); return FALSE; } if (!data || len <= 0) { log_print(HT_LOG_ERR, "%s, Invalid input data\r\n", __FUNCTION__); return FALSE; } // Allocate packet AVPacket* packet = av_packet_alloc(); if (!packet) { log_print(HT_LOG_ERR, "%s, Failed to allocate AVPacket\r\n", __FUNCTION__); return FALSE; } // FIXED: Use av_new_packet() to properly allocate and manage packet data int ret = av_new_packet(packet, len); if (ret < 0) { char error_buf[AV_ERROR_MAX_STRING_SIZE]; av_strerror(ret, error_buf, sizeof(error_buf)); log_print(HT_LOG_ERR, "%s, Failed to allocate packet data: %s\r\n", __FUNCTION__, error_buf); av_packet_free(&packet); return FALSE; } // Copy data - av_new_packet() already allocated the buffer with proper padding memcpy(packet->data, data, len); // Set packet timing information packet->pts = pts; packet->dts = pts; // Call decode function BOOL result = decode(packet); // Clean up - av_packet_free will properly handle the data buffer av_packet_free(&packet); return result; } BOOL CVideoDecoder::readFrame() { int ret = 0; AVFrame* tmp_frame = NULL; BOOL frame_processed = FALSE; while (ret >= 0) { ret = avcodec_receive_frame(m_pContext, m_pFrame); if (ret == AVERROR(EAGAIN)) { // Need more input data return frame_processed ? TRUE : FALSE; } else if (ret == AVERROR_EOF) { // End of stream return TRUE; } else if (ret < 0) { char error_buf[AV_ERROR_MAX_STRING_SIZE]; av_strerror(ret, error_buf, sizeof(error_buf)); log_print(HT_LOG_ERR, "%s, avcodec_receive_frame failed: %s (ret=%d)\r\n", __FUNCTION__, error_buf, ret); return FALSE; } // Check if we got a valid frame if (!m_pFrame || m_pFrame->width <= 0 || m_pFrame->height <= 0) { log_print(HT_LOG_WARN, "%s, Received invalid frame\r\n", __FUNCTION__); av_frame_unref(m_pFrame); continue; } if (m_pFrame->format == m_hwPixFmt) { // CUDA HW accel: clone the HW frame BEFORE transfer so inference // can use CUDA device pointers directly (zero-copy, no upload). if (m_bCudaHWAccel) { if (m_pCudaHWFrame) av_frame_free(&m_pCudaHWFrame); m_pCudaHWFrame = av_frame_clone(m_pFrame); } // FIXED: Ensure m_pSoftFrame is properly initialized before transfer av_frame_unref(m_pSoftFrame); // Clear any previous data // Hardware frame - transfer to software (needed for display) ret = av_hwframe_transfer_data(m_pSoftFrame, m_pFrame, 0); if (ret < 0) { char error_buf[AV_ERROR_MAX_STRING_SIZE]; av_strerror(ret, error_buf, sizeof(error_buf)); log_print(HT_LOG_ERR, "%s, Error transferring hardware frame to system memory: %s (ret=%d)\r\n", __FUNCTION__, error_buf, ret); av_frame_unref(m_pFrame); if (m_pCudaHWFrame) av_frame_free(&m_pCudaHWFrame); continue; } // Copy timing information m_pSoftFrame->pts = m_pFrame->pts; m_pSoftFrame->pkt_dts = m_pFrame->pkt_dts; m_pSoftFrame->best_effort_timestamp = m_pFrame->best_effort_timestamp; tmp_frame = m_pSoftFrame; } else { // Software frame - use directly tmp_frame = m_pFrame; } // Render the frame if (tmp_frame) { render(tmp_frame); frame_processed = TRUE; } // FIXED: Ensure proper cleanup of frame references if (tmp_frame == m_pSoftFrame) { av_frame_unref(m_pSoftFrame); } av_frame_unref(m_pFrame); } return TRUE; } int CVideoDecoder::render(AVFrame* frame) { if (!m_bRunning || !frame) { return 0; } if (m_pCallback) { try { m_pCallback(frame, m_pUserdata); } catch (...) { log_print(HT_LOG_ERR, "%s, Exception in callback function\r\n", __FUNCTION__); return 0; } } return 1; } void CVideoDecoder::flush() { std::lock_guard lock(_mutex); if (NULL == m_pContext || NULL == m_pContext->codec || !(m_pContext->codec->capabilities & AV_CODEC_CAP_DELAY)) { return; } log_print(HT_LOG_INFO, "%s, Flushing decoder buffers\r\n", __FUNCTION__); // Send NULL packet to flush avcodec_send_packet(m_pContext, NULL); // FIXED: Drain all frames after flushing while (true) { int ret = avcodec_receive_frame(m_pContext, m_pFrame); if (ret == AVERROR_EOF || ret == AVERROR(EAGAIN)) { break; } if (ret < 0) { char error_buf[AV_ERROR_MAX_STRING_SIZE]; av_strerror(ret, error_buf, sizeof(error_buf)); log_print(HT_LOG_WARN, "%s, Error during flush: %s\r\n", __FUNCTION__, error_buf); break; } // Process the frame if needed, or just unref it av_frame_unref(m_pFrame); } // Also flush the codec buffers if (m_pContext) { avcodec_flush_buffers(m_pContext); } }