2026-03-28 16:54:11 +11:00
|
|
|
#ifndef VIDEO_DECODER_H
|
|
|
|
|
#define VIDEO_DECODER_H
|
|
|
|
|
#include "sys_inc.h"
|
|
|
|
|
#include "media_format.h"
|
|
|
|
|
#include <string>
|
2026-04-02 22:07:27 +11:00
|
|
|
#include <atomic>
|
2026-03-28 16:54:11 +11:00
|
|
|
#include <mutex>
|
|
|
|
|
#include <vector>
|
|
|
|
|
extern "C"
|
|
|
|
|
{
|
|
|
|
|
#include "libavcodec/avcodec.h"
|
|
|
|
|
#include "libavutil/avutil.h"
|
|
|
|
|
#include "libswscale/swscale.h"
|
|
|
|
|
#include "libavformat/avformat.h"
|
|
|
|
|
#include <libavutil/opt.h>
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define HW_DECODING_AUTO 0 // automatic select video acceleration hardware
|
|
|
|
|
#define HW_DECODING_D3D11 1 // D3D11 video acceleration
|
|
|
|
|
#define HW_DECODING_DXVA 2 // DXVA video acceleration
|
|
|
|
|
#define HW_DECODING_VAAPI 3 // VAAPI video acceleration
|
|
|
|
|
#define HW_DECODING_OPENCL 4 // OPENCL video acceleration
|
|
|
|
|
#define HW_DECODING_VIDEOTOOLBOX 5 // VideoToolBox video acceleration
|
|
|
|
|
#define HW_DECODING_MEDIACODEC 6 // MediaCodec video acceleration
|
|
|
|
|
#define HW_DECODING_CUDA 7 // CUDA/NVDEC — decoded NV12 stays in GPU VRAM
|
|
|
|
|
#define HW_DECODING_DISABLE -1 // disable video acceleration
|
|
|
|
|
|
|
|
|
|
// Legacy global limit (default: 4). Still works if HWDecoderPool is not configured.
|
|
|
|
|
extern uint32 g_hw_decoder_max;
|
|
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
// HWDecoderPool -- per-GPU hardware decoder session manager
|
|
|
|
|
//
|
|
|
|
|
// Tracks active HW decoder sessions per GPU and distributes new sessions
|
|
|
|
|
// to the GPU with the fewest active decoders (least-loaded).
|
|
|
|
|
//
|
|
|
|
|
// Usage:
|
|
|
|
|
// // Auto-configure from outside (e.g., ANSRTSP):
|
|
|
|
|
// HWDecoderPool::instance().configure(numGpus, maxSessionsPerGpu);
|
|
|
|
|
//
|
|
|
|
|
// // Or leave unconfigured -- falls back to legacy g_hw_decoder_max behaviour.
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
class HWDecoderPool {
|
|
|
|
|
public:
|
|
|
|
|
static HWDecoderPool& instance();
|
|
|
|
|
|
|
|
|
|
// Configure uniform per-GPU limits. Call once at startup before creating decoders.
|
|
|
|
|
void configure(int numGpus, int maxPerGpu);
|
|
|
|
|
|
|
|
|
|
// Configure per-GPU limits individually (different GPUs may have different capabilities).
|
|
|
|
|
void configure(const std::vector<int>& maxPerGpuList);
|
|
|
|
|
|
|
|
|
|
// Is the pool configured with per-GPU tracking?
|
|
|
|
|
bool isConfigured() const;
|
|
|
|
|
|
|
|
|
|
// Try to acquire a HW decoder slot. Returns the GPU index to use,
|
|
|
|
|
// or -1 if all GPUs are at capacity.
|
|
|
|
|
// If preferredGpu >= 0, prefer that GPU (e.g. to match inference GPU for zero-copy).
|
|
|
|
|
// Falls back to least-loaded if preferred GPU is at capacity.
|
|
|
|
|
int acquireSlot(int preferredGpu = -1);
|
|
|
|
|
|
|
|
|
|
// Release a HW decoder slot on the given GPU.
|
|
|
|
|
void releaseSlot(int gpuIndex);
|
|
|
|
|
|
|
|
|
|
// Get total max sessions across all GPUs.
|
|
|
|
|
int getTotalMax() const;
|
|
|
|
|
|
|
|
|
|
// Get number of active sessions across all GPUs.
|
|
|
|
|
int getTotalActive() const;
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
HWDecoderPool() = default;
|
|
|
|
|
|
|
|
|
|
std::mutex m_mutex;
|
|
|
|
|
bool m_configured = false;
|
|
|
|
|
std::vector<int> m_maxPerGpu; // max session limit per GPU
|
|
|
|
|
std::vector<int> m_activePerGpu; // active session count per GPU
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
// SharedHWDeviceCtx -- per-GPU shared AVHWDeviceContext cache
|
|
|
|
|
//
|
|
|
|
|
// NVIDIA recommends sharing CUDA contexts across decode sessions to reduce
|
|
|
|
|
// GPU memory overhead. This cache creates one AVHWDeviceContext per GPU
|
|
|
|
|
// and shares it (via av_buffer_ref) across all decoder sessions on that GPU.
|
|
|
|
|
//
|
|
|
|
|
// Thread-safe: all methods lock internally.
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
class SharedHWDeviceCtx {
|
|
|
|
|
public:
|
|
|
|
|
static SharedHWDeviceCtx& instance();
|
|
|
|
|
|
|
|
|
|
// Get (or create) a shared HW device context for the given GPU index and device type.
|
|
|
|
|
// Returns a new av_buffer_ref to the shared context (caller must av_buffer_unref).
|
|
|
|
|
// Returns nullptr on failure.
|
|
|
|
|
AVBufferRef* acquire(int gpuIndex, AVHWDeviceType type);
|
|
|
|
|
|
|
|
|
|
// Release all cached contexts (call at shutdown).
|
|
|
|
|
void releaseAll();
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
SharedHWDeviceCtx() = default;
|
|
|
|
|
~SharedHWDeviceCtx();
|
|
|
|
|
|
|
|
|
|
struct GpuCtx {
|
|
|
|
|
AVBufferRef* ctx = nullptr;
|
|
|
|
|
AVHWDeviceType type = AV_HWDEVICE_TYPE_NONE;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
std::mutex m_mutex;
|
|
|
|
|
std::vector<GpuCtx> m_cache;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
typedef void (*VDCB)(AVFrame* frame, void* pUserdata);
|
|
|
|
|
|
|
|
|
|
class CVideoDecoder
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
CVideoDecoder();
|
|
|
|
|
~CVideoDecoder();
|
|
|
|
|
public:
|
|
|
|
|
BOOL init(int codec, uint8* extradata = NULL, int extradata_size = 0, int hwMode = HW_DECODING_AUTO, int preferredGpu = -1);
|
|
|
|
|
BOOL init(enum AVCodecID codec, uint8* extradata = NULL, int extradata_size = 0, int hwMode = HW_DECODING_AUTO, int preferredGpu = -1);
|
|
|
|
|
void uninit();
|
|
|
|
|
int getWidth();
|
|
|
|
|
int getHeight();
|
|
|
|
|
double getFrameRate();
|
|
|
|
|
|
|
|
|
|
BOOL decode(uint8* data, int len, int64_t pts = AV_NOPTS_VALUE);
|
|
|
|
|
BOOL decode(AVPacket* pkt);
|
|
|
|
|
void setCallback(VDCB pCallback, void* pUserdata) { m_pCallback = pCallback; m_pUserdata = pUserdata; }
|
|
|
|
|
BOOL getHWFormat(AVCodecContext* ctx, const AVPixelFormat* pix_fmts, AVPixelFormat* dst);
|
|
|
|
|
bool getHardwareTypeForPlatform(int hwMode, std::string& hwtype);
|
|
|
|
|
bool findHwConfigForDeviceType(AVHWDeviceType type);
|
|
|
|
|
void logSupportedHwTypes();
|
|
|
|
|
BOOL isHardwareDecoderEnabled() const { return m_bHardwareDecoderEnabled; }
|
|
|
|
|
int getHWGpuIndex() const { return m_hwGpuIndex; }
|
|
|
|
|
bool isCudaHWAccel() const { return m_bCudaHWAccel; }
|
|
|
|
|
// Returns the CUDA HW frame (device pointers). Caller takes ownership.
|
|
|
|
|
AVFrame* takeCudaHWFrame();
|
|
|
|
|
// Clone CUDA HW frame without locking — caller MUST already hold _mutex
|
|
|
|
|
// (used by onVideoFrame callback which runs inside decode()'s lock scope).
|
|
|
|
|
AVFrame* cloneCudaHWFrame_unlocked();
|
|
|
|
|
void Start();
|
|
|
|
|
void Stop();
|
|
|
|
|
void flush();
|
|
|
|
|
AVCodecContext* getAVCodeContext() {
|
|
|
|
|
return m_pContext;
|
|
|
|
|
}
|
|
|
|
|
private:
|
|
|
|
|
BOOL readFrame();
|
|
|
|
|
int render(AVFrame* frame);
|
|
|
|
|
int hwDecoderInit(AVCodecContext* ctx, int hwMode, int preferredGpu = -1);
|
|
|
|
|
private:
|
|
|
|
|
BOOL m_bInited;
|
2026-04-02 22:07:27 +11:00
|
|
|
std::atomic<BOOL> m_bRunning;
|
2026-03-28 16:54:11 +11:00
|
|
|
BOOL m_bHardwareDecoderEnabled; // Track if hardware decoder is enabled
|
|
|
|
|
bool m_bCudaHWAccel; // true when using AV_HWDEVICE_TYPE_CUDA
|
|
|
|
|
int m_hwGpuIndex; // GPU index assigned by HWDecoderPool (-1 = legacy)
|
|
|
|
|
AVFrame* m_pCudaHWFrame; // Cloned CUDA HW frame (device ptrs) for inference
|
|
|
|
|
const AVCodec* m_pCodec;
|
|
|
|
|
AVCodecContext* m_pContext;
|
|
|
|
|
AVFrame* m_pFrame;
|
|
|
|
|
AVFrame* m_pSoftFrame;
|
|
|
|
|
VDCB m_pCallback;
|
|
|
|
|
void* m_pUserdata;
|
|
|
|
|
AVPixelFormat m_hwPixFmt;
|
|
|
|
|
AVBufferRef* m_pHWDeviceCtx;
|
|
|
|
|
std::recursive_mutex _mutex;
|
|
|
|
|
};
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|