#ifndef VIDEO_DECODER_H #define VIDEO_DECODER_H #include "sys_inc.h" #include "media_format.h" #include #include #include #include extern "C" { #include "libavcodec/avcodec.h" #include "libavutil/avutil.h" #include "libswscale/swscale.h" #include "libavformat/avformat.h" #include } #define HW_DECODING_AUTO 0 // automatic select video acceleration hardware #define HW_DECODING_D3D11 1 // D3D11 video acceleration #define HW_DECODING_DXVA 2 // DXVA video acceleration #define HW_DECODING_VAAPI 3 // VAAPI video acceleration #define HW_DECODING_OPENCL 4 // OPENCL video acceleration #define HW_DECODING_VIDEOTOOLBOX 5 // VideoToolBox video acceleration #define HW_DECODING_MEDIACODEC 6 // MediaCodec video acceleration #define HW_DECODING_CUDA 7 // CUDA/NVDEC — decoded NV12 stays in GPU VRAM #define HW_DECODING_DISABLE -1 // disable video acceleration // Legacy global limit (default: 4). Still works if HWDecoderPool is not configured. extern uint32 g_hw_decoder_max; // --------------------------------------------------------------------------- // HWDecoderPool -- per-GPU hardware decoder session manager // // Tracks active HW decoder sessions per GPU and distributes new sessions // to the GPU with the fewest active decoders (least-loaded). // // Usage: // // Auto-configure from outside (e.g., ANSRTSP): // HWDecoderPool::instance().configure(numGpus, maxSessionsPerGpu); // // // Or leave unconfigured -- falls back to legacy g_hw_decoder_max behaviour. // --------------------------------------------------------------------------- class HWDecoderPool { public: static HWDecoderPool& instance(); // Configure uniform per-GPU limits. Call once at startup before creating decoders. void configure(int numGpus, int maxPerGpu); // Configure per-GPU limits individually (different GPUs may have different capabilities). void configure(const std::vector& maxPerGpuList); // Is the pool configured with per-GPU tracking? bool isConfigured() const; // Try to acquire a HW decoder slot. Returns the GPU index to use, // or -1 if all GPUs are at capacity. // If preferredGpu >= 0, prefer that GPU (e.g. to match inference GPU for zero-copy). // Falls back to least-loaded if preferred GPU is at capacity. int acquireSlot(int preferredGpu = -1); // Release a HW decoder slot on the given GPU. void releaseSlot(int gpuIndex); // Get total max sessions across all GPUs. int getTotalMax() const; // Get number of active sessions across all GPUs. int getTotalActive() const; private: HWDecoderPool() = default; std::mutex m_mutex; bool m_configured = false; std::vector m_maxPerGpu; // max session limit per GPU std::vector m_activePerGpu; // active session count per GPU }; // --------------------------------------------------------------------------- // SharedHWDeviceCtx -- per-GPU shared AVHWDeviceContext cache // // NVIDIA recommends sharing CUDA contexts across decode sessions to reduce // GPU memory overhead. This cache creates one AVHWDeviceContext per GPU // and shares it (via av_buffer_ref) across all decoder sessions on that GPU. // // Thread-safe: all methods lock internally. // --------------------------------------------------------------------------- class SharedHWDeviceCtx { public: static SharedHWDeviceCtx& instance(); // Get (or create) a shared HW device context for the given GPU index and device type. // Returns a new av_buffer_ref to the shared context (caller must av_buffer_unref). // Returns nullptr on failure. AVBufferRef* acquire(int gpuIndex, AVHWDeviceType type); // Release all cached contexts (call at shutdown). void releaseAll(); private: SharedHWDeviceCtx() = default; ~SharedHWDeviceCtx(); struct GpuCtx { AVBufferRef* ctx = nullptr; AVHWDeviceType type = AV_HWDEVICE_TYPE_NONE; }; std::mutex m_mutex; std::vector m_cache; }; typedef void (*VDCB)(AVFrame* frame, void* pUserdata); class CVideoDecoder { public: CVideoDecoder(); ~CVideoDecoder(); public: BOOL init(int codec, uint8* extradata = NULL, int extradata_size = 0, int hwMode = HW_DECODING_AUTO, int preferredGpu = -1); BOOL init(enum AVCodecID codec, uint8* extradata = NULL, int extradata_size = 0, int hwMode = HW_DECODING_AUTO, int preferredGpu = -1); void uninit(); int getWidth(); int getHeight(); double getFrameRate(); BOOL decode(uint8* data, int len, int64_t pts = AV_NOPTS_VALUE); BOOL decode(AVPacket* pkt); void setCallback(VDCB pCallback, void* pUserdata) { m_pCallback = pCallback; m_pUserdata = pUserdata; } BOOL getHWFormat(AVCodecContext* ctx, const AVPixelFormat* pix_fmts, AVPixelFormat* dst); bool getHardwareTypeForPlatform(int hwMode, std::string& hwtype); bool findHwConfigForDeviceType(AVHWDeviceType type); void logSupportedHwTypes(); BOOL isHardwareDecoderEnabled() const { return m_bHardwareDecoderEnabled; } int getHWGpuIndex() const { return m_hwGpuIndex; } bool isCudaHWAccel() const { return m_bCudaHWAccel; } // Returns the CUDA HW frame (device pointers). Caller takes ownership. AVFrame* takeCudaHWFrame(); // Clone CUDA HW frame without locking — caller MUST already hold _mutex // (used by onVideoFrame callback which runs inside decode()'s lock scope). AVFrame* cloneCudaHWFrame_unlocked(); void Start(); void Stop(); void flush(); AVCodecContext* getAVCodeContext() { return m_pContext; } private: BOOL readFrame(); int render(AVFrame* frame); int hwDecoderInit(AVCodecContext* ctx, int hwMode, int preferredGpu = -1); private: BOOL m_bInited; std::atomic m_bRunning; BOOL m_bHardwareDecoderEnabled; // Track if hardware decoder is enabled bool m_bCudaHWAccel; // true when using AV_HWDEVICE_TYPE_CUDA int m_hwGpuIndex; // GPU index assigned by HWDecoderPool (-1 = legacy) AVFrame* m_pCudaHWFrame; // Cloned CUDA HW frame (device ptrs) for inference const AVCodec* m_pCodec; AVCodecContext* m_pContext; AVFrame* m_pFrame; AVFrame* m_pSoftFrame; VDCB m_pCallback; void* m_pUserdata; AVPixelFormat m_hwPixFmt; AVBufferRef* m_pHWDeviceCtx; std::recursive_mutex _mutex; }; #endif