MediaClient/media/video_decoder.h

#ifndef VIDEO_DECODER_H
#define VIDEO_DECODER_H
#include "sys_inc.h"
#include "media_format.h"
#include <string>
#include <atomic>
#include <mutex>
#include <vector>
extern "C"
{
#include "libavcodec/avcodec.h"
#include "libavutil/avutil.h"
#include "libswscale/swscale.h"
#include "libavformat/avformat.h"
#include <libavutil/opt.h>
}

#define HW_DECODING_AUTO            0       // automatic select video acceleration hardware
#define HW_DECODING_D3D11           1       // D3D11 video acceleration
#define HW_DECODING_DXVA            2       // DXVA  video acceleration
#define HW_DECODING_VAAPI           3       // VAAPI video acceleration
#define HW_DECODING_OPENCL          4       // OPENCL video acceleration
#define HW_DECODING_VIDEOTOOLBOX    5       // VideoToolBox video acceleration
#define HW_DECODING_MEDIACODEC      6       // MediaCodec video acceleration
#define HW_DECODING_CUDA            7       // CUDA/NVDEC — decoded NV12 stays in GPU VRAM
#define HW_DECODING_DISABLE         -1      // disable  video acceleration

// Legacy global limit (default: 4). Still works if HWDecoderPool is not configured.
extern uint32 g_hw_decoder_max;

// ---------------------------------------------------------------------------
//  HWDecoderPool -- per-GPU hardware decoder session manager
//
//  Tracks active HW decoder sessions per GPU and distributes new sessions
//  to the GPU with the fewest active decoders (least-loaded).
//
//  Usage:
//    // Auto-configure from outside (e.g., ANSRTSP):
//    HWDecoderPool::instance().configure(numGpus, maxSessionsPerGpu);
//
//    // Or leave unconfigured -- falls back to legacy g_hw_decoder_max behaviour.
// ---------------------------------------------------------------------------
class HWDecoderPool {
public:
    static HWDecoderPool& instance();

    // Configure uniform per-GPU limits. Call once at startup before creating decoders.
    void configure(int numGpus, int maxPerGpu);

    // Configure per-GPU limits individually (different GPUs may have different capabilities).
    void configure(const std::vector<int>& maxPerGpuList);

    // Is the pool configured with per-GPU tracking?
    bool isConfigured() const;

    // Try to acquire a HW decoder slot. Returns the GPU index to use,
    // or -1 if all GPUs are at capacity.
    // If preferredGpu >= 0, prefer that GPU (e.g. to match inference GPU for zero-copy).
    // Falls back to least-loaded if preferred GPU is at capacity.
    int acquireSlot(int preferredGpu = -1);

    // Release a HW decoder slot on the given GPU.
    void releaseSlot(int gpuIndex);

    // Get total max sessions across all GPUs.
    int getTotalMax() const;

    // Get number of active sessions across all GPUs.
    int getTotalActive() const;

private:
    HWDecoderPool() = default;

    std::mutex              m_mutex;
    bool                    m_configured = false;
    std::vector<int>        m_maxPerGpu;     // max session limit per GPU
    std::vector<int>        m_activePerGpu;  // active session count per GPU
};

// ---------------------------------------------------------------------------
//  SharedHWDeviceCtx -- per-GPU shared AVHWDeviceContext cache
//
//  NVIDIA recommends sharing CUDA contexts across decode sessions to reduce
//  GPU memory overhead. This cache creates one AVHWDeviceContext per GPU
//  and shares it (via av_buffer_ref) across all decoder sessions on that GPU.
//
//  Thread-safe: all methods lock internally.
// ---------------------------------------------------------------------------
class SharedHWDeviceCtx {
public:
    static SharedHWDeviceCtx& instance();

    // Get (or create) a shared HW device context for the given GPU index and device type.
    // Returns a new av_buffer_ref to the shared context (caller must av_buffer_unref).
    // Returns nullptr on failure.
    AVBufferRef* acquire(int gpuIndex, AVHWDeviceType type);

    // Release all cached contexts (call at shutdown).
    void releaseAll();

private:
    SharedHWDeviceCtx() = default;
    ~SharedHWDeviceCtx();

    struct GpuCtx {
        AVBufferRef*    ctx = nullptr;
        AVHWDeviceType  type = AV_HWDEVICE_TYPE_NONE;
    };

    std::mutex              m_mutex;
    std::vector<GpuCtx>     m_cache;
};

typedef void (*VDCB)(AVFrame* frame, void* pUserdata);

class CVideoDecoder
{
public:
	CVideoDecoder();
	~CVideoDecoder();
public:
	BOOL    init(int codec, uint8* extradata = NULL, int extradata_size = 0, int hwMode = HW_DECODING_AUTO, int preferredGpu = -1);
	BOOL    init(enum AVCodecID codec, uint8* extradata = NULL, int extradata_size = 0, int hwMode = HW_DECODING_AUTO, int preferredGpu = -1);
	void    uninit();
	int     getWidth();
	int     getHeight();
	double  getFrameRate();

	BOOL    decode(uint8* data, int len, int64_t pts = AV_NOPTS_VALUE);
	BOOL    decode(AVPacket* pkt);
	void    setCallback(VDCB pCallback, void* pUserdata) { m_pCallback = pCallback; m_pUserdata = pUserdata; }
	BOOL    getHWFormat(AVCodecContext* ctx, const AVPixelFormat* pix_fmts, AVPixelFormat* dst);
	bool	getHardwareTypeForPlatform(int hwMode, std::string& hwtype);
	bool	findHwConfigForDeviceType(AVHWDeviceType type);
	void    logSupportedHwTypes();
	BOOL    isHardwareDecoderEnabled() const { return m_bHardwareDecoderEnabled; }
	int     getHWGpuIndex() const { return m_hwGpuIndex; }
	bool    isCudaHWAccel() const { return m_bCudaHWAccel; }
	// Returns the CUDA HW frame (device pointers). Caller takes ownership.
	AVFrame* takeCudaHWFrame();
	// Clone CUDA HW frame without locking — caller MUST already hold _mutex
	// (used by onVideoFrame callback which runs inside decode()'s lock scope).
	AVFrame* cloneCudaHWFrame_unlocked();
	void	Start();
	void	Stop();
	void    flush();
	AVCodecContext* getAVCodeContext() {
		return m_pContext;
	}
private:
	BOOL    readFrame();
	int     render(AVFrame* frame);
	int     hwDecoderInit(AVCodecContext* ctx, int hwMode, int preferredGpu = -1);
private:
	BOOL			m_bInited;
	std::atomic<BOOL>	m_bRunning;
	BOOL			m_bHardwareDecoderEnabled;  // Track if hardware decoder is enabled
	bool			m_bCudaHWAccel;             // true when using AV_HWDEVICE_TYPE_CUDA
	int				m_hwGpuIndex;               // GPU index assigned by HWDecoderPool (-1 = legacy)
	AVFrame*		m_pCudaHWFrame;             // Cloned CUDA HW frame (device ptrs) for inference
	const AVCodec* m_pCodec;
	AVCodecContext* m_pContext;
	AVFrame* m_pFrame;
	AVFrame* m_pSoftFrame;
	VDCB            m_pCallback;
	void* m_pUserdata;
	AVPixelFormat   m_hwPixFmt;
	AVBufferRef* m_pHWDeviceCtx;
	std::recursive_mutex	_mutex;
};
#endif
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`#ifndef VIDEO_DECODER_H`
			`#define VIDEO_DECODER_H`
			`#include "sys_inc.h"`
			`#include "media_format.h"`
			`#include <string>`
Fix NV12 crash issue when recreate camera object 2026-04-02 22:07:27 +11:00			`#include <atomic>`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`#include <mutex>`
			`#include <vector>`
			`extern "C"`
			`{`
			`#include "libavcodec/avcodec.h"`
			`#include "libavutil/avutil.h"`
			`#include "libswscale/swscale.h"`
			`#include "libavformat/avformat.h"`
			`#include <libavutil/opt.h>`
			`}`

			`#define HW_DECODING_AUTO 0 // automatic select video acceleration hardware`
			`#define HW_DECODING_D3D11 1 // D3D11 video acceleration`
			`#define HW_DECODING_DXVA 2 // DXVA video acceleration`
			`#define HW_DECODING_VAAPI 3 // VAAPI video acceleration`
			`#define HW_DECODING_OPENCL 4 // OPENCL video acceleration`
			`#define HW_DECODING_VIDEOTOOLBOX 5 // VideoToolBox video acceleration`
			`#define HW_DECODING_MEDIACODEC 6 // MediaCodec video acceleration`
			`#define HW_DECODING_CUDA 7 // CUDA/NVDEC — decoded NV12 stays in GPU VRAM`
			`#define HW_DECODING_DISABLE -1 // disable video acceleration`

			`// Legacy global limit (default: 4). Still works if HWDecoderPool is not configured.`
			`extern uint32 g_hw_decoder_max;`

			`// ---------------------------------------------------------------------------`
			`// HWDecoderPool -- per-GPU hardware decoder session manager`
			`//`
			`// Tracks active HW decoder sessions per GPU and distributes new sessions`
			`// to the GPU with the fewest active decoders (least-loaded).`
			`//`
			`// Usage:`
			`// // Auto-configure from outside (e.g., ANSRTSP):`
			`// HWDecoderPool::instance().configure(numGpus, maxSessionsPerGpu);`
			`//`
			`// // Or leave unconfigured -- falls back to legacy g_hw_decoder_max behaviour.`
			`// ---------------------------------------------------------------------------`
			`class HWDecoderPool {`
			`public:`
			`static HWDecoderPool& instance();`

			`// Configure uniform per-GPU limits. Call once at startup before creating decoders.`
			`void configure(int numGpus, int maxPerGpu);`

			`// Configure per-GPU limits individually (different GPUs may have different capabilities).`
			`void configure(const std::vector<int>& maxPerGpuList);`

			`// Is the pool configured with per-GPU tracking?`
			`bool isConfigured() const;`

			`// Try to acquire a HW decoder slot. Returns the GPU index to use,`
			`// or -1 if all GPUs are at capacity.`
			`// If preferredGpu >= 0, prefer that GPU (e.g. to match inference GPU for zero-copy).`
			`// Falls back to least-loaded if preferred GPU is at capacity.`
			`int acquireSlot(int preferredGpu = -1);`

			`// Release a HW decoder slot on the given GPU.`
			`void releaseSlot(int gpuIndex);`

			`// Get total max sessions across all GPUs.`
			`int getTotalMax() const;`

			`// Get number of active sessions across all GPUs.`
			`int getTotalActive() const;`

			`private:`
			`HWDecoderPool() = default;`

			`std::mutex m_mutex;`
			`bool m_configured = false;`
			`std::vector<int> m_maxPerGpu; // max session limit per GPU`
			`std::vector<int> m_activePerGpu; // active session count per GPU`
			`};`

			`// ---------------------------------------------------------------------------`
			`// SharedHWDeviceCtx -- per-GPU shared AVHWDeviceContext cache`
			`//`
			`// NVIDIA recommends sharing CUDA contexts across decode sessions to reduce`
			`// GPU memory overhead. This cache creates one AVHWDeviceContext per GPU`
			`// and shares it (via av_buffer_ref) across all decoder sessions on that GPU.`
			`//`
			`// Thread-safe: all methods lock internally.`
			`// ---------------------------------------------------------------------------`
			`class SharedHWDeviceCtx {`
			`public:`
			`static SharedHWDeviceCtx& instance();`

			`// Get (or create) a shared HW device context for the given GPU index and device type.`
			`// Returns a new av_buffer_ref to the shared context (caller must av_buffer_unref).`
			`// Returns nullptr on failure.`
			`AVBufferRef* acquire(int gpuIndex, AVHWDeviceType type);`

			`// Release all cached contexts (call at shutdown).`
			`void releaseAll();`

			`private:`
			`SharedHWDeviceCtx() = default;`
			`~SharedHWDeviceCtx();`

			`struct GpuCtx {`
			`AVBufferRef* ctx = nullptr;`
			`AVHWDeviceType type = AV_HWDEVICE_TYPE_NONE;`
			`};`

			`std::mutex m_mutex;`
			`std::vector<GpuCtx> m_cache;`
			`};`

			`typedef void (VDCB)(AVFrame frame, void* pUserdata);`

			`class CVideoDecoder`
			`{`
			`public:`
			`CVideoDecoder();`
			`~CVideoDecoder();`
			`public:`
			`BOOL init(int codec, uint8* extradata = NULL, int extradata_size = 0, int hwMode = HW_DECODING_AUTO, int preferredGpu = -1);`
			`BOOL init(enum AVCodecID codec, uint8* extradata = NULL, int extradata_size = 0, int hwMode = HW_DECODING_AUTO, int preferredGpu = -1);`
			`void uninit();`
			`int getWidth();`
			`int getHeight();`
			`double getFrameRate();`

			`BOOL decode(uint8* data, int len, int64_t pts = AV_NOPTS_VALUE);`
			`BOOL decode(AVPacket* pkt);`
			`void setCallback(VDCB pCallback, void* pUserdata) { m_pCallback = pCallback; m_pUserdata = pUserdata; }`
			`BOOL getHWFormat(AVCodecContext* ctx, const AVPixelFormat* pix_fmts, AVPixelFormat* dst);`
			`bool getHardwareTypeForPlatform(int hwMode, std::string& hwtype);`
			`bool findHwConfigForDeviceType(AVHWDeviceType type);`
			`void logSupportedHwTypes();`
			`BOOL isHardwareDecoderEnabled() const { return m_bHardwareDecoderEnabled; }`
			`int getHWGpuIndex() const { return m_hwGpuIndex; }`
			`bool isCudaHWAccel() const { return m_bCudaHWAccel; }`
			`// Returns the CUDA HW frame (device pointers). Caller takes ownership.`
			`AVFrame* takeCudaHWFrame();`
			`// Clone CUDA HW frame without locking — caller MUST already hold _mutex`
			`// (used by onVideoFrame callback which runs inside decode()'s lock scope).`
			`AVFrame* cloneCudaHWFrame_unlocked();`
			`void Start();`
			`void Stop();`
			`void flush();`
			`AVCodecContext* getAVCodeContext() {`
			`return m_pContext;`
			`}`
			`private:`
			`BOOL readFrame();`
			`int render(AVFrame* frame);`
			`int hwDecoderInit(AVCodecContext* ctx, int hwMode, int preferredGpu = -1);`
			`private:`
			`BOOL m_bInited;`
Fix NV12 crash issue when recreate camera object 2026-04-02 22:07:27 +11:00			`std::atomic<BOOL> m_bRunning;`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`BOOL m_bHardwareDecoderEnabled; // Track if hardware decoder is enabled`
			`bool m_bCudaHWAccel; // true when using AV_HWDEVICE_TYPE_CUDA`
			`int m_hwGpuIndex; // GPU index assigned by HWDecoderPool (-1 = legacy)`
			`AVFrame* m_pCudaHWFrame; // Cloned CUDA HW frame (device ptrs) for inference`
			`const AVCodec* m_pCodec;`
			`AVCodecContext* m_pContext;`
			`AVFrame* m_pFrame;`
			`AVFrame* m_pSoftFrame;`
			`VDCB m_pCallback;`
			`void* m_pUserdata;`
			`AVPixelFormat m_hwPixFmt;`
			`AVBufferRef* m_pHWDeviceCtx;`
			`std::recursive_mutex _mutex;`
			`};`
			`#endif`