ANSCORE/MediaClient/media/video_player.cpp

#include "sys_inc.h"
#include "media_util.h"
#include "media_parse.h"
#include "media_codec.h"
#include "h264.h"
#include "h265.h"
#include "video_player.h"
extern "C"
{
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libswscale/swscale.h>
#include <libswresample/swresample.h>
#include <libavutil/intreadwrite.h>
#include <libavutil/avstring.h>
#include <libavutil/base64.h>
#include <libavutil/imgutils.h>
}
#if __WINDOWS_OS__
#include "audio_play_win.h"
#elif defined(IOS)
#include "video_render_sdl.h"
#include "audio_play_mac.h"
#elif __LINUX_OS__
#include "video_render_sdl.h"
#include "audio_play_qt.h"
#endif
#include <string>
#include <vector>
#include <chrono>
#include <atomic>
#include <libswscale/swscale.h>
#if defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64)
#include <emmintrin.h>
#define HAS_SSE2 1
#endif

#include "ANSLicense.h"   // ANS_DBG macro (gated by ANSCORE_DEBUGVIEW)

// libyuv: SIMD-accelerated YUV↔RGB conversion with native strided-plane input.
// Replaces the memcpy-into-staging + cv::cvtColor(COLOR_YUV2BGR_I420) chain
// in avframeYUV420PToCvMat with a direct I420→RGB24 (== OpenCV BGR memory
// order) call. When the submodule isn't checked out, ANSCORE_HAS_LIBYUV is
// not defined and we fall back to the pre-libyuv path.
#if defined(ANSCORE_HAS_LIBYUV) && ANSCORE_HAS_LIBYUV
#include "libyuv/convert_argb.h"   // libyuv::I420ToRGB24
#endif


void VideoDecoderCallback(AVFrame* frame, void* userdata)
{
	CVideoPlayer* pPlayer = (CVideoPlayer*)userdata;
	pPlayer->onVideoFrame(frame);
}

void AudioDecoderCallback(AVFrame* frame, void* userdata)
{
	CVideoPlayer* pPlayer = (CVideoPlayer*)userdata;
	pPlayer->onAudioFrame(frame);
}


void   CVideoPlayer::setBbox(cv::Rect bbox) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	m_Bbox = bbox;
}
void   CVideoPlayer::setCrop(bool crop) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	m_bCrop = crop;
}
AVFrame* CVideoPlayer::cropFrame(const AVFrame* srcFrame, cv::Rect bBox, bool cropFlag) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	try {
		// Validate prerequisites
		if (!cropFlag || !srcFrame || !m_bPlaying) {
			return nullptr;
		}

		// Ensure the bounding box is within the source frame's boundaries
		bBox.x = std::clamp(bBox.x, 0, srcFrame->width);
		bBox.y = std::clamp(bBox.y, 0, srcFrame->height);
		bBox.width = std::clamp(bBox.width, 0, srcFrame->width - bBox.x);
		bBox.height = std::clamp(bBox.height, 0, srcFrame->height - bBox.y);

		// Validate the bounding box dimensions
		if (bBox.width <= 10 || bBox.height <= 10) {
			std::cerr << "Invalid bounding box dimensions for cropping." << std::endl;
			return nullptr;
		}

		// Allocate memory for the cropped frame
		AVFrame* croppedFrame = av_frame_alloc();
		if (!croppedFrame) {
			std::cerr << "Failed to allocate memory for the cropped frame." << std::endl;
			return nullptr;
		}

		// Set cropped frame attributes
		croppedFrame->format = srcFrame->format;
		croppedFrame->width = bBox.width;
		croppedFrame->height = bBox.height;

		// *** REMOVED: Don't allocate buffer since we're only setting pointers ***
		// The cropFrameData() function will set pointers to the original frame's data

		// Crop the frame based on its format
		if (!cropFrameData(srcFrame, croppedFrame, bBox)) {
			av_frame_free(&croppedFrame);
			return nullptr;
		}

		return croppedFrame;
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in CVideoPlayer::cropFrame: " << e.what() << std::endl;
		return nullptr;
	}
}


// Helper function to crop frame data
bool CVideoPlayer::cropFrameData(const AVFrame* srcFrame, AVFrame* croppedFrame, const cv::Rect& bBox) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	try {
		switch (srcFrame->format) {
		case AV_PIX_FMT_YUVJ444P:
		case AV_PIX_FMT_YUV444P:
			// Full chroma resolution (No subsampling)
			cropPlane(srcFrame, croppedFrame, 0, bBox.x, bBox.y, 1, 1); // Y plane
			cropPlane(srcFrame, croppedFrame, 1, bBox.x, bBox.y, 1, 1); // U plane
			cropPlane(srcFrame, croppedFrame, 2, bBox.x, bBox.y, 1, 1); // V plane
			break;

		case AV_PIX_FMT_YUVJ422P:
		case AV_PIX_FMT_YUV422P:
			// Horizontal chroma subsampling (chroma resolution is half in X direction)
			cropPlane(srcFrame, croppedFrame, 0, bBox.x, bBox.y, 1, 1); // Y plane
			cropPlane(srcFrame, croppedFrame, 1, bBox.x / 2, bBox.y, 1, 1); // U plane
			cropPlane(srcFrame, croppedFrame, 2, bBox.x / 2, bBox.y, 1, 1); // V plane
			break;

		case AV_PIX_FMT_YUVJ420P:
		case AV_PIX_FMT_YUV420P:
			// Both horizontal and vertical chroma subsampling (chroma is 1/4 of Y resolution)
			cropPlane(srcFrame, croppedFrame, 0, bBox.x, bBox.y, 1, 1); // Y plane
			cropPlane(srcFrame, croppedFrame, 1, bBox.x / 2, bBox.y / 2, 1, 1); // U plane
			cropPlane(srcFrame, croppedFrame, 2, bBox.x / 2, bBox.y / 2, 1, 1); // V plane
			break;

		case AV_PIX_FMT_NV12:
			// NV12 has a **single interleaved UV plane**
			cropPlane(srcFrame, croppedFrame, 0, bBox.x, bBox.y, 1, 1);  // Y plane
			cropPlane(srcFrame, croppedFrame, 1, bBox.x / 2, bBox.y / 2, 2, 1); // UV plane (interleaved, stepX=2)
			break;

		default:
			std::cerr << "Unsupported pixel format: " << av_get_pix_fmt_name((AVPixelFormat)srcFrame->format) << std::endl;
			return false;
		}
		return true;
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in cropFrameData: " << e.what() << std::endl;
		return false;
	}
}

// Helper function to crop individual planes
void CVideoPlayer::cropPlane(const AVFrame* srcFrame, AVFrame* croppedFrame, int planeIndex, int offsetX, int offsetY, int subsampleX, int subsampleY) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	try {
		croppedFrame->data[planeIndex] = srcFrame->data[planeIndex]
			+ offsetY * srcFrame->linesize[planeIndex]
			+ offsetX * subsampleX;
		croppedFrame->linesize[planeIndex] = srcFrame->linesize[planeIndex];
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in cropPlane: " << e.what() << std::endl;
	}

}

// Convert NV12 AVFrame to YUVJ420P
AVFrame* CVideoPlayer::convertNV12ToYUVJ420P(const AVFrame* nv12Frame) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);

	AVFrame* yuvjFrame = nullptr;

	try {
		if (!nv12Frame || !nv12Frame->data[0] || nv12Frame->width <= 10 || nv12Frame->height <= 10) {
			std::cerr << "Invalid or empty NV12 frame data, or invalid dimensions." << std::endl;
			return nullptr;
		}

		int width = nv12Frame->width;
		int height = nv12Frame->height;

		// ✅ Allocate new YUVJ420P frame
		yuvjFrame = av_frame_alloc();
		if (!yuvjFrame) {
			std::cerr << "Failed to allocate YUVJ420P frame" << std::endl;
			return nullptr;
		}

		yuvjFrame->format = AV_PIX_FMT_YUVJ420P;
		yuvjFrame->width = width;
		yuvjFrame->height = height;

		// ✅ Allocate buffer for YUVJ420P frame
		if (av_frame_get_buffer(yuvjFrame, 32) < 0) {
			std::cerr << "Failed to allocate buffer for YUVJ420P" << std::endl;
			av_frame_free(&yuvjFrame);
			return nullptr;
		}

		// ✅ Copy Y plane (Luma) row by row (prevents memory corruption)
		for (int j = 0; j < height; ++j) {
			memcpy(yuvjFrame->data[0] + j * yuvjFrame->linesize[0],
				nv12Frame->data[0] + j * nv12Frame->linesize[0], width);
		}

		// ✅ Correctly extract UV planes from interleaved NV12
		uint8_t* nv12_uv = nv12Frame->data[1];
		uint8_t* yuvj_u = yuvjFrame->data[1];
		uint8_t* yuvj_v = yuvjFrame->data[2];

		int uvWidth = width / 2;
		int uvHeight = height / 2;

		for (int j = 0; j < uvHeight; ++j) {
			uint8_t* nv12Row = nv12_uv + j * nv12Frame->linesize[1];
			uint8_t* uRow = yuvj_u + j * yuvjFrame->linesize[1];
			uint8_t* vRow = yuvj_v + j * yuvjFrame->linesize[2];

			for (int i = 0; i < uvWidth; ++i) {
				uRow[i] = nv12Row[i * 2];     // Extract U
				vRow[i] = nv12Row[i * 2 + 1]; // Extract V
			}
		}

		return yuvjFrame;
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in convertNV12ToYUVJ420P: " << e.what() << std::endl;

		// ✅ Prevent Memory Leak by Freeing the Allocated Frame
		if (yuvjFrame) {
			av_frame_free(&yuvjFrame);
		}

		return nullptr;
	}
}
std::string CVideoPlayer::avframeYUVJ420PToJpegStringUsingFFMpeg(const AVFrame* pFrame) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	try {
		if (!m_bPlaying) {
			return "";
		}
		if (!pFrame || !pFrame->data[0] || pFrame->width <= 10 || pFrame->height <= 10) {
			std::cerr << "Invalid or empty frame data, or invalid dimensions." << std::endl;
			return "";
		}
		AVCodec* jpegCodec = avcodec_find_encoder(AV_CODEC_ID_MJPEG);
		if (!jpegCodec) {
			std::cerr << "Failed to find MJPEG encoder." << std::endl;
			return "";
		}
		AVCodecContext* jpegContext = avcodec_alloc_context3(jpegCodec);
		if (!jpegContext) {
			std::cerr << "Failed to allocate codec context." << std::endl;
			return "";
		}
		int imageSize = std::max(pFrame->width, pFrame->height);
		AVPixelFormat pixFmt = AV_PIX_FMT_YUVJ420P;// Fix to use YUVJ420P for all resolutions
		jpegContext->pix_fmt = pixFmt;
		jpegContext->time_base.num = 1;
		jpegContext->time_base.den = 30;
		jpegContext->compression_level = 10;
		jpegContext->flags |= AV_CODEC_FLAG_QSCALE;  // Enable quality scale
		jpegContext->global_quality = 90 * FF_QP2LAMBDA;  // Adjust quality (90 is near lossless)

		AVFrame* convertedFrame = nullptr;
		AVPacket packet;
		av_init_packet(&packet);
		packet.data = nullptr;
		packet.size = 0;
		bool isSuccess = false;
		std::string jpegData;

		// Determine if conversion is needed based on the pixel format
		if ((pFrame->format == AV_PIX_FMT_YUVJ420P) ||
			(pFrame->format == AV_PIX_FMT_YUV420P))
		{
			jpegContext->width = pFrame->width;
			jpegContext->height = pFrame->height;
			if (avcodec_open2(jpegContext, jpegCodec, NULL) >= 0) {
				if (avcodec_send_frame(jpegContext, pFrame) >= 0) {
					if (avcodec_receive_packet(jpegContext, &packet) >= 0) {
						jpegData.assign(reinterpret_cast<char*>(packet.data), packet.size);
						m_Width = pFrame->width;
						m_Height = pFrame->height;
						m_pts = m_pts + 1;
						isSuccess = true;
					}
				}
			}
		}
		else {
			// Conversion is needed to AV_PIX_FMT_YUVJ420P
			initSwsContext(pFrame->width, pFrame->height, static_cast<AVPixelFormat>(pFrame->format));
			convertedFrame = av_frame_alloc();
			if (convertedFrame) {
				convertedFrame->format = pixFmt;
				convertedFrame->width = pFrame->width;
				convertedFrame->height = pFrame->height;
				convertedFrame->color_range = AVCOL_RANGE_JPEG;

				if (av_frame_get_buffer(convertedFrame, 32) >= 0) {
					sws_scale(swsCtx, pFrame->data, pFrame->linesize, 0, pFrame->height,
						convertedFrame->data, convertedFrame->linesize);
					jpegContext->width = convertedFrame->width;
					jpegContext->height = convertedFrame->height;

					if (avcodec_open2(jpegContext, jpegCodec, NULL) >= 0) {
						if (avcodec_send_frame(jpegContext, convertedFrame) >= 0) {
							if (avcodec_receive_packet(jpegContext, &packet) >= 0) {
								// Successfully encoded to JPEG
								jpegData.assign(reinterpret_cast<char*>(packet.data), packet.size);
								m_Width = convertedFrame->width;
								m_Height = convertedFrame->height;
								m_pts = m_pts + 1;
								isSuccess = true;
							}
						}
					}
				}
			}
			av_frame_free(&convertedFrame); // Free the converted frame if allocated
		}
		// Cleanup
		av_packet_unref(&packet); // Free the packet data
		avcodec_free_context(&jpegContext); // Free the codec context
		// Return the JPEG data as a string if successful, otherwise an empty string
		return isSuccess ? jpegData : "";
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in avframeToJpegString: " << e.what() << std::endl;
		return ""; // Return empty string on error
	}
}
std::string CVideoPlayer::avframeYUVJ420PToJpegStringUsingTurboJPEG(const AVFrame* pFrame) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	try {
		if (!m_bPlaying || !pFrame || !pFrame->data[0] || pFrame->width <= 10 || pFrame->height <= 10) {
			return "";
		}
		// Ensure TurboJPEG instance is valid
		if (!_tjInstance) {
			return "";
		}
		unsigned char* yuvPlanes[3] = { pFrame->data[0], pFrame->data[1], pFrame->data[2] };
		int strides[3] = { pFrame->linesize[0], pFrame->linesize[1], pFrame->linesize[2] };
		int width = pFrame->width;
		int height = pFrame->height;
		constexpr int subsampling = TJSAMP_420;
		constexpr int quality = 85;

		// Use thread-local buffers to avoid malloc/free overhead
		static thread_local std::vector<unsigned char> jpegBuffer;
		static thread_local std::vector<unsigned char> yuvBuffer;

		// Estimate required buffer sizes
		unsigned long jpegBufferSize = tjBufSize(width, height, subsampling);
		unsigned long yuvBufferSize = tjBufSizeYUV(width, height, subsampling);

		// Resize buffers only if necessary
		if (jpegBuffer.size() < jpegBufferSize) {
			jpegBuffer.resize(jpegBufferSize);
		}
		if (yuvBuffer.size() < yuvBufferSize) {
			yuvBuffer.resize(yuvBufferSize);
		}

		// Pointers for JPEG output
		unsigned char* jpegDataPtr = jpegBuffer.data();
		unsigned long jpegSize = 0;

		// Convert YUV to JPEG using TurboJPEG
		int ret = tjCompressFromYUVPlanes(
			_tjInstance,
			(const unsigned char**)yuvPlanes,
			width,
			strides,
			height,
			subsampling,
			&jpegDataPtr,  // Using preallocated buffer
			&jpegSize,
			quality,
			TJFLAG_FASTDCT | TJFLAG_FASTUPSAMPLE
		);

		// Check if TurboJPEG reallocated the buffer
		if (ret < 0) {
			return "";
		}

		// If TurboJPEG allocated a new buffer, we must free it
		if (jpegDataPtr != jpegBuffer.data()) {
			std::string jpegString(reinterpret_cast<char*>(jpegDataPtr), jpegSize);
			tjFree(jpegDataPtr);  // Free the buffer allocated by TurboJPEG
			return jpegString;
		}

		// Convert to std::string (without extra allocations)
		return std::string(reinterpret_cast<char*>(jpegDataPtr), jpegSize);
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in avframeToJpegString: " << e.what() << std::endl;
		return ""; // Return empty string on error
	}
}
std::string CVideoPlayer::encodeYUVJ420PToJPEG(AVFrame* frame, int quality) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	try {
		if (!frame || frame->format != AV_PIX_FMT_YUVJ420P) {
			std::cerr << "Invalid frame format (must be YUVJ420P)" << std::endl;
			return "";
		}

		// Find MJPEG encoder
		AVCodec* codec = avcodec_find_encoder(AV_CODEC_ID_MJPEG);
		if (!codec) {
			std::cerr << "JPEG encoder not found" << std::endl;
			return "";
		}

		// Allocate codec context
		AVCodecContext* codecCtx = avcodec_alloc_context3(codec);
		if (!codecCtx) {
			std::cerr << "Failed to allocate codec context" << std::endl;
			return "";
		}

		// Set encoding parameters
		codecCtx->pix_fmt = AV_PIX_FMT_YUVJ420P; // Use full-range YUV for better quality
		codecCtx->width = frame->width;
		codecCtx->height = frame->height;
		codecCtx->time_base.num = 1;
		codecCtx->time_base.den = 30;
		codecCtx->gop_size = 1;
		codecCtx->max_b_frames = 0;
		codecCtx->compression_level = 10;  // Increase quality
		codecCtx->flags |= AV_CODEC_FLAG_QSCALE;  // Enable quality scale
		codecCtx->global_quality = quality * FF_QP2LAMBDA;  // Adjust quality (90 is near lossless)

		// Enable optimal Huffman tables
		AVDictionary* opts = nullptr;
		av_dict_set(&opts, "huffman", "optimal", 0);

		// Open codec
		if (avcodec_open2(codecCtx, codec, &opts) < 0) {
			std::cerr << "Failed to open JPEG encoder" << std::endl;
			avcodec_free_context(&codecCtx);
			return "";
		}

		AVPacket pkt;
		av_init_packet(&pkt);
		pkt.data = nullptr;
		pkt.size = 0;

		// Send frame to encoder
		if (avcodec_send_frame(codecCtx, frame) < 0) {
			std::cerr << "Failed to send frame for encoding" << std::endl;
			avcodec_free_context(&codecCtx);
			return "";
		}

		// Receive encoded packet
		if (avcodec_receive_packet(codecCtx, &pkt) < 0) {
			std::cerr << "Failed to receive encoded packet" << std::endl;
			avcodec_free_context(&codecCtx);
			return "";
		}

		// Convert to string and clean up
		std::string jpegString(reinterpret_cast<char*>(pkt.data), pkt.size);
		av_packet_unref(&pkt);
		avcodec_free_context(&codecCtx);
		av_dict_free(&opts);

		return jpegString;
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in encodeYUVJ420PToJPEG: " << e.what() << std::endl;
		return ""; // Return empty string on error
	}
}

std::string CVideoPlayer::avframeYUVJ420PToJpegString(const AVFrame* spFrame) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);

	AVFrame* croppedFrame = nullptr;
	AVFrame* convertedFrame = nullptr;
	AVFrame* convertedNV12Frame = nullptr;
	AVFrame* pFrame = const_cast<AVFrame*>(spFrame);  // Default to original frame
	bool isSuccess = false;
	std::string jpegData;

	try {
		if (!m_bPlaying) {
			return "";
		}

		if (!spFrame || !spFrame->data[0] || spFrame->width <= 10 || spFrame->height <= 10) {
			std::cerr << "Invalid or empty frame data, or invalid dimensions." << std::endl;
			return "";
		}

		// ✅ Convert NV12 to YUVJ420P if needed
		if (pFrame->format == AV_PIX_FMT_NV12) {
			convertedNV12Frame = convertNV12ToYUVJ420P(spFrame);
			if (convertedNV12Frame) {
				pFrame = convertedNV12Frame;  // Use the converted frame
			}
		}

		// ✅ Process the frame if it's already in YUVJ420P or YUV420P
		if ((pFrame->format == AV_PIX_FMT_YUVJ420P) || (pFrame->format == AV_PIX_FMT_YUV420P)) {
			croppedFrame = cropFrame(pFrame, m_Bbox, m_bCrop);
			if (!croppedFrame) {
				croppedFrame = pFrame;  // Use original frame if cropping failed
			}

			// TurboJPEG handles all resolutions efficiently
			jpegData = avframeYUVJ420PToJpegStringUsingTurboJPEG(croppedFrame);

			if (!jpegData.empty()) {
				m_Width = croppedFrame->width;
				m_Height = croppedFrame->height;
				m_pts = m_pts + 1;
				isSuccess = true;
			}

			// ✅ Free cropped frame if allocated
			if (croppedFrame != pFrame) {
				av_frame_free(&croppedFrame);
				croppedFrame = nullptr;
			}
		}
		else {  // ✅ Convert non-YUVJ420P frames
			initSwsContext(pFrame->width, pFrame->height, static_cast<AVPixelFormat>(pFrame->format));
			convertedFrame = av_frame_alloc();
			if (convertedFrame) {
				convertedFrame->format = AV_PIX_FMT_YUVJ420P;
				convertedFrame->width = pFrame->width;
				convertedFrame->height = pFrame->height;
				convertedFrame->color_range = AVCOL_RANGE_JPEG;

				if (av_frame_get_buffer(convertedFrame, 32) >= 0) {
					sws_scale(swsCtx, pFrame->data, pFrame->linesize, 0, pFrame->height,
						convertedFrame->data, convertedFrame->linesize);

					croppedFrame = cropFrame(convertedFrame, m_Bbox, m_bCrop);
					if (!croppedFrame) {
						croppedFrame = convertedFrame;  // Use converted frame if cropping failed
					}

					// TurboJPEG handles all resolutions efficiently
					jpegData = avframeYUVJ420PToJpegStringUsingTurboJPEG(croppedFrame);

					if (!jpegData.empty()) {
						m_Width = croppedFrame->width;
						m_Height = croppedFrame->height;
						m_pts = m_pts + 1;
						isSuccess = true;
					}

					// ✅ Free cropped frame if allocated
					if (croppedFrame != convertedFrame) {
						av_frame_free(&croppedFrame);
						croppedFrame = nullptr;
					}
				}
			}
			// ✅ Free converted frame if allocated
			if (convertedFrame) {
				av_frame_free(&convertedFrame);
				convertedFrame = nullptr;
			}
		}

		// ✅ Free the NV12 converted frame if used
		if (convertedNV12Frame) {
			av_frame_free(&convertedNV12Frame);
			convertedNV12Frame = nullptr;
		}

		return isSuccess ? jpegData : "";
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in avframeYUVJ420PToJpegString: " << e.what() << std::endl;

		// ✅ Ensure all allocated frames are freed in case of an exception
		if (croppedFrame && croppedFrame != pFrame && croppedFrame != convertedFrame) {
			av_frame_free(&croppedFrame);
		}
		if (convertedFrame) {
			av_frame_free(&convertedFrame);
		}
		if (convertedNV12Frame) {
			av_frame_free(&convertedNV12Frame);
		}

		return "";
	}
}


// Direct conversion of AVFrame to JPEG using TurboJPEG
std::string CVideoPlayer::encodeNV12ToJPEG_TurboJPEG(const AVFrame* pFrame, int quality) {
	// NOTE: caller (avframeToJpegString) already holds _mutex — no lock needed here
	try {
		if (!m_bPlaying || !pFrame || !pFrame->data[0] || pFrame->width <= 10 || pFrame->height <= 10) {
			return "";
		}
		// Ensure TurboJPEG instance is valid
		if (!_tjInstance) {
			std::cerr << "TurboJPEG instance is not initialized." << std::endl;
			return "";
		}

		// Ensure the frame format is NV12
		if (pFrame->format != AV_PIX_FMT_NV12) {
			std::cerr << "Unsupported format! Expected NV12, got: "
				<< av_get_pix_fmt_name((AVPixelFormat)pFrame->format) << std::endl;
			return "";
		}

		int width = pFrame->width;
		int height = pFrame->height;
		// Use caller's quality parameter (default 90 from function signature)

		// NV12 has interleaved UV, but TurboJPEG requires separate U and V planes
		unsigned char* yuvPlanes[3];
		int strides[3];

		yuvPlanes[0] = pFrame->data[0];  // Y plane (full resolution)
		strides[0] = pFrame->linesize[0];

		// **Convert NV12 interleaved UV to separate U and V planes**
		int uvWidth = width / 2;
		int uvHeight = height / 2;
		int uvSize = uvWidth * uvHeight;

		static thread_local std::vector<unsigned char> uPlane(uvSize);
		static thread_local std::vector<unsigned char> vPlane(uvSize);

		// Deinterleave NV12 UV plane into separate U and V planes
		unsigned char* uvData = pFrame->data[1];
		int uvStride = pFrame->linesize[1];
		for (int j = 0; j < uvHeight; j++) {
			const unsigned char* uvRow = uvData + j * uvStride;
			unsigned char* uRow = uPlane.data() + j * uvWidth;
			unsigned char* vRow = vPlane.data() + j * uvWidth;
			int i = 0;
#ifdef HAS_SSE2
			// SSE2: process 16 UV pairs (32 bytes) at a time
			for (; i + 15 < uvWidth; i += 16) {
				__m128i uv0 = _mm_loadu_si128((__m128i*)(uvRow + i * 2));
				__m128i uv1 = _mm_loadu_si128((__m128i*)(uvRow + i * 2 + 16));
				// Deinterleave: even bytes = U, odd bytes = V
				__m128i mask = _mm_set1_epi16(0x00FF);
				__m128i u0 = _mm_and_si128(uv0, mask);
				__m128i u1 = _mm_and_si128(uv1, mask);
				__m128i v0 = _mm_srli_epi16(uv0, 8);
				__m128i v1 = _mm_srli_epi16(uv1, 8);
				__m128i uPacked = _mm_packus_epi16(u0, u1);
				__m128i vPacked = _mm_packus_epi16(v0, v1);
				_mm_storeu_si128((__m128i*)(uRow + i), uPacked);
				_mm_storeu_si128((__m128i*)(vRow + i), vPacked);
			}
#endif
			// Scalar fallback for remaining pixels
			for (; i < uvWidth; i++) {
				uRow[i] = uvRow[i * 2];
				vRow[i] = uvRow[i * 2 + 1];
			}
		}

		// Assign separate planes to TurboJPEG input
		yuvPlanes[1] = uPlane.data();
		yuvPlanes[2] = vPlane.data();
		strides[1] = uvWidth;
		strides[2] = uvWidth;

		// Use thread-local buffers to avoid malloc/free overhead
		static thread_local std::vector<unsigned char> jpegBuffer;

		// Estimate required buffer size for JPEG
		unsigned long jpegBufferSize = tjBufSize(width, height, TJSAMP_420);

		// Resize JPEG buffer only if necessary
		if (jpegBuffer.size() < jpegBufferSize) {
			jpegBuffer.resize(jpegBufferSize);
		}

		// Pointer for JPEG output
		unsigned char* jpegDataPtr = jpegBuffer.data();
		unsigned long jpegSize = 0;

		// Convert NV12 (separated into YUV420P) to JPEG using TurboJPEG
		int ret = tjCompressFromYUVPlanes(
			_tjInstance,
			(const unsigned char**)yuvPlanes,
			width,
			strides,
			height,
			TJSAMP_420,  // Explicitly define subsampling format for NV12
			&jpegDataPtr,  // Preallocated buffer
			&jpegSize,
			quality,
			TJFLAG_FASTDCT | TJFLAG_FASTUPSAMPLE
		);

		if (ret < 0) {
			std::cerr << "TurboJPEG compression failed: " << tjGetErrorStr() << std::endl;
			return "";
		}

		// If TurboJPEG allocated a new buffer, free it after copying
		if (jpegDataPtr != jpegBuffer.data()) {
			std::string jpegString(reinterpret_cast<char*>(jpegDataPtr), jpegSize);
			tjFree(jpegDataPtr);
			return jpegString;
		}

		// Convert to std::string without extra allocations
		return std::string(reinterpret_cast<char*>(jpegDataPtr), jpegSize);
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in avframeNV12ToJpegStringUsingTurboJPEG: " << e.what() << std::endl;
		return "";  // Return empty string on error
	}
}
std::string CVideoPlayer::encodeNV12ToJPEG_FFmpeg(const AVFrame* nv12Frame, int quality) {
	// NOTE: caller (avframeToJpegString) already holds _mutex — no lock needed here
	AVCodecContext* codecCtx = nullptr;
	AVFrame* yuvjFrame = nullptr;
	AVPacket pkt;
	try {
		if (!m_bPlaying || !nv12Frame || !nv12Frame->data[0] || nv12Frame->width <= 10 || nv12Frame->height <= 10) {
			return "";
		}

		if (nv12Frame->format != AV_PIX_FMT_NV12) {
			std::cerr << "Invalid frame format! Expected NV12." << std::endl;
			return "";
		}

		int width = nv12Frame->width;
		int height = nv12Frame->height;

		// ✅ Find and allocate MJPEG encoder
		AVCodec* jpegCodec = avcodec_find_encoder(AV_CODEC_ID_MJPEG);
		if (!jpegCodec) {
			std::cerr << "MJPEG encoder not found!" << std::endl;
			return "";
		}

		codecCtx = avcodec_alloc_context3(jpegCodec);
		if (!codecCtx) {
			std::cerr << "Failed to allocate codec context!" << std::endl;
			return "";
		}

		// ✅ Set encoding parameters
		codecCtx->pix_fmt = AV_PIX_FMT_YUVJ420P;
		codecCtx->width = width;
		codecCtx->height = height;
		codecCtx->time_base = { 1, 25 };
		codecCtx->gop_size = 1;
		codecCtx->max_b_frames = 0;
		codecCtx->compression_level = 10;
		codecCtx->flags |= AV_CODEC_FLAG_QSCALE;
		codecCtx->global_quality = quality * FF_QP2LAMBDA;

		if (avcodec_open2(codecCtx, jpegCodec, nullptr) < 0) {
			std::cerr << "Failed to open MJPEG encoder!" << std::endl;
			avcodec_free_context(&codecCtx);
			return "";
		}

		// ✅ Allocate YUVJ420P frame
		yuvjFrame = av_frame_alloc();
		if (!yuvjFrame) {
			std::cerr << "Failed to allocate YUVJ420P frame!" << std::endl;
			avcodec_free_context(&codecCtx);
			return "";
		}

		yuvjFrame->format = AV_PIX_FMT_YUVJ420P;
		yuvjFrame->width = width;
		yuvjFrame->height = height;

		if (av_frame_get_buffer(yuvjFrame, 32) < 0) {
			std::cerr << "Failed to allocate buffer for YUVJ420P frame!" << std::endl;
			av_frame_free(&yuvjFrame);
			avcodec_free_context(&codecCtx);
			return "";
		}

		// ✅ Copy Y plane row by row (Prevents memory corruption)
		for (int j = 0; j < height; ++j) {
			memcpy(yuvjFrame->data[0] + j * yuvjFrame->linesize[0],
				nv12Frame->data[0] + j * nv12Frame->linesize[0], width);
		}

		// ✅ Correctly extract UV planes from NV12
		uint8_t* nv12_uv = nv12Frame->data[1];
		uint8_t* yuvj_u = yuvjFrame->data[1];
		uint8_t* yuvj_v = yuvjFrame->data[2];

		int uvWidth = width / 2;
		int uvHeight = height / 2;

		for (int j = 0; j < uvHeight; ++j) {
			uint8_t* nv12Row = nv12_uv + j * nv12Frame->linesize[1];
			uint8_t* uRow = yuvj_u + j * yuvjFrame->linesize[1];
			uint8_t* vRow = yuvj_v + j * yuvjFrame->linesize[2];

			for (int i = 0; i < uvWidth; ++i) {
				uRow[i] = nv12Row[i * 2];     // Extract U
				vRow[i] = nv12Row[i * 2 + 1]; // Extract V
			}
		}

		// ✅ Encode frame to JPEG
		av_init_packet(&pkt);
		pkt.data = nullptr;
		pkt.size = 0;

		bool isSuccess = false;
		std::string jpegData;

		if (avcodec_send_frame(codecCtx, yuvjFrame) >= 0) {
			if (avcodec_receive_packet(codecCtx, &pkt) >= 0) {
				jpegData.assign(reinterpret_cast<char*>(pkt.data), pkt.size);
				isSuccess = true;
			}
		}

		// ✅ Cleanup
		av_packet_unref(&pkt);
		av_frame_free(&yuvjFrame);
		avcodec_free_context(&codecCtx);

		return isSuccess ? jpegData : "";
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in encodeNV12ToJPEG_FFmpeg: " << e.what() << std::endl;
	}

	// ✅ Ensure memory cleanup in case of exceptions
	if (yuvjFrame) av_frame_free(&yuvjFrame);
	if (codecCtx) avcodec_free_context(&codecCtx);
	av_packet_unref(&pkt);

	return "";  // Return empty string on error
}
std::string CVideoPlayer::avframeToJpegString(const AVFrame* spFrame) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);

	AVFrame* croppedFrame = nullptr;
	AVFrame* convertedFrame = nullptr;
	AVFrame* pFrame = const_cast<AVFrame*>(spFrame);
	bool isSuccess = false;
	std::string jpegData;

	try {
		if (!m_bPlaying) {
			return "";
		}

		if (!spFrame || !spFrame->data[0] || spFrame->width <= 10 || spFrame->height <= 10) {
			std::cerr << "Invalid or empty frame data, or invalid dimensions." << std::endl;
			return "";
		}

		// ✅ Process NV12 frames directly
		if (pFrame->format == AV_PIX_FMT_NV12) {
			croppedFrame = cropFrame(pFrame, m_Bbox, m_bCrop);
			if (!croppedFrame) {
				croppedFrame = pFrame;  // Use original frame if cropping failed
			}

			// TurboJPEG handles all resolutions — no need for slow FFmpeg MJPEG path
			jpegData = encodeNV12ToJPEG_TurboJPEG(croppedFrame);

			if (!jpegData.empty()) {
				m_Width = croppedFrame->width;
				m_Height = croppedFrame->height;
				m_pts = m_pts + 1;
				isSuccess = true;
			}

			// ✅ Free cropped frame if allocated
			if (croppedFrame != pFrame) {
				av_frame_free(&croppedFrame);
				croppedFrame = nullptr;
			}
		}
		else {  // ✅ Convert other formats to NV12 before processing
			initSwsContext(pFrame->width, pFrame->height, static_cast<AVPixelFormat>(pFrame->format), AV_PIX_FMT_NV12);
			convertedFrame = av_frame_alloc();
			if (convertedFrame) {
				convertedFrame->format = AV_PIX_FMT_NV12;
				convertedFrame->width = pFrame->width;
				convertedFrame->height = pFrame->height;
				convertedFrame->color_range = AVCOL_RANGE_JPEG;

				if (av_frame_get_buffer(convertedFrame, 32) >= 0) {
					sws_scale(swsCtx, pFrame->data, pFrame->linesize, 0, pFrame->height,
						convertedFrame->data, convertedFrame->linesize);

					croppedFrame = cropFrame(convertedFrame, m_Bbox, m_bCrop);
					if (!croppedFrame) {
						croppedFrame = convertedFrame;  // Use converted frame if cropping failed
					}

					// TurboJPEG handles all resolutions
					jpegData = encodeNV12ToJPEG_TurboJPEG(croppedFrame);

					if (!jpegData.empty()) {
						m_Width = croppedFrame->width;
						m_Height = croppedFrame->height;
						m_pts = m_pts + 1;
						isSuccess = true;
					}

					// ✅ Free cropped frame if allocated
					if (croppedFrame != convertedFrame) {
						av_frame_free(&croppedFrame);
						croppedFrame = nullptr;
					}
				}
			}
			// ✅ Free converted frame if allocated
			if (convertedFrame) {
				av_frame_free(&convertedFrame);
				convertedFrame = nullptr;
			}
		}

		return isSuccess ? jpegData : "";
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in avframeToJpegString: " << e.what() << std::endl;

		// ✅ Cleanup memory in case of exceptions
		if (croppedFrame && croppedFrame != pFrame && croppedFrame != convertedFrame) {
			av_frame_free(&croppedFrame);
		}
		if (convertedFrame) {
			av_frame_free(&convertedFrame);
		}

		return "";
	}
}

bool CVideoPlayer::areFramesIdentical(AVFrame* frame1, AVFrame* frame2) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	try {
		if (!frame1 || !frame2) return false;

		// Ensure the frames have the same width, height, and format
		if (frame1->width != frame2->width || frame1->height != frame2->height || frame1->format != frame2->format) {
			return false;
		}

		int height = frame1->height;
		int width = frame1->width;

		// Compare Y plane (Luma)
		for (int y = 0; y < height; y++) {
			if (std::memcmp(frame1->data[0] + y * frame1->linesize[0],
				frame2->data[0] + y * frame2->linesize[0],
				width) != 0) {
				return false;
			}
		}

		if (frame1->format == AV_PIX_FMT_NV12) {
			// Compare UV plane (Interleaved)
			int chromaHeight = height / 2;
			for (int y = 0; y < chromaHeight; y++) {
				if (std::memcmp(frame1->data[1] + y * frame1->linesize[1],
					frame2->data[1] + y * frame2->linesize[1],
					width) != 0) {
					return false;
				}
			}
		}
		else if (frame1->format == AV_PIX_FMT_YUVJ420P) {
			// Compare U and V planes separately
			int chromaWidth = width / 2;
			int chromaHeight = height / 2;
			for (int y = 0; y < chromaHeight; y++) {
				if (std::memcmp(frame1->data[1] + y * frame1->linesize[1],  // U
					frame2->data[1] + y * frame2->linesize[1],
					chromaWidth) != 0) {
					return false;
				}
				if (std::memcmp(frame1->data[2] + y * frame1->linesize[2],  // V
					frame2->data[2] + y * frame2->linesize[2],
					chromaWidth) != 0) {
					return false;
				}
			}
		}

		return true; // If all planes match
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in areFramesIdentical: " << e.what() << std::endl;
		return false;
	}
}

void CVideoPlayer::initSwsContext(int width, int height, AVPixelFormat pixFmt, AVPixelFormat outputPixFmt) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	try {
		// Validate input dimensions and pixel format
		if (width <= 0 || height <= 0 || pixFmt == AV_PIX_FMT_NONE) {
			std::cerr << "Invalid parameters: width=" << width
				<< ", height=" << height
				<< ", pixFmt=" << pixFmt << std::endl;
			return;
		}

		// Check if reinitialization is required
		bool needsReinit = (swsCtx == nullptr) ||
			(width != lastWidth || height != lastHeight || pixFmt != lastPixFmt || outputPixFmt != lastOutPixFmt);

		if (!needsReinit) {
			// SwsContext is already up-to-date
			return;
		}

		// Free the existing SwsContext if it exists
		if (swsCtx) {
			sws_freeContext(swsCtx);
			swsCtx = nullptr;
		}
		// Determine output pixel format and scaling options based on resolution
		int scalingFlags = SWS_BILINEAR;  // Fast scaling — LANCZOS is too slow for real-time
		// Create a new SwsContext
		swsCtx = sws_getContext(width, height, pixFmt,
			width, height, outputPixFmt,
			scalingFlags,
			nullptr, nullptr, nullptr);

		// Check for errors in SwsContext creation
		if (!swsCtx) {
			std::cerr << "Failed to create SwsContext: width=" << width
				<< ", height=" << height
				<< ", inputPixFmt=" << pixFmt
				<< ", outputPixFmt=" << outputPixFmt << std::endl;
			return;
		}

		// Update last known parameters
		lastWidth = width;
		lastHeight = height;
		lastPixFmt = pixFmt;
		lastOutPixFmt = outputPixFmt;
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in initSwsContext: " << e.what() << std::endl;
	}
	catch (...) {
		std::cerr << "Unknown exception in initSwsContext." << std::endl;
	}
}
cv::Mat CVideoPlayer::avframeAnyToCvmat(const AVFrame* frame) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);  // Protect against concurrent access
	try {
		if (!frame || !frame->data[0] || frame->width <= 10 || frame->height <= 10) {
			std::cerr << "Invalid or empty frame data, or invalid dimensions." << std::endl;
			return cv::Mat();  // Return an empty matrix if the frame is invalid
		}
		initSwsContext(frame->width, frame->height, static_cast<AVPixelFormat>(frame->format), AV_PIX_FMT_BGR24);

		// Create OpenCV Mat to store the resulting image
		cv::Mat image(frame->height, frame->width, CV_8UC3);

		uint8_t* dst[1] = { image.data };
		int dstStride[1] = { static_cast<int>(image.step[0]) };  // OpenCV's stride

		// Perform the conversion using sws_scale
		int result = sws_scale(swsCtx, frame->data, frame->linesize, 0, frame->height, dst, dstStride);
		if (result < 0) {
			std::cerr << "Failed to scale the frame." << std::endl;
			return cv::Mat();  // Return an empty matrix if scaling fails
		}

		return image;  // Return the successfully converted OpenCV Mat
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in avframeToCvmat: " << e.what() << std::endl;
		return cv::Mat();  // Return an empty matrix on error
	}
}


cv::Mat CVideoPlayer::avframeYUVJ420PToCvmat(const AVFrame* frame) {
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	try {
		if (!frame || !frame->data[0] || frame->width <= 10 || frame->height <= 10) {
			std::cerr << "Invalid or empty frame data, or invalid dimensions." << std::endl;
			return cv::Mat();
		}

		// Create OpenCV Mat for the output image (RGB)
		cv::Mat image(frame->height, frame->width, CV_8UC3);  // 8-bit 3 channels for RGB image

		// Pointer to Y, U, V data from AVFrame
		uint8_t* yPlane = frame->data[0];  // Y plane (luminance)
		uint8_t* uPlane = frame->data[1];  // U plane (chrominance)
		uint8_t* vPlane = frame->data[2];  // V plane (chrominance)

		int yStride = frame->linesize[0];  // Stride of Y plane
		int uStride = frame->linesize[1];  // Stride of U plane
		int vStride = frame->linesize[2];  // Stride of V plane

		// Precompute offsets for U and V channels
		int uvWidth = frame->width / 2;  // U and V are subsampled (half resolution)
		int uvHeight = frame->height / 2;

		// Loop through each pixel and convert YUV to RGB
		for (int y = 0; y < frame->height; ++y) {
			for (int x = 0; x < frame->width; ++x) {
				// Y, U, V values for each pixel
				int yVal = yPlane[y * yStride + x];
				int uVal = uPlane[(y / 2) * uStride + (x / 2)];
				int vVal = vPlane[(y / 2) * vStride + (x / 2)];

				// Precompute differences for speed
				int uDiff = uVal - 128;
				int vDiff = vVal - 128;

				// Convert YUV to RGB (clamping values inline)
				int r = yVal + (1.402 * vDiff);
				int g = yVal - (0.344136 * uDiff) - (0.714136 * vDiff);
				int b = yVal + (1.772 * uDiff);

				// Clamp the values to the valid range for RGB (0-255)
				r = std::clamp(r, 0, 255);
				g = std::clamp(g, 0, 255);
				b = std::clamp(b, 0, 255);

				// Store the result in the OpenCV Mat (BGR format)
				image.at<cv::Vec3b>(y, x) = cv::Vec3b(b, g, r);  // OpenCV uses BGR by default
			}
		}

		return image;  // Return the converted OpenCV Mat (BGR)
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in avframeToCvmatYUVJ420P: " << e.what() << std::endl;
		return cv::Mat();  // Return an empty matrix on error
	}
}
// Initialize a dedicated SwsContext for NV12→BGR with correct color space
void CVideoPlayer::initNV12SwsContext(const AVFrame* frame) {
	int width = frame->width;
	int height = frame->height;

	// Detect color space from frame metadata (BT.709 for HD/4K, BT.601 for SD)
	int colorspace = SWS_CS_ITU709;  // Default to BT.709 for HD/4K
	if (frame->colorspace == AVCOL_SPC_BT470BG || frame->colorspace == AVCOL_SPC_SMPTE170M) {
		colorspace = SWS_CS_ITU601;
	}
	else if (frame->colorspace == AVCOL_SPC_BT2020_NCL || frame->colorspace == AVCOL_SPC_BT2020_CL) {
		colorspace = SWS_CS_BT2020;
	}
	else if (frame->colorspace == AVCOL_SPC_BT709) {
		colorspace = SWS_CS_ITU709;
	}
	else if (width >= 1280 || height >= 720) {
		// Auto-detect: HD and above → BT.709 (most common for IP cameras)
		colorspace = SWS_CS_ITU709;
	}
	else {
		colorspace = SWS_CS_ITU601;  // SD content
	}

	// Detect color range: limited (16-235) vs full (0-255)
	int srcRange = (frame->color_range == AVCOL_RANGE_JPEG) ? 1 : 0;  // 0=limited, 1=full
	int dstRange = 1;  // Output always full range (0-255) for display/AI processing

	// Check if reinit needed
	if (m_nv12SwsCtx && width == m_nv12LastWidth && height == m_nv12LastHeight
		&& colorspace == m_nv12LastColorspace && srcRange == m_nv12LastRange) {
		return;  // Already configured
	}

	// Free old context
	if (m_nv12SwsCtx) {
		sws_freeContext(m_nv12SwsCtx);
		m_nv12SwsCtx = nullptr;
	}

	// Create context: NV12 → BGR24, same dimensions (no scaling)
	// SWS_BILINEAR + SWS_FULL_CHR_H_INT: good quality chroma upsampling (~12ms for 4K)
	// SWS_ACCURATE_RND: better rounding for color precision
	// Note: SWS_LANCZOS gives VLC-matching quality but costs 50-80ms — too slow.
	//       VLC achieves its quality via GPU shaders, not CPU processing.
	m_nv12SwsCtx = sws_getContext(width, height, AV_PIX_FMT_NV12,
		width, height, AV_PIX_FMT_BGR24,
		SWS_BILINEAR | SWS_ACCURATE_RND | SWS_FULL_CHR_H_INT,
		nullptr, nullptr, nullptr);

	if (!m_nv12SwsCtx) {
		std::cerr << "Failed to create NV12 SwsContext" << std::endl;
		return;
	}

	// Configure correct color space and range
	const int* coefficients = sws_getCoefficients(colorspace);
	int* inv_table; int* table;
	int curSrcRange, curDstRange, brightness, contrast, saturation;
	sws_getColorspaceDetails(m_nv12SwsCtx, &inv_table, &curSrcRange, &table, &curDstRange,
		&brightness, &contrast, &saturation);
	sws_setColorspaceDetails(m_nv12SwsCtx, coefficients, srcRange, coefficients, dstRange,
		brightness, contrast, saturation);

	m_nv12LastWidth = width;
	m_nv12LastHeight = height;
	m_nv12LastColorspace = colorspace;
	m_nv12LastRange = srcRange;

}

cv::Mat CVideoPlayer::avframeNV12ToCvMat(const AVFrame* frame)
{
	try {
		if (!frame || frame->width <= 0 || frame->height <= 0) {
			std::cerr << "Invalid frame! Either null, incorrect format, or zero dimensions." << std::endl;
			return cv::Mat();
		}
		// Software decode handler
		if (frame->format != AV_PIX_FMT_NV12) return avframeAnyToCvmat(frame);

		int width = frame->width;
		int height = frame->height;

		// Store original NV12 dimensions for inference coordinate mapping
		m_nv12OrigWidth = width;
		m_nv12OrigHeight = height;

		// Return full-resolution BGR image.
		// No forced downscale — LabVIEW manages display resolution via SetDisplayResolution().
		// If the caller needs a specific display size, SetDisplayResolution(w, h) applies
		// resizing in GetImage() at the ANSRTSP/ANS*Client level after this returns.

		// Store original NV12 dimensions for inference coordinate mapping
		m_nv12OrigWidth = width;
		m_nv12OrigHeight = height;

		cv::Mat yPlane(height, width, CV_8UC1, frame->data[0], frame->linesize[0]);
		cv::Mat uvPlane(height / 2, width / 2, CV_8UC2, frame->data[1], frame->linesize[1]);

		cv::Mat bgrImage;
		cv::cvtColorTwoPlane(yPlane, uvPlane, bgrImage, cv::COLOR_YUV2BGR_NV12);

		if (m_nImageQuality == 1) {
			bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0);
		}
		return bgrImage;
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in avframeNV12ToCvMat: " << e.what() << std::endl;
		return cv::Mat();
	}
}
cv::Mat CVideoPlayer::avframeYUV420PToCvMat(const AVFrame* frame) {
	try {
		if (!frame || frame->width <= 0 || frame->height <= 0) {
			return cv::Mat();
		}

		const int width = frame->width;
		const int height = frame->height;

		// Debug: confirm this SW-decode conversion is actually hit.
		// Throttled to ~1 log/sec at 30 fps to keep DebugView readable.
		// Gated by ANSCORE_DEBUGVIEW — compiles to nothing in production.
		{
			static std::atomic<uint64_t> s_swCallCount{0};
			uint64_t n = s_swCallCount.fetch_add(1, std::memory_order_relaxed);
			if ((n % 30) == 0) {
				const char* fmtName = av_get_pix_fmt_name((AVPixelFormat)frame->format);
				const bool contig =
					(frame->linesize[0] == width &&
					 frame->linesize[1] == width / 2 &&
					 frame->linesize[2] == width / 2 &&
					 frame->data[1] == frame->data[0] + width * height &&
					 frame->data[2] == frame->data[1] + (width / 2) * (height / 2));
				// Report the codec's allocated Y-plane height (inferred from
				// the Y/U pointer spacing and Y stride). Lets us see whether
				// our custom get_buffer2 achieved alloc_h == visible_h.
				const int yStrideDbg = frame->linesize[0] > 0 ? frame->linesize[0] : 1;
				const int alloc_h_y = (int)((frame->data[1] - frame->data[0]) / yStrideDbg);
#if defined(ANSCORE_HAS_LIBYUV) && ANSCORE_HAS_LIBYUV
				const char* pathLabel = "LIBYUV/I420ToRGB24";
#else
				const char* pathLabel =
					contig                       ? "FAST/zero-copy" :
					(frame->linesize[0] == width) ? "SLOW/bulk-memcpy" :
					                                "SLOW/per-row-copy";
#endif
				(void)contig;  // silence unused warning when libyuv is on
				ANS_DBG("MEDIA_SWDec",
					"avframeYUV420PToCvMat ENTRY call#%llu fmt=%s visible=%dx%d alloc_h_y=%d "
					"linesize=[%d,%d,%d] path=%s (this=%p)",
					(unsigned long long)n,
					fmtName ? fmtName : "?",
					width, height, alloc_h_y,
					frame->linesize[0], frame->linesize[1], frame->linesize[2],
					pathLabel,
					(void*)this);
			}
		}

#if defined(ANSCORE_HAS_LIBYUV) && ANSCORE_HAS_LIBYUV
		// libyuv path: direct I420 (3 strided planes) → RGB24 (== BGR in memory
		// order for libyuv, matches cv::Mat CV_8UC3 default). No staging buffer,
		// no memcpy, no cv::cvtColor — one SIMD-optimized sweep.
		//
		// libyuv's "RGB24" is B,G,R per pixel in memory (see RGB24ToARGBRow_C
		// in libyuv/source/row_common.cc where src[0]=b, src[1]=g, src[2]=r).
		// That matches OpenCV's BGR layout — safe to wrap in CV_8UC3.
		cv::Mat bgrImage(height, width, CV_8UC3);
		int ret = libyuv::I420ToRGB24(
			frame->data[0], frame->linesize[0],
			frame->data[1], frame->linesize[1],
			frame->data[2], frame->linesize[2],
			bgrImage.data, static_cast<int>(bgrImage.step),
			width, height);
		if (ret != 0) {
			std::cerr << "libyuv::I420ToRGB24 failed with ret=" << ret << std::endl;
			return cv::Mat();
		}
		if (m_nImageQuality == 1) {
			bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0);
		}
		return bgrImage;
#else

		// YUV420P has 3 separate planes: Y (full res), U (half), V (half).
		// OpenCV's cvtColor(COLOR_YUV2BGR_I420) expects a single contiguous buffer
		// with Y on top (H rows) and U,V stacked below (H/2 rows total).
		// Layout: [Y: W×H] [U: W/2 × H/2] [V: W/2 × H/2]
		// Total height = H * 3/2, width = W, single channel.

		// If all planes are contiguous with matching strides, wrap directly
		const int yStride = frame->linesize[0];
		const int uStride = frame->linesize[1];
		const int vStride = frame->linesize[2];

		// Fast path: planes are packed contiguously with stride == width
		if (yStride == width && uStride == width / 2 && vStride == width / 2 &&
			frame->data[1] == frame->data[0] + width * height &&
			frame->data[2] == frame->data[1] + (width / 2) * (height / 2)) {
			// Contiguous I420 — wrap directly, zero copy
			cv::Mat yuv(height * 3 / 2, width, CV_8UC1, frame->data[0]);
			cv::Mat bgrImage;
			cv::cvtColor(yuv, bgrImage, cv::COLOR_YUV2BGR_I420);
			if (m_nImageQuality == 1) {
				bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0);
			}
			return bgrImage;
		}

		// Slow path: planes have padding (linesize > width) OR Y/U/V live in
		// non-adjacent buffers. Copy into a single I420-layout staging buffer
		// so cvtColor(COLOR_YUV2BGR_I420) can process it in one SIMD sweep.
		const int uvWidth = width / 2;
		const int uvHeight = height / 2;
		const int totalSize = width * height + uvWidth * uvHeight * 2;

		// Thread-local staging Mat — reused across calls to avoid a 12 MB malloc
		// on every 4K frame. Each decoder runs on its own worker thread, so
		// thread_local is the right granularity (no cross-thread sharing, no
		// locking). The Mat reallocates only when dimensions change.
		static thread_local cv::Mat s_yuvStaging;
		if (s_yuvStaging.rows != height * 3 / 2 ||
			s_yuvStaging.cols != width ||
			s_yuvStaging.type() != CV_8UC1 ||
			!s_yuvStaging.isContinuous()) {
			s_yuvStaging.create(height * 3 / 2, width, CV_8UC1);
		}
		cv::Mat& yuv = s_yuvStaging;
		uint8_t* dst = yuv.data;

		// Copy Y plane (line by line if stride != width)
		if (yStride == width) {
			std::memcpy(dst, frame->data[0], width * height);
		} else {
			for (int row = 0; row < height; ++row) {
				std::memcpy(dst + row * width, frame->data[0] + row * yStride, width);
			}
		}
		dst += width * height;

		// Copy U plane
		if (uStride == uvWidth) {
			std::memcpy(dst, frame->data[1], uvWidth * uvHeight);
		} else {
			for (int row = 0; row < uvHeight; ++row) {
				std::memcpy(dst + row * uvWidth, frame->data[1] + row * uStride, uvWidth);
			}
		}
		dst += uvWidth * uvHeight;

		// Copy V plane
		if (vStride == uvWidth) {
			std::memcpy(dst, frame->data[2], uvWidth * uvHeight);
		} else {
			for (int row = 0; row < uvHeight; ++row) {
				std::memcpy(dst + row * uvWidth, frame->data[2] + row * vStride, uvWidth);
			}
		}

		cv::Mat bgrImage;
		cv::cvtColor(yuv, bgrImage, cv::COLOR_YUV2BGR_I420);
		if (m_nImageQuality == 1) {
			bgrImage.convertTo(bgrImage, -1, 255.0 / 219.0, -16.0 * 255.0 / 219.0);
		}
		return bgrImage;
#endif  // ANSCORE_HAS_LIBYUV
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in avframeYUV420PToCvMat: " << e.what() << std::endl;
		return cv::Mat();
	}
}

cv::Mat CVideoPlayer::avframeToCVMat(const AVFrame* pFrame) {
	// No _mutex here: caller (getImage) releases the mutex before invoking this
	// so the expensive NV12/YUV420P→BGR conversion does not block onVideoFrame.
	// NV12/YUV420P paths touch only the caller-owned AVFrame clone and benign
	// member reads. avframeAnyToCvmat() takes its own lock for swsCtx.
	try {
		// 1. Validate input frame
		if (!pFrame || !pFrame->data[0] || pFrame->width <= 10 || pFrame->height <= 10) {
			std::cerr << "Invalid or empty frame data, or invalid dimensions." << std::endl;
			return cv::Mat();
		}

		// One-shot diagnostic: print the pixel format the first time through so
		// we can see which branch of the switch below is taken. Remove after use.
		static bool s_loggedFmt = false;
		if (!s_loggedFmt) {
			s_loggedFmt = true;
			const char* name = av_get_pix_fmt_name((AVPixelFormat)pFrame->format);
			fprintf(stderr, "[avframeToCVMat] first frame format=%d (%s) %dx%d\n",
			        pFrame->format, name ? name : "?", pFrame->width, pFrame->height);
			ANS_DBG("MEDIA_Convert",
				"avframeToCVMat FIRST-FRAME fmt=%d(%s) %dx%d HWDecoding=%d (this=%p)",
				pFrame->format, name ? name : "?",
				pFrame->width, pFrame->height,
				m_nHWDecoding, (void*)this);
		}

		// Per-branch throttled trace so we can see the dispatch at runtime.
		// Gated by ANSCORE_DEBUGVIEW — zero overhead in production.
		static std::atomic<uint64_t> s_dispatchCount{0};
		const uint64_t dispN = s_dispatchCount.fetch_add(1, std::memory_order_relaxed);
		const bool logThis = ((dispN % 30) == 0);

		switch (pFrame->format) {
		case AV_PIX_FMT_NV12:
			if (logThis) {
				ANS_DBG("MEDIA_Convert",
					"DISPATCH call#%llu fmt=NV12 %dx%d -> avframeNV12ToCvMat (HW-decode path)",
					(unsigned long long)dispN, pFrame->width, pFrame->height);
			}
			return avframeNV12ToCvMat(pFrame);
		case AV_PIX_FMT_YUV420P:
		case AV_PIX_FMT_YUVJ420P:
			if (logThis) {
				ANS_DBG("MEDIA_Convert",
					"DISPATCH call#%llu fmt=%s %dx%d -> avframeYUV420PToCvMat (SW-decode path)",
					(unsigned long long)dispN,
					(pFrame->format == AV_PIX_FMT_YUVJ420P) ? "YUVJ420P" : "YUV420P",
					pFrame->width, pFrame->height);
			}
			return avframeYUV420PToCvMat(pFrame);
		default:
			if (logThis) {
				const char* name = av_get_pix_fmt_name((AVPixelFormat)pFrame->format);
				ANS_DBG("MEDIA_Convert",
					"DISPATCH call#%llu fmt=%d(%s) %dx%d -> avframeAnyToCvmat (sws_scale fallback)",
					(unsigned long long)dispN,
					pFrame->format, name ? name : "?",
					pFrame->width, pFrame->height);
			}
			return avframeAnyToCvmat(pFrame);

		}
	}
	catch (const std::exception& e) {
		std::cerr << "Exception in avframeToCvMat: " << e.what() << std::endl;
		return cv::Mat();  // Return an empty matrix on error
	}
}

CVideoPlayer::CVideoPlayer() :
	m_bVideoInited(FALSE)
	, m_bAudioInited(FALSE)
	, m_bPlaying(FALSE)
	, m_bPaused(FALSE)
	, m_nHWDecoding(HW_DECODING_DISABLE)// Software decode by default — saves VRAM (no NVDEC DPB surfaces)
	, m_bUpdown(FALSE)
	, m_bSnapshot(FALSE)
	, m_nSnapVideoFmt(AV_PIX_FMT_YUVJ420P)
	, m_nVideoCodec(VIDEO_CODEC_NONE)
	, m_nAudioCodec(AUDIO_CODEC_NONE)
	, m_nSampleRate(0)
	, m_nChannel(0)
	, m_nBitPerSample(0)
	, m_pSnapFrame(NULL)
	, m_bRecording(FALSE)
	, m_bNalFlag(FALSE)
	, m_pAviCtx(NULL)
	, m_pAudioListMutex(NULL)
	, m_audioPlayFlag(FALSE)
	//, m_audioPlayThread(0)
	, m_pVideoListMutex(NULL)
	, m_videoPlayFlag(FALSE)
	//, m_videoPlayThread(0)
	, m_nLastAudioPts(AV_NOPTS_VALUE)
	, m_lastAudioTS(0)
{
	m_Bbox.x = 0;
	m_Bbox.y = 0;
	m_Bbox.width = 0;
	m_Bbox.height = 0;
	m_bCrop = false;
	m_pRecordMutex = sys_os_create_mutex();
	m_lastJpegImage = "";
	m_jpegImage = "";
	m_pts = 0;
	memset(&m_h26XParamSets, 0, sizeof(H26XParamSets));
	memset(&m_audioClock, 0, sizeof(HTCLOCK));
	memset(&m_videoClock, 0, sizeof(HTCLOCK));
	this->_tjInstance = tjInitCompress();
}
CVideoPlayer::~CVideoPlayer()
{
	// Lock to ensure no other thread is mid-operation (getImage, getJpegImage, onVideoFrame)
	// before we free resources. close() stops the decoder which prevents new callbacks.
	{
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		close();  // Stop decoder first — prevents new onVideoFrame callbacks
		g_frameQueue.clearQueue();
		if (swsCtx != nullptr) {
			sws_freeContext(swsCtx);
			swsCtx = nullptr;
		}
		if (m_nv12SwsCtx != nullptr) {
			sws_freeContext(m_nv12SwsCtx);
			m_nv12SwsCtx = nullptr;
		}
		if (this->_tjInstance) {
			tjDestroy(this->_tjInstance);
			this->_tjInstance = nullptr;
		}
	}
	// _mutex is destroyed after this block — no other thread should be accessing this object
}


BOOL CVideoPlayer::open(std::string fileName)
{
	m_sFileName = fileName;
	return TRUE;
}
BOOL CVideoPlayer::open(std::string _username, std::string _password, std::string _url)
{
	m_acct = _username;
	m_pass = _password;
	m_sFileName = _url;
	return TRUE;
}
AVFrame* CVideoPlayer::getNV12Frame() {
	// Return a CLONE so multiple consumers (tasks sharing the same stream)
	// each get their own copy.  The original m_currentNV12Frame stays valid
	// until the next getImage() call overwrites it.
	// (Previously used ownership transfer — only the first caller got NV12,
	//  and the second caller fell back to BGR.)
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	return m_currentNV12Frame ? av_frame_clone(m_currentNV12Frame) : nullptr;
}

AVFrame* CVideoPlayer::getCudaHWFrame() {
	// Return a clone of the CUDA HW frame captured by onVideoFrame().
	// Clone (not ownership transfer) because multiple callers may request
	// the frame between onVideoFrame updates (e.g., during warmup when
	// GetRTSPCVImage is called faster than the decode rate).
	// extra_hw_frames=2 in the decoder provides surface pool headroom
	// for the 3 concurrent clones (decoder + player + registry).
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	return m_currentCudaHWFrame ? av_frame_clone(m_currentCudaHWFrame) : nullptr;
}

bool CVideoPlayer::isCudaHWAccel() const {
	return m_pVideoDecoder && m_pVideoDecoder->isCudaHWAccel();
}

void CVideoPlayer::close()
{
	closeVideo();
	closeAudio();
	if (m_currentNV12Frame) {
		av_frame_free(&m_currentNV12Frame);
		m_currentNV12Frame = nullptr;
	}
	if (m_currentCudaHWFrame) {
		av_frame_free(&m_currentCudaHWFrame);
		m_currentCudaHWFrame = nullptr;
	}
	if (m_pSnapFrame)
	{
		av_frame_free(&m_pSnapFrame);
		m_pSnapFrame = nullptr;
	}
	stopRecord();
	if (m_pRecordMutex) {
		sys_os_destroy_sig_mutex(m_pRecordMutex);
		m_pRecordMutex = NULL;
	}
}

void CVideoPlayer::setVolume(int volume)
{
	if (m_pAudioPlay)
	{
		m_pAudioPlay->setVolume(volume);
	}
}
void CVideoPlayer::snapshot(int videofmt)
{
	m_bSnapshot = TRUE;
	m_nSnapVideoFmt = videofmt;
}
BOOL CVideoPlayer::record(std::string baseName)
{
	if (m_bRecording)
	{
		return TRUE;
	}
	//std::string path = getRecordPath();
	std::string file = baseName;// path + "/" + getTempFile(baseName, ".avi");

	m_sBaseName = baseName;

	m_pAviCtx = avi_write_open(file.c_str());
	if (NULL == m_pAviCtx)
	{
		log_print(HT_LOG_ERR, "%s, avi_write_open failed. %s\r\n",
			__FUNCTION__, file.c_str());
		return FALSE;
	}

	if (!onRecord())
	{
		avi_write_close(m_pAviCtx);
		m_pAviCtx = NULL;

		return FALSE;
	}

	m_bRecording = TRUE;

	return m_bRecording;
}
void CVideoPlayer::stopRecord()
{
	sys_os_mutex_enter(m_pRecordMutex);

	m_bRecording = FALSE;
	m_bNalFlag = FALSE;

	memset(&m_h26XParamSets, 0, sizeof(H26XParamSets));

	if (m_pAviCtx)
	{
		avi_write_close(m_pAviCtx);
		m_pAviCtx = NULL;
	}

	sys_os_mutex_leave(m_pRecordMutex);
}
void CVideoPlayer::recordVideo(uint8* data, int len, uint32 ts, uint16 seq)
{
	int codec = VIDEO_CODEC_NONE;

	if (!memcmp(m_pAviCtx->v_fcc, "H264", 4))
	{
		codec = VIDEO_CODEC_H264;
	}
	else if (!memcmp(m_pAviCtx->v_fcc, "H265", 4))
	{
		codec = VIDEO_CODEC_H265;
	}

	if ((VIDEO_CODEC_H264 == codec || VIDEO_CODEC_H265 == codec) && !m_bNalFlag)
	{
		if (avc_get_h26x_paramsets(data, len, codec, &m_h26XParamSets))
		{
			avi_write_nalu(m_pAviCtx,
				m_h26XParamSets.vps, m_h26XParamSets.vps_size,
				m_h26XParamSets.sps, m_h26XParamSets.sps_size,
				m_h26XParamSets.pps, m_h26XParamSets.pps_size);
			m_bNalFlag = 1;
		}
	}

	recordVideoEx(data, len, ts, seq);

	if (recordSwitchCheck())
	{
		recordFileSwitch();
	}
}
void CVideoPlayer::recordVideoEx(uint8* data, int len, uint32 ts, uint16 seq)
{
	AVICTX* p_avictx = m_pAviCtx;

	if (p_avictx->v_width == 0 || p_avictx->v_height == 0)
	{
		int codec = VIDEO_CODEC_NONE;

		if (memcmp(p_avictx->v_fcc, "H264", 4) == 0)
		{
			codec = VIDEO_CODEC_H264;
		}
		else if (memcmp(p_avictx->v_fcc, "H265", 4) == 0)
		{
			codec = VIDEO_CODEC_H265;
		}
		else if (memcmp(p_avictx->v_fcc, "JPEG", 4) == 0)
		{
			codec = VIDEO_CODEC_JPEG;
		}
		else if (memcmp(p_avictx->v_fcc, "MP4V", 4) == 0)
		{
			codec = VIDEO_CODEC_MP4;
		}

		avc_parse_video_size(codec, data, len, &p_avictx->v_width, &p_avictx->v_height);

		if (p_avictx->v_width && p_avictx->v_height)
		{
			avi_update_header(p_avictx);
		}
	}

	int key = 0;

	if (memcmp(p_avictx->v_fcc, "H264", 4) == 0)
	{
		uint8 nalu_t = (data[4] & 0x1F);
		key = (nalu_t == 5 || nalu_t == 7 || nalu_t == 8);
	}
	else if (memcmp(p_avictx->v_fcc, "H265", 4) == 0)
	{
		uint8 nalu_t = (data[4] >> 1) & 0x3F;
		key = ((nalu_t >= 16 && nalu_t <= 21) || nalu_t == 32 || nalu_t == 33 || nalu_t == 34);
	}
	else if (memcmp(p_avictx->v_fcc, "MP4V", 4) == 0)
	{
		key = 1;
	}
	else if (memcmp(p_avictx->v_fcc, "JPEG", 4) == 0)
	{
		key = 1;
	}

	avi_write_video(p_avictx, data, len, ts, key);
}
void CVideoPlayer::recordAudio(uint8* data, int len, uint32 ts, uint16 seq)
{
	AVICTX* p_avictx = m_pAviCtx;

	avi_write_audio(p_avictx, data, len, ts);

	if (recordSwitchCheck())
	{
		recordFileSwitch();
	}
}
BOOL CVideoPlayer::recordSwitchCheck()
{
	uint64 tlen = avi_get_file_length(m_pAviCtx);
	uint32 mtime = avi_get_media_time(m_pAviCtx);
	uint32 recordSize = 0;// getRecordSize();
	if (recordSize == 0)
	{
		recordSize = 1048576; // max 1G file size
	}
	// Switch according to the recording size
	if (tlen > recordSize * 1024)
	{
		return TRUE;
	}

	uint32 recordTime = 0;// getRecordTime();

	// Switch according to the recording duration
	if (recordTime > 0 && mtime > recordTime * 1000)
	{
		return TRUE;
	}

	return FALSE;
}
void CVideoPlayer::recordFileSwitch()
{
	AVICTX* p_ctx;
	AVICTX* p_oldctx = m_pAviCtx;

	//std::string path = getRecordPath();
	std::string file = m_sBaseName;// path + "/" + getTempFile(m_sBaseName, ".avi");

	p_ctx = avi_write_open(file.c_str());
	if (NULL == p_ctx)
	{
		return;
	}

	p_ctx->ctxf_video = p_oldctx->ctxf_video;
	p_ctx->ctxf_audio = p_oldctx->ctxf_audio;

	if (p_ctx->ctxf_video)
	{
		avi_calc_fps(p_oldctx);
		avi_set_video_info(p_ctx, p_oldctx->v_fps, p_oldctx->v_width, p_oldctx->v_height, p_oldctx->v_fcc);
		avi_set_video_extra_info(p_ctx, p_oldctx->v_extra, p_oldctx->v_extra_len);
	}

	if (p_ctx->ctxf_audio)
	{
		avi_set_audio_info(p_ctx, p_oldctx->a_chns, p_oldctx->a_rate, p_oldctx->a_fmt);
		avi_set_audio_extra_info(p_ctx, p_oldctx->a_extra, p_oldctx->a_extra_len);
	}

	avi_write_close(p_oldctx);

	avi_update_header(p_ctx);

	m_pAviCtx = p_ctx;

	if (m_h26XParamSets.vps_size > 0 ||
		m_h26XParamSets.sps_size > 0 ||
		m_h26XParamSets.pps_size > 0)
	{
		avi_write_nalu(m_pAviCtx,
			m_h26XParamSets.vps, m_h26XParamSets.vps_size,
			m_h26XParamSets.sps, m_h26XParamSets.sps_size,
			m_h26XParamSets.pps, m_h26XParamSets.pps_size);
	}
}
BOOL CVideoPlayer::openVideo(enum AVCodecID codec, uint8* extradata, int extradata_size)
{
	if (m_bVideoInited)
	{
		return TRUE;
	}
	if (m_pVideoDecoder)
	{
		m_bVideoInited = m_pVideoDecoder->init(codec, extradata, extradata_size, m_nHWDecoding, m_nPreferredGpu);
	}

	if (m_bVideoInited)
	{
		m_pVideoDecoder->setCallback(VideoDecoderCallback, this);
		m_pVideoListMutex = sys_os_create_mutex();
		m_videoPlayFlag = TRUE;
		//m_videoPlayThread = sys_os_create_thread((void*)VideoPlayThread, this);
	}
	m_nVideoCodec = to_video_codec(codec);
	return m_bVideoInited;
}


BOOL CVideoPlayer::openVideo(int codec, uint8* extradata, int extradata_size)
{
	return openVideo(to_video_avcodecid(codec), extradata, extradata_size);
}
void CVideoPlayer::closeVideo()
{
	// Stop decoder outside the player lock to avoid the same lock-ordering
	// deadlock as StopVideoDecoder() (see comment there).
	CVideoDecoder* decoder = nullptr;
	{
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		decoder = m_pVideoDecoder.get();
	}
	if (decoder)
	{
		decoder->Stop();
		decoder->flush();
	}
	// Now clean up resources under the lock
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	m_videoPlayFlag = FALSE;
	if (m_pVideoListMutex)
	{
		sys_os_destroy_sig_mutex(m_pVideoListMutex);
		m_pVideoListMutex = NULL;
	}
	if (!g_frameQueue.isEmpty())g_frameQueue.clearQueue();
	m_bVideoInited = FALSE;
}


void  CVideoPlayer::StartVideoDecoder() {
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	// Clear queue but KEEP m_currentImage — it holds the last good frame
	// which we'll return while the decoder stabilizes after restart
	g_frameQueue.clearQueue();
	m_lastFrameSeq = 0;
	m_bWaitingForKeyframe = true;   // Skip frames until first keyframe
	m_cleanFrameCount = 0;          // Reset settle counter
	if (m_pVideoDecoder)
	{
		m_pVideoDecoder->Start();
	}
}
void  CVideoPlayer::StopVideoDecoder() {
	// Get decoder pointer under lock, then release BEFORE calling decoder methods.
	// This avoids a lock-ordering deadlock:
	//   Thread 1 (here): CVideoPlayer::_mutex -> CVideoDecoder::_mutex
	//   Thread 2 (TCP rx decode -> onVideoFrame callback): CVideoDecoder::_mutex -> CVideoPlayer::_mutex
	CVideoDecoder* decoder = nullptr;
	{
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		decoder = m_pVideoDecoder.get();
	}
	if (decoder)
	{
		decoder->Stop();
		// Flush decoder to drain and discard any buffered frames,
		// so stale reference frames don't corrupt the next session
		decoder->flush();
		// Free NVDEC decoder context and all GPU surfaces (DPB buffers).
		// Stopped cameras should not hold VRAM — with 100 cameras created
		// but only 5 running, the 95 idle decoders would consume ~5-10 GB.
		// The decoder will be re-initialized automatically when the next
		// video packet arrives after Start() is called.
		decoder->uninit();
		m_bVideoInited = FALSE;
	}
	// Clear queue but KEEP m_currentImage and m_lastJpegImage —
	// getImage()/getJpegImage() will return the last good frame while decoder stabilizes
	{
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		g_frameQueue.clearQueue();
		m_lastFrameSeq = 0;
	}
}
BOOL CVideoPlayer::openAudio(enum AVCodecID codec, int samplerate, int channels, int bitpersample)
{
	if (m_bAudioInited)
	{
		return TRUE;
	}
	if (m_pAudioDecoder)
	{
		m_bAudioInited = m_pAudioDecoder->init(codec, samplerate, channels, bitpersample);
	}

	if (m_bAudioInited)
	{
		m_pAudioDecoder->setCallback(AudioDecoderCallback, this);

#if __WINDOWS_OS__
		m_pAudioPlay = std::make_unique<CWAudioPlay>();/// new CWAudioPlay();
#elif defined(IOS)
		m_pAudioPlay = std::make_unique<CMAudioPlay>();
#elif __LINUX_OS__
		m_pAudioPlay = std::make_unique<CQAudioPlay>();
#endif
		if (m_pAudioPlay)
		{
			m_pAudioPlay->startPlay(samplerate, channels);
		}

		m_pAudioListMutex = sys_os_create_mutex();

		m_audioPlayFlag = FALSE;//disable by default
		//m_audioPlayThread = sys_os_create_thread((void*)AudioPlayThread, this);
	}

	m_nAudioCodec = to_audio_codec(codec);
	m_nSampleRate = samplerate;
	m_nChannel = channels;
	m_nBitPerSample = bitpersample;

	return m_bAudioInited;
}
void CVideoPlayer::enableAudio(bool status) {
	if (status)m_audioPlayFlag = TRUE;
	else m_audioPlayFlag = FALSE;
}
BOOL CVideoPlayer::openAudio(int codec, int samplerate, int channels, int bitpersample)
{
	return openAudio(to_audio_avcodecid(codec), samplerate, channels, bitpersample);
}
void CVideoPlayer::closeAudio()
{
	m_audioPlayFlag = FALSE;
	if (m_pAudioListMutex)
	{
		sys_os_destroy_sig_mutex(m_pAudioListMutex);
		m_pAudioListMutex = NULL;
	}

	if (!a_frameQueue.isEmpty())a_frameQueue.clearQueue();
	m_bAudioInited = FALSE;
}
int CVideoPlayer::getVideoWidth()
{
	if (m_pVideoDecoder)
	{
		return m_pVideoDecoder->getWidth();
	}

	return 0;
}
int CVideoPlayer::getVideoHeight()
{
	if (m_pVideoDecoder)
	{
		return m_pVideoDecoder->getHeight();
	}

	return 0;
}
double CVideoPlayer::getFrameRate()
{
	if (m_pVideoDecoder)
	{
		return m_pVideoDecoder->getFrameRate();
	}

	return 0;
}
void CVideoPlayer::setTargetFPS(double intervalMs)
{
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	m_targetIntervalMs = intervalMs;
	m_targetFPSInitialized = false;  // reset timing on change
}
double CVideoPlayer::getLastFrameAgeMs()
{
	std::lock_guard<std::recursive_mutex> lock(_mutex);
	if (!m_lastDecoderFrameTimeSet) return 0.0;
	auto now = std::chrono::steady_clock::now();
	return std::chrono::duration<double, std::milli>(now - m_lastDecoderFrameTime).count();
}
void CVideoPlayer::playVideo(uint8* data, int len, uint32 ts, uint16 seq)
{
	if (m_bRecording)
	{
		sys_os_mutex_enter(m_pRecordMutex);
		recordVideo(data, len, ts, seq);
		sys_os_mutex_leave(m_pRecordMutex);
	}

	updateClock(&m_videoClock, ts, getVideoClock());

	if (m_bVideoInited)
	{
		if (m_bPlaying) {
			m_pVideoDecoder->decode(data, len, m_videoClock.SyncTime.tv_sec * 1000000 + m_videoClock.SyncTime.tv_usec);
		}
	}
}
void CVideoPlayer::playAudio(uint8* data, int len, uint32 ts, uint16 seq)
{
	if (m_bRecording)
	{
		sys_os_mutex_enter(m_pRecordMutex);
		recordAudio(data, len, ts, seq);
		sys_os_mutex_leave(m_pRecordMutex);
	}

	updateClock(&m_audioClock, ts, getAudioClock());

	if (m_bAudioInited)
	{
		m_pAudioDecoder->decode(data, len, m_audioClock.SyncTime.tv_sec * 1000000 + m_audioClock.SyncTime.tv_usec);
	}
}
void CVideoPlayer::updateClock(HTCLOCK* clock, uint32 ts, int frequency)
{
	if (ts == 0)
	{
		return;
	}

	if (clock->SyncTime.tv_sec == 0 && clock->SyncTime.tv_usec == 0)
	{
		clock->SyncTimestamp = ts;
		gettimeofday(&clock->SyncTime, NULL);
	}

	int timestampDiff = ts - clock->SyncTimestamp;

	// Divide this by the timestamp frequency to get real time:
	double timeDiff = timestampDiff / (double)frequency;

	uint32 const million = 1000000;
	uint32 seconds, uSeconds;

	if (timeDiff >= 0.0)
	{
		seconds = clock->SyncTime.tv_sec + (uint32)(timeDiff);
		uSeconds = clock->SyncTime.tv_usec + (uint32)((timeDiff - (uint32)timeDiff) * million);
		if (uSeconds >= million)
		{
			uSeconds -= million;
			++seconds;
		}
	}
	else
	{
		timeDiff = -timeDiff;
		seconds = clock->SyncTime.tv_sec - (uint32)(timeDiff);
		uSeconds = clock->SyncTime.tv_usec - (uint32)((timeDiff - (uint32)timeDiff) * million);
		if ((int)uSeconds < 0)
		{
			uSeconds += million;
			--seconds;
		}
	}
	// Save these as the new synchronization timestamp & time:
	clock->SyncTimestamp = ts;
	clock->SyncTime.tv_sec = seconds;
	clock->SyncTime.tv_usec = uSeconds;
}
BOOL CVideoPlayer::initFrame(AVFrame*& frame, int width, int height, AVPixelFormat pixfmt)
{
	if (width == 0 || height == 0 || pixfmt == AV_PIX_FMT_NONE)
	{
		return FALSE;
	}

	if (NULL == frame || frame->width != width || frame->height != height || frame->format != pixfmt)
	{
		if (frame)
		{
			av_frame_free(&frame);
		}

		frame = av_frame_alloc();
		if (NULL == frame)
		{
			return FALSE;
		}

		frame->format = pixfmt;
		frame->width = width;
		frame->height = height;

		if (0 != av_frame_get_buffer(frame, 0))
		{
			av_frame_free(&frame);
			return FALSE;
		}

		av_frame_make_writable(frame);
	}

	return TRUE;
}
BOOL CVideoPlayer::doSnapshot(AVFrame* frame)
{
	if (m_pSnapFrame) {
		av_frame_free(&m_pSnapFrame);  // Free the previous snapshot frame if it exists
	}

	if (!initFrame(m_pSnapFrame,
		frame->width,
		frame->height,
		to_avpixelformat(m_nSnapVideoFmt)))
	{
		return FALSE;
	}

	if (NULL == convertFrame(frame, m_pSnapFrame, FALSE))
	{
		return FALSE;
	}

	return TRUE;
}
AVFrame* CVideoPlayer::convertFrame(AVFrame* srcframe, AVFrame* dstframe, BOOL updown)
{
	if (!srcframe || !dstframe) {
		return NULL;
	}

	SwsContext* _swsctx = sws_getContext(srcframe->width,
		srcframe->height,
		(enum AVPixelFormat)srcframe->format,
		srcframe->width,
		srcframe->height,
		(enum AVPixelFormat)dstframe->format,
		SWS_BICUBIC, NULL, NULL, NULL);

	if (!_swsctx) {
		return NULL;
	}

	if (updown) {
		srcframe->data[0] += srcframe->linesize[0] * (srcframe->height - 1);
		srcframe->linesize[0] *= -1;
		srcframe->data[1] += srcframe->linesize[1] * (srcframe->height / 2 - 1);
		srcframe->linesize[1] *= -1;
		srcframe->data[2] += srcframe->linesize[2] * (srcframe->height / 2 - 1);
		srcframe->linesize[2] *= -1;
	}

	int ret = sws_scale(_swsctx,
		srcframe->data,
		srcframe->linesize, 0,
		srcframe->height,
		dstframe->data,
		dstframe->linesize);

	sws_freeContext(_swsctx);  // Free context after scaling attempt

	if (ret > 0) {
		dstframe->pts = srcframe->pts;
		dstframe->pkt_dts = srcframe->pkt_dts;
		return dstframe;
	}
	else {
		log_print(HT_LOG_ERR, "%s, sws_scale failed\r\n", __FUNCTION__);
		return NULL;
	}
}


void CVideoPlayer::onVideoFrame(AVFrame* frame)
{
	std::lock_guard<std::recursive_mutex> lock(_mutex);  // Protect against concurrent access

	if (!frame) return;  // Check for null pointer

	if (m_bSnapshot)
	{
		if (doSnapshot(frame))
		{
			m_bSnapshot = FALSE;
		}
	}

	if (m_bPlaying && m_videoPlayFlag) {
		// Drop any frame with decode errors (corrupted reference frames, etc.)
		if (frame->decode_error_flags != 0) {
			fprintf(stderr, "[HWDecode] Dropping frame with decode errors (flags=0x%x)\n", frame->decode_error_flags);
			return;
		}

		// After start/restart, skip corrupted frames until first keyframe (IDR) arrives.
		// HEVC/H.264 P/B frames received before the first I-frame will produce visual
		// corruption ("Could not find ref with POC", green/grey artifacts).
		if (m_bWaitingForKeyframe) {
			if (frame->key_frame || frame->pict_type == AV_PICTURE_TYPE_I) {
				m_bWaitingForKeyframe = false;
				m_cleanFrameCount = 0;
				fprintf(stderr, "[HWDecode] First keyframe received, settling for %d frames\n", SETTLE_FRAME_COUNT);
			} else {
				return;  // Drop this frame — not yet safe to decode
			}
		}

		// Record wall-clock time of every decoded frame (even rate-limited ones).
		// Used by getLastFrameAgeMs() to detect truly stale cameras.
		m_lastDecoderFrameTime = std::chrono::steady_clock::now();
		m_lastDecoderFrameTimeSet = true;

		// --- Frame rate limiting ---
		// Skip post-decode processing (clone, queue push, CUDA clone) if not enough
		// time has elapsed since the last processed frame. The decode itself still
		// runs for every packet to maintain the H.264/H.265 reference frame chain.
		if (m_targetIntervalMs > 0.0) {
			auto now = std::chrono::steady_clock::now();
			if (!m_targetFPSInitialized) {
				m_lastProcessedTime = now;
				m_targetFPSInitialized = true;
			} else {
				auto elapsed = std::chrono::duration<double, std::milli>(now - m_lastProcessedTime).count();
				if (elapsed < m_targetIntervalMs) {
					return;  // Skip this frame — too soon
				}
			}
			m_lastProcessedTime = now;
		}
		// --- End frame rate limiting ---

		// Push frame to queue; during settle period getImage() will ignore the queue
		// and keep returning the last good cached image
		g_frameQueue.pushFrame(frame);  // pushFrame() clones the frame internally

		// Capture CUDA HW frame for zero-copy inference.
		// We're inside decode()'s lock scope (decoder._mutex held) AND onVideoFrame
		// holds player._mutex — so this is the ONE place where both locks are held
		// and we can safely clone the CUDA frame without deadlock risk.
		// cloneCudaHWFrame_unlocked() is safe because decoder._mutex is already held.
		if (m_pVideoDecoder && m_pVideoDecoder->isCudaHWAccel()) {
			if (m_currentCudaHWFrame) av_frame_free(&m_currentCudaHWFrame);
			m_currentCudaHWFrame = m_pVideoDecoder->cloneCudaHWFrame_unlocked();
		}

		// Track how many clean frames have arrived since keyframe
		if (m_cleanFrameCount < SETTLE_FRAME_COUNT) {
			m_cleanFrameCount++;
			if (m_cleanFrameCount == SETTLE_FRAME_COUNT) {
				fprintf(stderr, "[HWDecode] Settle complete, delivering new frames\n");
			}
		}
	}
}


void CVideoPlayer::onAudioFrame(AVFrame* frame)
{
	// Support for audio playback
	std::lock_guard<std::recursive_mutex> lock(_mutex);  // Protect against concurrent access

	if (!frame) return;  // Check for null pointer

	if (m_bSnapshot)
	{
		if (doSnapshot(frame))
		{
			m_bSnapshot = FALSE;
		}
	}

	if (m_bPlaying && m_audioPlayFlag) {
		a_frameQueue.pushFrame(frame);  // pushFrame() clones the frame internally
	}
}


cv::Mat CVideoPlayer::getImage(int& width, int& height, int64_t& pts) {
	try {
		AVFrame* frameToProcess = nullptr;
		uint64_t currentSeq = 0;

		// Timing breakdown — gated by ANSCORE_DEBUGVIEW (zero overhead in production).
		// t0 = entry, t1 = after pulling frame from queue, t2 = after YUV->BGR,
		// t3 = after publish. Throttled to every 30 full-path calls (~1/sec @30fps).
		using clk = std::chrono::steady_clock;
		const auto t0 = clk::now();

		// --- Phase 1: short locked section — examine state, pull latest frame ---
		{
			std::lock_guard<std::recursive_mutex> lock(_mutex);

			if (!m_bPlaying) {
				width = m_currentImage.cols;
				height = m_currentImage.rows;
				pts = m_pts;
				return m_currentImage;  // Shallow copy (reference counted)
			}

			// While waiting for keyframe or during settle period after restart,
			// return the last good cached image to avoid showing corrupted frames
			if (m_bWaitingForKeyframe || m_cleanFrameCount < SETTLE_FRAME_COUNT) {
				width = m_currentImage.cols;
				height = m_currentImage.rows;
				pts = m_pts;
				return m_currentImage;
			}

			// Fast path: same frame as last call — skip clone + BGR conversion
			currentSeq = g_frameQueue.getSequence();
			if (currentSeq == m_lastFrameSeq && !m_currentImage.empty()) {
				width = m_currentImage.cols;
				height = m_currentImage.rows;
				pts = m_pts;
				return m_currentImage;
			}

			if (g_frameQueue.isEmpty()) {
				width = m_currentImage.cols;
				height = m_currentImage.rows;
				pts = m_pts;
				std::cerr << "No frame available in getImage()" << std::endl;
				return cv::Mat();
			}

			// getLatestFrame() clones the AVFrame — we own it from here
			frameToProcess = g_frameQueue.getLatestFrame();
			if (!frameToProcess) {
				width = m_currentImage.cols;
				height = m_currentImage.rows;
				pts = m_pts;
				return cv::Mat();
			}
		}
		// --- _mutex released here ---
		// At 4K NV12, cvtColorTwoPlane takes ~100–300 ms on CPU; during that
		// window the decoder callback (onVideoFrame) is free to push the next
		// frame and the CUDA HW capture path can run in parallel.
		const auto t1 = clk::now();

		cv::Mat converted;
		try {
			converted = avframeToCVMat(frameToProcess);
		}
		catch (const std::exception& e) {
			std::cerr << "Exception while converting AVFrame to cv::Mat: " << e.what() << std::endl;
		}
		const auto t2 = clk::now();

		// --- Phase 2: short locked section — publish new frame state ---
		cv::Mat result;  // Snapshot taken under the lock, returned after release.
		{
			std::lock_guard<std::recursive_mutex> lock(_mutex);

			if (!converted.empty()) {
				m_currentImage = converted;
				m_pts++;
				m_lastFrameSeq = currentSeq;
			}

			// Preserve raw YUV/NV12 frame for GPU fast-path inference
			// (NV12 from HW decode, YUV420P/YUVJ420P from SW decode)
			if (frameToProcess &&
				(frameToProcess->format == AV_PIX_FMT_NV12 ||
				 frameToProcess->format == AV_PIX_FMT_YUV420P ||
				 frameToProcess->format == AV_PIX_FMT_YUVJ420P)) {
				if (m_currentNV12Frame) av_frame_free(&m_currentNV12Frame);
				m_currentNV12Frame = av_frame_clone(frameToProcess);
			}

			width = m_currentImage.cols;
			height = m_currentImage.rows;
			pts = m_pts;
			result = m_currentImage;  // Shallow copy under lock — refcount keeps buffer alive
		}

		av_frame_free(&frameToProcess);

		// Emit timing breakdown. Throttled so DebugView / stderr stay readable.
		{
			static std::atomic<uint64_t> s_timingCount{0};
			const uint64_t n = s_timingCount.fetch_add(1, std::memory_order_relaxed);
			if ((n % 30) == 0) {
				const auto t3 = clk::now();
				auto ms = [](clk::time_point a, clk::time_point b) {
					return std::chrono::duration<double, std::milli>(b - a).count();
				};
				ANS_DBG("MEDIA_Timing",
					"getImage call#%llu pull=%.2fms convert=%.2fms publish=%.2fms total=%.2fms "
					"size=%dx%d seq=%llu (this=%p)",
					(unsigned long long)n,
					ms(t0, t1), ms(t1, t2), ms(t2, t3), ms(t0, t3),
					width, height,
					(unsigned long long)currentSeq,
					(void*)this);
			}
		}

		return result;
	}
	catch (const std::exception& e) {
		std::cerr << "Unexpected exception in getImage(): " << e.what() << std::endl;
		return cv::Mat();  // Return an empty cv::Mat if an exception occurs
	}
	catch (...) {
		std::cerr << "Unknown exception in getImage()" << std::endl;
		return cv::Mat();  // Return an empty cv::Mat if an exception occurs

	}
}

std::string CVideoPlayer::getJpegImage(int& width, int& height, int64_t& pts) {
	try {
		// Timing breakdown — gated by ANSCORE_DEBUGVIEW (zero overhead in production).
		using clk = std::chrono::steady_clock;
		const auto t0 = clk::now();

		// Use same _mutex as getImage() to protect shared state consistently
		// recursive_mutex allows nested calls to avframeToJpegString → _mutex
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		const auto t1 = clk::now();

		// While waiting for keyframe or during settle period after restart,
		// return the last good cached JPEG to avoid showing corrupted frames
		if (m_bWaitingForKeyframe || m_cleanFrameCount < SETTLE_FRAME_COUNT) {
			width = m_Width;
			height = m_Height;
			pts = m_pts;
			return m_lastJpegImage;  // Last good JPEG (may be empty on first-ever start)
		}

		AVFrame* frameToProcess = g_frameQueue.getLatestFrame();  // Get a safe copy
		if (!frameToProcess) {
			return m_lastJpegImage;  // Return the last valid JPEG image if no frame is available
		}
		const auto t2 = clk::now();
		const int frameFmt = frameToProcess->format;
		const int frameW = frameToProcess->width;
		const int frameH = frameToProcess->height;

		try {
			if (frameToProcess->format == AV_PIX_FMT_NV12) {
				m_jpegImage = avframeToJpegString(frameToProcess);  // Convert frame to JPEG from NV12
			}
			else {
				m_jpegImage = avframeYUVJ420PToJpegString(frameToProcess);  // Convert frame to JPEG from YUVJ420P
			}
		}
		catch (const std::exception& e) {
			std::cerr << "Exception while converting AVFrame to JPEG string: " << e.what() << std::endl;
			av_frame_free(&frameToProcess);
			return m_lastJpegImage;
		}
		const auto t3 = clk::now();

		av_frame_free(&frameToProcess);

		if (m_pts < INT64_MAX) {
			m_pts++;
		}
		else {
			m_pts = 0;  // Reset to zero when max is reached
		}

		// Update the width, height, and pts
		width = m_Width;
		height = m_Height;
		pts = m_pts;

		if (!m_jpegImage.empty()) {
			m_lastJpegImage = std::move(m_jpegImage);  // Move instead of copy
		}

		// Throttled timing breakdown for the JPEG hot path.
		{
			static std::atomic<uint64_t> s_jpegTimingCount{0};
			const uint64_t n = s_jpegTimingCount.fetch_add(1, std::memory_order_relaxed);
			if ((n % 30) == 0) {
				const auto t4 = clk::now();
				auto ms = [](clk::time_point a, clk::time_point b) {
					return std::chrono::duration<double, std::milli>(b - a).count();
				};
				const char* fmtName = av_get_pix_fmt_name((AVPixelFormat)frameFmt);
				ANS_DBG("MEDIA_JpegTiming",
					"getJpegImage call#%llu lock=%.2fms pull=%.2fms encode=%.2fms publish=%.2fms "
					"total=%.2fms src_fmt=%s %dx%d jpeg_bytes=%zu (this=%p)",
					(unsigned long long)n,
					ms(t0, t1), ms(t1, t2), ms(t2, t3), ms(t3, t4), ms(t0, t4),
					fmtName ? fmtName : "?",
					frameW, frameH,
					m_lastJpegImage.size(),
					(void*)this);
			}
		}

		// Return the most recent valid JPEG image
		return m_lastJpegImage;
	}
	catch (const std::exception& e) {
		std::cerr << "Unexpected exception in getJpegImage(): " << e.what() << std::endl;
	}
	catch (...) {
		std::cerr << "Unknown exception in getJpegImage()" << std::endl;
	}

	// If any exception occurs, return the last valid JPEG image
	return m_lastJpegImage;
}