modules/ANSODEngine/SCRFDFaceDetector.cpp

#include "SCRFDFaceDetector.h"
#include "ANSGpuFrameRegistry.h"
#include "NV12PreprocessHelper.h"   // tl_currentGpuFrame()
#include "Utility.h"
#include <chrono>
//#define FNS_DEBUG
namespace ANSCENTER {
	// Initialization function with memory leak handling
	bool ANSSCRFDFD::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath,
		const std::string& modelZipPassword, std::string& labelMap) {
		// Clean up existing resources before reinitialization
		const bool engineAlreadyLoaded = _isInitialized && m_trtEngine != nullptr;
		if (!engineAlreadyLoaded) Destroy();
		// Call base class Initialize
		bool result = ANSFDBase::Initialize(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, labelMap);
		if (!result) return false;
		labelMap = "Face";
		_licenseValid = true;
		try {
			_modelConfig = modelConfig;
			_modelConfig.inpHeight = 640;
			_modelConfig.inpWidth = 640;
			_modelConfig.modelType = ModelType::FACEDETECT;
			_modelConfig.detectionType = DetectionType::FACEDETECTOR;
			std::string onnxfile = CreateFilePath(_modelFolder, "scrfdface.onnx");

			if (!std::filesystem::exists(onnxfile)) {
				this->_logger.LogError("ANSSCRFDFD::Initialize. Model scrfdface.onnx file does not exist", onnxfile, __FILE__, __LINE__);
				return false;
			}
			_modelFilePath = onnxfile;
			// Initialize TensorRT via shared pool
			if (!m_trtEngine) {
				m_options.precision = ANSCENTER::Precision::FP16;
				m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
				m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
				m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
				m_options.maxInputHeight = _modelConfig.maxInputHeight;
				m_options.minInputHeight = _modelConfig.minInputHeight;
				m_options.optInputHeight = _modelConfig.optInputHeight;
				m_options.maxInputWidth = _modelConfig.maxInputWidth;
				m_options.minInputWidth = _modelConfig.minInputWidth;
				m_options.optInputWidth = _modelConfig.optInputWidth;
				m_options.calibrationBatchSize = 1;

				m_poolKey = { _modelFilePath,
					static_cast<int>(m_options.precision),
					m_options.maxBatchSize };
				m_trtEngine = EnginePoolManager<float>::instance().acquire(
					m_poolKey, m_options, _modelFilePath,
					SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
				m_usingSharedPool = (m_trtEngine != nullptr);

				if (!m_trtEngine) {
					this->_logger.LogError("ANSSCRFDFD::Initialize. Unable to build or load TensorRT engine.", _modelFilePath, __FILE__, __LINE__);
					return false;
				}
			}
			fmc = 3;
			feat_stride_fpn = { 8, 16, 32 };
			num_anchors = 2;
			use_kps = true;
			_movementObjects.clear();
			_retainDetectedFaces = 0;
			_isInitialized = true;
			return true;
		}
		catch (const std::exception& e) {
			this->_logger.LogFatal("ANSSCRFDFD::Initialize", e.what(), __FILE__, __LINE__);
			return false;
		}
	}
	bool ANSSCRFDFD::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) {
		try {
			// We need to get the _modelFolder
			bool result = ANSFDBase::LoadModel(modelZipFilePath, modelZipPassword);
			if (!result) return false;
			const bool engineAlreadyLoaded = _isInitialized && m_trtEngine != nullptr;
			_modelConfig.modelType = ModelType::FACEDETECT;
			_modelConfig.detectionType = DetectionType::FACEDETECTOR;
			_modelConfig.inpHeight = 640;
			_modelConfig.inpWidth = 640;
			_movementObjects.clear();
			_retainDetectedFaces = 0;
			std::string onnxfile = CreateFilePath(_modelFolder, "scrfdface.onnx");

			if (!std::filesystem::exists(onnxfile)) {
				this->_logger.LogError("ANSSCRFDFD::Initialize. Model scrfdface.onnx file does not exist", onnxfile, __FILE__, __LINE__);
				return false;
			}
			_modelFilePath = onnxfile;
			if (!m_trtEngine) {
				m_options.precision = ANSCENTER::Precision::FP16;
				m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
				m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
				m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
				m_options.maxInputHeight = _modelConfig.maxInputHeight;
				m_options.minInputHeight = _modelConfig.minInputHeight;
				m_options.optInputHeight = _modelConfig.optInputHeight;
				m_options.maxInputWidth = _modelConfig.maxInputWidth;
				m_options.minInputWidth = _modelConfig.minInputWidth;
				m_options.optInputWidth = _modelConfig.optInputWidth;

				m_poolKey = { _modelFilePath,
					static_cast<int>(m_options.precision),
					m_options.maxBatchSize };
				m_trtEngine = EnginePoolManager<float>::instance().acquire(
					m_poolKey, m_options, _modelFilePath,
					SUB_VALS, DIV_VALS, NORMALIZE, m_maxSlotsPerGpu);
				m_usingSharedPool = (m_trtEngine != nullptr);

				if (!m_trtEngine) {
					this->_logger.LogError("ANSSCRFDFD::LoadModel. Unable to build or load TensorRT engine.", _modelFilePath, __FILE__, __LINE__);
					return false;
				}
			}
			fmc = 3;
			feat_stride_fpn = { 8, 16, 32 };
			num_anchors = 2;
			use_kps = true;
			_movementObjects.clear();
			_retainDetectedFaces = 0;
			_isInitialized = true;
			return true;
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("ANSSCRFDFD::LoadModel", e.what(), __FILE__, __LINE__);
			return false;
		}
	}
	bool ANSSCRFDFD::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig, std::string modelName, std::string className, const std::string& modelFolder, std::string& labelMap) {
		try {
			// We need to get the _modelFolder
			bool result = ANSFDBase::LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap);
			if (!result) return false;
			std::string _modelName = modelName;
			if (_modelName.empty()) {
				_modelName = "scrfdface";
			}
			_modelConfig.inpHeight = 640;
			_modelConfig.inpWidth = 640;
			_movementObjects.clear();
			_retainDetectedFaces = 0;
			std::string modelFullName = _modelName + ".onnx";
			std::string onnxfile = CreateFilePath(_modelFolder, modelFullName);
			if (std::filesystem::exists(onnxfile)) {
				_modelFilePath = onnxfile;
				this->_logger.LogDebug("ANSSCRFDFD::LoadModel.  Loading scrfdface weight", _modelFilePath, __FILE__, __LINE__);
			}
			else {
				this->_logger.LogError("ANSSCRFDFD::LoadModel.  Model scrfdface.onnx file is not exist", _modelFilePath, __FILE__, __LINE__);
				return false;
			}

			return true;
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("ANSSCRFDFD::LoadModel", e.what(), __FILE__, __LINE__);
			return false;
		}
	}
	bool ANSSCRFDFD::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		if (!FileExist(_modelFilePath)) {
			optimizedModelFolder = "";
			return false;
		}
		optimizedModelFolder = GetParentFolder(_modelFilePath);
		// Check if the engine already exists to avoid reinitializing
		if (!m_trtEngine) {
			// Fixed batch size of 1 for this model
			m_options.optBatchSize = _modelConfig.gpuOptBatchSize;
			m_options.maxBatchSize = _modelConfig.gpuMaxBatchSize;
			m_options.deviceIndex = _modelConfig.gpuDeviceIndex;
			m_options.maxInputHeight = _modelConfig.maxInputHeight;
			m_options.minInputHeight = _modelConfig.minInputHeight;
			m_options.optInputHeight = _modelConfig.optInputHeight;
			m_options.maxInputWidth = _modelConfig.maxInputWidth;
			m_options.minInputWidth = _modelConfig.minInputWidth;
			m_options.optInputWidth = _modelConfig.optInputWidth;
			m_options.engineFileDir = optimizedModelFolder;
			// Use FP16 or FP32 precision based on the input flag
			m_options.precision = Precision::FP16;
			// Create the TensorRT inference engine
			m_trtEngine = std::make_shared<Engine<float>>(m_options);
		}
		// Build the TensorRT engine
		auto succ = m_trtEngine->buildWithRetry(_modelFilePath, SUB_VALS, DIV_VALS, NORMALIZE);
		if (!succ) {
			const std::string errMsg =
				"Error: Unable to build the TensorRT engine. "
				"Try increasing TensorRT log severity to kVERBOSE.";
			this->_logger.LogError("ANSSCRFDFD::OptimizeModel", errMsg, __FILE__, __LINE__);
			return false;
		}
		std::string optimizedFaceAttributeModelFolder;
		bool result = ANSFDBase::OptimizeModel(fp16, optimizedFaceAttributeModelFolder);
		return result;
	}

	std::vector<Object> ANSSCRFDFD::RunInference(const cv::Mat& input, bool useDynamicImage, bool validateFace, bool facelivenessCheck) {
		if (facelivenessCheck) {
			std::vector<Object> rawFaceResults = Inference(input, "CustomCam", useDynamicImage, validateFace);
			std::vector<Object> facesWithLivenessResults = ValidateLivenessFaces(input, rawFaceResults, "CustomCam");
			return facesWithLivenessResults;
		}
		else {
			return Inference(input, "CustomCam", useDynamicImage, validateFace);
		}
	}
	std::vector<Object> ANSSCRFDFD::RunInference(const cv::Mat& input, const std::string& camera_id, bool useDynamicImage, bool validateFace, bool facelivenessCheck) {
		if (facelivenessCheck) {
			std::vector<Object> rawFaceResults = Inference(input, camera_id, useDynamicImage, validateFace);
			std::vector<Object> facesWithLivenessResults = ValidateLivenessFaces(input, rawFaceResults, camera_id);
			return facesWithLivenessResults;
		}
		else {
			return Inference(input, camera_id, useDynamicImage, validateFace);
		}
	}
	std::vector<Object> ANSSCRFDFD::Inference(const cv::Mat& input,
		const std::string& camera_id,
		bool useDynamicImage,
		bool validateFace)
	{
		if (_modelLoading.load()) return {};
		// Phase 1: Validation + image preprocessing (brief lock)
		cv::Mat im;
		bool croppedFace;
		float scoreThreshold;
		{
			auto lock = TryLockWithTimeout("ANSSCRFDFD::Inference");
			if (!lock.owns_lock()) return {};

			if (!_licenseValid) {
				_logger.LogError("ANSSCRFDFD::Inference", "Invalid license", __FILE__, __LINE__);
				return {};
			}

			if (!_isInitialized) {
				_logger.LogError("ANSSCRFDFD::Inference", "Model is not initialized", __FILE__, __LINE__);
				return {};
			}

			if (input.empty() || input.cols < 10 || input.rows < 10) {
				_logger.LogError("ANSSCRFDFD::Inference", "Invalid input image", __FILE__, __LINE__);
				return {};
			}

			croppedFace    = !useDynamicImage;
			scoreThreshold = _modelConfig.detectionScoreThreshold;

			if (croppedFace) {
				constexpr int border = 200;
				cv::copyMakeBorder(input, im, border, border, border, border, cv::BORDER_REPLICATE);

				if (im.rows > 1280) {
					const float aspectRatio = static_cast<float>(im.cols) / static_cast<float>(im.rows);
					constexpr int newHeight = 1280;
					const int newWidth = static_cast<int>(newHeight * aspectRatio);
					cv::resize(im, im, cv::Size(newWidth, newHeight));
				}
			}
			else {
				im = input;
			}
		}

		// Phase 2: Detect faces (mutex released — Detect manages its own brief locks around GPU inference)
		std::vector<Object> detectedFaces;
		try {
			detectedFaces = Detect(im);
		}
		catch (const std::exception& e) {
			_logger.LogFatal("ANSSCRFDFD::Inference", e.what(), __FILE__, __LINE__);
			return {};
		}
		catch (...) {
			_logger.LogFatal("ANSSCRFDFD::Inference", "Unknown exception occurred", __FILE__, __LINE__);
			return {};
		}

		if (detectedFaces.empty()) {
			return {};
		}

		// Phase 3: Process detected faces (operates on per-call local data — no shared state)
		const int originalWidth  = croppedFace ? input.cols : 0;
		const int originalHeight = croppedFace ? input.rows : 0;
		constexpr int border  = 200;
		constexpr float borderF = 200.0f;

		// NV12 affine warp: precompute scale factors (display-res → full-res NV12)
		float nv12ScaleX = 1.f, nv12ScaleY = 1.f;
		int nv12FullW = 0, nv12FullH = 0;
		bool nv12AffineAvailable = false;
		const int inferenceGpu = m_trtEngine ? m_trtEngine->getPreferredDeviceIndex() : 0;
		if (!croppedFace && m_nv12Helper.isCudaContextHealthy(_logger, "SCRFD")) {
			auto* gpuData = tl_currentGpuFrame();
			if (gpuData && gpuData->pixelFormat == 23 && gpuData->width > 0 && gpuData->height > 0) {
				nv12ScaleX = static_cast<float>(gpuData->width) / im.cols;
				nv12ScaleY = static_cast<float>(gpuData->height) / im.rows;
				nv12FullW = gpuData->width;
				nv12FullH = gpuData->height;
				nv12AffineAvailable = true;
			}
		}

		std::vector<Object> output;
		output.reserve(detectedFaces.size());

		for (auto& face : detectedFaces) {
			if (face.confidence <= scoreThreshold) {
				continue;
			}

			if (validateFace && !isValidFace(face.polygon, face.box, 27)) {
				continue;
			}

			// Get face mask — try NV12 affine warp first, fall back to CPU warpAffine
			cv::Mat mask;
			cv::cuda::GpuMat gpuMask;
			if (nv12AffineAvailable && face.polygon.size() == 5) {
				// Compute affine matrix on CPU (fast ~0.01ms)
				static const std::vector<cv::Point2f> kTemplate112 = []() {
					const std::vector<cv::Point2f> face_template = {
						{0.34191607f, 0.46157411f}, {0.65653393f, 0.45983393f},
						{0.50022500f, 0.64050536f}, {0.37097589f, 0.82469196f},
						{0.63151696f, 0.82325089f}
					};
					std::vector<cv::Point2f> tpl;
					tpl.reserve(5);
					for (const auto& pt : face_template)
						tpl.emplace_back(pt.x * 112.0f, pt.y * 112.0f);
					return tpl;
				}();

				cv::Mat affineMatrix = cv::estimateAffinePartial2D(
					face.polygon, kTemplate112);

				if (!affineMatrix.empty()) {
					auto nv12Face = m_nv12Helper.tryNV12AffineWarp(
						im, inferenceGpu, affineMatrix, 112, 112,
						nv12ScaleX, nv12ScaleY, _logger, "SCRFD");
					if (nv12Face.succeeded) {
						// Log first successful NV12 affine warp (once per instance)
						static bool s_nv12AffineLogged = false;
						if (!s_nv12AffineLogged) {
							s_nv12AffineLogged = true;
							_logger.LogInfo("ANSSCRFDFD::Inference",
								"NV12 affine warp ACTIVE: face aligned from " +
								std::to_string(nv12FullW) + "x" + std::to_string(nv12FullH) +
								" NV12 -> 112x112 BGR (display=" +
								std::to_string(im.cols) + "x" + std::to_string(im.rows) +
								" scaleX=" + std::to_string(nv12ScaleX) +
								" scaleY=" + std::to_string(nv12ScaleY) + ")",
								__FILE__, __LINE__);
						}
						mask = std::move(nv12Face.alignedFaceBGR);
						gpuMask = std::move(nv12Face.gpuAlignedFace);
					}
				}
			}

			// CPU fallback
			if (mask.empty()) {
				mask = Preprocess(im, face.polygon, im);
			}

			if (mask.empty()) {
				_logger.LogError("ANSSCRFDFD::Inference", "Cannot get mask image", __FILE__, __LINE__);
				continue;
			}

			// Build result object
			Object result;
			result.classId    = 0;
			result.className  = "Face";
			result.confidence = face.confidence;
			result.cameraId   = camera_id;
			result.polygon    = std::move(face.polygon);
			result.mask       = std::move(mask);
			result.gpuMask    = std::move(gpuMask);

			if (croppedFace) {
				// Adjust coordinates for border offset
				const int x1_new = std::max(0, face.box.x - border);
				const int y1_new = std::max(0, face.box.y - border);
				const int x2_new = std::min(originalWidth,  face.box.x + face.box.width  - border);
				const int y2_new = std::min(originalHeight, face.box.y + face.box.height - border);

				result.box = cv::Rect(x1_new, y1_new,
					std::max(0, x2_new - x1_new),
					std::max(0, y2_new - y1_new));

				result.kps.reserve(face.kps.size());
				for (const auto& pt : face.kps) {
					result.kps.emplace_back(pt - borderF);
				}
			}
			else {
				result.box = face.box;
				result.kps = std::move(face.kps);
			}

			output.push_back(std::move(result));
		}

		return output;
	}


	std::vector<Object> ANSSCRFDFD::InferenceDynamic(const cv::Mat& input, const std::string& camera_id) {
		if (_modelLoading.load()) return {};
		auto lock = TryLockWithTimeout("ANSSCRFDFD::InferenceDynamic");
		if (!lock.owns_lock()) return {};
		std::vector<Object> output;

		try {
			if (!_licenseValid) {
				_logger.LogError("ANSSCRFDFD::Inference", "Invalid license", __FILE__, __LINE__);
				return output;
			}

			if (!_isInitialized) {
				_logger.LogError("ANSSCRFDFD::Inference", "Model is not initialized", __FILE__, __LINE__);
				return output;
			}

			if (input.empty() || input.cols < 10 || input.rows < 10) {
				_logger.LogError("ANSSCRFDFD::Inference", "Invalid input image", __FILE__, __LINE__);
				return output;
			}

			bool croppedFace = (input.cols <= 300 || input.rows <= 300);
			cv::Mat im;

			try {
				if (croppedFace) {
					cv::copyMakeBorder(input, im, 200, 200, 200, 200, cv::BORDER_REPLICATE);
				}
				else {
					im = input.clone();
				}
			}
			catch (const std::exception& e) {
				_logger.LogError("ANSSCRFDFD::Inference", std::string("copyMakeBorder failed: ") + e.what(), __FILE__, __LINE__);
				return output;
			}

			const int originalWidth = input.cols;
			const int originalHeight = input.rows;

			std::vector<ImageSection> sections = createSlideScreens(im);
			int lowestPriority = getLowestPriorityRegion();
			if ((_currentPriority > lowestPriority) || (_currentPriority == 0)) {
				_currentPriority = getHighestPriorityRegion();
			}
			else {
				_currentPriority++;
			}

			cv::Rect regionByPriority = getRegionByPriority(_currentPriority);
			_detectedArea = regionByPriority;

#ifdef FNS_DEBUG
			cv::Mat draw = input.clone();
			cv::rectangle(draw, _detectedArea, cv::Scalar(0, 0, 255), 2);
#endif

			std::vector<Object> filteredFaceObjects;

			if (_detectedArea.width > 50 && _detectedArea.height > 50) {
				try {
					cv::Mat activeFrame = im(_detectedArea).clone();
					std::vector<Object> rawDetections = Detect(activeFrame);
					filteredFaceObjects = AdjustDetectedBoundingBoxes(rawDetections, _detectedArea, im.size(), 0.9);

#ifdef FNS_DEBUG
					cv::imshow("Active Area", activeFrame);
					cv::waitKey(1);
#endif
				}
				catch (const std::exception& e) {
					_logger.LogError("ANSSCRFDFD::Inference", std::string("Detect() failed: ") + e.what(), __FILE__, __LINE__);
					return output;
				}
			}

			for (const auto& face : filteredFaceObjects) {
				try {
					if (face.confidence < _modelConfig.detectionScoreThreshold)
						continue;
#ifdef FNS_DEBUG
					// draw landmarks
					for (cv::Point2f point : face.polygon)
					{
						cv::circle(draw, cv::Point(point.x + _detectedArea.x, point.y + _detectedArea.y), 2, cv::Scalar(0, 255, 0), -1);

					}
#endif
					if (!isValidFace(face.polygon, face.box, 27, _detectedArea.x, _detectedArea.y))
						continue;

					Object result;
					int x_min = face.box.x;
					int y_min = face.box.y;
					int x_max = x_min + face.box.width;
					int y_max = y_min + face.box.height;

					if (croppedFace) {
						x_min = std::max(0, x_min - 200);
						y_min = std::max(0, y_min - 200);
						x_max = std::min(originalWidth, x_max - 200);
						y_max = std::min(originalHeight, y_max - 200);
					}

					int width_half = std::abs((x_max - x_min) / 2);
					int height_half = std::abs((y_max - y_min) / 2);
					int xc = x_min + width_half;
					int yc = y_min + height_half;
					int c = std::max(width_half, height_half);

					int x1_new = std::max(0, xc - c);
					int y1_new = std::max(0, yc - c);
					int x2_new = std::min(originalWidth, xc + c);
					int y2_new = std::min(originalHeight, yc + c);

					result.classId = 0;
					result.className = "Face";
					result.confidence = face.confidence;
					result.box = cv::Rect(x1_new, y1_new, x2_new - x1_new, y2_new - y1_new);
					result.kps = face.kps;
					result.cameraId = camera_id;

#ifdef FNS_DEBUG
					cv::rectangle(draw, result.box, cv::Scalar(0, 0, 255), 2);
#endif

					try {
						result.mask = GetCroppedFaceScale(im, x1_new, y1_new, x2_new, y2_new, 112);
					}
					catch (const std::exception& e) {
						_logger.LogError("ANSSCRFDFD::Inference", std::string("GetCroppedFaceScale failed: ") + e.what(), __FILE__, __LINE__);
						continue;
					}

					if (!result.mask.empty()) {
						output.push_back(result);
					}
				}
				catch (const std::exception& e) {
					_logger.LogError("ANSSCRFDFD::Inference", std::string("Processing one face failed: ") + e.what(), __FILE__, __LINE__);
					continue;
				}
			}

#ifdef FNS_DEBUG
			cv::resize(draw, draw, cv::Size(1920, 1080));
			cv::imshow("Detected Areas", draw);
			cv::waitKey(1);
			draw.release();
#endif
			return output;
		}
		catch (const std::exception& e) {
			_logger.LogFatal("ANSSCRFDFD::TensorRTInference", e.what(), __FILE__, __LINE__);
		}
		catch (...) {
			_logger.LogFatal("ANSSCRFDFD::TensorRTInference", "Unknown exception occurred", __FILE__, __LINE__);
		}

		return output;
	}


	std::vector<Object> ANSSCRFDFD::Detect(const cv::Mat& input)
	{
		// Phase 1: Validation + engine dims (brief lock)
		int net_h, net_w;
		float imgHeight, imgWidth;
		{
			std::lock_guard<std::recursive_mutex> lock(_mutex);

			if (input.empty() || input.cols < 10 || input.rows < 10) {
				this->_logger.LogError("ANSSCRFDFD::Detect", "Invalid input image", __FILE__, __LINE__);
				return {};
			}

			if (!m_trtEngine) {
				this->_logger.LogFatal("ANSSCRFDFD::Detect", "TensorRT engine not initialized", __FILE__, __LINE__);
				return {};
			}

			imgHeight = static_cast<float>(input.rows);
			imgWidth  = static_cast<float>(input.cols);

			// Get and validate expected input dims
			auto inputDims = m_trtEngine->getInputDims();
			if (inputDims.empty() || inputDims[0].nbDims < 3) {
				this->_logger.LogFatal("ANSSCRFDFD::Detect", "Invalid input dimensions", __FILE__, __LINE__);
				return {};
			}

			net_h = inputDims[0].d[1];
			net_w = inputDims[0].d[2];

			// Optional check against configured INPUT_H/INPUT_W
			if (net_h != INPUT_H || net_w != INPUT_W) {
				this->_logger.LogFatal(
					"ANSSCRFDFD::Detect",
					"Engine input dims mismatch with configured INPUT_H/INPUT_W",
					__FILE__, __LINE__
				);
				return {};
			}
		}

		// Compute scale and padding (fully local math — no lock needed)
		const float w_r = static_cast<float>(net_w) / imgWidth;
		const float h_r = static_cast<float>(net_h) / imgHeight;
		const float r   = std::min(w_r, h_r);

		const int new_unpad_w = static_cast<int>(imgWidth  * r);
		const int new_unpad_h = static_cast<int>(imgHeight * r);

		const int pad_w = net_w - new_unpad_w; // >= 0
		const int pad_h = net_h - new_unpad_h; // >= 0

		const int dw = pad_w / 2;
		const int dh = pad_h / 2;

		SCRFDScaleParams scale_params;
		scale_params.ratio = r;
		scale_params.dw    = dw;
		scale_params.dh    = dh;
		scale_params.flag  = true;

		// Phase 2: CUDA preprocessing + inference (mutex released — pool dispatches to idle GPU slot)
		std::vector<std::vector<cv::cuda::GpuMat>> inputs;
		bool usedNV12 = false;
		try {
			// Clear any sticky CUDA error from transient graph-capture failures
			cudaError_t priorErr = cudaGetLastError();
			if (priorErr != cudaSuccess) {
				this->_logger.LogWarn(
					"ANSSCRFDFD::Detect",
					std::string("Cleared prior CUDA error before SCRFD preprocessing: ")
						+ cudaGetErrorString(priorErr),
					__FILE__, __LINE__);
			}

			// Try NV12 fast path first (fused NV12→RGB + center-padded letterbox)
			const int inferenceGpu = m_trtEngine ? m_trtEngine->getPreferredDeviceIndex() : 0;
			auto nv12 = m_nv12Helper.tryNV12(input, inferenceGpu, net_w, net_h,
				NV12PreprocessHelper::scrfdCenterLetterboxLauncher(dw, dh),
				_logger, "SCRFD");

			if (nv12.succeeded) {
				inputs = {{ std::move(nv12.gpuRGB) }};
				usedNV12 = true;
			}
			else if (nv12.useBgrFullRes) {
				// BGR full-res path — preprocess the full-res image instead
				// (fall through to standard BGR path with nv12.bgrFullResImg)
				// For simplicity, use the standard BGR path below with the original input
			}

			if (!usedNV12) {
				// CPU center-padded letterbox + BGR->RGB, then upload small image
				cv::Mat srcImg;
				if (input.channels() == 1) {
					cv::cvtColor(input, srcImg, cv::COLOR_GRAY2BGR);
				} else if (input.channels() == 3) {
					srcImg = input;
				} else {
					this->_logger.LogError("ANSSCRFDFD::Detect", "Unsupported channel count", __FILE__, __LINE__);
					return {};
				}

				// CPU resize to unpadded size
				cv::Mat cpuResized;
				if (srcImg.rows != new_unpad_h || srcImg.cols != new_unpad_w) {
					cv::resize(srcImg, cpuResized, cv::Size(new_unpad_w, new_unpad_h), 0, 0, cv::INTER_LINEAR);
				} else {
					cpuResized = srcImg;
				}

				// CPU center-pad to net_w x net_h
				cv::Mat cpuPadded(net_h, net_w, CV_8UC3, cv::Scalar(0, 0, 0));
				cpuResized.copyTo(cpuPadded(cv::Rect(dw, dh, new_unpad_w, new_unpad_h)));

				// CPU BGR -> RGB
				cv::Mat cpuRGB;
				cv::cvtColor(cpuPadded, cpuRGB, cv::COLOR_BGR2RGB);

				// Upload small padded image to GPU
				cv::cuda::Stream stream;
				cv::cuda::GpuMat d_padded;
				d_padded.upload(cpuRGB, stream);
				stream.waitForCompletion();

				std::vector<cv::cuda::GpuMat> inputVec;
				inputVec.emplace_back(std::move(d_padded));
				inputs.emplace_back(std::move(inputVec));
			}
			m_nv12Helper.tickInference();
		}
		catch (const std::exception& e) {
			this->_logger.LogError(
				"ANSSCRFDFD::Detect",
				std::string("CUDA preprocessing failed: ") + e.what(),
				__FILE__, __LINE__
			);
			return {};
		}

		std::vector<std::vector<std::vector<float>>> featureVectors;
		try {
			if (!m_trtEngine->runInference(inputs, featureVectors)) {
				this->_logger.LogFatal("ANSSCRFDFD::Detect", "Inference failed", __FILE__, __LINE__);
				return {};
			}
		}
		catch (const std::exception& e) {
			this->_logger.LogFatal(
				"ANSSCRFDFD::Detect",
				std::string("runInference exception: ") + e.what(),
				__FILE__, __LINE__
			);
			return {};
		}

		// Phase 3: Postprocessing (brief lock — generate_bboxes_kps uses center_points)
		std::vector<Object> filteredFaceObjects;
		{
			std::lock_guard<std::recursive_mutex> lock(_mutex);
			try {
				std::vector<Object> proposedFaceObjects;
				this->generate_bboxes_kps(
					scale_params,
					proposedFaceObjects,
					featureVectors[0],
					_modelConfig.detectionScoreThreshold,
					imgHeight,
					imgWidth
				);
				this->nms_bboxes_kps(
					proposedFaceObjects,
					filteredFaceObjects,
					_modelConfig.modelMNSThreshold,
					400
				);
			}
			catch (const std::exception& e) {
				this->_logger.LogError(
					"ANSSCRFDFD::Detect",
					std::string("Post-processing failed: ") + e.what(),
					__FILE__, __LINE__
				);
				return {};
			}
		}

		return filteredFaceObjects;
	}

	std::vector<Object> ANSSCRFDFD::TensorRTInferene(const cv::Mat& inputImage, const std::string& camera_id, bool useDynamicImage) {
		std::lock_guard<std::recursive_mutex> lock(_mutex);
		std::vector<Object> output;
		output.clear();
		if (!_licenseValid) {
			this->_logger.LogError("ANSSCRFDFD::TensorRTInferene", "Invalid license", __FILE__, __LINE__);
			return output;
		}
		if (!_isInitialized) {
			this->_logger.LogError("ANSSCRFDFD::TensorRTInferene", "Model is not initialized", __FILE__, __LINE__);
			return output;
		}
		try
		{
			//0. Resize image
			if (inputImage.empty()) return output;
			if ((inputImage.cols < 10) || (inputImage.rows < 10)) return output;
			bool croppedFace = false; // Check if the image is cropped face image
			cv::Mat im = inputImage.clone();
			int orginalHeight = im.rows;
			int orginalWidth = im.cols;
			if ((inputImage.size[0] <= 300) || (inputImage.size[1] <= 300)) croppedFace = true;
			if (croppedFace) cv::copyMakeBorder(inputImage, im, 200, 200, 200, 200, cv::BORDER_REPLICATE);
			std::vector<cv::Rect> activeROIs;
			if (useDynamicImage) {
				std::vector<Object> movementResults = DetectMovement(im, camera_id);
				std::vector<Object> movementObjects;
				if ((!movementResults.empty()) && ((movementResults.size() < 12)))
				{
					movementObjects.insert(movementObjects.end(), movementResults.begin(), movementResults.end());
					if (!_movementObjects.empty())movementObjects.insert(movementObjects.end(), _movementObjects.begin(), _movementObjects.end());
				}
				else {
					if (!_movementObjects.empty())movementObjects.insert(movementObjects.end(), _movementObjects.begin(), _movementObjects.end());
				}
				activeROIs.clear();
				if (!movementObjects.empty()) {
					std::vector<cv::Rect> localActiveROIs = GenerateFixedROIs(movementObjects, _modelConfig.inpHeight, _modelConfig.inpWidth, im.cols, im.rows);
					activeROIs.insert(activeROIs.end(), localActiveROIs.begin(), localActiveROIs.end());
				}
				else {
					activeROIs.push_back(cv::Rect(0, 0, im.cols, im.rows));// Use the orginal image
				}
				if ((activeROIs.size() <= 0) ||
					(activeROIs.empty()))
				{
					return output;
				}
				UpdateAndFilterDetectionObjects(_movementObjects, 80);
			}
			else {
				activeROIs.push_back(cv::Rect(0, 0, im.cols, im.rows));// Use the orginal image
			}

#ifdef FACEDEBUG
			cv::Mat draw = im.clone();
			for (int i = 0; i < movementObjects.size(); i++) {
				cv::rectangle(draw, movementObjects[i].box, cv::Scalar(0, 255, 255), 2); // RED for detectedArea
			}
			for (int i = 0; i < activeROIs.size(); i++) {
				cv::rectangle(draw, activeROIs[i], cv::Scalar(0, 0, 255), 2); // RED for detectedArea
			}
#endif
			for (int j = 0; j < activeROIs.size(); j++) {
				cv::Rect activeROI = activeROIs[j];
				activeROI.x = std::max(0, activeROI.x);
				activeROI.y = std::max(0, activeROI.y);
				activeROI.width = std::min(im.cols, activeROI.width);
				activeROI.height = std::min(im.rows, activeROI.height);
				cv::Mat frame = im(activeROI).clone();
				std::vector<Object> filteredFaceObjects = Detect(frame);
				// 5. Return the detected objects
				for (int i = 0; i < filteredFaceObjects.size(); i++)
				{
					if (filteredFaceObjects[i].confidence > _modelConfig.detectionScoreThreshold) {
#ifdef FACEDEBUG
						cv::Rect faceRect;
						faceRect.x = filteredFaceObjects[i].box.x + activeROI.x;
						faceRect.y = filteredFaceObjects[i].box.y + activeROI.y;
						faceRect.width = filteredFaceObjects[i].box.width;
						faceRect.height = filteredFaceObjects[i].box.height;
						cv::rectangle(draw, faceRect, cv::Scalar(225, 255, 0), 2); // RED for detectedArea
#endif
						// Check if the face is valid
						if (isValidFace(filteredFaceObjects[i].polygon, filteredFaceObjects[i].box), 27)
						{
							Object result;
							// 0. Get the face bounding box
							int x_min = filteredFaceObjects[i].box.x + activeROI.x;
							int y_min = filteredFaceObjects[i].box.y + activeROI.y;
							int x_max = filteredFaceObjects[i].box.width + filteredFaceObjects[i].box.x + activeROI.x;
							int y_max = filteredFaceObjects[i].box.height + filteredFaceObjects[i].box.y + activeROI.y;
#ifdef FACEDEBUG
							// draw landmarks
							for (cv::Point2f point : filteredFaceObjects[i].polygon)
							{
								cv::circle(draw, cv::Point(point.x + activeROI.x, point.y + activeROI.y), 2, cv::Scalar(0, 255, 0), -1);
							}
#endif
							if (croppedFace)
							{
								x_min = std::max(0, x_min - 200);
								y_min = std::max(0, y_min - 200);
								x_max = std::min(orginalWidth, x_max - 200);
								y_max = std::min(orginalHeight, y_max - 200);
							}
							// 1. Calculate the centered coordinates and dimensions
							int width_half = abs((x_max - x_min) / 2);
							int height_half = abs((y_max - y_min) / 2);
							int xc = x_min + width_half;
							int yc = y_min + height_half;
							int c = std::max(width_half, height_half);

							// 2. Calculate the new bounding box coordinates (square with center at xc, yc)
							int x1_new = std::max(0, xc - c);
							int y1_new = std::max(0, yc - c);
							int x2_new = std::min(orginalWidth, xc + c);
							int y2_new = std::min(orginalHeight, yc + c);

							// 3. Update the bounding box coordinates
							result.classId = 0;
							result.className = "Face";
							result.confidence = filteredFaceObjects[i].confidence;
							result.box.x = x1_new;
							result.box.y = y1_new;
							result.box.width = x2_new - x1_new;
							result.box.height = y2_new - y1_new;
							//result.polygon = ANSUtilityHelper::RectToNormalizedPolygon(result.box, inputImage.cols, inputImage.rows);
							result.mask = GetCroppedFaceScale(inputImage, x1_new, y1_new, x2_new, y2_new, 112);
							result.kps = filteredFaceObjects[i].kps; // landmarks as array of x,y,x,y...
							result.cameraId = camera_id;
							if (!result.mask.empty())
							{
								output.push_back(result);
								if (useDynamicImage) {
									//// Check if movement object contain results before adding to movement objects
									result.extraInfo = "0";
									// Find if obj already exists in detectionObjects using ContainsIntersectingObject
									auto it = std::find_if(_movementObjects.begin(), _movementObjects.end(),
										[&](Object& existingObj) {
											return ContainsIntersectingObject(_movementObjects, result);
										});

									if (it != _movementObjects.end()) {
										*it = result;  // Replace existing object with the new one
									}
									else {
										// If not found, add the new object to the list
										_movementObjects.push_back(result);
									}
								}

							}
						}

					}
				}
				frame.release();
			}
			im.release();
#ifdef FACEDEBUG
			cv::imshow("Combined Detected Areas", draw);// Debugging: Diplsay the frame with the combined detected areas
			cv::waitKey(1);// Debugging: Diplsay the frame with the combined detected areas
			draw.release();// Debugging: Diplsay the frame with the combined detected areas
#endif
			return output;
		}
		catch (std::exception& e)
		{
			this->_logger.LogFatal("ANSSCRFDFD::TensorRTInferene", e.what(), __FILE__, __LINE__);
			return output;
		}
	}

	ANSSCRFDFD::~ANSSCRFDFD() {
		try {
			Destroy();
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("ANSSCRFDFD::Destroy", e.what(), __FILE__, __LINE__);
		}
	}
	bool ANSSCRFDFD::Destroy() {
		try {
			_isInitialized = false;
			_licenseValid = false;
			_modelFilePath.clear();
			m_nv12Helper.destroy();
			if (m_usingSharedPool) {
				EnginePoolManager<float>::instance().release(m_poolKey);
				m_trtEngine.reset();
				m_usingSharedPool = false;
			}
			else if (m_trtEngine) {
				m_trtEngine.reset();
			}
			return true;
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("ANSSCRFDFD::Destroy", e.what(), __FILE__, __LINE__);
			return false;
		}
	}

	// SCRFDFD implementation (private)
	void ANSSCRFDFD::resize_unscale(const cv::Mat& mat, cv::Mat& mat_rs,
		int target_height, int target_width,
		SCRFDScaleParams& scale_params)
	{
		if (mat.empty()) return;
		int img_height = static_cast<int>(mat.rows);
		int img_width = static_cast<int>(mat.cols);

		mat_rs = cv::Mat(target_height, target_width, CV_8UC3,
			cv::Scalar(0, 0, 0));
		// scale ratio (new / old) new_shape(h,w)
		float w_r = (float)target_width / (float)img_width;
		float h_r = (float)target_height / (float)img_height;
		float r = std::min(w_r, h_r);

		// compute padding
		int new_unpad_w = static_cast<int>((float)img_width * r); // floor
		int new_unpad_h = static_cast<int>((float)img_height * r); // floor
		int pad_w = target_width - new_unpad_w; // >=0
		int pad_h = target_height - new_unpad_h; // >=0

		int dw = pad_w / 2;
		int dh = pad_h / 2;

		// resize with unscaling
		cv::Mat new_unpad_mat;
		// cv::Mat new_unpad_mat = mat.clone(); // may not need clone.
		cv::resize(mat, new_unpad_mat, cv::Size(new_unpad_w, new_unpad_h));
		new_unpad_mat.copyTo(mat_rs(cv::Rect(dw, dh, new_unpad_w, new_unpad_h)));

		// record scale params.
		scale_params.ratio = r;
		scale_params.dw = dw;
		scale_params.dh = dh;
		scale_params.flag = true;
	}
	void ANSSCRFDFD::generate_points(const int target_height, const int target_width)
	{
		if (center_points_is_update) return;
		// 8, 16, 32
		for (auto stride : feat_stride_fpn)
		{
			unsigned int num_grid_w = target_width / stride;
			unsigned int num_grid_h = target_height / stride;
			// y
			for (unsigned int i = 0; i < num_grid_h; ++i)
			{
				// x
				for (unsigned int j = 0; j < num_grid_w; ++j)
				{
					// num_anchors, col major
					for (unsigned int k = 0; k < num_anchors; ++k)
					{
						SCRFDPoint point;
						point.cx = (float)j;
						point.cy = (float)i;
						point.stride = (float)stride;
						center_points[stride].push_back(point);
					}
				}
			}
		}
		center_points_is_update = true;
	}

	void ANSSCRFDFD::generate_bboxes_kps(const SCRFDScaleParams& scale_params,
		std::vector<Object>& bbox_kps_collection,
		std::vector<std::vector<float>>& output_tensors,
		float score_threshold,
		float img_height,
		float img_width)
	{
		// score_8,score_16,score_32,bbox_8,bbox_16,bbox_32
		std::vector<float> score_8 = output_tensors.at(0);  // e.g [1,12800,1]
		std::vector<float> score_16 = output_tensors.at(1); // e.g [1,3200,1]
		std::vector<float> score_32 = output_tensors.at(2); // e.g [1,800,1]
		std::vector<float> bbox_8 = output_tensors.at(3);   // e.g [1,12800,4]
		std::vector<float> bbox_16 = output_tensors.at(4);  // e.g [1,3200,4]
		std::vector<float> bbox_32 = output_tensors.at(5);  // e.g [1,800,4]
		// generate center points.
		const float input_height = INPUT_H;// static_cast<float>(input_node_dims.at(2)); // e.g 640
		const float input_width = INPUT_W;// static_cast<float>(input_node_dims.at(3)); // e.g 640
		this->generate_points(input_height, input_width);
		bbox_kps_collection.clear();
		if (use_kps)
		{
			std::vector<float> kps_8 = output_tensors.at(6);   // e.g [1,12800,10]
			std::vector<float> kps_16 = output_tensors.at(7);  // e.g [1,3200,10]
			std::vector<float> kps_32 = output_tensors.at(8);  // e.g [1,800,10]

			// level 8 & 16 & 32 with kps
			this->generate_bboxes_kps_single_stride(scale_params, score_8, bbox_8, kps_8, 8, score_threshold,
				img_height, img_width, bbox_kps_collection);
			this->generate_bboxes_kps_single_stride(scale_params, score_16, bbox_16, kps_16, 16, score_threshold,
				img_height, img_width, bbox_kps_collection);
			this->generate_bboxes_kps_single_stride(scale_params, score_32, bbox_32, kps_32, 32, score_threshold,
				img_height, img_width, bbox_kps_collection);
		} // no kps
		else
		{
			// level 8 & 16 & 32
			this->generate_bboxes_single_stride(scale_params, score_8, bbox_8, 8, score_threshold,
				img_height, img_width, bbox_kps_collection);
			this->generate_bboxes_single_stride(scale_params, score_16, bbox_16, 16, score_threshold,
				img_height, img_width, bbox_kps_collection);
			this->generate_bboxes_single_stride(scale_params, score_32, bbox_32, 32, score_threshold,
				img_height, img_width, bbox_kps_collection);
		}
	}
	void ANSSCRFDFD::generate_bboxes_single_stride(
		const SCRFDScaleParams& scale_params, std::vector<float>& score_pred, std::vector<float>& bbox_pred,
		unsigned int stride, float score_threshold, float img_height, float img_width,
		std::vector<Object>& bbox_kps_collection)
	{
		unsigned int nms_pre_ = (stride / 8) * nms_pre; // 1 * 1000,2*1000,...
		nms_pre_ = nms_pre_ >= nms_pre ? nms_pre_ : nms_pre;

		const unsigned int num_points = score_pred.size();// stride_dims.at(1);  // 12800
		const float* score_ptr = score_pred.data(); // [1,12800,1]
		const float* bbox_ptr = bbox_pred.data();   // [1,12800,4]

		float ratio = scale_params.ratio;
		int dw = scale_params.dw;
		int dh = scale_params.dh;

		unsigned int count = 0;
		auto& stride_points = center_points[stride];

		for (unsigned int i = 0; i < num_points; ++i)
		{
			const float cls_conf = score_ptr[i];
			if (cls_conf < score_threshold) continue; // filter
			auto& point = stride_points.at(i);
			const float cx = point.cx; // cx
			const float cy = point.cy; // cy
			const float s = point.stride; // stride

			// bbox
			const float* offsets = bbox_ptr + i * 4;
			float l = offsets[0]; // left
			float t = offsets[1]; // top
			float r = offsets[2]; // right
			float b = offsets[3]; // bottom

			Object box_kps;
			float x1 = ((cx - l) * s - (float)dw) / ratio;  // cx - l x1
			float y1 = ((cy - t) * s - (float)dh) / ratio;  // cy - t y1
			float x2 = ((cx + r) * s - (float)dw) / ratio;  // cx + r x2
			float y2 = ((cy + b) * s - (float)dh) / ratio;  // cy + b y2
			box_kps.box.x = std::max(0.f, x1);
			box_kps.box.y = std::max(0.f, y1);
			box_kps.box.width = std::min(img_width - 1.f, x2 - x1);
			box_kps.box.height = std::min(img_height - 1.f, y2 - y1);
			box_kps.confidence = cls_conf;
			box_kps.classId = 0;
			box_kps.className = "face";

			bbox_kps_collection.push_back(box_kps);

			count += 1; // limit boxes for nms.
			if (count > max_nms)
				break;
		}

		if (bbox_kps_collection.size() > nms_pre_)
		{
			std::sort(
				bbox_kps_collection.begin(), bbox_kps_collection.end(),
				[](const Object& a, const Object& b)
				{ return a.confidence > b.confidence; }
			); // sort inplace
			// trunc
			bbox_kps_collection.resize(nms_pre_);
		}
	}

	void ANSSCRFDFD::generate_bboxes_kps_single_stride(
		const SCRFDScaleParams& scale_params, std::vector<float>& score_pred, std::vector<float>& bbox_pred,
		std::vector<float>& kps_pred, unsigned int stride, float score_threshold, float img_height,
		float img_width, std::vector<Object>& bbox_kps_collection)
	{
		unsigned int nms_pre_ = (stride / 8) * nms_pre; // 1 * 1000,2*1000,...
		nms_pre_ = nms_pre_ >= nms_pre ? nms_pre_ : nms_pre;

		const unsigned int num_points = score_pred.size();  // 12800
		const float* score_ptr = score_pred.data(); // [1,12800,1]
		const float* bbox_ptr = bbox_pred.data();   // [1,12800,4]
		const float* kps_ptr = kps_pred.data();     // [1,12800,10]

		float ratio = scale_params.ratio;
		int dw = scale_params.dw;
		int dh = scale_params.dh;

		unsigned int count = 0;
		auto& stride_points = center_points[stride];

		for (unsigned int i = 0; i < num_points; ++i)
		{
			const float cls_conf = score_ptr[i];
			if (cls_conf < score_threshold) continue; // filter
			auto& point = stride_points.at(i);
			const float cx = point.cx; // cx
			const float cy = point.cy; // cy
			const float s = point.stride; // stride

			// bbox
			const float* offsets = bbox_ptr + i * 4;
			float l = offsets[0]; // left
			float t = offsets[1]; // top
			float r = offsets[2]; // right
			float b = offsets[3]; // bottom

			Object box_kps;
			float x1 = ((cx - l) * s - (float)dw) / ratio;  // cx - l x1
			float y1 = ((cy - t) * s - (float)dh) / ratio;  // cy - t y1
			float x2 = ((cx + r) * s - (float)dw) / ratio;  // cx + r x2
			float y2 = ((cy + b) * s - (float)dh) / ratio;  // cy + b y2

			box_kps.box.x = (int)std::max(0.f, x1);
			box_kps.box.y = (int)std::max(0.f, y1);
			box_kps.box.width = (int)std::min(img_width - 1.f, x2 - x1);
			box_kps.box.height = (int)std::min(img_height - 1.f, y2 - y1);

			box_kps.confidence = cls_conf;
			box_kps.classId = 0;
			box_kps.className = "face";

			// landmarks
			const float* kps_offsets = kps_ptr + i * 10;
			for (unsigned int j = 0; j < 10; j += 2)
			{
				cv::Point2f kps;
				float kps_l = kps_offsets[j];
				float kps_t = kps_offsets[j + 1];
				float kps_x = ((cx + kps_l) * s - (float)dw) / ratio;  // cx + l x
				float kps_y = ((cy + kps_t) * s - (float)dh) / ratio;  // cy + t y
				kps.x = std::min(std::max(0.f, kps_x), img_width - 1.f);
				kps.y = std::min(std::max(0.f, kps_y), img_height - 1.f);
				box_kps.kps.push_back(kps.x);
				box_kps.kps.push_back(kps.y);
				box_kps.polygon.push_back(kps); // landmarks as polygon
			}
			bbox_kps_collection.push_back(box_kps);
			count += 1; // limit boxes for nms.
			if (count > max_nms)
				break;
		}

		if (bbox_kps_collection.size() > nms_pre_)
		{
			std::sort(
				bbox_kps_collection.begin(), bbox_kps_collection.end(),
				[](const Object& a, const Object& b)
				{ return a.confidence > b.confidence; }
			); // sort inplace
			// trunc
			bbox_kps_collection.resize(nms_pre_);
		}

	}
	float ANSSCRFDFD::getIouOfObjects(const Object& a, const Object& b) {
		// Retrieve the bounding boxes
		const cv::Rect& boxA = a.box;
		const cv::Rect& boxB = b.box;

		// Compute the coordinates of the intersection rectangle
		int inner_x1 = std::max(boxA.x, boxB.x);
		int inner_y1 = std::max(boxA.y, boxB.y);
		int inner_x2 = std::min(boxA.x + boxA.width, boxB.x + boxB.width);
		int inner_y2 = std::min(boxA.y + boxA.height, boxB.y + boxB.height);

		// Calculate width and height of the intersection
		int inner_w = inner_x2 - inner_x1;
		int inner_h = inner_y2 - inner_y1;

		// If there's no overlap, return 0
		if (inner_w <= 0 || inner_h <= 0) {
			return 0.0f;
		}

		// Calculate the area of the intersection
		float inner_area = static_cast<float>(inner_w * inner_h);

		// Calculate the areas of the two boxes
		float areaA = static_cast<float>(boxA.width * boxA.height);
		float areaB = static_cast<float>(boxB.width * boxB.height);

		// Calculate the union area
		float union_area = areaA + areaB - inner_area;

		// Avoid division by zero and return IoU
		if (union_area <= 0.0f) {
			return 0.0f;
		}
		return inner_area / union_area;
	}

	void ANSSCRFDFD::nms_bboxes_kps(std::vector<Object>& input,
		std::vector<Object>& output,
		float iou_threshold, unsigned int topk)
	{
		if (input.empty()) return;
		std::sort(
			input.begin(), input.end(),
			[](const Object& a, const Object& b)
			{ return a.confidence > b.confidence; }
		);
		const unsigned int box_num = input.size();
		std::vector<int> merged(box_num, 0);

		unsigned int count = 0;
		for (unsigned int i = 0; i < box_num; ++i)
		{
			if (merged[i]) continue;
			std::vector<Object> buf;

			buf.push_back(input[i]);
			merged[i] = 1;

			for (unsigned int j = i + 1; j < box_num; ++j)
			{
				if (merged[j]) continue;

				float iou = getIouOfObjects(input[i], input[j]); //static_cast<float>(input[i].box.iou_of(input[j].box));

				if (iou > iou_threshold)
				{
					merged[j] = 1;
					buf.push_back(input[j]);
				}
			}
			output.push_back(buf[0]);

			// keep top k
			count += 1;
			if (count >= topk)
				break;
		}
	}

}