Files
ANSCORE/ANSODEngine/ANSOVFaceDetector.cpp

741 lines
34 KiB
C++
Raw Normal View History

2026-03-28 16:54:11 +11:00
#include "ANSOVFaceDetector.h"
#include "ANSGpuFrameRegistry.h"
#include "NV12PreprocessHelper.h" // tl_currentGpuFrame()
#include <models/detection_model_ssd.h>
#include <pipelines/metadata.h>
#include <models/input_data.h>
#include <models/results.h>
#include "Utility.h"
//#define FNS_DEBUG
namespace ANSCENTER {
bool ANSOVFD::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, std::string& labelMap) {
bool result = ANSFDBase::Initialize(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, labelMap);
labelMap = "Face";
_licenseValid = true;
std::vector<std::string> labels{ labelMap };
if (!_licenseValid) return false;
try {
_modelConfig = modelConfig;
_modelConfig.modelType = ModelType::FACEDETECT;
_modelConfig.detectionType = DetectionType::FACEDETECTOR;
_modelConfig.inpHeight = 640;
_modelConfig.inpWidth = 640;
if (_modelConfig.modelMNSThreshold < 0.2)
_modelConfig.modelMNSThreshold = 0.5;
if (_modelConfig.modelConfThreshold < 0.2)
_modelConfig.modelConfThreshold = 0.5;
if (_isInitialized) {
_face_detector.reset(); // Releases previously allocated memory for face detection
_isInitialized = false; // Reset initialization flag
}
std::string onnxModel = CreateFilePath(_modelFolder, "scrfd.onnx");
this->_face_detector = std::make_unique<SCRFD>(onnxModel);
_isInitialized = true;
_movementObjects.clear();
_retainDetectedFaces = 0;
return true;
}
catch (const std::exception& e) {
this->_logger.LogFatal("ANSOVFD::Initialize", e.what(), __FILE__, __LINE__);
return false;
}
}
bool ANSOVFD::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) {
try {
// We need to get the _modelFolder
bool result = ANSFDBase::LoadModel(modelZipFilePath, modelZipPassword);
if (!result) return false;
std::string onnxModel = CreateFilePath(_modelFolder, "scrfd.onnx");
//this->_face_detector = std::make_unique<SCRFD>(onnxModel);
_isInitialized = true;
return _isInitialized;
}
catch (std::exception& e) {
this->_logger.LogFatal("ANSOVFD::LoadModel", e.what(), __FILE__, __LINE__);
return false;
}
}
bool ANSOVFD::LoadModelFromFolder(std::string licenseKey, ModelConfig modelConfig, std::string modelName,std::string className, const std::string& modelFolder, std::string& labelMap) {
try {
bool result = ANSFDBase::LoadModelFromFolder(licenseKey, modelConfig,modelName, className, modelFolder, labelMap);
if (!result) return false;
std::string _modelName = modelName;
if (_modelName.empty()) {
_modelName = "scrfd";
}
std::string modelFullName = _modelName +".onnx";
// We need to get the modelfolder from here
_modelConfig = modelConfig;
_modelConfig.modelType = ModelType::FACEDETECT;
_modelConfig.detectionType = DetectionType::FACEDETECTOR;
_modelConfig.inpHeight = 640; // Only for Yolo model
_modelConfig.inpWidth = 640;
if (_modelConfig.modelMNSThreshold < 0.2)
_modelConfig.modelMNSThreshold = 0.5;
if (_modelConfig.modelConfThreshold < 0.2)
_modelConfig.modelConfThreshold = 0.5;
if (_isInitialized) {
_face_detector.reset(); // Releases previously allocated memory for face detection
_isInitialized = false; // Reset initialization flag
}
this->_face_detector = std::make_unique<SCRFD>(modelFullName);
_isInitialized = true;
return _isInitialized;
}
catch (std::exception& e) {
this->_logger.LogFatal("ANSOVFD::LoadModel", e.what(), __FILE__, __LINE__);
return false;
}
}
bool ANSOVFD::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
if (!FileExist(_modelFilePath)) {
optimizedModelFolder = "";
return false;
}
bool result = ANSFDBase::OptimizeModel(fp16, optimizedModelFolder);
return result;
}
std::vector<Object> ANSOVFD::RunInference(const cv::Mat& input, const std::string& camera_id, bool useDynamicImage, bool validateFace, bool facelivenessCheck) {
if (facelivenessCheck) {
std::vector<Object> rawFaceResults = Inference(input, camera_id, useDynamicImage, validateFace);
std::vector<Object> facesWithLivenessResults = ValidateLivenessFaces(input, rawFaceResults, camera_id);
return facesWithLivenessResults;
}
else {
return Inference(input, camera_id, useDynamicImage, validateFace);
}
}
std::vector<Object> ANSOVFD::RunInference(const cv::Mat& input, bool useDynamicImage, bool validateFace, bool facelivenessCheck) {
if (facelivenessCheck) {
std::vector<Object> rawFaceResults = Inference(input, "CustomCam", useDynamicImage, validateFace);
std::vector<Object> facesWithLivenessResults = ValidateLivenessFaces(input, rawFaceResults, "CustomCam");
return facesWithLivenessResults;
}
else {
return Inference(input, "CustomCam", useDynamicImage, validateFace);
}
}
std::vector<Object> ANSOVFD::Inference(const cv::Mat& input,
const std::string& camera_id,
bool useDynamicImage,
bool validateFace)
{
std::lock_guard<std::mutex> guard(_mtx);
try {
// Step 1: Validate license and initialization
if (!_licenseValid) {
_logger.LogError("ANSOVFD::RunInference", "Invalid license",
__FILE__, __LINE__);
return {};
}
if (!_isInitialized || !_face_detector) {
_logger.LogError("ANSOVFD::RunInference",
"Model or face detector not initialized",
__FILE__, __LINE__);
return {};
}
// Step 2: Validate input image
if (input.empty() || input.cols < 5 || input.rows < 5) {
_logger.LogError("ANSOVFD::RunInference", "Invalid input image",
__FILE__, __LINE__);
return {};
}
// Step 3: Convert to BGR if needed
cv::Mat processedImage;
if (input.channels() == 1) {
cv::cvtColor(input, processedImage, cv::COLOR_GRAY2BGR);
}
else if (input.channels() == 3) {
processedImage = input; // Shallow copy - safe
}
else {
_logger.LogError("ANSOVFD::RunInference",
"Unsupported number of image channels",
__FILE__, __LINE__);
return {};
}
// Step 4: Prepare image for detection
const bool croppedFace = !useDynamicImage;
const int originalWidth = input.cols;
const int originalHeight = input.rows;
cv::Mat im;
int offsetX = 0, offsetY = 0; // Track padding offset
if (croppedFace) {
// Add padding
cv::copyMakeBorder(processedImage, im, 200, 200, 200, 200,
cv::BORDER_REPLICATE);
offsetX = 200;
offsetY = 200;
// Resize if too large
if (im.rows > 1280) {
const float aspectRatio = static_cast<float>(im.cols) / im.rows;
const int newHeight = 1280;
const int newWidth = static_cast<int>(newHeight * aspectRatio);
// Update offsets for resize
const float scale = static_cast<float>(newHeight) / im.rows;
offsetX = static_cast<int>(offsetX * scale);
offsetY = static_cast<int>(offsetY * scale);
cv::resize(im, im, cv::Size(newWidth, newHeight));
}
}
else {
im = processedImage; // Shallow copy
}
// Step 5: Run face detection
std::vector<ANSCENTER::types::BoxfWithLandmarks> detectionResults;
_face_detector->detect(im, detectionResults);
// Step 6: Process detections
std::vector<Object> output;
output.reserve(detectionResults.size()); // Pre-allocate
// Try to get full-res image from registry for higher quality face alignment.
// On Intel-only systems (no NVIDIA GPU), NV12 may be in CPU memory from
// D3D11VA decode. Convert to BGR once and use for all face alignments.
cv::Mat fullResImg;
float landmarkScaleX = 1.f, landmarkScaleY = 1.f;
if (!croppedFace && !detectionResults.empty()) {
auto* gpuData = tl_currentGpuFrame();
if (gpuData && gpuData->width > im.cols && gpuData->height > im.rows) {
if (gpuData->pixelFormat == 1000 && gpuData->yPlane) {
// BGR full-res from ANSVideoPlayer/ANSFilePlayer
fullResImg = cv::Mat(gpuData->height, gpuData->width, CV_8UC3,
gpuData->yPlane, gpuData->yLinesize).clone();
landmarkScaleX = static_cast<float>(gpuData->width) / im.cols;
landmarkScaleY = static_cast<float>(gpuData->height) / im.rows;
}
else if (gpuData->pixelFormat == 23 && !gpuData->isCudaDevicePtr &&
gpuData->yPlane && gpuData->uvPlane &&
(gpuData->width % 2) == 0 && (gpuData->height % 2) == 0) {
// CPU NV12 — convert to BGR for face alignment
try {
cv::Mat yPlane(gpuData->height, gpuData->width, CV_8UC1,
gpuData->yPlane, gpuData->yLinesize);
cv::Mat uvPlane(gpuData->height / 2, gpuData->width / 2, CV_8UC2,
gpuData->uvPlane, gpuData->uvLinesize);
cv::cvtColorTwoPlane(yPlane, uvPlane, fullResImg, cv::COLOR_YUV2BGR_NV12);
landmarkScaleX = static_cast<float>(gpuData->width) / im.cols;
landmarkScaleY = static_cast<float>(gpuData->height) / im.rows;
} catch (...) { /* NV12 conversion failed — skip full-res */ }
}
else if (gpuData->pixelFormat == 23 && gpuData->isCudaDevicePtr &&
gpuData->cpuYPlane && gpuData->cpuUvPlane &&
(gpuData->width % 2) == 0 && (gpuData->height % 2) == 0) {
// CUDA NV12 with CPU fallback planes
try {
cv::Mat yPlane(gpuData->height, gpuData->width, CV_8UC1,
gpuData->cpuYPlane, gpuData->cpuYLinesize);
cv::Mat uvPlane(gpuData->height / 2, gpuData->width / 2, CV_8UC2,
gpuData->cpuUvPlane, gpuData->cpuUvLinesize);
cv::cvtColorTwoPlane(yPlane, uvPlane, fullResImg, cv::COLOR_YUV2BGR_NV12);
landmarkScaleX = static_cast<float>(gpuData->width) / im.cols;
landmarkScaleY = static_cast<float>(gpuData->height) / im.rows;
} catch (...) { /* NV12 conversion failed — skip full-res */ }
}
}
}
for (const auto& obj : detectionResults) {
// Check confidence threshold
const float confidence = obj.box.score;
if (confidence < _modelConfig.detectionScoreThreshold) {
continue;
}
// Validate face landmarks
if (validateFace && !isValidFace(obj.landmarks.points,obj.box.rect(), 27)) {
continue;
}
Object result;
result.classId = 0;
result.className = "Face";
result.confidence = confidence;
result.cameraId = camera_id;
// Get bounding box
cv::Rect bbox = obj.box.rect();
// Process face for mask — use full-res image when available
// for higher quality 112x112 face alignment
cv::Mat alignImage;
std::vector<cv::Point2f> face_landmark_5 = obj.landmarks.points;
if (!fullResImg.empty()) {
// Scale landmarks from display-res to full-res
for (auto& pt : face_landmark_5) {
pt.x *= landmarkScaleX;
pt.y *= landmarkScaleY;
}
alignImage = fullResImg;
} else {
alignImage = im;
}
result.mask = Preprocess(alignImage, face_landmark_5, alignImage);
if (result.mask.empty()) {
_logger.LogError("ANSOVFD::RunInference", "Cannot get mask image",
__FILE__, __LINE__);
continue;
}
// Adjust coordinates back to original image space
if (croppedFace) {
// Remove padding offset
bbox.x = std::max(0, bbox.x - offsetX);
bbox.y = std::max(0, bbox.y - offsetY);
// Clamp to original image bounds
bbox.x = std::min(bbox.x, originalWidth);
bbox.y = std::min(bbox.y, originalHeight);
bbox.width = std::min(bbox.width, originalWidth - bbox.x);
bbox.height = std::min(bbox.height, originalHeight - bbox.y);
result.box = bbox;
// Adjust landmarks
result.kps.reserve(obj.landmarks.points.size() * 2);
for (const auto& pt : obj.landmarks.points) {
result.kps.push_back(std::max(0.0f, pt.x - offsetX));
result.kps.push_back(std::max(0.0f, pt.y - offsetY));
}
}
else {
result.box = bbox;
// Copy landmarks directly
result.kps.reserve(obj.landmarks.points.size() * 2);
for (const auto& pt : obj.landmarks.points) {
result.kps.push_back(pt.x);
result.kps.push_back(pt.y);
}
}
output.push_back(std::move(result));
}
return output;
}
catch (const std::exception& e) {
_logger.LogFatal("ANSOVFD::RunInference",
"Exception: " + std::string(e.what()),
__FILE__, __LINE__);
return {};
}
catch (...) {
_logger.LogFatal("ANSOVFD::RunInference", "Unknown fatal exception",
__FILE__, __LINE__);
return {};
}
}
ANSOVFD::~ANSOVFD() {
try {
Destroy();
}
catch (std::exception& e) {
this->_logger.LogFatal("ANSOVFD::Destroy", e.what(), __FILE__, __LINE__);
}
}
bool ANSOVFD::Destroy() {
try {
#ifdef USEOVFACEDETECTOR
_faceLandmark.reset(); // Releases previously allocated memory for face landmark
#endif
_face_detector.reset(); // Releases previously allocated memory for face detection
_isInitialized = false; // Reset initialization flag
if (FolderExist(_modelFolder)) {
if (!DeleteFolder(_modelFolder)) {
this->_logger.LogError("ANSOVFD::Destroy", "Failed to release ANSOVFD Models", __FILE__, __LINE__);
}
}
return true;
}
catch (std::exception& e) {
this->_logger.LogFatal("ANSOVFD::Destroy", e.what(), __FILE__, __LINE__);
return false;
}
}
#ifdef USEOVFACEDETECTOR
std::shared_ptr<ov::Model> ANSOVFD::read(const ov::Core& core, std::string pathToModel) {
std::shared_ptr<ov::Model> model = core.read_model(pathToModel);
model_input_height = model->input().get_shape()[3];
model_input_width = model->input().get_shape()[2];
ov::OutputVector outputs = model->outputs();
if (outputs.size() == 1) {
output = outputs.front().get_any_name();
const auto& outShape = outputs.front().get_shape();
if (outShape.size() != 4) {
this->_logger.LogFatal("ANSOVFD::read.", "Face Detection model output should have 4 dimentions", __FILE__, __LINE__);
return model;
}
objectSize = outputs.front().get_shape()[3];
if (objectSize != 7) {
this->_logger.LogFatal("ANSOVFD::read. ", "Face Detection model output layer should have 7 as a last dimension", __FILE__, __LINE__);
return model;
}
if (outShape[2] != ndetections) {
this->_logger.LogFatal("ANSOVFD::read. ", "Face Detection model output must contain 200 detections", __FILE__, __LINE__);
return model;
}
}
else {
for (const auto& out : outputs) {
const auto& outShape = out.get_shape();
if (outShape.size() == 2 && outShape.back() == 5) {
output = out.get_any_name();
if (outShape[0] != ndetections) {
this->_logger.LogFatal("ANSOVFD::read. ", "Face Detection model output must contain 200 detections", __FILE__, __LINE__);
return model;
}
objectSize = outShape.back();
}
else if (outShape.size() == 1 && out.get_element_type() == ov::element::i32) {
labels_output = out.get_any_name();
}
}
if (output.empty() || labels_output.empty()) {
this->_logger.LogFatal("ANSOVFD::read. ", "Face Detection model must contain either single DetectionOutput", __FILE__, __LINE__);
return model;
}
}
ov::preprocess::PrePostProcessor ppp(model);
ppp.input().tensor().
set_element_type(ov::element::u8).
set_layout("NHWC");
ppp.input().preprocess().convert_layout("NCHW");
ppp.output(output).tensor().set_element_type(ov::element::f32);
model = ppp.build();
ov::set_batch(model, 1);
inShape = model->input().get_shape();
return model;
}
bool ANSOVFD::InitFaceDetector() {
if (_isInitialized) {
_faceLandmark.reset(); // Releases previously allocated memory for face landmark
_isInitialized = false; // Reset initialization flag
}
// Check if the Yolo model is available
std::string yoloModel = CreateFilePath(_modelFolder, "ansylfd.xml");
if (std::filesystem::exists(yoloModel)) {
_modelFilePath = yoloModel;
_useYoloFaceDetector = true;
}
else {
_useYoloFaceDetector = false;
std::string xmlfile = CreateFilePath(_modelFolder, "ansovfd.xml");
if (std::filesystem::exists(xmlfile)) {
_modelFilePath = xmlfile;
}
else {
this->_logger.LogError("ANSOVFD::Initialize. Face detector model file is not exist", _modelFilePath, __FILE__, __LINE__);
return false;
}
}
std::string deviceName = GetOpenVINODevice();
std::string landmarkModel = CreateFilePath(_modelFolder, "landmarks.xml");
if (std::filesystem::exists(landmarkModel)) {
_landmarkModelFilePath = landmarkModel;
this->_logger.LogDebug("ANSOVFD::Initialize. Loading landmarks weight", _landmarkModelFilePath, __FILE__, __LINE__);
ov::Core lmcore;
CnnConfig landmarks_config(_landmarkModelFilePath, "Facial Landmarks Regression");
if (deviceName == "NPU") {
lmcore.set_property("NPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
lmcore.set_property("GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
landmarks_config.m_deviceName = "AUTO:NPU,GPU";
}
else {
//lmcore.set_property(deviceName, ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
landmarks_config.m_deviceName = deviceName;
}
landmarks_config.m_max_batch_size = 1;
landmarks_config.m_core = lmcore;
this->_faceLandmark = std::make_unique<VectorCNN>(landmarks_config);
if (!_faceLandmark) {
this->_logger.LogFatal("ANSOVFD::Initialize", "Failed to initialize face recognizer model", __FILE__, __LINE__);
}
}
else {
this->_logger.LogError("ANSOVFD::LoadModel. Model landmarks.xml file is not exist", _landmarkModelFilePath, __FILE__, __LINE__);
}
if (_useYoloFaceDetector) InitialYoloModel(deviceName);
else InitialSSDModel(deviceName);
_isInitialized = true;
return _isInitialized;
}
void ANSOVFD::InitialSSDModel(const std::string& deviceName) {
try {
bb_enlarge_coefficient = 1.0;
bb_dx_coefficient = 1;
bb_dy_coefficient = 1;
ov::Core core;
if (deviceName == "NPU") {
core.set_property("NPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
core.set_property("GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
ov::CompiledModel cml = core.compile_model(read(core, _modelFilePath), "AUTO:NPU,GPU");
request = cml.create_infer_request();
inTensor = request.get_input_tensor();
inTensor.set_shape(inShape);
}
else {
//ore.set_property("GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
//core.set_property("CPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
ov::CompiledModel cml = core.compile_model(read(core, _modelFilePath), deviceName);
request = cml.create_infer_request();
inTensor = request.get_input_tensor();
inTensor.set_shape(inShape);
}
}
catch (const std::exception& e) {
this->_logger.LogFatal("ANSOVFD::InitialSSDModel", e.what(), __FILE__, __LINE__);
}
}
void ANSOVFD::InitialYoloModel(const std::string& deviceName) {
try {
ov::Core core;
if (deviceName == "NPU") {
core.set_property("NPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
core.set_property("GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
compiled_model_ = core.compile_model(_modelFilePath, "AUTO:NPU,GPU");
}
else {
// Configure and compile for individual device
//core.set_property("GPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
//core.set_property("CPU", ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY));
compiled_model_ = core.compile_model(_modelFilePath, deviceName);
}
request = compiled_model_.create_infer_request();
}
catch (const std::exception& e) {
this->_logger.LogFatal("OPENVINOOD::InitialModel", e.what(), __FILE__, __LINE__);
}
}
std::vector<Object> ANSOVFD::runOVInference(const cv::Mat& frame, const std::string& cameraId) {
std::vector<Object> detectionOutputs;
if (frame.empty()) {
this->_logger.LogError("ANSOVFD::runOVInference", "Input image is empty", __FILE__, __LINE__);
return detectionOutputs;
}
if ((frame.cols < 10) || (frame.rows < 10)) return detectionOutputs;
try {
width = static_cast<float>(frame.cols);
height = static_cast<float>(frame.rows);
resize2tensor(frame, inTensor);
request.infer();
float* detections = request.get_tensor(output).data<float>();
if (!labels_output.empty()) {
const int32_t* labels = request.get_tensor(labels_output).data<int32_t>();
for (size_t i = 0; i < ndetections; i++) {
Object r;
r.className = labels[i];
r.confidence = detections[i * objectSize + 4];
if (r.confidence <= _modelConfig.detectionScoreThreshold) {
continue;
}
r.box.x = static_cast<int>(detections[i * objectSize] / model_input_width * width);
r.box.y = static_cast<int>(detections[i * objectSize + 1] / model_input_height * height);
r.box.width = static_cast<int>(detections[i * objectSize + 2] / model_input_width * width - r.box.x);
r.box.height = static_cast<int>(detections[i * objectSize + 3] / model_input_height * height - r.box.y);
// Make square and enlarge face bounding box for more robust operation of face analytics networks
int bb_width = r.box.width;
int bb_height = r.box.height;
int bb_center_x = r.box.x + bb_width / 2;
int bb_center_y = r.box.y + bb_height / 2;
int max_of_sizes = std::max(bb_width, bb_height);
int bb_new_width = static_cast<int>(bb_enlarge_coefficient * max_of_sizes);
int bb_new_height = static_cast<int>(bb_enlarge_coefficient * max_of_sizes);
r.box.x = bb_center_x - static_cast<int>(std::floor(bb_dx_coefficient * bb_new_width / 2));
r.box.y = bb_center_y - static_cast<int>(std::floor(bb_dy_coefficient * bb_new_height / 2));
r.box.width = bb_new_width;
r.box.height = bb_new_height;
r.box.y = std::max(0, r.box.y);
r.box.x = std::max(0, r.box.x);
r.box.width = std::min(r.box.width, frame.cols - r.box.x);
r.box.height = std::min(r.box.height, frame.rows - r.box.y);
r.cameraId = cameraId;
if (r.confidence > _modelConfig.detectionScoreThreshold) {
detectionOutputs.push_back(r);
}
}
}
for (size_t i = 0; i < ndetections; i++) {
float image_id = detections[i * objectSize];
if (image_id < 0) {
break;
}
Object r;
r.className = static_cast<int>(detections[i * objectSize + 1]);
r.confidence = detections[i * objectSize + 2];
if (r.confidence <= _modelConfig.detectionScoreThreshold) {
continue;
}
r.box.x = static_cast<int>(detections[i * objectSize + 3] * width);
r.box.y = static_cast<int>(detections[i * objectSize + 4] * height);
r.box.width = static_cast<int>(detections[i * objectSize + 5] * width - r.box.x);
r.box.height = static_cast<int>(detections[i * objectSize + 6] * height - r.box.y);
// Make square and enlarge face bounding box for more robust operation of face analytics networks
int bb_width = r.box.width;
int bb_height = r.box.height;
int bb_center_x = r.box.x + bb_width / 2;
int bb_center_y = r.box.y + bb_height / 2;
int max_of_sizes = std::max(bb_width, bb_height);
int bb_new_width = static_cast<int>(bb_enlarge_coefficient * max_of_sizes);
int bb_new_height = static_cast<int>(bb_enlarge_coefficient * max_of_sizes);
r.box.x = bb_center_x - static_cast<int>(std::floor(bb_dx_coefficient * bb_new_width / 2));
r.box.y = bb_center_y - static_cast<int>(std::floor(bb_dy_coefficient * bb_new_height / 2));
r.box.width = bb_new_width;
r.box.height = bb_new_height;
r.box.y = std::max(0, r.box.y);
r.box.x = std::max(0, r.box.x);
r.box.width = std::min(r.box.width, frame.cols - r.box.x);
r.box.height = std::min(r.box.height, frame.rows - r.box.y);
r.cameraId = cameraId;
if (r.confidence > _modelConfig.detectionScoreThreshold) {
detectionOutputs.push_back(r);
}
}
return detectionOutputs;
}
catch (std::exception& e) {
this->_logger.LogFatal("ANSOVFD::runOVInference", e.what(), __FILE__, __LINE__);
return detectionOutputs;
}
}
cv::Mat ANSOVFD::PreProcessing(const cv::Mat& source) {
try {
int col = source.cols;
int row = source.rows;
int _max = MAX(col, row);
cv::Mat result = cv::Mat::zeros(_max, _max, CV_8UC3);
if ((col > 0) && (row > 0))source.copyTo(result(cv::Rect(0, 0, col, row)));
return result;
}
catch (std::exception& e) {
this->_logger.LogFatal("ANSOVFD::PreProcessing", e.what(), __FILE__, __LINE__);
return source;
}
}
std::vector<Object> ANSOVFD::runYoloInference(const cv::Mat& input, const std::string& cameraId) {
std::vector<Object> outputs;
outputs.clear();
try {
if (input.empty()) {
this->_logger.LogError("ANSOVFD::RunInference", "Input image is empty", __FILE__, __LINE__);
return outputs;
}
if ((input.cols < 10) || (input.rows < 10)) return outputs;
// Step 0: Prepare input
cv::Mat img = input.clone();
cv::Mat letterbox_img = PreProcessing(img);
float scale = letterbox_img.size[0] / 640.0;
cv::Mat blob = cv::dnn::blobFromImage(letterbox_img, 1.0 / 255.0, cv::Size(640, 640), cv::Scalar(), true);
// Step 1: Feed blob to the network
// Get input port for model with one input
auto input_port = compiled_model_.input();
// Create tensor from external memory
ov::Tensor input_tensor(input_port.get_element_type(), input_port.get_shape(), blob.ptr(0));
// Set input tensor for model with one input
request.set_input_tensor(input_tensor);
// Step 3. Start inference
request.infer();
// Step 4. Get output
auto output = request.get_output_tensor(0);
auto output_shape = output.get_shape();
int rows = output_shape[2]; //8400
int dimensions = output_shape[1]; //84: box[cx, cy, w, h]+80 classes scores
// Step 5. Post-processing
float* data = output.data<float>();
cv::Mat output_buffer(output_shape[1], output_shape[2], CV_32F, data);
transpose(output_buffer, output_buffer); //[8400,84]
std::vector<int> class_ids;
std::vector<float> class_scores;
std::vector<cv::Rect> boxes;
// Figure out the bbox, class_id and class_score
for (int i = 0; i < output_buffer.rows; i++) {
cv::Mat classes_scores = output_buffer.row(i).colRange(4, output_shape[1]);
cv::Point class_id;
double maxClassScore;
minMaxLoc(classes_scores, 0, &maxClassScore, 0, &class_id);
if (maxClassScore > _modelConfig.detectionScoreThreshold) {
class_scores.push_back(maxClassScore);
class_ids.push_back(class_id.x);
float cx = output_buffer.at<float>(i, 0);
float cy = output_buffer.at<float>(i, 1);
float w = output_buffer.at<float>(i, 2);
float h = output_buffer.at<float>(i, 3);
int left = int((cx - 0.5 * w) * scale);
int top = int((cy - 0.5 * h) * scale);
int width = int(w * scale);
int height = int(h * scale);
left = std::max(0, left);
top = std::max(0, top);
width = std::min(width, img.cols - left);
height = std::min(height, img.rows - top);
boxes.push_back(cv::Rect(left, top, width, height));
}
}
//NMS
std::vector<int> indices;
cv::dnn::NMSBoxes(boxes, class_scores, _modelConfig.modelConfThreshold, _modelConfig.modelMNSThreshold, indices);
for (int i = 0; i < indices.size(); i++)
{
Object result;
int id = indices[i];
if (class_scores[id] > _modelConfig.detectionScoreThreshold) {
int classId = class_ids[id];
result.classId = classId;
result.className = "Face";
result.confidence = class_scores[id];
result.box = boxes[id];
outputs.push_back(result);
}
}
img.release();
letterbox_img.release();
return outputs;
}
catch (std::exception& e) {
this->_logger.LogFatal("OPENVINOOD::runYoloInference", e.what(), __FILE__, __LINE__);
return outputs;
}
}
#endif
}