Initial setup for CLion

2026-03-28 16:54:11 +11:00
parent 239cc02591
commit 7b4134133c
1136 changed files with 811916 additions and 0 deletions
--- a/engines/OpenVINOEngine/src/faceapp/action_detector.cpp
+++ b/engines/OpenVINOEngine/src/faceapp/action_detector.cpp
@@ -0,0 +1,418 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <algorithm>
+#include <limits>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include "openvino/openvino.hpp"
+
+#include "action_detector.hpp"
+
+#define SSD_LOCATION_RECORD_SIZE 4
+#define SSD_PRIORBOX_RECORD_SIZE 4
+#define NUM_DETECTION_CLASSES 2
+#define POSITIVE_DETECTION_IDX 1
+
+template <typename T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+    return pair1.first > pair2.first;
+}
+
+ActionDetection::ActionDetection(const ActionDetectorConfig& config) :
+    BaseCnnDetection(config.is_async), m_config(config) {
+    m_detectorName = "action detector";
+
+    ov::Layout desiredLayout = {"NHWC"};
+
+    slog::info << "Reading model: " << m_config.m_path_to_model << slog::endl;
+    std::shared_ptr<ov::Model> model = m_config.m_core.read_model(m_config.m_path_to_model);
+    logBasicModelInfo(model);
+
+    ov::OutputVector inputInfo = model->inputs();
+    if (inputInfo.size() != 1) {
+        throw std::runtime_error("Action Detection network should have only one input");
+    }
+
+    m_input_name = model->input().get_any_name();
+
+    m_modelLayout = ov::layout::get_layout(model->input());
+    if (m_modelLayout.empty())
+        m_modelLayout = {"NCHW"};
+
+    m_network_input_size.height = model->input().get_shape()[ov::layout::height_idx(m_modelLayout)];
+    m_network_input_size.width = model->input().get_shape()[ov::layout::width_idx(m_modelLayout)];
+
+    m_new_model = false;
+
+    ov::OutputVector outputs = model->outputs();
+    auto cmp = [&](const ov::Output<ov::Node>& output) { return output.get_any_name() == m_config.new_loc_blob_name; };
+    auto it = std::find_if(outputs.begin(), outputs.end(), cmp);
+    if (it != outputs.end())
+        m_new_model = true;
+
+    ov::preprocess::PrePostProcessor ppp(model);
+
+    ppp.input().tensor()
+        .set_element_type(ov::element::u8)
+        .set_layout(desiredLayout);
+    ppp.input().preprocess()
+        .convert_layout(m_modelLayout)
+        .convert_element_type(ov::element::f32);
+    ppp.input().model().set_layout(m_modelLayout);
+
+    model = ppp.build();
+
+    slog::info << "PrePostProcessor configuration:" << slog::endl;
+    slog::info << ppp << slog::endl;
+
+    ov::set_batch(model, m_config.m_max_batch_size);
+
+    m_model = m_config.m_core.compile_model(model, m_config.m_deviceName);
+    logCompiledModelInfo(m_model, m_config.m_path_to_model, m_config.m_deviceName, m_config.m_model_type);
+
+    const auto& head_anchors = m_new_model ? m_config.new_anchors : m_config.old_anchors;
+    const int num_heads = head_anchors.size();
+
+    m_head_ranges.resize(num_heads + 1);
+    m_glob_anchor_map.resize(num_heads);
+    m_head_step_sizes.resize(num_heads);
+
+    m_head_ranges[0] = 0;
+    int head_shift = 0;
+
+    for (int head_id = 0; head_id < num_heads; ++head_id) {
+        m_glob_anchor_map[head_id].resize(head_anchors[head_id]);
+        int anchor_height, anchor_width;
+        for (int anchor_id = 0; anchor_id < head_anchors[head_id]; ++anchor_id) {
+            const auto glob_anchor_name = m_new_model ?
+                m_config.new_action_conf_blob_name_prefix + std::to_string(head_id + 1) +
+                    m_config.new_action_conf_blob_name_suffix + std::to_string(anchor_id + 1) :
+                m_config.old_action_conf_blob_name_prefix + std::to_string(anchor_id + 1);
+            m_glob_anchor_names.push_back(glob_anchor_name);
+
+            const auto anchor_dims = m_model.output(glob_anchor_name).get_shape();
+
+            anchor_height = m_new_model ? anchor_dims[ov::layout::height_idx(m_modelLayout)] : anchor_dims[1];
+            anchor_width = m_new_model ? anchor_dims[ov::layout::width_idx(m_modelLayout)] : anchor_dims[2];
+            std::size_t num_action_classes = m_new_model ? anchor_dims[ov::layout::channels_idx(m_modelLayout)] : anchor_dims[3];
+            if (num_action_classes != m_config.num_action_classes) {
+                throw std::logic_error("The number of specified actions and the number of actions predicted by "
+                    "the Person/Action Detection Retail model must match");
+            }
+
+            const int anchor_size = anchor_height * anchor_width;
+            head_shift += anchor_size;
+
+            m_head_step_sizes[head_id] = m_new_model ? anchor_size : 1;
+            m_glob_anchor_map[head_id][anchor_id] = m_num_glob_anchors++;
+        }
+
+        m_head_ranges[head_id + 1] = head_shift;
+        m_head_blob_sizes.emplace_back(anchor_width, anchor_height);
+    }
+
+    m_num_candidates = head_shift;
+
+    m_binary_task = m_config.num_action_classes == 2;
+}
+
+void ActionDetection::submitRequest() {
+    if (!m_enqueued_frames)
+        return;
+    m_enqueued_frames = 0;
+
+    BaseCnnDetection::submitRequest();
+}
+
+void ActionDetection::enqueue(const cv::Mat& frame) {
+    if (m_request == nullptr) {
+        m_request = std::make_shared<ov::InferRequest>(m_model.create_infer_request());
+    }
+
+    m_width = static_cast<float>(frame.cols);
+    m_height = static_cast<float>(frame.rows);
+
+    ov::Tensor input_tensor = m_request->get_tensor(m_input_name);
+
+    resize2tensor(frame, input_tensor);
+
+    m_enqueued_frames = 1;
+}
+
+DetectedActions ActionDetection::fetchResults() {
+    const auto loc_blob_name = m_new_model ? m_config.new_loc_blob_name : m_config.old_loc_blob_name;
+    const auto det_conf_blob_name = m_new_model ? m_config.new_det_conf_blob_name : m_config.old_det_conf_blob_name;
+
+    ov::Shape loc_out_shape = m_model.output(loc_blob_name).get_shape();
+    const cv::Mat loc_out(loc_out_shape[0],
+                          loc_out_shape[1],
+                          CV_32F,
+                          m_request->get_tensor(loc_blob_name).data<float>());
+
+    ov::Shape conf_out_shape = m_model.output(det_conf_blob_name).get_shape();
+    const cv::Mat main_conf_out(conf_out_shape[0],
+                                conf_out_shape[1],
+                                CV_32F,
+                                m_request->get_tensor(det_conf_blob_name).data<float>());
+
+    std::vector<cv::Mat> add_conf_out;
+    for (int glob_anchor_id = 0; glob_anchor_id < m_num_glob_anchors; ++glob_anchor_id) {
+        const auto& blob_name = m_glob_anchor_names[glob_anchor_id];
+        ov::Shape shape = m_request->get_tensor(blob_name).get_shape();
+        add_conf_out.emplace_back(shape[ov::layout::height_idx(m_modelLayout)],
+                                  shape[ov::layout::width_idx(m_modelLayout)],
+                                  CV_32F,
+                                  m_request->get_tensor(blob_name).data());
+    }
+
+    // Parse detections
+    DetectedActions result;
+    if (m_new_model) {
+        const cv::Mat priorbox_out;
+        result = GetDetections(loc_out, main_conf_out, priorbox_out, add_conf_out,
+                             cv::Size(static_cast<int>(m_width), static_cast<int>(m_height)));
+    } else {
+        ov::Shape old_priorbox_shape = m_model.output(m_config.old_priorbox_blob_name).get_shape();
+        const cv::Mat priorbox_out((int)old_priorbox_shape[0], (int)old_priorbox_shape[1],
+            CV_32F,
+            m_request->get_tensor(m_config.old_priorbox_blob_name).data());
+
+        result = GetDetections(loc_out, main_conf_out, priorbox_out, add_conf_out,
+            cv::Size(static_cast<int>(m_width), static_cast<int>(m_height)));
+    }
+
+    return result;
+}
+
+inline ActionDetection::NormalizedBBox
+ActionDetection::ParseBBoxRecord(const float* data, bool inverse) const {
+    NormalizedBBox bbox;
+    bbox.xmin = data[inverse ? 1 : 0];
+    bbox.ymin = data[inverse ? 0 : 1];
+    bbox.xmax = data[inverse ? 3 : 2];
+    bbox.ymax = data[inverse ? 2 : 3];
+    return bbox;
+}
+
+inline ActionDetection::NormalizedBBox
+ActionDetection::GeneratePriorBox(int pos, int step, const cv::Size2f& anchor,
+                                  const cv::Size& blob_size) const {
+    const int row = pos / blob_size.width;
+    const int col = pos % blob_size.width;
+
+    const float center_x = (col + 0.5f) * static_cast<float>(step);
+    const float center_y = (row + 0.5f) * static_cast<float>(step);
+
+    NormalizedBBox bbox;
+    bbox.xmin = (center_x - 0.5f * anchor.width) / static_cast<float>(m_network_input_size.width);
+    bbox.ymin = (center_y - 0.5f * anchor.height) / static_cast<float>(m_network_input_size.height);
+    bbox.xmax = (center_x + 0.5f * anchor.width) / static_cast<float>(m_network_input_size.width);
+    bbox.ymax = (center_y + 0.5f * anchor.height) / static_cast<float>(m_network_input_size.height);
+
+    return bbox;
+}
+
+cv::Rect ActionDetection::ConvertToRect(
+        const NormalizedBBox& prior_bbox, const NormalizedBBox& variances,
+        const NormalizedBBox& encoded_bbox, const cv::Size& frame_size) const {
+    // Convert prior bbox to CV_Rect
+    const float prior_width = prior_bbox.xmax - prior_bbox.xmin;
+    const float prior_height = prior_bbox.ymax - prior_bbox.ymin;
+    const float prior_center_x = 0.5f * (prior_bbox.xmin + prior_bbox.xmax);
+    const float prior_center_y = 0.5f * (prior_bbox.ymin + prior_bbox.ymax);
+
+    // Decode bbox coordinates from the SSD format
+    const float decoded_bbox_center_x =
+            variances.xmin * encoded_bbox.xmin * prior_width + prior_center_x;
+    const float decoded_bbox_center_y =
+            variances.ymin * encoded_bbox.ymin * prior_height + prior_center_y;
+    const float decoded_bbox_width =
+            static_cast<float>(exp(static_cast<float>(variances.xmax * encoded_bbox.xmax))) * prior_width;
+    const float decoded_bbox_height =
+            static_cast<float>(exp(static_cast<float>(variances.ymax * encoded_bbox.ymax))) * prior_height;
+
+    // Create decoded bbox
+    const float decoded_bbox_xmin = decoded_bbox_center_x - 0.5f * decoded_bbox_width;
+    const float decoded_bbox_ymin = decoded_bbox_center_y - 0.5f * decoded_bbox_height;
+    const float decoded_bbox_xmax = decoded_bbox_center_x + 0.5f * decoded_bbox_width;
+    const float decoded_bbox_ymax = decoded_bbox_center_y + 0.5f * decoded_bbox_height;
+
+    // Convert decoded bbox to CV_Rect
+    return cv::Rect(static_cast<int>(decoded_bbox_xmin * frame_size.width),
+                    static_cast<int>(decoded_bbox_ymin * frame_size.height),
+                    static_cast<int>((decoded_bbox_xmax - decoded_bbox_xmin) * frame_size.width),
+                    static_cast<int>((decoded_bbox_ymax - decoded_bbox_ymin) * frame_size.height));
+}
+
+DetectedActions ActionDetection::GetDetections(const cv::Mat& loc, const cv::Mat& main_conf,
+        const cv::Mat& priorboxes, const std::vector<cv::Mat>& add_conf,
+        const cv::Size& frame_size) const {
+    // Prepare input data buffers
+    const float* loc_data = reinterpret_cast<float*>(loc.data);
+    const float* det_conf_data = reinterpret_cast<float*>(main_conf.data);
+    const float* prior_data = m_new_model ? NULL : reinterpret_cast<float*>(priorboxes.data);
+
+    const int total_num_anchors = add_conf.size();
+    std::vector<float*> action_conf_data(total_num_anchors);
+    for (int i = 0; i < total_num_anchors; ++i) {
+        action_conf_data[i] = reinterpret_cast<float*>(add_conf[i].data);
+    }
+
+    // Variable to store all detection candidates
+    DetectedActions valid_detections;
+
+    // Iterate over all candidate bboxes
+    for (int p = 0; p < m_num_candidates; ++p) {
+        // Parse detection confidence from the SSD Detection output
+        const float detection_conf =
+                det_conf_data[p * NUM_DETECTION_CLASSES + POSITIVE_DETECTION_IDX];
+
+        // Skip low-confidence detections
+        if (detection_conf < m_config.detection_confidence_threshold) {
+            continue;
+        }
+
+        // Estimate the action head ID
+        int head_id = 0;
+        while (p >= m_head_ranges[head_id + 1]) {
+            ++head_id;
+        }
+        const int head_p = p - m_head_ranges[head_id];
+
+        // Estimate the action anchor ID
+        const int head_num_anchors =
+            m_new_model ? m_config.new_anchors[head_id] : m_config.old_anchors[head_id];
+        const int anchor_id = head_p % head_num_anchors;
+
+        // Estimate the action label
+        const int glob_anchor_id = m_glob_anchor_map[head_id][anchor_id];
+        const float* anchor_conf_data = action_conf_data[glob_anchor_id];
+        const int action_conf_idx_shift = m_new_model ?
+            head_p / head_num_anchors :
+            head_p / head_num_anchors * m_config.num_action_classes;
+        const int action_conf_step = m_head_step_sizes[head_id];
+        const float scale = m_new_model ? m_config.new_action_scale : m_config.old_action_scale;
+        int action_label = -1;
+        float action_max_exp_value = 0.f;
+        float action_sum_exp_values = 0.f;
+        for (size_t c = 0; c < m_config.num_action_classes; ++c) {
+            float action_exp_value =
+                std::exp(scale * anchor_conf_data[action_conf_idx_shift + c * action_conf_step]);
+            action_sum_exp_values += action_exp_value;
+            if (action_exp_value > action_max_exp_value && ((c > 0 && m_binary_task) || !m_binary_task)) {
+                action_max_exp_value = action_exp_value;
+                action_label = c;
+            }
+        }
+
+        if (std::fabs(action_sum_exp_values) < std::numeric_limits<float>::epsilon()) {
+            throw std::logic_error("action_sum_exp_values can't be equal to 0");
+        }
+        // Estimate the action confidence
+        float action_conf = action_max_exp_value / action_sum_exp_values;
+
+        // Skip low-confidence actions
+        if (action_label < 0 || action_conf < m_config.action_confidence_threshold) {
+            action_label = m_config.default_action_id;
+            action_conf = 0.f;
+        }
+
+        // Parse bbox from the SSD Detection output
+        const auto priorbox = m_new_model ?
+            GeneratePriorBox(head_p / head_num_anchors,
+                m_config.new_det_heads[head_id].step,
+                m_config.new_det_heads[head_id].anchors[anchor_id],
+                m_head_blob_sizes[head_id]) :
+            ParseBBoxRecord(prior_data + p * SSD_PRIORBOX_RECORD_SIZE, false);
+        const auto variance =
+                ParseBBoxRecord(m_new_model ?
+                    m_config.variances :
+                    prior_data + (m_num_candidates + p) * SSD_PRIORBOX_RECORD_SIZE, false);
+        const auto encoded_bbox =
+                ParseBBoxRecord(loc_data + p * SSD_LOCATION_RECORD_SIZE, m_new_model);
+
+        const auto det_rect = ConvertToRect(priorbox, variance, encoded_bbox, frame_size);
+
+        // Store detected action
+        valid_detections.emplace_back(det_rect, action_label, detection_conf, action_conf);
+    }
+
+    // Merge most overlapped detections
+    std::vector<int> out_det_indices;
+    SoftNonMaxSuppression(valid_detections, m_config.nms_sigma, m_config.keep_top_k,
+                          m_config.detection_confidence_threshold, &out_det_indices);
+
+    DetectedActions detections;
+    for (size_t i = 0; i < out_det_indices.size(); ++i) {
+        detections.emplace_back(valid_detections[out_det_indices[i]]);
+    }
+    return detections;
+}
+
+void ActionDetection::SoftNonMaxSuppression(const DetectedActions& detections,
+        const float sigma, size_t top_k, const float min_det_conf,
+        std::vector<int>* out_indices) const {
+    // Store input bbox scores
+    std::vector<float> scores(detections.size());
+    for (size_t i = 0; i < detections.size(); ++i) {
+        scores[i] = detections[i].detection_conf;
+    }
+
+    top_k = std::min(top_k, scores.size());
+
+    // Select top-k score indices
+    std::vector<size_t> score_idx(scores.size());
+    std::iota(score_idx.begin(), score_idx.end(), 0);
+    std::nth_element(score_idx.begin(), score_idx.begin() + top_k, score_idx.end(),
+        [&scores](size_t i1, size_t i2) {return scores[i1] > scores[i2];});
+
+    // Extract top-k score values
+    std::vector<float> top_scores(top_k);
+    for (size_t i = 0; i < top_scores.size(); ++i) {
+        top_scores[i] = scores[score_idx[i]];
+    }
+
+    // Carry out Soft Non-Maximum Suppression algorithm
+    out_indices->clear();
+    for (size_t step = 0; step < top_scores.size(); ++step) {
+        auto best_score_itr = std::max_element(top_scores.begin(), top_scores.end());
+        if (*best_score_itr < min_det_conf) {
+            break;
+        }
+
+        // Add current bbox to output list
+        const size_t local_anchor_idx = std::distance(top_scores.begin(), best_score_itr);
+        const int anchor_idx = score_idx[local_anchor_idx];
+        out_indices->emplace_back(anchor_idx);
+        *best_score_itr = 0.f;
+
+        // Update top_scores of the rest bboxes
+        for (size_t local_reference_idx = 0; local_reference_idx < top_scores.size(); ++local_reference_idx) {
+            // Skip updating step for the low-confidence bbox
+            if (top_scores[local_reference_idx] < min_det_conf) {
+                continue;
+            }
+
+            // Calculate the Intersection over Union metric between two bboxes
+            const size_t reference_idx = score_idx[local_reference_idx];
+            const auto& rect1 = detections[anchor_idx].rect;
+            const auto& rect2 = detections[reference_idx].rect;
+            const auto intersection = rect1 & rect2;
+            float overlap = 0.f;
+            if (intersection.width > 0 && intersection.height > 0) {
+                const int intersection_area = intersection.area();
+                overlap = static_cast<float>(intersection_area) / static_cast<float>(rect1.area() + rect2.area() - intersection_area);
+            }
+
+            // Scale bbox score using the exponential rule
+            top_scores[local_reference_idx] *= std::exp(-overlap * overlap / sigma);
+        }
+    }
+}
--- a/engines/OpenVINOEngine/src/faceapp/align_transform.cpp
+++ b/engines/OpenVINOEngine/src/faceapp/align_transform.cpp
@@ -0,0 +1,72 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <algorithm>
+#include <vector>
+#include <limits>
+
+#include <opencv2/imgproc.hpp>
+
+#include "face_reid.hpp"
+
+static const float h = 112.;
+static const float w = 96.;
+// reference landmarks points in the unit square [0,1]x[0,1]
+static const float ref_landmarks_normalized[] = {
+    30.2946f / w, 51.6963f / h, 65.5318f / w, 51.5014f / h, 48.0252f / w,
+    71.7366f / h, 33.5493f / w, 92.3655f / h, 62.7299f / w, 92.2041f / h};
+
+cv::Mat GetTransform(cv::Mat* src, cv::Mat* dst) {
+    cv::Mat col_mean_src;
+    reduce(*src, col_mean_src, 0, cv::REDUCE_AVG);
+    for (int i = 0; i < src->rows; i++) {
+        src->row(i) -= col_mean_src;
+    }
+
+    cv::Mat col_mean_dst;
+    reduce(*dst, col_mean_dst, 0, cv::REDUCE_AVG);
+    for (int i = 0; i < dst->rows; i++) {
+        dst->row(i) -= col_mean_dst;
+    }
+
+    cv::Scalar mean, dev_src, dev_dst;
+    cv::meanStdDev(*src, mean, dev_src);
+    dev_src(0) =
+            std::max(static_cast<double>(std::numeric_limits<float>::epsilon()), dev_src(0));
+    *src /= dev_src(0);
+    cv::meanStdDev(*dst, mean, dev_dst);
+    dev_dst(0) =
+            std::max(static_cast<double>(std::numeric_limits<float>::epsilon()), dev_dst(0));
+    *dst /= dev_dst(0);
+
+    cv::Mat w, u, vt;
+    cv::SVD::compute((*src).t() * (*dst), w, u, vt);
+    cv::Mat r = (u * vt).t();
+    cv::Mat m(2, 3, CV_32F);
+    m.colRange(0, 2) = r * (dev_dst(0) / dev_src(0));
+    m.col(2) = (col_mean_dst.t() - m.colRange(0, 2) * col_mean_src.t());
+    return m;
+}
+
+void AlignFaces(std::vector<cv::Mat>* face_images, std::vector<cv::Mat>* landmarks_vec) {
+    if (landmarks_vec->size() == 0) {
+        return;
+    }
+    CV_Assert(face_images->size() == landmarks_vec->size());
+    cv::Mat ref_landmarks = cv::Mat(5, 2, CV_32F);
+
+    for (size_t j = 0; j < face_images->size(); j++) {
+        for (int i = 0; i < ref_landmarks.rows; i++) {
+            ref_landmarks.at<float>(i, 0) =
+                    ref_landmarks_normalized[2 * i] * face_images->at(j).cols;
+            ref_landmarks.at<float>(i, 1) =
+                    ref_landmarks_normalized[2 * i + 1] * face_images->at(j).rows;
+            landmarks_vec->at(j).at<float>(i, 0) *= face_images->at(j).cols;
+            landmarks_vec->at(j).at<float>(i, 1) *= face_images->at(j).rows;
+        }
+        cv::Mat m = GetTransform(&ref_landmarks, &landmarks_vec->at(j));
+        cv::warpAffine(face_images->at(j), face_images->at(j), m,
+                       face_images->at(j).size(), cv::WARP_INVERSE_MAP);
+    }
+}
--- a/engines/OpenVINOEngine/src/faceapp/cnn.cpp
+++ b/engines/OpenVINOEngine/src/faceapp/cnn.cpp
@@ -0,0 +1,111 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+#include <opencv2/core/core.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include "openvino/openvino.hpp"
+#include "cnn.hpp"
+
+CnnDLSDKBase::CnnDLSDKBase(const Config& config) : m_config(config) {}
+
+void CnnDLSDKBase::Load() {
+    std::shared_ptr<ov::Model> model = m_config.m_core.read_model(m_config.m_path_to_model);
+    ov::OutputVector inputs = model->inputs();
+    if (inputs.size() != 1) {
+        return;
+    }
+    m_desired_layout = {"NHWC"};
+    ov::Layout model_layout = ov::layout::get_layout(model->input());
+    if (model_layout.empty()) {
+        model_layout = {"NCHW"};
+    }
+    ov::OutputVector outputs = model->outputs();
+    for (auto& item : outputs) {
+        const std::string name = item.get_any_name();
+        m_output_tensors_names.push_back(name);
+    }
+    ov::preprocess::PrePostProcessor ppp(model);
+    ppp.input().tensor()
+        .set_element_type(ov::element::u8)
+        .set_layout(m_desired_layout);
+    ppp.input().preprocess()
+        .convert_layout(model_layout)
+        .convert_element_type(ov::element::f32);
+    ppp.input().model().set_layout(model_layout);
+
+    model = ppp.build();
+    m_modelShape = model->input().get_shape();
+    m_modelShape[ov::layout::batch_idx(m_desired_layout)] = m_config.m_max_batch_size;
+    ov::set_batch(model, {1, m_config.m_max_batch_size});
+    m_compiled_model = m_config.m_core.compile_model(model, m_config.m_deviceName, { ov::hint::num_requests(1) });
+    m_infer_request = m_compiled_model.create_infer_request();
+    m_in_tensor = m_infer_request.get_input_tensor();
+    m_in_tensor.set_shape(m_modelShape);
+}
+
+void CnnDLSDKBase::InferBatch(const std::vector<cv::Mat>& frames,const std::function<void(const std::map<std::string, ov::Tensor>&, size_t)>& fetch_results) 
+{
+    size_t num_imgs = frames.size();
+    size_t h = m_modelShape[ov::layout::height_idx(m_desired_layout)];
+    size_t w = m_modelShape[ov::layout::width_idx(m_desired_layout)];
+    size_t c = m_modelShape[ov::layout::channels_idx(m_desired_layout)];
+    for (size_t i = 0; i < num_imgs; i++) {
+        auto slice = ov::Tensor{ m_in_tensor, {i, 0, 0, 0}, {i + 1, h, w, c} };
+        resize2tensor(frames[i], slice);
+    }
+    m_infer_request.set_input_tensor(ov::Tensor{ m_in_tensor, { 0, 0, 0, 0 }, {num_imgs, h, w, c} });
+    m_infer_request.infer();
+
+    std::map<std::string, ov::Tensor> output_tensors;
+    for (const auto& output_tensor_name : m_output_tensors_names) {
+        output_tensors[output_tensor_name] = m_infer_request.get_tensor(output_tensor_name);
+    }
+
+    fetch_results(output_tensors, num_imgs);
+}
+
+VectorCNN::VectorCNN(const Config& config) : CnnDLSDKBase(config) {
+    Load();
+}
+
+void VectorCNN::Compute(const cv::Mat& frame, cv::Mat* vector, cv::Size outp_shape) {
+    std::vector<cv::Mat> output;
+    Compute({frame}, &output, outp_shape);
+    *vector = output[0];
+}
+
+void VectorCNN::Compute(const std::vector<cv::Mat>& images, std::vector<cv::Mat>* vectors, 
+                        cv::Size outp_shape) {
+    if (images.empty()) {
+        return;
+    }
+    vectors->clear();
+    auto results_fetcher = [vectors, outp_shape](const std::map<std::string, ov::Tensor>& outputs, size_t batch_size) {
+        for (auto& output : outputs) {
+            ov::Tensor tensor = output.second;
+            ov::Shape shape = tensor.get_shape();
+            std::vector<int> tensor_sizes(shape.size(), 0);
+            for (size_t i = 0; i < tensor_sizes.size(); ++i) {
+                tensor_sizes[i] = shape[i];
+            }
+            cv::Mat out_tensor(tensor_sizes, CV_32F, tensor.data<float>());
+            for (size_t b = 0; b < batch_size; b++) {
+                cv::Mat tensor_wrapper(out_tensor.size[1], 1, CV_32F,
+                                     reinterpret_cast<void*>((out_tensor.ptr<float>(0) + b * out_tensor.size[1])));
+                vectors->emplace_back();
+                if (!isSizeEmpty(outp_shape))
+                    tensor_wrapper = tensor_wrapper.reshape(1, outp_shape.height);
+                tensor_wrapper.copyTo(vectors->back());
+            }
+        }
+    };
+    InferBatch(images, results_fetcher);
+}
+int VectorCNN::maxBatchSize() const {
+    return m_config.m_max_batch_size;
+}
--- a/engines/OpenVINOEngine/src/faceapp/detector.cpp
+++ b/engines/OpenVINOEngine/src/faceapp/detector.cpp
@@ -0,0 +1,176 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <algorithm>
+#include <string>
+#include <map>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include "openvino/openvino.hpp"
+
+#include "detector.hpp"
+
+#define SSD_EMPTY_DETECTIONS_INDICATOR -1.0
+
+using namespace detection;
+
+namespace {
+cv::Rect TruncateToValidRect(const cv::Rect& rect, const cv::Size& size) {
+    auto tl = rect.tl(), br = rect.br();
+    tl.x = std::max(0, std::min(size.width - 1, tl.x));
+    tl.y = std::max(0, std::min(size.height - 1, tl.y));
+    br.x = std::max(0, std::min(size.width, br.x));
+    br.y = std::max(0, std::min(size.height, br.y));
+    int w = std::max(0, br.x - tl.x);
+    int h = std::max(0, br.y - tl.y);
+    return cv::Rect(tl.x, tl.y, w, h);
+}
+
+cv::Rect IncreaseRect(const cv::Rect& r, float coeff_x, float coeff_y) {
+    cv::Point2f tl = r.tl();
+    cv::Point2f br = r.br();
+    cv::Point2f c = (tl * 0.5f) + (br * 0.5f);
+    cv::Point2f diff = c - tl;
+    cv::Point2f new_diff{diff.x * coeff_x, diff.y * coeff_y};
+    cv::Point2f new_tl = c - new_diff;
+    cv::Point2f new_br = c + new_diff;
+
+    cv::Point new_tl_int {static_cast<int>(std::floor(new_tl.x)), static_cast<int>(std::floor(new_tl.y))};
+    cv::Point new_br_int {static_cast<int>(std::ceil(new_br.x)), static_cast<int>(std::ceil(new_br.y))};
+
+    return cv::Rect(new_tl_int, new_br_int);
+}
+}  // namespace
+
+FaceDetection::FaceDetection(const DetectorConfig& config) :
+        BaseCnnDetection(config.is_async), m_config(config) {
+    m_detectorName = "Face Detection";
+
+    slog::info << "Reading model: " << m_config.m_path_to_model << slog::endl;
+    std::shared_ptr<ov::Model> model = m_config.m_core.read_model(m_config.m_path_to_model);
+    logBasicModelInfo(model);
+
+    ov::Layout desiredLayout = {"NHWC"};
+
+    ov::OutputVector inputs_info = model->inputs();
+    if (inputs_info.size() != 1) {
+        throw std::runtime_error("Face Detection network should have only one input");
+    }
+
+    ov::OutputVector outputs_info = model->outputs();
+    if (outputs_info.size() != 1) {
+        throw std::runtime_error("Face Detection network should have only one output");
+    }
+
+    ov::Output<ov::Node> input = model->input();
+    m_input_name = input.get_any_name();
+
+    ov::Layout modelLayout = ov::layout::get_layout(input);
+    if (modelLayout.empty())
+        modelLayout = {"NCHW"};
+
+    ov::Shape shape = input.get_shape();
+    shape[ov::layout::height_idx(modelLayout)] = m_config.input_h;
+    shape[ov::layout::width_idx(modelLayout)] = m_config.input_w;
+
+    ov::Output<ov::Node> output = model->output();
+    m_output_name = output.get_any_name();
+
+    ov::Shape outputDims = output.get_shape();
+    m_max_detections_count = outputDims[2];
+    m_object_size = outputDims[3];
+    if (m_object_size != 7) {
+        throw std::runtime_error("Face Detection network output layer should have 7 as a last dimension");
+    }
+    if (outputDims.size() != 4) {
+        throw std::runtime_error("Face Detection network output should have 4 dimensions, but had " +
+            std::to_string(outputDims.size()));
+    }
+
+    std::map<std::string, ov::PartialShape> input_shapes;
+    input_shapes[input.get_any_name()] = shape;
+    model->reshape(input_shapes);
+
+    ov::preprocess::PrePostProcessor ppp(model);
+
+    ppp.input().tensor()
+        .set_element_type(ov::element::u8)
+        .set_layout(desiredLayout);
+    ppp.input().preprocess()
+        .convert_layout(modelLayout)
+        .convert_element_type(ov::element::f32);
+    ppp.input().model().set_layout(modelLayout);
+
+    model = ppp.build();
+
+    slog::info << "PrePostProcessor configuration:" << slog::endl;
+    slog::info << ppp << slog::endl;
+
+    m_model = m_config.m_core.compile_model(model, m_config.m_deviceName);
+    logCompiledModelInfo(m_model, m_config.m_path_to_model, m_config.m_deviceName, m_detectorName);
+}
+
+void FaceDetection::submitRequest() {
+    if (!m_enqueued_frames)
+        return;
+    m_enqueued_frames = 0;
+
+    BaseCnnDetection::submitRequest();
+}
+
+void FaceDetection::enqueue(const cv::Mat& frame) {
+    if (m_request == nullptr) {
+        m_request = std::make_shared<ov::InferRequest>(m_model.create_infer_request());
+    }
+
+    m_width = static_cast<float>(frame.cols);
+    m_height = static_cast<float>(frame.rows);
+
+    ov::Tensor inputTensor = m_request->get_tensor(m_input_name);
+
+    resize2tensor(frame, inputTensor);
+
+    m_enqueued_frames = 1;
+}
+
+DetectedObjects FaceDetection::fetchResults() {
+    DetectedObjects results;
+    const float* data = m_request->get_tensor(m_output_name).data<float>();
+
+    for (int det_id = 0; det_id < m_max_detections_count; ++det_id) {
+        const int start_pos = det_id * m_object_size;
+
+        const float batchID = data[start_pos];
+        if (batchID == SSD_EMPTY_DETECTIONS_INDICATOR) {
+            break;
+        }
+
+        const float score = std::min(std::max(0.0f, data[start_pos + 2]), 1.0f);
+        const float x0 = std::min(std::max(0.0f, data[start_pos + 3]), 1.0f) * m_width;
+        const float y0 = std::min(std::max(0.0f, data[start_pos + 4]), 1.0f) * m_height;
+        const float x1 = std::min(std::max(0.0f, data[start_pos + 5]), 1.0f) * m_width;
+        const float y1 = std::min(std::max(0.0f, data[start_pos + 6]), 1.0f) * m_height;
+
+        DetectedObject object;
+        object.confidence = score;
+        object.rect = cv::Rect(cv::Point(static_cast<int>(round(static_cast<double>(x0))),
+                                         static_cast<int>(round(static_cast<double>(y0)))),
+                               cv::Point(static_cast<int>(round(static_cast<double>(x1))),
+                                         static_cast<int>(round(static_cast<double>(y1)))));
+
+
+        object.rect = TruncateToValidRect(IncreaseRect(object.rect,
+                                                       m_config.increase_scale_x,
+                                                       m_config.increase_scale_y),
+                                          cv::Size(static_cast<int>(m_width), static_cast<int>(m_height)));
+
+        if (object.confidence > m_config.confidence_threshold && object.rect.area() > 0) {
+            results.emplace_back(object);
+        }
+    }
+
+    return results;
+}
--- a/engines/OpenVINOEngine/src/faceapp/logger.cpp
+++ b/engines/OpenVINOEngine/src/faceapp/logger.cpp
@@ -0,0 +1,166 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <iomanip>
+#include <string>
+#include <map>
+#include <set>
+#include <vector>
+#include <fstream>
+
+#include "logger.hpp"
+
+namespace {
+
+const char unknown_label[] = "Unknown";
+
+std::string GetUnknownOrLabel(const std::vector<std::string>& labels, int idx)  {
+    return idx >= 0 ? labels.at(idx) : unknown_label;
+}
+
+std::string FrameIdxToString(const std::string& path, int frame_idx) {
+    std::stringstream ss;
+    ss << std::setw(6) << std::setfill('0') << frame_idx;
+    return path.substr(path.rfind("/") + 1) + "@" + ss.str();
+}
+}  // anonymous namespace
+
+DetectionsLogger::DetectionsLogger(slog::LogStream& stream, bool enabled,
+                                   const std::string& act_stat_log_file,
+                                   const std::string& act_det_log_file) :
+    m_log_stream(stream) {
+    m_write_logs = enabled;
+    m_act_stat_log_stream.open(act_stat_log_file, std::fstream::out);
+
+    if (!act_det_log_file.empty()) {
+        m_act_det_log_stream.open(act_det_log_file, cv::FileStorage::WRITE);
+
+        m_act_det_log_stream << "data" << "[";
+    }
+}
+
+void DetectionsLogger::CreateNextFrameRecord(
+    const std::string& path, const int frame_idx, const size_t width, const size_t height) {
+    if (m_write_logs)
+        m_log_stream << "Frame_name: " << path << "@" << frame_idx << " width: "
+                     << width << " height: " << height << slog::endl;
+}
+
+void DetectionsLogger::AddFaceToFrame(const cv::Rect& rect, const std::string& id, const std::string& action) {
+    if (m_write_logs) {
+        m_log_stream << "Object type: face. Box: " << rect << " id: " << id;
+        if (!action.empty()) {
+            m_log_stream << " action: " << action;
+        }
+        m_log_stream << slog::endl;
+    }
+}
+
+void DetectionsLogger::AddPersonToFrame(const cv::Rect& rect, const std::string& action, const std::string& id) {
+    if (m_write_logs) {
+        m_log_stream << "Object type: person. Box: " << rect << " action: " << action;
+        if (!id.empty()) {
+            m_log_stream << " id: " << id;
+        }
+        m_log_stream << slog::endl;
+    }
+}
+
+void DetectionsLogger::AddDetectionToFrame(const TrackedObject& object, const int frame_idx) {
+    if (m_act_det_log_stream.isOpened()) {
+        m_act_det_log_stream << "{" << "frame_id" << frame_idx
+                             << "det_conf" << object.confidence
+                             << "label" << object.label
+                             << "rect" << object.rect << "}";
+    }
+}
+
+void DetectionsLogger::FinalizeFrameRecord() {
+    if (m_write_logs) {
+        m_log_stream << slog::endl;
+    }
+}
+
+void DetectionsLogger::DumpDetections(const std::string& video_path,
+                                      const cv::Size frame_size,
+                                      const size_t num_frames,
+                                      const std::vector<Track>& face_tracks,
+                                      const std::map<int, int>& track_id_to_label_faces,
+                                      const std::vector<std::string>& action_idx_to_label,
+                                      const std::vector<std::string>& person_id_to_label,
+                                      const std::vector<std::map<int, int>>& frame_face_obj_id_to_action_maps)  {
+    std::map<int, std::vector<const TrackedObject*>> frame_idx_to_face_track_objs;
+
+    for (const auto& tr : face_tracks) {
+        for (const auto& obj : tr.objects) {
+            frame_idx_to_face_track_objs[obj.frame_idx].emplace_back(&obj);
+        }
+    }
+
+    std::map<std::string, std::string> face_label_to_action;
+    for (const auto& label : person_id_to_label) {
+        face_label_to_action[label] = unknown_label;
+    }
+    m_act_stat_log_stream << "frame_idx";
+    for (const auto& label : person_id_to_label) {
+        m_act_stat_log_stream << "," << label;
+    }
+    m_act_stat_log_stream << std::endl;
+
+    for (size_t i = 0; i < num_frames; i++)  {
+        CreateNextFrameRecord(video_path, i, frame_size.width, frame_size.height);
+        const auto& frame_face_obj_id_to_action = frame_face_obj_id_to_action_maps.at(i);
+        for (auto& kv : face_label_to_action) {
+            kv.second = unknown_label;
+        }
+
+        for (const auto& p_obj : frame_idx_to_face_track_objs[i]) {
+            const auto& obj = *p_obj;
+            std::string action_label = unknown_label;
+            if (frame_face_obj_id_to_action.count(obj.object_id) > 0) {
+                action_label = GetUnknownOrLabel(action_idx_to_label, frame_face_obj_id_to_action.at(obj.object_id));
+            }
+            std::string face_label = GetUnknownOrLabel(person_id_to_label, track_id_to_label_faces.at(obj.object_id));
+            face_label_to_action[face_label] = action_label;
+            AddFaceToFrame(obj.rect, face_label, action_label);
+        }
+
+        m_act_stat_log_stream << FrameIdxToString(video_path, i);
+        for (const auto& label : person_id_to_label) {
+            m_act_stat_log_stream << "," << face_label_to_action[label];
+        }
+        m_act_stat_log_stream << std::endl;
+
+        FinalizeFrameRecord();
+    }
+}
+
+void DetectionsLogger::DumpTracks(const std::map<int, RangeEventsTrack>& obj_id_to_events,
+                                  const std::vector<std::string>& action_idx_to_label,
+                                  const std::map<int, int>& track_id_to_label_faces,
+                                  const std::vector<std::string>& person_id_to_label) {
+    for (const auto& tup : obj_id_to_events) {
+        const int obj_id = tup.first;
+        if (track_id_to_label_faces.count(obj_id) > 0) {
+            const auto& events = tup.second;
+
+            std::string face_label = GetUnknownOrLabel(person_id_to_label, track_id_to_label_faces.at(obj_id));
+            m_log_stream << "Person: " << face_label << slog::endl;
+
+            for (const auto& event : events) {
+                std::string action_label = GetUnknownOrLabel(action_idx_to_label, event.action);
+                m_log_stream << "   - " << action_label
+                             << ": from " << event.begin_frame_id
+                             << " to " << event.end_frame_id
+                             << " frames" << slog::endl;
+            }
+        }
+    }
+}
+
+DetectionsLogger::~DetectionsLogger() {
+    if (m_act_det_log_stream.isOpened()) {
+        m_act_det_log_stream << "]";
+    }
+}
--- a/engines/OpenVINOEngine/src/faceapp/reid_gallery.cpp
+++ b/engines/OpenVINOEngine/src/faceapp/reid_gallery.cpp
@@ -0,0 +1,186 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+#include <string>
+#include <limits>
+
+#include <opencv2/opencv.hpp>
+
+#include "face_reid.hpp"
+#include "tracker/tracker.hpp"
+
+namespace {
+    float ComputeReidDistance(const cv::Mat& descr1, const cv::Mat& descr2) {
+        float xy = static_cast<float>(descr1.dot(descr2));
+        float xx = static_cast<float>(descr1.dot(descr1));
+        float yy = static_cast<float>(descr2.dot(descr2));
+        float norm = sqrt(xx * yy) + 1e-6f;
+        return 1.0f - xy / norm;
+    }
+
+    bool file_exists(const std::string& name) {
+        std::ifstream f(name.c_str());
+        return f.good();
+    }
+
+    inline char separator() {
+        #ifdef _WIN32
+        return '\\';
+        #else
+        return '/';
+        #endif
+    }
+
+    std::string folder_name(const std::string& path) {
+        size_t found_pos;
+        found_pos = path.find_last_of(separator());
+        if (found_pos != std::string::npos)
+            return path.substr(0, found_pos);
+        return std::string(".") + separator();
+    }
+
+}  // namespace
+
+const char EmbeddingsGallery::unknown_label[] = "Unknown";
+const int EmbeddingsGallery::unknown_id = TrackedObject::UNKNOWN_LABEL_IDX;
+
+EmbeddingsGallery::EmbeddingsGallery(const std::string& ids_list,
+                                     double threshold, int min_size_fr,
+                                     bool crop_gallery, const detection::DetectorConfig& detector_config,
+                                     VectorCNN& landmarks_det,
+                                     VectorCNN& image_reid,
+                                     bool use_greedy_matcher) :
+    reid_threshold(threshold), use_greedy_matcher(use_greedy_matcher) {
+    if (ids_list.empty()) {
+        return;
+    }
+
+    detection::FaceDetection detector(detector_config);
+
+    cv::FileStorage fs(ids_list, cv::FileStorage::Mode::READ);
+    cv::FileNode fn = fs.root();
+    int id = 0;
+    for (cv::FileNodeIterator fit = fn.begin(); fit != fn.end(); ++fit) {
+        cv::FileNode item = *fit;
+        std::string label = item.name();
+        std::vector<cv::Mat> embeddings;
+
+        // Please, note that the case when there are more than one image in gallery
+        // for a person might not work properly with the current implementation
+        // of the demo.
+        // Remove this assert by your own risk.
+        CV_Assert(item.size() == 1);
+
+        for (size_t i = 0; i < item.size(); i++) {
+            std::string path;
+            if (file_exists(item[i].string())) {
+                path = item[i].string();
+            } else {
+                path = folder_name(ids_list) + separator() + item[i].string();
+            }
+
+            cv::Mat image = cv::imread(path);
+            CV_Assert(!image.empty());
+            cv::Mat emb;
+            RegistrationStatus status = RegisterIdentity(label, image, min_size_fr, crop_gallery,  detector, landmarks_det, image_reid, emb);
+            if (status == RegistrationStatus::SUCCESS) {
+                embeddings.push_back(emb);
+                idx_to_id.push_back(id);
+                identities.emplace_back(embeddings, label, id);
+                ++id;
+            }
+        }
+    }
+}
+
+std::vector<int> EmbeddingsGallery::GetIDsByEmbeddings(const std::vector<cv::Mat>& embeddings) const {
+    if (embeddings.empty() || idx_to_id.empty())
+        return std::vector<int>(embeddings.size(), unknown_id);
+
+    cv::Mat distances(static_cast<int>(embeddings.size()), static_cast<int>(idx_to_id.size()), CV_32F);
+
+    for (int i = 0; i < distances.rows; i++) {
+        int k = 0;
+        for (size_t j = 0; j < identities.size(); j++) {
+            for (const auto& reference_emb : identities[j].embeddings) {
+                distances.at<float>(i, k) = ComputeReidDistance(embeddings[i], reference_emb);
+                k++;
+            }
+        }
+    }
+    KuhnMunkres matcher(use_greedy_matcher);
+    auto matched_idx = matcher.Solve(distances);
+    std::vector<int> output_ids;
+    for (auto col_idx : matched_idx) {
+        if (int(col_idx) == -1) {
+            output_ids.push_back(unknown_id);
+            continue;
+        }
+        if (distances.at<float>(output_ids.size(), col_idx) > reid_threshold)
+            output_ids.push_back(unknown_id);
+        else
+            output_ids.push_back(idx_to_id[col_idx]);
+    }
+    return output_ids;
+}
+
+std::string EmbeddingsGallery::GetLabelByID(int id) const {
+    if (id >= 0 && id < static_cast<int>(identities.size()))
+        return identities[id].label;
+    else
+        return unknown_label;
+}
+
+size_t EmbeddingsGallery::size() const {
+    return identities.size();
+}
+
+std::vector<std::string> EmbeddingsGallery::GetIDToLabelMap() const {
+    std::vector<std::string> map;
+    map.reserve(identities.size());
+    for (const auto& item : identities) {
+        map.emplace_back(item.label);
+    }
+    return map;
+}
+
+bool EmbeddingsGallery::LabelExists(const std::string& label) const {
+    return identities.end() != std::find_if(identities.begin(), identities.end(),
+                                        [label](const GalleryObject& o){return o.label == label;});
+}
+
+RegistrationStatus EmbeddingsGallery::RegisterIdentity(const std::string& identity_label,
+    const cv::Mat& image,
+    int min_size_fr, bool crop_gallery,
+    detection::FaceDetection& detector,
+    VectorCNN& landmarks_det,
+    VectorCNN& image_reid,
+    cv::Mat& embedding) {
+    cv::Mat target = image;
+    if (crop_gallery) {
+        detector.enqueue(image);
+        detector.submitRequest();
+        detector.wait();
+        detection::DetectedObjects faces = detector.fetchResults();
+        if (faces.size() == 0) {
+            return RegistrationStatus::FAILURE_NOT_DETECTED;
+        }
+        CV_Assert(faces.size() == 1);
+        cv::Mat face_roi = image(faces[0].rect);
+        target = face_roi;
+    }
+    if ((target.rows < min_size_fr) && (target.cols < min_size_fr)) {
+        return RegistrationStatus::FAILURE_LOW_QUALITY;
+    }
+    cv::Mat landmarks;
+    landmarks_det.Compute(target, &landmarks, cv::Size(2, 5));
+    std::vector<cv::Mat> images = { target };
+    std::vector<cv::Mat> landmarks_vec = { landmarks };
+    AlignFaces(&images, &landmarks_vec);
+    image_reid.Compute(images[0], &embedding);
+    return RegistrationStatus::SUCCESS;
+}