ANSCORE/engines/OpenVINOEngine/src/models/detection_model_yolo.cpp

/*
// Copyright (C) 2020-2024 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/

#include "models/detection_model_yolo.h"

#include <algorithm>
#include <cmath>
#include <cstdint>
#include <stdexcept>
#include <string>
#include <utility>
#include <vector>

#include <openvino/openvino.hpp>

#include <utils/common.hpp>
#include <utils/slog.hpp>

#include "models/internal_model_data.h"
#include "models/results.h"

std::vector<float> defaultAnchors[] = {
    // YOLOv1v2
    {0.57273f, 0.677385f, 1.87446f, 2.06253f, 3.33843f, 5.47434f, 7.88282f, 3.52778f, 9.77052f, 9.16828f},
    // YOLOv3
    {10.0f,
     13.0f,
     16.0f,
     30.0f,
     33.0f,
     23.0f,
     30.0f,
     61.0f,
     62.0f,
     45.0f,
     59.0f,
     119.0f,
     116.0f,
     90.0f,
     156.0f,
     198.0f,
     373.0f,
     326.0f},
    // YOLOv4
    {12.0f,
     16.0f,
     19.0f,
     36.0f,
     40.0f,
     28.0f,
     36.0f,
     75.0f,
     76.0f,
     55.0f,
     72.0f,
     146.0f,
     142.0f,
     110.0f,
     192.0f,
     243.0f,
     459.0f,
     401.0f},
    // YOLOv4_Tiny
    {10.0f, 14.0f, 23.0f, 27.0f, 37.0f, 58.0f, 81.0f, 82.0f, 135.0f, 169.0f, 344.0f, 319.0f},
    // YOLOF
    {16.0f, 16.0f, 32.0f, 32.0f, 64.0f, 64.0f, 128.0f, 128.0f, 256.0f, 256.0f, 512.0f, 512.0f}};

const std::vector<int64_t> defaultMasks[] = {
    // YOLOv1v2
    {},
    // YOLOv3
    {},
    // YOLOv4
    {0, 1, 2, 3, 4, 5, 6, 7, 8},
    // YOLOv4_Tiny
    {1, 2, 3, 3, 4, 5},
    // YOLOF
    {0, 1, 2, 3, 4, 5}};

static inline float sigmoid(float x) {
    return 1.f / (1.f + exp(-x));
}

static inline float linear(float x) {
    return x;
}

ModelYolo::ModelYolo(const std::string& modelFileName,
                     float confidenceThreshold,
                     bool useAutoResize,
                     bool useAdvancedPostprocessing,
                     float boxIOUThreshold,
                     const std::vector<std::string>& labels,
                     const std::vector<float>& anchors,
                     const std::vector<int64_t>& masks,
                     const std::string& layout)
    : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, labels, layout),
      boxIOUThreshold(boxIOUThreshold),
      useAdvancedPostprocessing(useAdvancedPostprocessing),
      yoloVersion(YOLO_V3),
      presetAnchors(anchors),
      presetMasks(masks) {}

void ModelYolo::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
    // --------------------------- Configure input & output -------------------------------------------------
    // --------------------------- Prepare input  ------------------------------------------------------
    if (model->inputs().size() != 1) {
        throw std::logic_error("YOLO model wrapper accepts models that have only 1 input");
    }

    const auto& input = model->input();
    const ov::Shape& inputShape = model->input().get_shape();
    ov::Layout inputLayout = getInputLayout(input);

    if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
        throw std::logic_error("Expected 3-channel input");
    }

    ov::preprocess::PrePostProcessor ppp(model);
    inputTransform.setPrecision(ppp, input.get_any_name());
    ppp.input().tensor().set_layout({"NHWC"});

    if (useAutoResize) {
        ppp.input().tensor().set_spatial_dynamic_shape();

        ppp.input()
            .preprocess()
            .convert_element_type(ov::element::f32)
            .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
    }

    ppp.input().model().set_layout(inputLayout);

    //--- Reading image input parameters
    inputsNames.push_back(model->input().get_any_name());
    netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
    netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];

    // --------------------------- Prepare output  -----------------------------------------------------
    const ov::OutputVector& outputs = model->outputs();
    std::map<std::string, ov::Shape> outShapes;
    for (auto& out : outputs) {
        ppp.output(out.get_any_name()).tensor().set_element_type(ov::element::f32);
        if (out.get_shape().size() == 4) {
            if (out.get_shape()[ov::layout::height_idx("NCHW")] != out.get_shape()[ov::layout::width_idx("NCHW")] &&
                out.get_shape()[ov::layout::height_idx("NHWC")] == out.get_shape()[ov::layout::width_idx("NHWC")]) {
                ppp.output(out.get_any_name()).model().set_layout("NHWC");
                // outShapes are saved before ppp.build() thus set yoloRegionLayout as it is in model before ppp.build()
                yoloRegionLayout = "NHWC";
            }
            // yolo-v1-tiny-tf out shape is [1, 21125] thus set layout only for 4 dim tensors
            ppp.output(out.get_any_name()).tensor().set_layout("NCHW");
        }
        outputsNames.push_back(out.get_any_name());
        outShapes[out.get_any_name()] = out.get_shape();
    }
    model = ppp.build();

    yoloVersion = YOLO_V3;
    bool isRegionFound = false;
    for (const auto& op : model->get_ordered_ops()) {
        if (std::string("RegionYolo") == op->get_type_name()) {
            auto regionYolo = std::dynamic_pointer_cast<ov::op::v0::RegionYolo>(op);

            if (regionYolo) {
                if (!regionYolo->get_mask().size()) {
                    yoloVersion = YOLO_V1V2;
                }

                const auto& opName = op->get_friendly_name();
                for (const auto& out : outputs) {
                    if (out.get_node()->get_friendly_name() == opName ||
                        out.get_node()->get_input_node_ptr(0)->get_friendly_name() == opName) {
                        isRegionFound = true;
                        regions.emplace(out.get_any_name(), Region(regionYolo));
                    }
                }
            }
        }
    }

    if (!isRegionFound) {
        switch (outputsNames.size()) {
            case 1:
                yoloVersion = YOLOF;
                break;
            case 2:
                yoloVersion = YOLO_V4_TINY;
                break;
            case 3:
                yoloVersion = YOLO_V4;
                break;
        }

        int num = yoloVersion == YOLOF ? 6 : 3;
        isObjConf = yoloVersion == YOLOF ? 0 : 1;
        int i = 0;

        auto chosenMasks = presetMasks.size() ? presetMasks : defaultMasks[yoloVersion];
        if (chosenMasks.size() != num * outputs.size()) {
            throw std::runtime_error(std::string("Invalid size of masks array, got ") +
                                     std::to_string(presetMasks.size()) + ", should be " +
                                     std::to_string(num * outputs.size()));
        }

        std::sort(outputsNames.begin(),
                  outputsNames.end(),
                  [&outShapes, this](const std::string& x, const std::string& y) {
                      return outShapes[x][ov::layout::height_idx(yoloRegionLayout)] >
                             outShapes[y][ov::layout::height_idx(yoloRegionLayout)];
                  });

        for (const auto& name : outputsNames) {
            const auto& shape = outShapes[name];
            if (shape[ov::layout::channels_idx(yoloRegionLayout)] % num != 0) {
                throw std::logic_error(std::string("Output tensor ") + name + " has wrong channel dimension");
            }
            regions.emplace(
                name,
                Region(shape[ov::layout::channels_idx(yoloRegionLayout)] / num - 4 - (isObjConf ? 1 : 0),
                       4,
                       presetAnchors.size() ? presetAnchors : defaultAnchors[yoloVersion],
                       std::vector<int64_t>(chosenMasks.begin() + i * num, chosenMasks.begin() + (i + 1) * num),
                       shape[ov::layout::width_idx(yoloRegionLayout)],
                       shape[ov::layout::height_idx(yoloRegionLayout)]));
            i++;
        }
    } else {
        // Currently externally set anchors and masks are supported only for YoloV4
        if (presetAnchors.size() || presetMasks.size()) {
            slog::warn << "Preset anchors and mask can be set for YoloV4 model only. "
                          "This model is not YoloV4, so these options will be ignored."
                       << slog::endl;
        }
    }
}

std::unique_ptr<ResultBase> ModelYolo::postprocess(InferenceResult& infResult) {
    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
    std::vector<DetectedObject> objects;

    // Parsing outputs
    const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();

    for (auto& output : infResult.outputsData) {
        this->parseYOLOOutput(output.first,
                              output.second,
                              netInputHeight,
                              netInputWidth,
                              internalData.inputImgHeight,
                              internalData.inputImgWidth,
                              objects);
    }

    if (useAdvancedPostprocessing) {
        // Advanced postprocessing
        // Checking IOU threshold conformance
        // For every i-th object we're finding all objects it intersects with, and comparing confidence
        // If i-th object has greater confidence than all others, we include it into result
        for (const auto& obj1 : objects) {
            bool isGoodResult = true;
            for (const auto& obj2 : objects) {
                if (obj1.labelID == obj2.labelID && obj1.confidence < obj2.confidence &&
                    intersectionOverUnion(obj1, obj2) >= boxIOUThreshold) {  // if obj1 is the same as obj2, condition
                                                                             // expression will evaluate to false anyway
                    isGoodResult = false;
                    break;
                }
            }
            if (isGoodResult) {
                result->objects.push_back(obj1);
            }
        }
    } else {
        // Classic postprocessing
        std::sort(objects.begin(), objects.end(), [](const DetectedObject& x, const DetectedObject& y) {
            return x.confidence > y.confidence;
        });
        for (size_t i = 0; i < objects.size(); ++i) {
            if (objects[i].confidence == 0)
                continue;
            for (size_t j = i + 1; j < objects.size(); ++j)
                if (intersectionOverUnion(objects[i], objects[j]) >= boxIOUThreshold)
                    objects[j].confidence = 0;
            result->objects.push_back(objects[i]);
        }
    }

    return std::unique_ptr<ResultBase>(result);
}

void ModelYolo::parseYOLOOutput(const std::string& output_name,
                                const ov::Tensor& tensor,
                                const unsigned long resized_im_h,
                                const unsigned long resized_im_w,
                                const unsigned long original_im_h,
                                const unsigned long original_im_w,
                                std::vector<DetectedObject>& objects) {
    // --------------------------- Extracting layer parameters -------------------------------------
    auto it = regions.find(output_name);
    if (it == regions.end()) {
        throw std::runtime_error(std::string("Can't find output layer with name ") + output_name);
    }
    auto& region = it->second;

    int sideW = 0;
    int sideH = 0;
    unsigned long scaleH;
    unsigned long scaleW;
    switch (yoloVersion) {
        case YOLO_V1V2:
            sideH = region.outputHeight;
            sideW = region.outputWidth;
            scaleW = region.outputWidth;
            scaleH = region.outputHeight;
            break;
        case YOLO_V3:
        case YOLO_V4:
        case YOLO_V4_TINY:
        case YOLOF:
            sideH = static_cast<int>(tensor.get_shape()[ov::layout::height_idx("NCHW")]);
            sideW = static_cast<int>(tensor.get_shape()[ov::layout::width_idx("NCHW")]);
            scaleW = resized_im_w;
            scaleH = resized_im_h;
            break;
    }

    auto entriesNum = sideW * sideH;
    const float* outData = tensor.data<float>();

    auto postprocessRawData =
        (yoloVersion == YOLO_V4 || yoloVersion == YOLO_V4_TINY || yoloVersion == YOLOF) ? sigmoid : linear;

    // --------------------------- Parsing YOLO Region output -------------------------------------
    for (int i = 0; i < entriesNum; ++i) {
        int row = i / sideW;
        int col = i % sideW;
        for (int n = 0; n < region.num; ++n) {
            //--- Getting region data
            int obj_index = calculateEntryIndex(entriesNum,
                                                region.coords,
                                                region.classes + isObjConf,
                                                n * entriesNum + i,
                                                region.coords);
            int box_index =
                calculateEntryIndex(entriesNum, region.coords, region.classes + isObjConf, n * entriesNum + i, 0);
            float scale = isObjConf ? postprocessRawData(outData[obj_index]) : 1;

            //--- Preliminary check for confidence threshold conformance
            if (scale >= confidenceThreshold) {
                //--- Calculating scaled region's coordinates
                float x, y;
                if (yoloVersion == YOLOF) {
                    x = (static_cast<float>(col) / sideW +
                         outData[box_index + 0 * entriesNum] * region.anchors[2 * n] / scaleW) *
                        original_im_w;
                    y = (static_cast<float>(row) / sideH +
                         outData[box_index + 1 * entriesNum] * region.anchors[2 * n + 1] / scaleH) *
                        original_im_h;
                } else {
                    x = static_cast<float>((col + postprocessRawData(outData[box_index + 0 * entriesNum])) / sideW *
                                           original_im_w);
                    y = static_cast<float>((row + postprocessRawData(outData[box_index + 1 * entriesNum])) / sideH *
                                           original_im_h);
                }
                float height = static_cast<float>(std::exp(outData[box_index + 3 * entriesNum]) *
                                                  region.anchors[2 * n + 1] * original_im_h / scaleH);
                float width = static_cast<float>(std::exp(outData[box_index + 2 * entriesNum]) * region.anchors[2 * n] *
                                                 original_im_w / scaleW);

                DetectedObject obj;
                obj.x = clamp(x - width / 2, 0.f, static_cast<float>(original_im_w));
                obj.y = clamp(y - height / 2, 0.f, static_cast<float>(original_im_h));
                obj.width = clamp(width, 0.f, static_cast<float>(original_im_w - obj.x));
                obj.height = clamp(height, 0.f, static_cast<float>(original_im_h - obj.y));

                for (size_t j = 0; j < region.classes; ++j) {
                    int class_index = calculateEntryIndex(entriesNum,
                                                          region.coords,
                                                          region.classes + isObjConf,
                                                          n * entriesNum + i,
                                                          region.coords + isObjConf + j);
                    float prob = scale * postprocessRawData(outData[class_index]);

                    //--- Checking confidence threshold conformance and adding region to the list
                    if (prob >= confidenceThreshold) {
                        obj.confidence = prob;
                        obj.labelID = j;
                        obj.label = getLabelName(obj.labelID);
                        objects.push_back(obj);
                    }
                }
            }
        }
    }
}

int ModelYolo::calculateEntryIndex(int totalCells, int lcoords, size_t lclasses, int location, int entry) {
    int n = location / totalCells;
    int loc = location % totalCells;
    return (n * (lcoords + lclasses) + entry) * totalCells + loc;
}

double ModelYolo::intersectionOverUnion(const DetectedObject& o1, const DetectedObject& o2) {
    double overlappingWidth = fmin(o1.x + o1.width, o2.x + o2.width) - fmax(o1.x, o2.x);
    double overlappingHeight = fmin(o1.y + o1.height, o2.y + o2.height) - fmax(o1.y, o2.y);
    double intersectionArea =
        (overlappingWidth < 0 || overlappingHeight < 0) ? 0 : overlappingHeight * overlappingWidth;
    double unionArea = o1.width * o1.height + o2.width * o2.height - intersectionArea;
    return intersectionArea / unionArea;
}

ModelYolo::Region::Region(const std::shared_ptr<ov::op::v0::RegionYolo>& regionYolo) {
    coords = regionYolo->get_num_coords();
    classes = regionYolo->get_num_classes();
    auto mask = regionYolo->get_mask();
    num = mask.size();

    auto shape = regionYolo->get_input_shape(0);
    outputWidth = shape[3];
    outputHeight = shape[2];

    if (num) {
        // Parsing YoloV3 parameters
        anchors.resize(num * 2);

        for (int i = 0; i < num; ++i) {
            anchors[i * 2] = regionYolo->get_anchors()[mask[i] * 2];
            anchors[i * 2 + 1] = regionYolo->get_anchors()[mask[i] * 2 + 1];
        }
    } else {
        // Parsing YoloV2 parameters
        num = regionYolo->get_num_regions();
        anchors = regionYolo->get_anchors();
        if (anchors.empty()) {
            anchors = defaultAnchors[YOLO_V1V2];
            num = 5;
        }
    }
}

ModelYolo::Region::Region(size_t classes,
                          int coords,
                          const std::vector<float>& anchors,
                          const std::vector<int64_t>& masks,
                          size_t outputWidth,
                          size_t outputHeight)
    : classes(classes),
      coords(coords),
      outputWidth(outputWidth),
      outputHeight(outputHeight) {
    num = masks.size();

    if (anchors.size() == 0 || anchors.size() % 2 != 0) {
        throw std::runtime_error("Explicitly initialized region should have non-empty even-sized regions vector");
    }

    if (num) {
        this->anchors.resize(num * 2);

        for (int i = 0; i < num; ++i) {
            this->anchors[i * 2] = anchors[masks[i] * 2];
            this->anchors[i * 2 + 1] = anchors[masks[i] * 2 + 1];
        }
    } else {
        this->anchors = anchors;
        num = anchors.size() / 2;
    }
}