483 lines
19 KiB
C++
483 lines
19 KiB
C++
/*
|
|
// Copyright (C) 2020-2024 Intel Corporation
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
*/
|
|
|
|
#include "models/detection_model_yolo.h"
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <cstdint>
|
|
#include <stdexcept>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include <openvino/openvino.hpp>
|
|
|
|
#include <utils/common.hpp>
|
|
#include <utils/slog.hpp>
|
|
|
|
#include "models/internal_model_data.h"
|
|
#include "models/results.h"
|
|
|
|
std::vector<float> defaultAnchors[] = {
|
|
// YOLOv1v2
|
|
{0.57273f, 0.677385f, 1.87446f, 2.06253f, 3.33843f, 5.47434f, 7.88282f, 3.52778f, 9.77052f, 9.16828f},
|
|
// YOLOv3
|
|
{10.0f,
|
|
13.0f,
|
|
16.0f,
|
|
30.0f,
|
|
33.0f,
|
|
23.0f,
|
|
30.0f,
|
|
61.0f,
|
|
62.0f,
|
|
45.0f,
|
|
59.0f,
|
|
119.0f,
|
|
116.0f,
|
|
90.0f,
|
|
156.0f,
|
|
198.0f,
|
|
373.0f,
|
|
326.0f},
|
|
// YOLOv4
|
|
{12.0f,
|
|
16.0f,
|
|
19.0f,
|
|
36.0f,
|
|
40.0f,
|
|
28.0f,
|
|
36.0f,
|
|
75.0f,
|
|
76.0f,
|
|
55.0f,
|
|
72.0f,
|
|
146.0f,
|
|
142.0f,
|
|
110.0f,
|
|
192.0f,
|
|
243.0f,
|
|
459.0f,
|
|
401.0f},
|
|
// YOLOv4_Tiny
|
|
{10.0f, 14.0f, 23.0f, 27.0f, 37.0f, 58.0f, 81.0f, 82.0f, 135.0f, 169.0f, 344.0f, 319.0f},
|
|
// YOLOF
|
|
{16.0f, 16.0f, 32.0f, 32.0f, 64.0f, 64.0f, 128.0f, 128.0f, 256.0f, 256.0f, 512.0f, 512.0f}};
|
|
|
|
const std::vector<int64_t> defaultMasks[] = {
|
|
// YOLOv1v2
|
|
{},
|
|
// YOLOv3
|
|
{},
|
|
// YOLOv4
|
|
{0, 1, 2, 3, 4, 5, 6, 7, 8},
|
|
// YOLOv4_Tiny
|
|
{1, 2, 3, 3, 4, 5},
|
|
// YOLOF
|
|
{0, 1, 2, 3, 4, 5}};
|
|
|
|
static inline float sigmoid(float x) {
|
|
return 1.f / (1.f + exp(-x));
|
|
}
|
|
|
|
static inline float linear(float x) {
|
|
return x;
|
|
}
|
|
|
|
ModelYolo::ModelYolo(const std::string& modelFileName,
|
|
float confidenceThreshold,
|
|
bool useAutoResize,
|
|
bool useAdvancedPostprocessing,
|
|
float boxIOUThreshold,
|
|
const std::vector<std::string>& labels,
|
|
const std::vector<float>& anchors,
|
|
const std::vector<int64_t>& masks,
|
|
const std::string& layout)
|
|
: DetectionModel(modelFileName, confidenceThreshold, useAutoResize, labels, layout),
|
|
boxIOUThreshold(boxIOUThreshold),
|
|
useAdvancedPostprocessing(useAdvancedPostprocessing),
|
|
yoloVersion(YOLO_V3),
|
|
presetAnchors(anchors),
|
|
presetMasks(masks) {}
|
|
|
|
void ModelYolo::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
|
|
// --------------------------- Configure input & output -------------------------------------------------
|
|
// --------------------------- Prepare input ------------------------------------------------------
|
|
if (model->inputs().size() != 1) {
|
|
throw std::logic_error("YOLO model wrapper accepts models that have only 1 input");
|
|
}
|
|
|
|
const auto& input = model->input();
|
|
const ov::Shape& inputShape = model->input().get_shape();
|
|
ov::Layout inputLayout = getInputLayout(input);
|
|
|
|
if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
|
|
throw std::logic_error("Expected 3-channel input");
|
|
}
|
|
|
|
ov::preprocess::PrePostProcessor ppp(model);
|
|
inputTransform.setPrecision(ppp, input.get_any_name());
|
|
ppp.input().tensor().set_layout({"NHWC"});
|
|
|
|
if (useAutoResize) {
|
|
ppp.input().tensor().set_spatial_dynamic_shape();
|
|
|
|
ppp.input()
|
|
.preprocess()
|
|
.convert_element_type(ov::element::f32)
|
|
.resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
|
|
}
|
|
|
|
ppp.input().model().set_layout(inputLayout);
|
|
|
|
//--- Reading image input parameters
|
|
inputsNames.push_back(model->input().get_any_name());
|
|
netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
|
|
netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
|
|
|
|
// --------------------------- Prepare output -----------------------------------------------------
|
|
const ov::OutputVector& outputs = model->outputs();
|
|
std::map<std::string, ov::Shape> outShapes;
|
|
for (auto& out : outputs) {
|
|
ppp.output(out.get_any_name()).tensor().set_element_type(ov::element::f32);
|
|
if (out.get_shape().size() == 4) {
|
|
if (out.get_shape()[ov::layout::height_idx("NCHW")] != out.get_shape()[ov::layout::width_idx("NCHW")] &&
|
|
out.get_shape()[ov::layout::height_idx("NHWC")] == out.get_shape()[ov::layout::width_idx("NHWC")]) {
|
|
ppp.output(out.get_any_name()).model().set_layout("NHWC");
|
|
// outShapes are saved before ppp.build() thus set yoloRegionLayout as it is in model before ppp.build()
|
|
yoloRegionLayout = "NHWC";
|
|
}
|
|
// yolo-v1-tiny-tf out shape is [1, 21125] thus set layout only for 4 dim tensors
|
|
ppp.output(out.get_any_name()).tensor().set_layout("NCHW");
|
|
}
|
|
outputsNames.push_back(out.get_any_name());
|
|
outShapes[out.get_any_name()] = out.get_shape();
|
|
}
|
|
model = ppp.build();
|
|
|
|
yoloVersion = YOLO_V3;
|
|
bool isRegionFound = false;
|
|
for (const auto& op : model->get_ordered_ops()) {
|
|
if (std::string("RegionYolo") == op->get_type_name()) {
|
|
auto regionYolo = std::dynamic_pointer_cast<ov::op::v0::RegionYolo>(op);
|
|
|
|
if (regionYolo) {
|
|
if (!regionYolo->get_mask().size()) {
|
|
yoloVersion = YOLO_V1V2;
|
|
}
|
|
|
|
const auto& opName = op->get_friendly_name();
|
|
for (const auto& out : outputs) {
|
|
if (out.get_node()->get_friendly_name() == opName ||
|
|
out.get_node()->get_input_node_ptr(0)->get_friendly_name() == opName) {
|
|
isRegionFound = true;
|
|
regions.emplace(out.get_any_name(), Region(regionYolo));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!isRegionFound) {
|
|
switch (outputsNames.size()) {
|
|
case 1:
|
|
yoloVersion = YOLOF;
|
|
break;
|
|
case 2:
|
|
yoloVersion = YOLO_V4_TINY;
|
|
break;
|
|
case 3:
|
|
yoloVersion = YOLO_V4;
|
|
break;
|
|
}
|
|
|
|
int num = yoloVersion == YOLOF ? 6 : 3;
|
|
isObjConf = yoloVersion == YOLOF ? 0 : 1;
|
|
int i = 0;
|
|
|
|
auto chosenMasks = presetMasks.size() ? presetMasks : defaultMasks[yoloVersion];
|
|
if (chosenMasks.size() != num * outputs.size()) {
|
|
throw std::runtime_error(std::string("Invalid size of masks array, got ") +
|
|
std::to_string(presetMasks.size()) + ", should be " +
|
|
std::to_string(num * outputs.size()));
|
|
}
|
|
|
|
std::sort(outputsNames.begin(),
|
|
outputsNames.end(),
|
|
[&outShapes, this](const std::string& x, const std::string& y) {
|
|
return outShapes[x][ov::layout::height_idx(yoloRegionLayout)] >
|
|
outShapes[y][ov::layout::height_idx(yoloRegionLayout)];
|
|
});
|
|
|
|
for (const auto& name : outputsNames) {
|
|
const auto& shape = outShapes[name];
|
|
if (shape[ov::layout::channels_idx(yoloRegionLayout)] % num != 0) {
|
|
throw std::logic_error(std::string("Output tensor ") + name + " has wrong channel dimension");
|
|
}
|
|
regions.emplace(
|
|
name,
|
|
Region(shape[ov::layout::channels_idx(yoloRegionLayout)] / num - 4 - (isObjConf ? 1 : 0),
|
|
4,
|
|
presetAnchors.size() ? presetAnchors : defaultAnchors[yoloVersion],
|
|
std::vector<int64_t>(chosenMasks.begin() + i * num, chosenMasks.begin() + (i + 1) * num),
|
|
shape[ov::layout::width_idx(yoloRegionLayout)],
|
|
shape[ov::layout::height_idx(yoloRegionLayout)]));
|
|
i++;
|
|
}
|
|
} else {
|
|
// Currently externally set anchors and masks are supported only for YoloV4
|
|
if (presetAnchors.size() || presetMasks.size()) {
|
|
slog::warn << "Preset anchors and mask can be set for YoloV4 model only. "
|
|
"This model is not YoloV4, so these options will be ignored."
|
|
<< slog::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
std::unique_ptr<ResultBase> ModelYolo::postprocess(InferenceResult& infResult) {
|
|
DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
|
|
std::vector<DetectedObject> objects;
|
|
|
|
// Parsing outputs
|
|
const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
|
|
|
|
for (auto& output : infResult.outputsData) {
|
|
this->parseYOLOOutput(output.first,
|
|
output.second,
|
|
netInputHeight,
|
|
netInputWidth,
|
|
internalData.inputImgHeight,
|
|
internalData.inputImgWidth,
|
|
objects);
|
|
}
|
|
|
|
if (useAdvancedPostprocessing) {
|
|
// Advanced postprocessing
|
|
// Checking IOU threshold conformance
|
|
// For every i-th object we're finding all objects it intersects with, and comparing confidence
|
|
// If i-th object has greater confidence than all others, we include it into result
|
|
for (const auto& obj1 : objects) {
|
|
bool isGoodResult = true;
|
|
for (const auto& obj2 : objects) {
|
|
if (obj1.labelID == obj2.labelID && obj1.confidence < obj2.confidence &&
|
|
intersectionOverUnion(obj1, obj2) >= boxIOUThreshold) { // if obj1 is the same as obj2, condition
|
|
// expression will evaluate to false anyway
|
|
isGoodResult = false;
|
|
break;
|
|
}
|
|
}
|
|
if (isGoodResult) {
|
|
result->objects.push_back(obj1);
|
|
}
|
|
}
|
|
} else {
|
|
// Classic postprocessing
|
|
std::sort(objects.begin(), objects.end(), [](const DetectedObject& x, const DetectedObject& y) {
|
|
return x.confidence > y.confidence;
|
|
});
|
|
for (size_t i = 0; i < objects.size(); ++i) {
|
|
if (objects[i].confidence == 0)
|
|
continue;
|
|
for (size_t j = i + 1; j < objects.size(); ++j)
|
|
if (intersectionOverUnion(objects[i], objects[j]) >= boxIOUThreshold)
|
|
objects[j].confidence = 0;
|
|
result->objects.push_back(objects[i]);
|
|
}
|
|
}
|
|
|
|
return std::unique_ptr<ResultBase>(result);
|
|
}
|
|
|
|
void ModelYolo::parseYOLOOutput(const std::string& output_name,
|
|
const ov::Tensor& tensor,
|
|
const unsigned long resized_im_h,
|
|
const unsigned long resized_im_w,
|
|
const unsigned long original_im_h,
|
|
const unsigned long original_im_w,
|
|
std::vector<DetectedObject>& objects) {
|
|
// --------------------------- Extracting layer parameters -------------------------------------
|
|
auto it = regions.find(output_name);
|
|
if (it == regions.end()) {
|
|
throw std::runtime_error(std::string("Can't find output layer with name ") + output_name);
|
|
}
|
|
auto& region = it->second;
|
|
|
|
int sideW = 0;
|
|
int sideH = 0;
|
|
unsigned long scaleH;
|
|
unsigned long scaleW;
|
|
switch (yoloVersion) {
|
|
case YOLO_V1V2:
|
|
sideH = region.outputHeight;
|
|
sideW = region.outputWidth;
|
|
scaleW = region.outputWidth;
|
|
scaleH = region.outputHeight;
|
|
break;
|
|
case YOLO_V3:
|
|
case YOLO_V4:
|
|
case YOLO_V4_TINY:
|
|
case YOLOF:
|
|
sideH = static_cast<int>(tensor.get_shape()[ov::layout::height_idx("NCHW")]);
|
|
sideW = static_cast<int>(tensor.get_shape()[ov::layout::width_idx("NCHW")]);
|
|
scaleW = resized_im_w;
|
|
scaleH = resized_im_h;
|
|
break;
|
|
}
|
|
|
|
auto entriesNum = sideW * sideH;
|
|
const float* outData = tensor.data<float>();
|
|
|
|
auto postprocessRawData =
|
|
(yoloVersion == YOLO_V4 || yoloVersion == YOLO_V4_TINY || yoloVersion == YOLOF) ? sigmoid : linear;
|
|
|
|
// --------------------------- Parsing YOLO Region output -------------------------------------
|
|
for (int i = 0; i < entriesNum; ++i) {
|
|
int row = i / sideW;
|
|
int col = i % sideW;
|
|
for (int n = 0; n < region.num; ++n) {
|
|
//--- Getting region data
|
|
int obj_index = calculateEntryIndex(entriesNum,
|
|
region.coords,
|
|
region.classes + isObjConf,
|
|
n * entriesNum + i,
|
|
region.coords);
|
|
int box_index =
|
|
calculateEntryIndex(entriesNum, region.coords, region.classes + isObjConf, n * entriesNum + i, 0);
|
|
float scale = isObjConf ? postprocessRawData(outData[obj_index]) : 1;
|
|
|
|
//--- Preliminary check for confidence threshold conformance
|
|
if (scale >= confidenceThreshold) {
|
|
//--- Calculating scaled region's coordinates
|
|
float x, y;
|
|
if (yoloVersion == YOLOF) {
|
|
x = (static_cast<float>(col) / sideW +
|
|
outData[box_index + 0 * entriesNum] * region.anchors[2 * n] / scaleW) *
|
|
original_im_w;
|
|
y = (static_cast<float>(row) / sideH +
|
|
outData[box_index + 1 * entriesNum] * region.anchors[2 * n + 1] / scaleH) *
|
|
original_im_h;
|
|
} else {
|
|
x = static_cast<float>((col + postprocessRawData(outData[box_index + 0 * entriesNum])) / sideW *
|
|
original_im_w);
|
|
y = static_cast<float>((row + postprocessRawData(outData[box_index + 1 * entriesNum])) / sideH *
|
|
original_im_h);
|
|
}
|
|
float height = static_cast<float>(std::exp(outData[box_index + 3 * entriesNum]) *
|
|
region.anchors[2 * n + 1] * original_im_h / scaleH);
|
|
float width = static_cast<float>(std::exp(outData[box_index + 2 * entriesNum]) * region.anchors[2 * n] *
|
|
original_im_w / scaleW);
|
|
|
|
DetectedObject obj;
|
|
obj.x = clamp(x - width / 2, 0.f, static_cast<float>(original_im_w));
|
|
obj.y = clamp(y - height / 2, 0.f, static_cast<float>(original_im_h));
|
|
obj.width = clamp(width, 0.f, static_cast<float>(original_im_w - obj.x));
|
|
obj.height = clamp(height, 0.f, static_cast<float>(original_im_h - obj.y));
|
|
|
|
for (size_t j = 0; j < region.classes; ++j) {
|
|
int class_index = calculateEntryIndex(entriesNum,
|
|
region.coords,
|
|
region.classes + isObjConf,
|
|
n * entriesNum + i,
|
|
region.coords + isObjConf + j);
|
|
float prob = scale * postprocessRawData(outData[class_index]);
|
|
|
|
//--- Checking confidence threshold conformance and adding region to the list
|
|
if (prob >= confidenceThreshold) {
|
|
obj.confidence = prob;
|
|
obj.labelID = j;
|
|
obj.label = getLabelName(obj.labelID);
|
|
objects.push_back(obj);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
int ModelYolo::calculateEntryIndex(int totalCells, int lcoords, size_t lclasses, int location, int entry) {
|
|
int n = location / totalCells;
|
|
int loc = location % totalCells;
|
|
return (n * (lcoords + lclasses) + entry) * totalCells + loc;
|
|
}
|
|
|
|
double ModelYolo::intersectionOverUnion(const DetectedObject& o1, const DetectedObject& o2) {
|
|
double overlappingWidth = fmin(o1.x + o1.width, o2.x + o2.width) - fmax(o1.x, o2.x);
|
|
double overlappingHeight = fmin(o1.y + o1.height, o2.y + o2.height) - fmax(o1.y, o2.y);
|
|
double intersectionArea =
|
|
(overlappingWidth < 0 || overlappingHeight < 0) ? 0 : overlappingHeight * overlappingWidth;
|
|
double unionArea = o1.width * o1.height + o2.width * o2.height - intersectionArea;
|
|
return intersectionArea / unionArea;
|
|
}
|
|
|
|
ModelYolo::Region::Region(const std::shared_ptr<ov::op::v0::RegionYolo>& regionYolo) {
|
|
coords = regionYolo->get_num_coords();
|
|
classes = regionYolo->get_num_classes();
|
|
auto mask = regionYolo->get_mask();
|
|
num = mask.size();
|
|
|
|
auto shape = regionYolo->get_input_shape(0);
|
|
outputWidth = shape[3];
|
|
outputHeight = shape[2];
|
|
|
|
if (num) {
|
|
// Parsing YoloV3 parameters
|
|
anchors.resize(num * 2);
|
|
|
|
for (int i = 0; i < num; ++i) {
|
|
anchors[i * 2] = regionYolo->get_anchors()[mask[i] * 2];
|
|
anchors[i * 2 + 1] = regionYolo->get_anchors()[mask[i] * 2 + 1];
|
|
}
|
|
} else {
|
|
// Parsing YoloV2 parameters
|
|
num = regionYolo->get_num_regions();
|
|
anchors = regionYolo->get_anchors();
|
|
if (anchors.empty()) {
|
|
anchors = defaultAnchors[YOLO_V1V2];
|
|
num = 5;
|
|
}
|
|
}
|
|
}
|
|
|
|
ModelYolo::Region::Region(size_t classes,
|
|
int coords,
|
|
const std::vector<float>& anchors,
|
|
const std::vector<int64_t>& masks,
|
|
size_t outputWidth,
|
|
size_t outputHeight)
|
|
: classes(classes),
|
|
coords(coords),
|
|
outputWidth(outputWidth),
|
|
outputHeight(outputHeight) {
|
|
num = masks.size();
|
|
|
|
if (anchors.size() == 0 || anchors.size() % 2 != 0) {
|
|
throw std::runtime_error("Explicitly initialized region should have non-empty even-sized regions vector");
|
|
}
|
|
|
|
if (num) {
|
|
this->anchors.resize(num * 2);
|
|
|
|
for (int i = 0; i < num; ++i) {
|
|
this->anchors[i * 2] = anchors[masks[i] * 2];
|
|
this->anchors[i * 2 + 1] = anchors[masks[i] * 2 + 1];
|
|
}
|
|
} else {
|
|
this->anchors = anchors;
|
|
num = anchors.size() / 2;
|
|
}
|
|
}
|