Files
ANSCORE/modules/ANSFR/RetinaFaceTRT.cpp

366 lines
16 KiB
C++

#include "RetinaFaceTRT.h"
// This is standalone Retina Face detector using TensorRT (We will not use as it is not inherit the ANSFD class)
namespace ANSCENTER {
RetinaFaceTRT::RetinaFaceTRT()
{
m_outputBbox.clear();
}
bool RetinaFaceTRT::Initialize(const std::string engineFile,
int frameWidth,
int frameHeight,
std::string inputName,
std::vector<std::string> outputNames,
std::vector<int> inputShape,
int maxBatchSize,
int maxFacesPerScene,
float nms_threshold,
float bbox_threshold)
{
try
{
assert(inputShape.size() == 3);
m_INPUT_C = static_cast<const int>(inputShape[0]);
m_INPUT_H = static_cast<const int>(inputShape[1]);
m_INPUT_W = static_cast<const int>(inputShape[2]);
m_INPUT_SIZE = static_cast<const int>(m_INPUT_C * m_INPUT_H * m_INPUT_W * sizeof(float));
m_OUTPUT_SIZE_BASE = static_cast<const int>((m_INPUT_H / 8 * m_INPUT_W / 8 + m_INPUT_H / 16 * m_INPUT_W / 16 + m_INPUT_H / 32 * m_INPUT_W / 32) * 2);
m_output0 = new float[m_OUTPUT_SIZE_BASE * 4];
m_output1 = new float[m_OUTPUT_SIZE_BASE * 2];
m_maxBatchSize = static_cast<const int>(maxBatchSize);
m_maxFacesPerScene = static_cast<const int>(maxFacesPerScene);
m_nms_threshold = static_cast<const float>(nms_threshold);
m_bbox_threshold = static_cast<const float>(bbox_threshold);
// load engine from .engine file
LoadEngine(engineFile);
// create stream and pre-allocate GPU buffers memory
PreInference(inputName, outputNames);
return true;
}
catch (std::exception& e) {
this->_logger.LogFatal("RetinaFace::Initialize", e.what(), __FILE__, __LINE__);
return false;
}
}
void RetinaFaceTRT::LoadEngine(const std::string engineFile)
{
try {
if (FileExist(engineFile)) {
this->_logger.LogDebug("RetinaFace::LoadEngine", "Loading RetinaFace Engine...", __FILE__, __LINE__);
std::vector<char> trtModelStream_;
size_t size{ 0 };
std::ifstream file(engineFile, std::ios::binary);
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream_.resize(size);
file.read(trtModelStream_.data(), size);
file.close();
}
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(m_logger);
assert(runtime != nullptr);
m_engine = runtime->deserializeCudaEngine(trtModelStream_.data(), size);
assert(m_engine != nullptr);
m_context = m_engine->createExecutionContext();
assert(m_context != nullptr);
}
else {
this->_logger.LogError("RetinaFace::LoadEngine", "Cant find engine file", __FILE__, __LINE__);
}
}
catch (std::exception& e) {
this->_logger.LogFatal("RetinaFace::Initialize", e.what(), __FILE__, __LINE__);
}
}
void RetinaFaceTRT::PreInference(std::string inputName, std::vector<std::string> outputNames) {
try {
/*
Does not make landmark head as we do not use face alignment.
*/
// Assert
assert(outputNames.size() == 2);
#if NV_TENSORRT_MAJOR >= 10
// TRT 10+: use named tensor API
assert(m_engine->getNbIOTensors() == 3);
// Look up tensor indices by name
for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
const char* name = m_engine->getIOTensorName(i);
if (name == inputName) inputIndex = i;
else if (name == outputNames[0]) outputIndex0 = i;
else if (name == outputNames[1]) outputIndex1 = i;
}
#else
// TRT 8.x: use binding API
assert(m_engine->getNbBindings() == 3);
inputIndex = m_engine->getBindingIndex(inputName.c_str());
outputIndex0 = m_engine->getBindingIndex(outputNames[0].c_str());
outputIndex1 = m_engine->getBindingIndex(outputNames[1].c_str());
#endif
// Create GPU buffers on device
ANSFRHelper::CheckCudaStatus(cudaMalloc(&buffers[inputIndex], m_maxBatchSize * m_INPUT_SIZE));
ANSFRHelper::CheckCudaStatus(cudaMalloc(&buffers[outputIndex0], m_maxBatchSize * m_OUTPUT_SIZE_BASE * 4 * sizeof(float)));
ANSFRHelper::CheckCudaStatus(cudaMalloc(&buffers[outputIndex1], m_maxBatchSize * m_OUTPUT_SIZE_BASE * 2 * sizeof(float)));
#if NV_TENSORRT_MAJOR >= 10
// TRT 10+: bind tensor addresses
m_context->setTensorAddress(inputName.c_str(), buffers[inputIndex]);
m_context->setTensorAddress(outputNames[0].c_str(), buffers[outputIndex0]);
m_context->setTensorAddress(outputNames[1].c_str(), buffers[outputIndex1]);
#endif
// Create stream
ANSFRHelper::CheckCudaStatus(cudaStreamCreate(&stream));
}
catch (std::exception& e) {
this->_logger.LogFatal("RetinaFace::PreInference", e.what(), __FILE__, __LINE__);
}
}
void RetinaFaceTRT::PreProcess(cv::Mat& img) {
try {
// Release input vector
m_input.release();
// Resize
float w, h, x, y;
if (m_scale_h > m_scale_w) {
w = float(m_INPUT_W);
h = float(m_scale_w * img.rows);
x = 0;
y = float((m_INPUT_H - h) / 2);
}
else {
w = float(m_scale_h * img.cols);
h = float(m_INPUT_H);
x = float((m_INPUT_W - w) / 2);
y = 0;
}
cv::Mat re((int)h, (int)w, CV_8UC3);
cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
cv::Mat out((int)m_INPUT_H, (int)m_INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
re.copyTo(out(cv::Rect(int(x), int(y), re.cols, re.rows)));
// Normalize
out.convertTo(out, CV_32F);
out = out - cv::Scalar(104, 117, 123);
std::vector<cv::Mat> temp;
cv::split(out, temp);
for (int i = 0; i < temp.size(); i++) {
m_input.push_back(temp[i]);
}
}
catch (std::exception& e) {
this->_logger.LogFatal("RetinaFace::PreProcess", e.what(), __FILE__, __LINE__);
}
}
void RetinaFaceTRT::RunInference(float* input, float* output0, float* output1) {
try {
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
ANSCENTER::ANSFRHelper::CheckCudaStatus(cudaMemcpyAsync(buffers[inputIndex], input, m_maxBatchSize * m_INPUT_SIZE, cudaMemcpyHostToDevice, stream));
#if NV_TENSORRT_MAJOR >= 10
m_context->enqueueV3(stream);
#else
m_context->enqueueV2(buffers, stream, nullptr);
#endif
ANSFRHelper::CheckCudaStatus(cudaMemcpyAsync(output0, buffers[outputIndex0], m_maxBatchSize * m_OUTPUT_SIZE_BASE * 4 * sizeof(float), cudaMemcpyDeviceToHost, stream));
ANSFRHelper::CheckCudaStatus(cudaMemcpyAsync(output1, buffers[outputIndex1], m_maxBatchSize * m_OUTPUT_SIZE_BASE * 2 * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}
catch (std::exception& e) {
this->_logger.LogFatal("RetinaFace::RunInference", e.what(), __FILE__, __LINE__);
}
}
std::vector<struct Bbox> RetinaFaceTRT::FindFace(cv::Mat& img) {
try {
m_outputBbox.clear();
int out_rows = img.rows;
int out_cols = img.cols;
m_frameWidth = static_cast<const int>(out_cols);
m_frameHeight = static_cast<const int>(out_rows);
m_scale_h = (float)(m_INPUT_H) / float(m_frameHeight);
m_scale_w = (float)(m_INPUT_W) / float(m_frameWidth);
PreProcess(img);
RunInference((float*)m_input.ptr<float>(0), m_output0, m_output1);
PostProcessing(m_output0, m_output1);
return m_outputBbox;
}
catch (std::exception& e) {
m_outputBbox.clear();
this->_logger.LogFatal("RetinaFace::FindFace", e.what(), __FILE__, __LINE__);
return m_outputBbox;
}
}
void RetinaFaceTRT::PostProcessing(float* bbox, float* conf) {
try {
m_outputBbox.clear();
std::vector<AnchorBox> anchor;
CreateAnchorRetinaface(anchor, m_INPUT_W, m_INPUT_H);
for (int i = 0; i < anchor.size(); ++i) {
if (*(conf + 1) > m_bbox_threshold) {
AnchorBox tmp = anchor[i];
AnchorBox tmp1;
Bbox result{};
// decode bbox (y - W; x - H)
tmp1.cx = float(tmp.cx + *bbox * 0.1 * tmp.sx);
tmp1.cy = float(tmp.cy + *(bbox + 1) * 0.1 * tmp.sy);
tmp1.sx = float(tmp.sx * exp(*(bbox + 2) * 0.2));
tmp1.sy = float(tmp.sy * exp(*(bbox + 3) * 0.2));
result.y1 = int((tmp1.cx - tmp1.sx / 2) * m_INPUT_W);
result.x1 = int((tmp1.cy - tmp1.sy / 2) * m_INPUT_H);
result.y2 = int((tmp1.cx + tmp1.sx / 2) * m_INPUT_W);
result.x2 = int((tmp1.cy + tmp1.sy / 2) * m_INPUT_H);
// rescale to original size
if (m_scale_h > m_scale_w) {
result.y1 = int(result.y1 / m_scale_w);
result.y2 = int(result.y2 / m_scale_w);
result.x1 = int((result.x1 - (m_INPUT_H - m_scale_w * m_frameHeight) / 2) / m_scale_w);
result.x2 = int((result.x2 - (m_INPUT_H - m_scale_w * m_frameHeight) / 2) / m_scale_w);
}
else {
result.y1 = int((result.y1 - (m_INPUT_W - m_scale_h * m_frameWidth) / 2) / m_scale_h);
result.y2 = int((result.y2 - (m_INPUT_W - m_scale_h * m_frameWidth) / 2) / m_scale_h);
result.x1 = int(result.x1 / m_scale_h);
result.x2 = int(result.x2 / m_scale_h);
}
// Clip object box coordinates to network resolution
result.y1 = CLIP(result.y1, 0, m_frameWidth - 1);
result.x1 = CLIP(result.x1, 0, m_frameHeight - 1);
result.y2 = CLIP(result.y2, 0, m_frameWidth - 1);
result.x2 = CLIP(result.x2, 0, m_frameHeight - 1);
// Get confidence
result.score = *(conf + 1);
// Push to result vector
m_outputBbox.push_back(result);
}
bbox += 4;
conf += 2;
}
std::sort(m_outputBbox.begin(), m_outputBbox.end(), MCMP);
NMS(m_outputBbox, m_nms_threshold);
if (m_outputBbox.size() > m_maxFacesPerScene)
m_outputBbox.resize(m_maxFacesPerScene);
}
catch (std::exception& e) {
m_outputBbox.clear();
this->_logger.LogFatal("RetinaFace::PostProcessing", e.what(), __FILE__, __LINE__);
}
}
void RetinaFaceTRT::CreateAnchorRetinaface(std::vector<AnchorBox>& anchor, int w, int h) {
try {
anchor.clear();
std::vector<std::vector<int>> feature_map(3), min_sizes(3);
float steps[] = { 8, 16, 32 };
for (int i = 0; i < feature_map.size(); ++i) {
feature_map[i].push_back(int(ceil(h / steps[i])));
feature_map[i].push_back(int(ceil(w / steps[i])));
}
std::vector<int> minsize1 = { 10, 20 };
min_sizes[0] = minsize1;
std::vector<int> minsize2 = { 32, 64 };
min_sizes[1] = minsize2;
std::vector<int> minsize3 = { 128, 256 };
min_sizes[2] = minsize3;
for (int k = 0; k < feature_map.size(); ++k) {
std::vector<int> min_size = min_sizes[k];
for (int i = 0; i < feature_map[k][0]; ++i) {
for (int j = 0; j < feature_map[k][1]; ++j) {
for (int l = 0; l < min_size.size(); ++l) {
float s_kx = float(min_size[l] * 1.0 / w);
float s_ky = float(min_size[l] * 1.0 / h);
float cx = float((j + 0.5) * steps[k] / w);
float cy = float((i + 0.5) * steps[k] / h);
AnchorBox axil = { cx, cy, s_kx, s_ky };
anchor.push_back(axil);
}
}
}
}
}
catch (std::exception& e) {
m_outputBbox.clear();
this->_logger.LogFatal("RetinaFace::CreateAnchorRetinaface", e.what(), __FILE__, __LINE__);
}
}
inline bool RetinaFaceTRT::MCMP(Bbox a, Bbox b) {
if (a.score > b.score)
return true;
return false;
}
void RetinaFaceTRT::NMS(std::vector<Bbox>& input_boxes, float NMS_THRESH) {
try {
std::vector<float> vArea(input_boxes.size());
for (int i = 0; i < int(input_boxes.size()); ++i) {
vArea[i] = float((input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) * (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1));
}
for (int i = 0; i < int(input_boxes.size()); ++i) {
for (int j = i + 1; j < int(input_boxes.size());) {
float xx1 = (float)(std::max(input_boxes[i].x1, input_boxes[j].x1));
float yy1 = (float)(std::max(input_boxes[i].y1, input_boxes[j].y1));
float xx2 = (float)(std::min(input_boxes[i].x2, input_boxes[j].x2));
float yy2 = (float)(std::min(input_boxes[i].y2, input_boxes[j].y2));
float w = (float)(std::max(float(0), xx2 - xx1 + 1));
float h = (float)(std::max(float(0), yy2 - yy1 + 1));
float inter = (float)(w * h);
float ovr = (float)(inter / (vArea[i] + vArea[j] - inter));
if (ovr >= NMS_THRESH) {
input_boxes.erase(input_boxes.begin() + j);
vArea.erase(vArea.begin() + j);
}
else {
j++;
}
}
}
}
catch (std::exception& e) {
m_outputBbox.clear();
this->_logger.LogFatal("RetinaFace::NMS", e.what(), __FILE__, __LINE__);
}
}
RetinaFaceTRT::~RetinaFaceTRT() {
try {
// Release stream and buffers
ANSFRHelper::CheckCudaStatus(cudaStreamDestroy(stream));
ANSFRHelper::CheckCudaStatus(cudaFree(buffers[inputIndex]));
ANSFRHelper::CheckCudaStatus(cudaFree(buffers[outputIndex0]));
ANSFRHelper::CheckCudaStatus(cudaFree(buffers[outputIndex1]));
// checkCudaStatus(cudaFree(buffers[outputIndex2]));
}
catch (std::exception& e) {
m_outputBbox.clear();
this->_logger.LogFatal("RetinaFace::~RetinaFace", e.what(), __FILE__, __LINE__);
}
}
}