#include "RetinaFaceTRT.h" // This is standalone Retina Face detector using TensorRT (We will not use as it is not inherit the ANSFD class) namespace ANSCENTER { RetinaFaceTRT::RetinaFaceTRT() { m_outputBbox.clear(); } bool RetinaFaceTRT::Initialize(const std::string engineFile, int frameWidth, int frameHeight, std::string inputName, std::vector outputNames, std::vector inputShape, int maxBatchSize, int maxFacesPerScene, float nms_threshold, float bbox_threshold) { try { assert(inputShape.size() == 3); m_INPUT_C = static_cast(inputShape[0]); m_INPUT_H = static_cast(inputShape[1]); m_INPUT_W = static_cast(inputShape[2]); m_INPUT_SIZE = static_cast(m_INPUT_C * m_INPUT_H * m_INPUT_W * sizeof(float)); m_OUTPUT_SIZE_BASE = static_cast((m_INPUT_H / 8 * m_INPUT_W / 8 + m_INPUT_H / 16 * m_INPUT_W / 16 + m_INPUT_H / 32 * m_INPUT_W / 32) * 2); m_output0 = new float[m_OUTPUT_SIZE_BASE * 4]; m_output1 = new float[m_OUTPUT_SIZE_BASE * 2]; m_maxBatchSize = static_cast(maxBatchSize); m_maxFacesPerScene = static_cast(maxFacesPerScene); m_nms_threshold = static_cast(nms_threshold); m_bbox_threshold = static_cast(bbox_threshold); // load engine from .engine file LoadEngine(engineFile); // create stream and pre-allocate GPU buffers memory PreInference(inputName, outputNames); return true; } catch (std::exception& e) { this->_logger.LogFatal("RetinaFace::Initialize", e.what(), __FILE__, __LINE__); return false; } } void RetinaFaceTRT::LoadEngine(const std::string engineFile) { try { if (FileExist(engineFile)) { this->_logger.LogDebug("RetinaFace::LoadEngine", "Loading RetinaFace Engine...", __FILE__, __LINE__); std::vector trtModelStream_; size_t size{ 0 }; std::ifstream file(engineFile, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream_.resize(size); file.read(trtModelStream_.data(), size); file.close(); } nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(m_logger); assert(runtime != nullptr); m_engine = runtime->deserializeCudaEngine(trtModelStream_.data(), size); assert(m_engine != nullptr); m_context = m_engine->createExecutionContext(); assert(m_context != nullptr); } else { this->_logger.LogError("RetinaFace::LoadEngine", "Cant find engine file", __FILE__, __LINE__); } } catch (std::exception& e) { this->_logger.LogFatal("RetinaFace::Initialize", e.what(), __FILE__, __LINE__); } } void RetinaFaceTRT::PreInference(std::string inputName, std::vector outputNames) { try { /* Does not make landmark head as we do not use face alignment. */ // Assert assert(outputNames.size() == 2); #if NV_TENSORRT_MAJOR >= 10 // TRT 10+: use named tensor API assert(m_engine->getNbIOTensors() == 3); // Look up tensor indices by name for (int i = 0; i < m_engine->getNbIOTensors(); ++i) { const char* name = m_engine->getIOTensorName(i); if (name == inputName) inputIndex = i; else if (name == outputNames[0]) outputIndex0 = i; else if (name == outputNames[1]) outputIndex1 = i; } #else // TRT 8.x: use binding API assert(m_engine->getNbBindings() == 3); inputIndex = m_engine->getBindingIndex(inputName.c_str()); outputIndex0 = m_engine->getBindingIndex(outputNames[0].c_str()); outputIndex1 = m_engine->getBindingIndex(outputNames[1].c_str()); #endif // Create GPU buffers on device ANSFRHelper::CheckCudaStatus(cudaMalloc(&buffers[inputIndex], m_maxBatchSize * m_INPUT_SIZE)); ANSFRHelper::CheckCudaStatus(cudaMalloc(&buffers[outputIndex0], m_maxBatchSize * m_OUTPUT_SIZE_BASE * 4 * sizeof(float))); ANSFRHelper::CheckCudaStatus(cudaMalloc(&buffers[outputIndex1], m_maxBatchSize * m_OUTPUT_SIZE_BASE * 2 * sizeof(float))); #if NV_TENSORRT_MAJOR >= 10 // TRT 10+: bind tensor addresses m_context->setTensorAddress(inputName.c_str(), buffers[inputIndex]); m_context->setTensorAddress(outputNames[0].c_str(), buffers[outputIndex0]); m_context->setTensorAddress(outputNames[1].c_str(), buffers[outputIndex1]); #endif // Create stream ANSFRHelper::CheckCudaStatus(cudaStreamCreate(&stream)); } catch (std::exception& e) { this->_logger.LogFatal("RetinaFace::PreInference", e.what(), __FILE__, __LINE__); } } void RetinaFaceTRT::PreProcess(cv::Mat& img) { try { // Release input vector m_input.release(); // Resize float w, h, x, y; if (m_scale_h > m_scale_w) { w = float(m_INPUT_W); h = float(m_scale_w * img.rows); x = 0; y = float((m_INPUT_H - h) / 2); } else { w = float(m_scale_h * img.cols); h = float(m_INPUT_H); x = float((m_INPUT_W - w) / 2); y = 0; } cv::Mat re((int)h, (int)w, CV_8UC3); cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); cv::Mat out((int)m_INPUT_H, (int)m_INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128)); re.copyTo(out(cv::Rect(int(x), int(y), re.cols, re.rows))); // Normalize out.convertTo(out, CV_32F); out = out - cv::Scalar(104, 117, 123); std::vector temp; cv::split(out, temp); for (int i = 0; i < temp.size(); i++) { m_input.push_back(temp[i]); } } catch (std::exception& e) { this->_logger.LogFatal("RetinaFace::PreProcess", e.what(), __FILE__, __LINE__); } } void RetinaFaceTRT::RunInference(float* input, float* output0, float* output1) { try { // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host ANSCENTER::ANSFRHelper::CheckCudaStatus(cudaMemcpyAsync(buffers[inputIndex], input, m_maxBatchSize * m_INPUT_SIZE, cudaMemcpyHostToDevice, stream)); #if NV_TENSORRT_MAJOR >= 10 m_context->enqueueV3(stream); #else m_context->enqueueV2(buffers, stream, nullptr); #endif ANSFRHelper::CheckCudaStatus(cudaMemcpyAsync(output0, buffers[outputIndex0], m_maxBatchSize * m_OUTPUT_SIZE_BASE * 4 * sizeof(float), cudaMemcpyDeviceToHost, stream)); ANSFRHelper::CheckCudaStatus(cudaMemcpyAsync(output1, buffers[outputIndex1], m_maxBatchSize * m_OUTPUT_SIZE_BASE * 2 * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); } catch (std::exception& e) { this->_logger.LogFatal("RetinaFace::RunInference", e.what(), __FILE__, __LINE__); } } std::vector RetinaFaceTRT::FindFace(cv::Mat& img) { try { m_outputBbox.clear(); int out_rows = img.rows; int out_cols = img.cols; m_frameWidth = static_cast(out_cols); m_frameHeight = static_cast(out_rows); m_scale_h = (float)(m_INPUT_H) / float(m_frameHeight); m_scale_w = (float)(m_INPUT_W) / float(m_frameWidth); PreProcess(img); RunInference((float*)m_input.ptr(0), m_output0, m_output1); PostProcessing(m_output0, m_output1); return m_outputBbox; } catch (std::exception& e) { m_outputBbox.clear(); this->_logger.LogFatal("RetinaFace::FindFace", e.what(), __FILE__, __LINE__); return m_outputBbox; } } void RetinaFaceTRT::PostProcessing(float* bbox, float* conf) { try { m_outputBbox.clear(); std::vector anchor; CreateAnchorRetinaface(anchor, m_INPUT_W, m_INPUT_H); for (int i = 0; i < anchor.size(); ++i) { if (*(conf + 1) > m_bbox_threshold) { AnchorBox tmp = anchor[i]; AnchorBox tmp1; Bbox result{}; // decode bbox (y - W; x - H) tmp1.cx = float(tmp.cx + *bbox * 0.1 * tmp.sx); tmp1.cy = float(tmp.cy + *(bbox + 1) * 0.1 * tmp.sy); tmp1.sx = float(tmp.sx * exp(*(bbox + 2) * 0.2)); tmp1.sy = float(tmp.sy * exp(*(bbox + 3) * 0.2)); result.y1 = int((tmp1.cx - tmp1.sx / 2) * m_INPUT_W); result.x1 = int((tmp1.cy - tmp1.sy / 2) * m_INPUT_H); result.y2 = int((tmp1.cx + tmp1.sx / 2) * m_INPUT_W); result.x2 = int((tmp1.cy + tmp1.sy / 2) * m_INPUT_H); // rescale to original size if (m_scale_h > m_scale_w) { result.y1 = int(result.y1 / m_scale_w); result.y2 = int(result.y2 / m_scale_w); result.x1 = int((result.x1 - (m_INPUT_H - m_scale_w * m_frameHeight) / 2) / m_scale_w); result.x2 = int((result.x2 - (m_INPUT_H - m_scale_w * m_frameHeight) / 2) / m_scale_w); } else { result.y1 = int((result.y1 - (m_INPUT_W - m_scale_h * m_frameWidth) / 2) / m_scale_h); result.y2 = int((result.y2 - (m_INPUT_W - m_scale_h * m_frameWidth) / 2) / m_scale_h); result.x1 = int(result.x1 / m_scale_h); result.x2 = int(result.x2 / m_scale_h); } // Clip object box coordinates to network resolution result.y1 = CLIP(result.y1, 0, m_frameWidth - 1); result.x1 = CLIP(result.x1, 0, m_frameHeight - 1); result.y2 = CLIP(result.y2, 0, m_frameWidth - 1); result.x2 = CLIP(result.x2, 0, m_frameHeight - 1); // Get confidence result.score = *(conf + 1); // Push to result vector m_outputBbox.push_back(result); } bbox += 4; conf += 2; } std::sort(m_outputBbox.begin(), m_outputBbox.end(), MCMP); NMS(m_outputBbox, m_nms_threshold); if (m_outputBbox.size() > m_maxFacesPerScene) m_outputBbox.resize(m_maxFacesPerScene); } catch (std::exception& e) { m_outputBbox.clear(); this->_logger.LogFatal("RetinaFace::PostProcessing", e.what(), __FILE__, __LINE__); } } void RetinaFaceTRT::CreateAnchorRetinaface(std::vector& anchor, int w, int h) { try { anchor.clear(); std::vector> feature_map(3), min_sizes(3); float steps[] = { 8, 16, 32 }; for (int i = 0; i < feature_map.size(); ++i) { feature_map[i].push_back(int(ceil(h / steps[i]))); feature_map[i].push_back(int(ceil(w / steps[i]))); } std::vector minsize1 = { 10, 20 }; min_sizes[0] = minsize1; std::vector minsize2 = { 32, 64 }; min_sizes[1] = minsize2; std::vector minsize3 = { 128, 256 }; min_sizes[2] = minsize3; for (int k = 0; k < feature_map.size(); ++k) { std::vector min_size = min_sizes[k]; for (int i = 0; i < feature_map[k][0]; ++i) { for (int j = 0; j < feature_map[k][1]; ++j) { for (int l = 0; l < min_size.size(); ++l) { float s_kx = float(min_size[l] * 1.0 / w); float s_ky = float(min_size[l] * 1.0 / h); float cx = float((j + 0.5) * steps[k] / w); float cy = float((i + 0.5) * steps[k] / h); AnchorBox axil = { cx, cy, s_kx, s_ky }; anchor.push_back(axil); } } } } } catch (std::exception& e) { m_outputBbox.clear(); this->_logger.LogFatal("RetinaFace::CreateAnchorRetinaface", e.what(), __FILE__, __LINE__); } } inline bool RetinaFaceTRT::MCMP(Bbox a, Bbox b) { if (a.score > b.score) return true; return false; } void RetinaFaceTRT::NMS(std::vector& input_boxes, float NMS_THRESH) { try { std::vector vArea(input_boxes.size()); for (int i = 0; i < int(input_boxes.size()); ++i) { vArea[i] = float((input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) * (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1)); } for (int i = 0; i < int(input_boxes.size()); ++i) { for (int j = i + 1; j < int(input_boxes.size());) { float xx1 = (float)(std::max(input_boxes[i].x1, input_boxes[j].x1)); float yy1 = (float)(std::max(input_boxes[i].y1, input_boxes[j].y1)); float xx2 = (float)(std::min(input_boxes[i].x2, input_boxes[j].x2)); float yy2 = (float)(std::min(input_boxes[i].y2, input_boxes[j].y2)); float w = (float)(std::max(float(0), xx2 - xx1 + 1)); float h = (float)(std::max(float(0), yy2 - yy1 + 1)); float inter = (float)(w * h); float ovr = (float)(inter / (vArea[i] + vArea[j] - inter)); if (ovr >= NMS_THRESH) { input_boxes.erase(input_boxes.begin() + j); vArea.erase(vArea.begin() + j); } else { j++; } } } } catch (std::exception& e) { m_outputBbox.clear(); this->_logger.LogFatal("RetinaFace::NMS", e.what(), __FILE__, __LINE__); } } RetinaFaceTRT::~RetinaFaceTRT() { try { // Release stream and buffers ANSFRHelper::CheckCudaStatus(cudaStreamDestroy(stream)); ANSFRHelper::CheckCudaStatus(cudaFree(buffers[inputIndex])); ANSFRHelper::CheckCudaStatus(cudaFree(buffers[outputIndex0])); ANSFRHelper::CheckCudaStatus(cudaFree(buffers[outputIndex1])); // checkCudaStatus(cudaFree(buffers[outputIndex2])); } catch (std::exception& e) { m_outputBbox.clear(); this->_logger.LogFatal("RetinaFace::~RetinaFace", e.what(), __FILE__, __LINE__); } } }