ANSCORE/modules/ANSFR/RetinaFaceTRT.cpp

#include "RetinaFaceTRT.h"
// This is standalone Retina Face detector using TensorRT (We will not use as it is not inherit the ANSFD class)
namespace ANSCENTER {
    RetinaFaceTRT::RetinaFaceTRT()
    {
        m_outputBbox.clear();
    }
    bool RetinaFaceTRT::Initialize(const std::string engineFile,
        int frameWidth,
        int frameHeight,
        std::string inputName,
        std::vector<std::string> outputNames,
        std::vector<int> inputShape,
        int maxBatchSize,
        int maxFacesPerScene,
        float nms_threshold,
        float bbox_threshold)
    {
        try
        {
            assert(inputShape.size() == 3);
            m_INPUT_C = static_cast<const int>(inputShape[0]);
            m_INPUT_H = static_cast<const int>(inputShape[1]);
            m_INPUT_W = static_cast<const int>(inputShape[2]);
            m_INPUT_SIZE = static_cast<const int>(m_INPUT_C * m_INPUT_H * m_INPUT_W * sizeof(float));
            m_OUTPUT_SIZE_BASE = static_cast<const int>((m_INPUT_H / 8 * m_INPUT_W / 8 + m_INPUT_H / 16 * m_INPUT_W / 16 + m_INPUT_H / 32 * m_INPUT_W / 32) * 2);
            m_output0 = new float[m_OUTPUT_SIZE_BASE * 4];
            m_output1 = new float[m_OUTPUT_SIZE_BASE * 2];
            m_maxBatchSize = static_cast<const int>(maxBatchSize);
            m_maxFacesPerScene = static_cast<const int>(maxFacesPerScene);
            m_nms_threshold = static_cast<const float>(nms_threshold);
            m_bbox_threshold = static_cast<const float>(bbox_threshold);
            // load engine from .engine file
            LoadEngine(engineFile);

            // create stream and pre-allocate GPU buffers memory
            PreInference(inputName, outputNames);
            return true;
        }
        catch (std::exception& e) {
           this->_logger.LogFatal("RetinaFace::Initialize", e.what(), __FILE__, __LINE__);
            return false;
        }
    }
    void RetinaFaceTRT::LoadEngine(const std::string engineFile)
    {
        try {
            if (FileExist(engineFile)) {
               this->_logger.LogDebug("RetinaFace::LoadEngine", "Loading RetinaFace Engine...", __FILE__, __LINE__);
                std::vector<char> trtModelStream_;
                size_t size{ 0 };

                std::ifstream file(engineFile, std::ios::binary);
                if (file.good()) {
                    file.seekg(0, file.end);
                    size = file.tellg();
                    file.seekg(0, file.beg);
                    trtModelStream_.resize(size);
                    file.read(trtModelStream_.data(), size);
                    file.close();
                }
                nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(m_logger);
                assert(runtime != nullptr);
                m_engine = runtime->deserializeCudaEngine(trtModelStream_.data(), size);
                assert(m_engine != nullptr);
                m_context = m_engine->createExecutionContext();
                assert(m_context != nullptr);
            }
            else {
               this->_logger.LogError("RetinaFace::LoadEngine", "Cant find engine file", __FILE__, __LINE__);
            }
        }
        catch (std::exception& e) {
           this->_logger.LogFatal("RetinaFace::Initialize", e.what(), __FILE__, __LINE__);
        }
    }


    void RetinaFaceTRT::PreInference(std::string inputName, std::vector<std::string> outputNames) {
        try {
            /*
            Does not make landmark head as we do not use face alignment.
            */
            // Assert
            assert(outputNames.size() == 2);

#if NV_TENSORRT_MAJOR >= 10
            // TRT 10+: use named tensor API
            assert(m_engine->getNbIOTensors() == 3);

            // Look up tensor indices by name
            for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
                const char* name = m_engine->getIOTensorName(i);
                if (name == inputName)        inputIndex = i;
                else if (name == outputNames[0]) outputIndex0 = i;
                else if (name == outputNames[1]) outputIndex1 = i;
            }
#else
            // TRT 8.x: use binding API
            assert(m_engine->getNbBindings() == 3);
            inputIndex = m_engine->getBindingIndex(inputName.c_str());
            outputIndex0 = m_engine->getBindingIndex(outputNames[0].c_str());
            outputIndex1 = m_engine->getBindingIndex(outputNames[1].c_str());
#endif

            // Create GPU buffers on device
            ANSFRHelper::CheckCudaStatus(cudaMalloc(&buffers[inputIndex], m_maxBatchSize * m_INPUT_SIZE));
            ANSFRHelper::CheckCudaStatus(cudaMalloc(&buffers[outputIndex0], m_maxBatchSize * m_OUTPUT_SIZE_BASE * 4 * sizeof(float)));
            ANSFRHelper::CheckCudaStatus(cudaMalloc(&buffers[outputIndex1], m_maxBatchSize * m_OUTPUT_SIZE_BASE * 2 * sizeof(float)));

#if NV_TENSORRT_MAJOR >= 10
            // TRT 10+: bind tensor addresses
            m_context->setTensorAddress(inputName.c_str(), buffers[inputIndex]);
            m_context->setTensorAddress(outputNames[0].c_str(), buffers[outputIndex0]);
            m_context->setTensorAddress(outputNames[1].c_str(), buffers[outputIndex1]);
#endif

            // Create stream
            ANSFRHelper::CheckCudaStatus(cudaStreamCreate(&stream));
        }
        catch (std::exception& e) {
           this->_logger.LogFatal("RetinaFace::PreInference", e.what(), __FILE__, __LINE__);
        }

    }

    void RetinaFaceTRT::PreProcess(cv::Mat& img) {
        try {
            // Release input vector
            m_input.release();

            // Resize
            float w, h, x, y;
            if (m_scale_h > m_scale_w) {
                w = float(m_INPUT_W);
                h = float(m_scale_w * img.rows);
                x = 0;
                y = float((m_INPUT_H - h) / 2);
            }
            else {
                w = float(m_scale_h * img.cols);
                h = float(m_INPUT_H);
                x = float((m_INPUT_W - w) / 2);
                y = 0;
            }
            cv::Mat re((int)h, (int)w, CV_8UC3);
            cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
            cv::Mat out((int)m_INPUT_H, (int)m_INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
            re.copyTo(out(cv::Rect(int(x), int(y), re.cols, re.rows)));


            // Normalize
            out.convertTo(out, CV_32F);
            out = out - cv::Scalar(104, 117, 123);
            std::vector<cv::Mat> temp;
            cv::split(out, temp);
            for (int i = 0; i < temp.size(); i++) {
                m_input.push_back(temp[i]);
            }
        }
        catch (std::exception& e) {
           this->_logger.LogFatal("RetinaFace::PreProcess", e.what(), __FILE__, __LINE__);
        }

    }

    void RetinaFaceTRT::RunInference(float* input, float* output0, float* output1) {
        try {
            // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
            ANSCENTER::ANSFRHelper::CheckCudaStatus(cudaMemcpyAsync(buffers[inputIndex], input, m_maxBatchSize * m_INPUT_SIZE, cudaMemcpyHostToDevice, stream));
#if NV_TENSORRT_MAJOR >= 10
            m_context->enqueueV3(stream);
#else
            m_context->enqueueV2(buffers, stream, nullptr);
#endif
            ANSFRHelper::CheckCudaStatus(cudaMemcpyAsync(output0, buffers[outputIndex0], m_maxBatchSize * m_OUTPUT_SIZE_BASE * 4 * sizeof(float), cudaMemcpyDeviceToHost, stream));
            ANSFRHelper::CheckCudaStatus(cudaMemcpyAsync(output1, buffers[outputIndex1], m_maxBatchSize * m_OUTPUT_SIZE_BASE * 2 * sizeof(float), cudaMemcpyDeviceToHost, stream));
            cudaStreamSynchronize(stream);
        }
        catch (std::exception& e) {
           this->_logger.LogFatal("RetinaFace::RunInference", e.what(), __FILE__, __LINE__);
        }
    }

    std::vector<struct Bbox> RetinaFaceTRT::FindFace(cv::Mat& img) {
        try {
            m_outputBbox.clear();
            int out_rows = img.rows;
            int out_cols = img.cols;
            m_frameWidth = static_cast<const int>(out_cols);
            m_frameHeight = static_cast<const int>(out_rows);
            m_scale_h = (float)(m_INPUT_H) / float(m_frameHeight);
            m_scale_w = (float)(m_INPUT_W) / float(m_frameWidth);
            PreProcess(img);
            RunInference((float*)m_input.ptr<float>(0), m_output0, m_output1);
            PostProcessing(m_output0, m_output1);
            return m_outputBbox;
        }
        catch (std::exception& e) {
            m_outputBbox.clear();
           this->_logger.LogFatal("RetinaFace::FindFace", e.what(), __FILE__, __LINE__);
            return m_outputBbox;

        }

    }

    void RetinaFaceTRT::PostProcessing(float* bbox, float* conf) {
        try {
            m_outputBbox.clear();
            std::vector<AnchorBox> anchor;
            CreateAnchorRetinaface(anchor, m_INPUT_W, m_INPUT_H);

            for (int i = 0; i < anchor.size(); ++i) {
                if (*(conf + 1) > m_bbox_threshold) {
                    AnchorBox tmp = anchor[i];
                    AnchorBox tmp1;
                    Bbox result{};

                    // decode bbox (y - W; x - H)
                    tmp1.cx = float(tmp.cx + *bbox * 0.1 * tmp.sx);
                    tmp1.cy = float(tmp.cy + *(bbox + 1) * 0.1 * tmp.sy);
                    tmp1.sx = float(tmp.sx * exp(*(bbox + 2) * 0.2));
                    tmp1.sy = float(tmp.sy * exp(*(bbox + 3) * 0.2));

                    result.y1 = int((tmp1.cx - tmp1.sx / 2) * m_INPUT_W);
                    result.x1 = int((tmp1.cy - tmp1.sy / 2) * m_INPUT_H);
                    result.y2 = int((tmp1.cx + tmp1.sx / 2) * m_INPUT_W);
                    result.x2 = int((tmp1.cy + tmp1.sy / 2) * m_INPUT_H);

                    // rescale to original size
                    if (m_scale_h > m_scale_w) {
                        result.y1 = int(result.y1 / m_scale_w);
                        result.y2 = int(result.y2 / m_scale_w);
                        result.x1 = int((result.x1 - (m_INPUT_H - m_scale_w * m_frameHeight) / 2) / m_scale_w);
                        result.x2 = int((result.x2 - (m_INPUT_H - m_scale_w * m_frameHeight) / 2) / m_scale_w);
                    }
                    else {
                        result.y1 = int((result.y1 - (m_INPUT_W - m_scale_h * m_frameWidth) / 2) / m_scale_h);
                        result.y2 = int((result.y2 - (m_INPUT_W - m_scale_h * m_frameWidth) / 2) / m_scale_h);
                        result.x1 = int(result.x1 / m_scale_h);
                        result.x2 = int(result.x2 / m_scale_h);
                    }

                    // Clip object box coordinates to network resolution
                    result.y1 = CLIP(result.y1, 0, m_frameWidth - 1);
                    result.x1 = CLIP(result.x1, 0, m_frameHeight - 1);
                    result.y2 = CLIP(result.y2, 0, m_frameWidth - 1);
                    result.x2 = CLIP(result.x2, 0, m_frameHeight - 1);

                    // Get confidence
                    result.score = *(conf + 1);

                    // Push to result vector
                    m_outputBbox.push_back(result);
                }
                bbox += 4;
                conf += 2;
            }
            std::sort(m_outputBbox.begin(), m_outputBbox.end(), MCMP);
            NMS(m_outputBbox, m_nms_threshold);
            if (m_outputBbox.size() > m_maxFacesPerScene)
                m_outputBbox.resize(m_maxFacesPerScene);
        }
        catch (std::exception& e) {
            m_outputBbox.clear();
           this->_logger.LogFatal("RetinaFace::PostProcessing", e.what(), __FILE__, __LINE__);
        }
    }

    void RetinaFaceTRT::CreateAnchorRetinaface(std::vector<AnchorBox>& anchor, int w, int h) {
        try {
            anchor.clear();
            std::vector<std::vector<int>> feature_map(3), min_sizes(3);
            float steps[] = { 8, 16, 32 };
            for (int i = 0; i < feature_map.size(); ++i) {
                feature_map[i].push_back(int(ceil(h / steps[i])));
                feature_map[i].push_back(int(ceil(w / steps[i])));
            }
            std::vector<int> minsize1 = { 10, 20 };
            min_sizes[0] = minsize1;
            std::vector<int> minsize2 = { 32, 64 };
            min_sizes[1] = minsize2;
            std::vector<int> minsize3 = { 128, 256 };
            min_sizes[2] = minsize3;

            for (int k = 0; k < feature_map.size(); ++k) {
                std::vector<int> min_size = min_sizes[k];
                for (int i = 0; i < feature_map[k][0]; ++i) {
                    for (int j = 0; j < feature_map[k][1]; ++j) {
                        for (int l = 0; l < min_size.size(); ++l) {
                            float s_kx = float(min_size[l] * 1.0 / w);
                            float s_ky = float(min_size[l] * 1.0 / h);
                            float cx = float((j + 0.5) * steps[k] / w);
                            float cy = float((i + 0.5) * steps[k] / h);
                            AnchorBox axil = { cx, cy, s_kx, s_ky };
                            anchor.push_back(axil);
                        }
                    }
                }
            }
        }
        catch (std::exception& e) {
            m_outputBbox.clear();
           this->_logger.LogFatal("RetinaFace::CreateAnchorRetinaface", e.what(), __FILE__, __LINE__);
        }

    }

    inline bool RetinaFaceTRT::MCMP(Bbox a, Bbox b) {
        if (a.score > b.score)
            return true;
        return false;
    }

    void RetinaFaceTRT::NMS(std::vector<Bbox>& input_boxes, float NMS_THRESH) {
        try {
            std::vector<float> vArea(input_boxes.size());
            for (int i = 0; i < int(input_boxes.size()); ++i) {
                vArea[i] = float((input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) * (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1));
            }
            for (int i = 0; i < int(input_boxes.size()); ++i) {
                for (int j = i + 1; j < int(input_boxes.size());) {
                    float xx1 = (float)(std::max(input_boxes[i].x1, input_boxes[j].x1));
                    float yy1 = (float)(std::max(input_boxes[i].y1, input_boxes[j].y1));
                    float xx2 = (float)(std::min(input_boxes[i].x2, input_boxes[j].x2));
                    float yy2 = (float)(std::min(input_boxes[i].y2, input_boxes[j].y2));
                    float w = (float)(std::max(float(0), xx2 - xx1 + 1));
                    float h = (float)(std::max(float(0), yy2 - yy1 + 1));
                    float inter = (float)(w * h);
                    float ovr = (float)(inter / (vArea[i] + vArea[j] - inter));
                    if (ovr >= NMS_THRESH) {
                        input_boxes.erase(input_boxes.begin() + j);
                        vArea.erase(vArea.begin() + j);
                    }
                    else {
                        j++;
                    }
                }
            }
        }
        catch (std::exception& e) {
            m_outputBbox.clear();
           this->_logger.LogFatal("RetinaFace::NMS", e.what(), __FILE__, __LINE__);
        }
    }

    RetinaFaceTRT::~RetinaFaceTRT() {
        try {
            // Release stream and buffers
            ANSFRHelper::CheckCudaStatus(cudaStreamDestroy(stream));
            ANSFRHelper::CheckCudaStatus(cudaFree(buffers[inputIndex]));
            ANSFRHelper::CheckCudaStatus(cudaFree(buffers[outputIndex0]));
            ANSFRHelper::CheckCudaStatus(cudaFree(buffers[outputIndex1]));
            // checkCudaStatus(cudaFree(buffers[outputIndex2]));
        }
        catch (std::exception& e) {
            m_outputBbox.clear();
           this->_logger.LogFatal("RetinaFace::~RetinaFace", e.what(), __FILE__, __LINE__);
        }


    }

}