modules/ANSODEngine/CNNFaceDetector.cpp

#include "CNNFaceDetector.h"
#include "Utility.h"
void* AllocMemory(size_t size)
{
    char* ptr, * ptr0;
    ptr0 = (char*)malloc(
        (size_t)(size + _MALLOC_ALIGN * ((size >= 4096) + 1L) + sizeof(char*)));

    if (!ptr0)
        return 0;

    // align the pointer
    ptr = (char*)(((size_t)(ptr0 + sizeof(char*) + 1) + _MALLOC_ALIGN - 1) & ~(size_t)(_MALLOC_ALIGN - 1));
    *(char**)(ptr - sizeof(char*)) = ptr0;

    return ptr;
}
void FreeMemory_(void* ptr)
{
    try {
        if (ptr)
        {
            if (((size_t)ptr & (_MALLOC_ALIGN - 1)) != 0)
                return;
            free(*((char**)ptr - 1));
        }
    }
    catch (std::exception& e) {

        std::cout << "ANSCENTER::FreeMemory:" << e.what();
    }

}
namespace ANSCENTER {
    bool SortScoreBBoxPairDescend(const std::pair<float, NormalizedBBox>& pair1, const std::pair<float, NormalizedBBox>& pair2)
    {
        return pair1.first > pair2.first;
    }

    bool ANSCNNFD::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
        return true;
    }
    bool ANSCNNFD::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) {
        try {
            return true;
        }
        catch (std::exception& e) {
            this->_logger.LogFatal("ANSCNNFD::LoadModel", e.what(), __FILE__, __LINE__);
            return false;
        }
    }

    bool ANSCNNFD::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, std::string& labelMap) {
        bool result = true;
        _licenseValid = true;
        if (!result) return false;
        try {
            _modelConfig = modelConfig;
            _modelConfig.modelType = ModelType::FACEDETECT;
            _modelConfig.detectionType = DetectionType::FACEDETECTOR;
            InitParameters();
            labelMap = "Face";
            _isInitialized = true;
            return true;

        }
        catch (std::exception& e) {
            this->_logger.LogFatal("ANSCNNFD::Initialize", e.what(), __FILE__, __LINE__);
            return false;
        }
    }
    std::vector<Object> ANSCNNFD::RunInference(const cv::Mat& input) {
        std::vector<Object> output;
        output.clear();
        if (!_licenseValid) {
    	if (_modelLoading.load()) return {};
            this->_logger.LogError("ANSCNNFD::RunInference", "Invalid license", __FILE__, __LINE__);
            return output;
        }
        if (!_isInitialized) {
            this->_logger.LogError("ANSCNNFD::RunInference", "Invalid model", __FILE__, __LINE__);
            return output;
        }
        try {
            bool croppedFace = false; // Check if the image is cropped face image
            int* pResults = nullptr;
            unsigned char* pBuffer =  static_cast<unsigned char*>(malloc(DETECT_BUFFER_SIZE));
            cv::Mat frame = input.clone();

            // We know that the image sizes <=300 px, it is likely that image is cropped for face only
            if ((input.size[0] <= 300) || (input.size[1] <= 300)) croppedFace = true;
            if (croppedFace) cv::copyMakeBorder(input, frame, 200, 200, 200, 200, cv::BORDER_REPLICATE);


            pResults = FaceDetectCNN(pBuffer, static_cast<unsigned char*>(frame.ptr(0)), frame.cols, frame.rows, static_cast<int>(frame.step));
            for (int i = 0; i < (pResults ? *pResults : 0); i++)
            {
                Object result;
                short* p = reinterpret_cast<short*>(pResults + 1) + 16 * i;
                float confidence = static_cast<float>(p[0]) / 100;
                if (confidence >= _modelConfig.detectionScoreThreshold) {
                    int x = p[1];
                    int y = p[2];
                    int w = p[3];
                    int h = p[4];
                    int x1, y1, x2, y2;

                    result.classId = 0;
                    result.className = "Face";
                    result.confidence = confidence;

                    result.box.x = x;
                    result.box.y = y;

                    if (croppedFace) {
                        if (x <= 200) x = 200;
                        if (y <= 200) y = 200;
                        result.box.x = x - 200;
                        result.box.y = y - 200;
                    }
                    result.box.width = w;
                    result.box.height = h;

                    x1 = x;
                    y1 = y;
                    x2 = x + w;
                    y2 = y + h;
                    cv::Rect facePos(cv::Point(x1, y1), cv::Point(x2, y2));
                    cv::Mat currFace = frame(facePos);
                    result.mask = currFace.clone();
                    output.push_back(result);
                }
            }   
            free(pBuffer);
            frame.release();
            return output;
        }
        catch (std::exception& e) {
            this->_logger.LogFatal("ANSCNNFD::RunInference", e.what(), __FILE__, __LINE__);
            return output;
        }
    }
   
    ANSCNNFD::~ANSCNNFD() {
        try {
            this->_logger.LogInfo("ANSCNNFD::~ANSCNNFD()", "Release ANSCNNFD ",__FILE__, __LINE__);
        }
        catch (std::exception& e) {
            std::cout << "ANSCNNFD::~ANSCNNFD()" << e.what() << std::endl;
        }
    }
    bool ANSCNNFD::Destroy() {
        try {
            this->_logger.LogInfo("ANSCNNFD::Destroy()", "Release ANSCNNFD ", __FILE__, __LINE__);
			return true;
        }
        catch (std::exception& e) {
            std::cout << "ANSCNNFD::Destroy()" << e.what() << std::endl;
			return false;
        }
    }
    // Private
    int* ANSCNNFD::FaceDetectCNN(unsigned char* result_buffer, unsigned char* rgb_image_data, int width, int height, int step) //input image, it must be BGR (three-channel) image!
    {
        try {
            if (!result_buffer)
            {
                this->_logger.LogError("ANSCNNFD::FaceDetectCNN", "Null buffer memory", __FILE__, __LINE__);
                return nullptr;
            }
            //clear memory
            result_buffer[0] = 0;
            result_buffer[1] = 0;
            result_buffer[2] = 0;
            result_buffer[3] = 0;

            std::vector<FaceRect> faces = ObjectDetectCNN(rgb_image_data, width, height, step);

            int num_faces = static_cast<int>(faces.size());
            num_faces = MIN(num_faces, 1024); //1024 = 0x9000 / (16 * 2 + 4)

            int* pCount = reinterpret_cast<int*>(result_buffer);
            pCount[0] = num_faces;

            for (int i = 0; i < num_faces; i++)
            {
                //copy data
                short* p = reinterpret_cast<short*>(result_buffer + 4) + 16 * size_t(i);
                p[0] = static_cast<short>(faces[i].score * 100);
                p[1] = static_cast<short>(faces[i].x);
                p[2] = static_cast<short>(faces[i].y);
                p[3] = static_cast<short>(faces[i].w);
                p[4] = static_cast<short>(faces[i].h);
                //copy landmarks
                for (int lmidx = 0; lmidx < 10; lmidx++)
                {
                    p[5 + lmidx] = static_cast<short>(faces[i].lm[lmidx]);
                }
            }

            return pCount;
        }
        catch (std::exception& e) {
            this->_logger.LogFatal("ANSCNNFD::FaceDetectCNN", e.what(), __FILE__, __LINE__);
            return nullptr;
        }

    }
    void ANSCNNFD::InitParameters() {
        for (int i = 0; i < NUM_CONV_LAYER; i++)
            g_pFilters[i] = param_pConvInfo[i];
        _paramInitialized = true;
    }
    std::vector<FaceRect> ANSCNNFD::ObjectDetectCNN(const unsigned char* rgbImageData, int width, int height, int step) {
        try {
            TIME_START;
            if (!_paramInitialized)
            {
                InitParameters();
            }
            TIME_END("init");


            TIME_START;
            auto fx = SetDataFrom3x3S2P1To1x1S1P0FromImage(rgbImageData, width, height, 3, step);
            TIME_END("convert data");

            /***************CONV0*********************/
            TIME_START;
            fx = Convolution(fx, g_pFilters[0]);
            TIME_END("conv_head");

            TIME_START;
            fx = ConvolutionDP(fx, g_pFilters[1], g_pFilters[2]);
            TIME_END("conv0");

            TIME_START;
            fx = MaxPooling2x2S2(fx);
            TIME_END("pool0");

            /***************CONV1*********************/
            TIME_START;
            fx = Convolution4LayerUnit(fx, g_pFilters[3], g_pFilters[4], g_pFilters[5], g_pFilters[6]);
            TIME_END("conv1");

            /***************CONV2*********************/
            TIME_START;
            fx = Convolution4LayerUnit(fx, g_pFilters[7], g_pFilters[8], g_pFilters[9], g_pFilters[10]);
            TIME_END("conv2");

            /***************CONV3*********************/
            TIME_START;
            fx = MaxPooling2x2S2(fx);
            TIME_END("pool3");

            TIME_START;
            auto fb1 = Convolution4LayerUnit(fx, g_pFilters[11], g_pFilters[12], g_pFilters[13], g_pFilters[14]);
            TIME_END("conv3");

            /***************CONV4*********************/
            TIME_START;
            fx = MaxPooling2x2S2(fb1);
            TIME_END("pool4");

            TIME_START;
            auto fb2 = Convolution4LayerUnit(fx, g_pFilters[15], g_pFilters[16], g_pFilters[17], g_pFilters[18]);
            TIME_END("conv4");

            /***************CONV5*********************/
            TIME_START;
            fx = MaxPooling2x2S2(fb2);
            TIME_END("pool5");

            TIME_START;
            auto fb3 = Convolution4LayerUnit(fx, g_pFilters[19], g_pFilters[20], g_pFilters[21], g_pFilters[22]);
            TIME_END("conv5");

            CDataBlob<float> pred_reg[3], pred_cls[3], pred_kps[3], pred_obj[3];
            /***************branch5*********************/
            TIME_START;
            fb3 = ConvolutionDP(fb3, g_pFilters[27], g_pFilters[28]);
            pred_cls[2] = ConvolutionDP(fb3, g_pFilters[33], g_pFilters[34], false);
            pred_reg[2] = ConvolutionDP(fb3, g_pFilters[39], g_pFilters[40], false);
            pred_kps[2] = ConvolutionDP(fb3, g_pFilters[51], g_pFilters[52], false);
            pred_obj[2] = ConvolutionDP(fb3, g_pFilters[45], g_pFilters[46], false);
            TIME_END("branch5");

            /*****************add5*********************/
            TIME_START;
            fb2 = ElementAdd(UpsampleX2(fb3), fb2);
            TIME_END("add5");

            /*****************add6*********************/
            TIME_START;
            fb2 = ConvolutionDP(fb2, g_pFilters[25], g_pFilters[26]);
            pred_cls[1] = ConvolutionDP(fb2, g_pFilters[31], g_pFilters[32], false);
            pred_reg[1] = ConvolutionDP(fb2, g_pFilters[37], g_pFilters[38], false);
            pred_kps[1] = ConvolutionDP(fb2, g_pFilters[49], g_pFilters[50], false);
            pred_obj[1] = ConvolutionDP(fb2, g_pFilters[43], g_pFilters[44], false);
            TIME_END("branch4");

            /*****************add4*********************/
            TIME_START;
            fb1 = ElementAdd(UpsampleX2(fb2), fb1);
            TIME_END("add4");

            /***************branch3*********************/
            TIME_START;
            fb1 = ConvolutionDP(fb1, g_pFilters[23], g_pFilters[24]);
            pred_cls[0] = ConvolutionDP(fb1, g_pFilters[29], g_pFilters[30], false);
            pred_reg[0] = ConvolutionDP(fb1, g_pFilters[35], g_pFilters[36], false);
            pred_kps[0] = ConvolutionDP(fb1, g_pFilters[47], g_pFilters[48], false);
            pred_obj[0] = ConvolutionDP(fb1, g_pFilters[41], g_pFilters[42], false);
            TIME_END("branch3");

            /***************PRIORBOX*********************/
            TIME_START;
            auto prior3 = MeshGrid(fb1.cols, fb1.rows, 8);
            auto prior4 = MeshGrid(fb2.cols, fb2.rows, 16);
            auto prior5 = MeshGrid(fb3.cols, fb3.rows, 32);
            TIME_END("prior");
            /***************PRIORBOX*********************/

            TIME_START;
            BboxDecode(pred_reg[0], prior3, 8);
            BboxDecode(pred_reg[1], prior4, 16);
            BboxDecode(pred_reg[2], prior5, 32);

            KPSDecode(pred_kps[0], prior3, 8);
            KPSDecode(pred_kps[1], prior4, 16);
            KPSDecode(pred_kps[2], prior5, 32);

            auto cls = Concat3(Blob2Vector(pred_cls[0]), Blob2Vector(pred_cls[1]), Blob2Vector(pred_cls[2]));
            auto reg = Concat3(Blob2Vector(pred_reg[0]), Blob2Vector(pred_reg[1]), Blob2Vector(pred_reg[2]));
            auto kps = Concat3(Blob2Vector(pred_kps[0]), Blob2Vector(pred_kps[1]), Blob2Vector(pred_kps[2]));
            auto obj = Concat3(Blob2Vector(pred_obj[0]), Blob2Vector(pred_obj[1]), Blob2Vector(pred_obj[2]));

            Sigmoid(cls);
            Sigmoid(obj);
            TIME_END("decode")

                TIME_START;
            std::vector<FaceRect> facesInfo = DetectionOutput(cls, reg, kps, obj, 0.45f, 0.2f, 1000, 512);
            TIME_END("detection output")
                return facesInfo;
        }
        catch (std::exception& e) {
            std::vector<FaceRect> facesInfo;
            facesInfo.clear();
            this->_logger.LogFatal("ANSCNNFD::ObjectDetectCNN", e.what(), __FILE__, __LINE__);
            return facesInfo;
        }

    }
    CDataBlob<float> ANSCNNFD::SetDataFrom3x3S2P1To1x1S1P0FromImage(const unsigned char* inputData, int imgWidth, int imgHeight, int imgChannels, int imgWidthStep, int padDivisor) {
        if (imgChannels != 3) {
            this->_logger.LogError("ANSCNNFD::SetDataFrom3x3S2P1To1x1S1P0FromImage", "The input image must be a 3-channel RGB image", __FILE__, __LINE__);

            exit(1);
        }
        if (padDivisor != 32) {
            this->_logger.LogError("ANSCNNFD::SetDataFrom3x3S2P1To1x1S1P0FromImage", "This version need pad of 32", __FILE__, __LINE__);
            exit(1);
        }
        int rows = ((imgHeight - 1) / padDivisor + 1) * padDivisor / 2;
        int cols = ((imgWidth - 1) / padDivisor + 1) * padDivisor / 2;
        int channels = 32;
        CDataBlob<float> outBlob(rows, cols, channels);

#if defined(_OPENMP)
#pragma omp parallel for
#endif
        for (int r = 0; r < rows; r++) {
            for (int c = 0; c < cols; c++) {
                float* pData = outBlob.ptr(r, c);
                for (int fy = -1; fy <= 1; fy++) {
                    int srcy = r * 2 + fy;

                    if (srcy < 0 || srcy >= imgHeight) //out of the range of the image
                        continue;

                    for (int fx = -1; fx <= 1; fx++) {
                        int srcx = c * 2 + fx;

                        if (srcx < 0 || srcx >= imgWidth) //out of the range of the image
                            continue;

                        const unsigned char* pImgData = inputData + size_t(imgWidthStep) * srcy + imgChannels * srcx;

                        int output_channel_offset = ((fy + 1) * 3 + fx + 1); //3x3 filters, 3-channel image
                        pData[output_channel_offset * imgChannels] = pImgData[0];
                        pData[output_channel_offset * imgChannels + 1] = pImgData[1];
                        pData[output_channel_offset * imgChannels + 2] = pImgData[2];
                    }
                }
            }
        }
        return outBlob;
    }

    //p1 and p2 must be 512-bit aligned (16 float numbers)
    inline float dotProduct(const float* p1, const float* p2, int num)
    {
        float sum = 0.f;

#if defined(_ENABLE_AVX512)
        __m512 a_float_x16, b_float_x16;
        __m512 sum_float_x16 = _mm512_setzero_ps();
        for (int i = 0; i < num; i += 16)
        {
            a_float_x16 = _mm512_load_ps(p1 + i);
            b_float_x16 = _mm512_load_ps(p2 + i);
            sum_float_x16 = _mm512_add_ps(sum_float_x16, _mm512_mul_ps(a_float_x16, b_float_x16));
        }
        sum = _mm512_reduce_add_ps(sum_float_x16);
#elif defined(_ENABLE_AVX2)
        __m256 a_float_x8, b_float_x8;
        __m256 sum_float_x8 = _mm256_setzero_ps();
        for (int i = 0; i < num; i += 8)
        {
            a_float_x8 = _mm256_load_ps(p1 + i);
            b_float_x8 = _mm256_load_ps(p2 + i);
            sum_float_x8 = _mm256_add_ps(sum_float_x8, _mm256_mul_ps(a_float_x8, b_float_x8));
        }
        sum_float_x8 = _mm256_hadd_ps(sum_float_x8, sum_float_x8);
        sum_float_x8 = _mm256_hadd_ps(sum_float_x8, sum_float_x8);
        sum = ((float*)&sum_float_x8)[0] + ((float*)&sum_float_x8)[4];
#elif defined(_ENABLE_NEON)
        float32x4_t a_float_x4, b_float_x4;
        float32x4_t sum_float_x4;
        sum_float_x4 = vdupq_n_f32(0);
        for (int i = 0; i < num; i += 4)
        {
            a_float_x4 = vld1q_f32(p1 + i);
            b_float_x4 = vld1q_f32(p2 + i);
            sum_float_x4 = vaddq_f32(sum_float_x4, vmulq_f32(a_float_x4, b_float_x4));
        }
        sum += vgetq_lane_f32(sum_float_x4, 0);
        sum += vgetq_lane_f32(sum_float_x4, 1);
        sum += vgetq_lane_f32(sum_float_x4, 2);
        sum += vgetq_lane_f32(sum_float_x4, 3);
#else
        for (int i = 0; i < num; i++)
        {
            sum += (p1[i] * p2[i]);
        }
#endif

        return sum;
    }
    inline bool vecMulAdd(const float* p1, const float* p2, float* p3, int num)
    {
#if defined(_ENABLE_AVX512)
        __m512 a_float_x16, b_float_x16, c_float_x16;
        for (int i = 0; i < num; i += 16)
        {
            a_float_x16 = _mm512_load_ps(p1 + i);
            b_float_x16 = _mm512_load_ps(p2 + i);
            c_float_x16 = _mm512_load_ps(p3 + i);
            c_float_x16 = _mm512_add_ps(c_float_x16, _mm512_mul_ps(a_float_x16, b_float_x16));
            _mm512_store_ps(p3 + i, c_float_x16);
        }
#elif defined(_ENABLE_AVX2)
        __m256 a_float_x8, b_float_x8, c_float_x8;
        for (int i = 0; i < num; i += 8)
        {
            a_float_x8 = _mm256_load_ps(p1 + i);
            b_float_x8 = _mm256_load_ps(p2 + i);
            c_float_x8 = _mm256_load_ps(p3 + i);
            c_float_x8 = _mm256_add_ps(c_float_x8, _mm256_mul_ps(a_float_x8, b_float_x8));
            _mm256_store_ps(p3 + i, c_float_x8);
        }
#elif defined(_ENABLE_NEON)
        float32x4_t a_float_x4, b_float_x4, c_float_x4;
        for (int i = 0; i < num; i += 4)
        {
            a_float_x4 = vld1q_f32(p1 + i);
            b_float_x4 = vld1q_f32(p2 + i);
            c_float_x4 = vld1q_f32(p3 + i);
            c_float_x4 = vaddq_f32(c_float_x4, vmulq_f32(a_float_x4, b_float_x4));
            vst1q_f32(p3 + i, c_float_x4);
        }
#else
        for (int i = 0; i < num; i++)
            p3[i] += (p1[i] * p2[i]);
#endif

        return true;
    }
    inline bool vecAdd(const float* p1, float* p2, int num)
    {
#if defined(_ENABLE_AVX512)
        __m512 a_float_x16, b_float_x16;
        for (int i = 0; i < num; i += 16)
        {
            a_float_x16 = _mm512_load_ps(p1 + i);
            b_float_x16 = _mm512_load_ps(p2 + i);
            b_float_x16 = _mm512_add_ps(a_float_x16, b_float_x16);
            _mm512_store_ps(p2 + i, b_float_x16);
        }
#elif defined(_ENABLE_AVX2)
        __m256 a_float_x8, b_float_x8;
        for (int i = 0; i < num; i += 8)
        {
            a_float_x8 = _mm256_load_ps(p1 + i);
            b_float_x8 = _mm256_load_ps(p2 + i);
            b_float_x8 = _mm256_add_ps(a_float_x8, b_float_x8);
            _mm256_store_ps(p2 + i, b_float_x8);
        }
#elif defined(_ENABLE_NEON)
        float32x4_t a_float_x4, b_float_x4, c_float_x4;
        for (int i = 0; i < num; i += 4)
        {
            a_float_x4 = vld1q_f32(p1 + i);
            b_float_x4 = vld1q_f32(p2 + i);
            c_float_x4 = vaddq_f32(a_float_x4, b_float_x4);
            vst1q_f32(p2 + i, c_float_x4);
        }
#else
        for (int i = 0; i < num; i++)
        {
            p2[i] += p1[i];
        }
#endif
        return true;
    }
    inline bool vecAdd(const float* p1, const float* p2, float* p3, int num)
    {
#if defined(_ENABLE_AVX512)
        __m512 a_float_x16, b_float_x16;
        for (int i = 0; i < num; i += 16)
        {
            a_float_x16 = _mm512_load_ps(p1 + i);
            b_float_x16 = _mm512_load_ps(p2 + i);
            b_float_x16 = _mm512_add_ps(a_float_x16, b_float_x16);
            _mm512_store_ps(p3 + i, b_float_x16);
        }
#elif defined(_ENABLE_AVX2)
        __m256 a_float_x8, b_float_x8;
        for (int i = 0; i < num; i += 8)
        {
            a_float_x8 = _mm256_load_ps(p1 + i);
            b_float_x8 = _mm256_load_ps(p2 + i);
            b_float_x8 = _mm256_add_ps(a_float_x8, b_float_x8);
            _mm256_store_ps(p3 + i, b_float_x8);
        }
#elif defined(_ENABLE_NEON)
        float32x4_t a_float_x4, b_float_x4, c_float_x4;
        for (int i = 0; i < num; i += 4)
        {
            a_float_x4 = vld1q_f32(p1 + i);
            b_float_x4 = vld1q_f32(p2 + i);
            c_float_x4 = vaddq_f32(a_float_x4, b_float_x4);
            vst1q_f32(p3 + i, c_float_x4);
        }
#else
        for (int i = 0; i < num; i++)
        {
            p3[i] = p1[i] + p2[i];
        }
#endif
        return true;
    }
    bool ANSCNNFD::Convolution1x1PointWise(const CDataBlob<float>& inputData, const Filters<float>& filters, CDataBlob<float>& outputData)
    {
#if defined(_OPENMP)
#pragma omp parallel for
#endif
        for (int row = 0; row < outputData.rows; row++)
        {
            for (int col = 0; col < outputData.cols; col++)
            {
                float* pOut = outputData.ptr(row, col);
                const float* pIn = inputData.ptr(row, col);
                for (int ch = 0; ch < outputData.channels; ch++)
                {
                    const float* pF = filters.weights.ptr(0, ch);
                    pOut[ch] = dotProduct(pIn, pF, inputData.channels);
                    pOut[ch] += filters.biases.data[ch];
                }
            }
        }
        return true;
    }
    bool ANSCNNFD::Convolution3x3DepthWise(const CDataBlob<float>& inputData, const Filters<float>& filters, CDataBlob<float>& outputData)
    {
        //set all elements in outputData to zeros
        outputData.setZero();
#if defined(_OPENMP)
#pragma omp parallel for
#endif
        for (int row = 0; row < outputData.rows; row++)
        {
            int srcy_start = row - 1;
            int srcy_end = srcy_start + 3;
            srcy_start = MAX(0, srcy_start);
            srcy_end = MIN(srcy_end, inputData.rows);

            for (int col = 0; col < outputData.cols; col++)
            {
                float* pOut = outputData.ptr(row, col);
                int srcx_start = col - 1;
                int srcx_end = srcx_start + 3;
                srcx_start = MAX(0, srcx_start);
                srcx_end = MIN(srcx_end, inputData.cols);


                for (int r = srcy_start; r < srcy_end; r++)
                    for (int c = srcx_start; c < srcx_end; c++)
                    {
                        int filter_r = r - row + 1;
                        int filter_c = c - col + 1;
                        int filter_idx = filter_r * 3 + filter_c;
                        vecMulAdd(inputData.ptr(r, c), filters.weights.ptr(0, filter_idx), pOut, filters.num_filters);
                    }
                vecAdd(filters.biases.ptr(0, 0), pOut, filters.num_filters);
            }
        }
        return true;
    }
    bool ANSCNNFD::Relu(CDataBlob<float>& inputoutputData)
    {
        if (inputoutputData.isEmpty())
        {
            this->_logger.LogError("ANSCNNFD::Relu", "The input data is empty", __FILE__, __LINE__);
            return false;
        }

        int len = inputoutputData.cols * inputoutputData.rows * inputoutputData.channelStep / sizeof(float);


#if defined(_ENABLE_AVX512)
        __m512 a, bzeros;
        bzeros = _mm512_setzero_ps(); //zeros
        for (int i = 0; i < len; i += 16)
        {
            a = _mm512_load_ps(inputoutputData.data + i);
            a = _mm512_max_ps(a, bzeros);
            _mm512_store_ps(inputoutputData.data + i, a);
        }
#elif defined(_ENABLE_AVX2)
        __m256 a, bzeros;
        bzeros = _mm256_setzero_ps(); //zeros
        for (int i = 0; i < len; i += 8)
        {
            a = _mm256_load_ps(inputoutputData.data + i);
            a = _mm256_max_ps(a, bzeros);
            _mm256_store_ps(inputoutputData.data + i, a);
        }
#else    
        for (int i = 0; i < len; i++)
            inputoutputData.data[i] *= (inputoutputData.data[i] > 0);
#endif

        return true;
    }
    void ANSCNNFD::IntersectBBox(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2,
        NormalizedBBox* intersect_bbox)
    {
        if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin ||
            bbox2.ymin > bbox1.ymax || bbox2.ymax < bbox1.ymin)
        {
            // Return [0, 0, 0, 0] if there is no intersection.
            intersect_bbox->xmin = 0;
            intersect_bbox->ymin = 0;
            intersect_bbox->xmax = 0;
            intersect_bbox->ymax = 0;
        }
        else
        {
            intersect_bbox->xmin = (std::max(bbox1.xmin, bbox2.xmin));
            intersect_bbox->ymin = (std::max(bbox1.ymin, bbox2.ymin));
            intersect_bbox->xmax = (std::min(bbox1.xmax, bbox2.xmax));
            intersect_bbox->ymax = (std::min(bbox1.ymax, bbox2.ymax));
        }
    }
    float ANSCNNFD::JaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2)
    {
        NormalizedBBox intersect_bbox;
        IntersectBBox(bbox1, bbox2, &intersect_bbox);
        float intersect_width, intersect_height;
        intersect_width = intersect_bbox.xmax - intersect_bbox.xmin;
        intersect_height = intersect_bbox.ymax - intersect_bbox.ymin;

        if (intersect_width > 0 && intersect_height > 0)
        {
            float intersect_size = intersect_width * intersect_height;
            float bsize1 = (bbox1.xmax - bbox1.xmin) * (bbox1.ymax - bbox1.ymin);
            float bsize2 = (bbox2.xmax - bbox2.xmin) * (bbox2.ymax - bbox2.ymin);
            return intersect_size / (bsize1 + bsize2 - intersect_size);
        }
        else
        {
            return 0.f;
        }
    }
    CDataBlob<float> ANSCNNFD::UpsampleX2(const CDataBlob<float>& inputData) {
        if (inputData.isEmpty()) {
            this->_logger.LogError("ANSCNNFD::UpsampleX2", "The input data is empty", __FILE__, __LINE__);
            exit(1);
        }

        CDataBlob<float> outData(inputData.rows * 2, inputData.cols * 2, inputData.channels);

        for (int r = 0; r < inputData.rows; r++) {
            for (int c = 0; c < inputData.cols; c++) {
                const float* pIn = inputData.ptr(r, c);
                int outr = r * 2;
                int outc = c * 2;
                for (int ch = 0; ch < inputData.channels; ++ch) {
                    outData.ptr(outr, outc)[ch] = pIn[ch];
                    outData.ptr(outr, outc + 1)[ch] = pIn[ch];
                    outData.ptr(outr + 1, outc)[ch] = pIn[ch];
                    outData.ptr(outr + 1, outc + 1)[ch] = pIn[ch];
                }
            }
        }
        return outData;
    }

    CDataBlob<float> ANSCNNFD::ElementAdd(const CDataBlob<float>& inputData1, const CDataBlob<float>& inputData2) {
        if (inputData1.rows != inputData2.rows || inputData1.cols != inputData2.cols || inputData1.channels != inputData2.channels) {
            this->_logger.LogError("ANSCNNFD::ElementAdd", "The two input datas must be in the same shape.", __FILE__, __LINE__);
            exit(1);
        }
        CDataBlob<float> outData(inputData1.rows, inputData1.cols, inputData1.channels);
        for (int r = 0; r < inputData1.rows; r++) {
            for (int c = 0; c < inputData1.cols; c++) {
                const float* pIn1 = inputData1.ptr(r, c);
                const float* pIn2 = inputData2.ptr(r, c);
                float* pOut = outData.ptr(r, c);
                vecAdd(pIn1, pIn2, pOut, inputData1.channels);
            }
        }
        return outData;
    }
    CDataBlob<float> ANSCNNFD::Convolution(const CDataBlob<float>& inputData, const Filters<float>& filters, bool do_relu)
    {
        if (inputData.isEmpty() || filters.weights.isEmpty() || filters.biases.isEmpty())
        {
            this->_logger.LogError("ANSCNNFD::Convolution", "The input data or filter data is empty.", __FILE__, __LINE__);

            exit(1);
        }
        if (inputData.channels != filters.channels)
        {
            this->_logger.LogError("ANSCNNFD::Convolution", "The input data dimension cannot meet filters.", __FILE__, __LINE__);
            exit(1);
        }
        CDataBlob<float> outputData(inputData.rows, inputData.cols, filters.num_filters);
        if (filters.is_pointwise && !filters.is_depthwise)
            Convolution1x1PointWise(inputData, filters, outputData);
        else if (!filters.is_pointwise && filters.is_depthwise)
            Convolution3x3DepthWise(inputData, filters, outputData);
        else
        {
            this->_logger.LogError("ANSCNNFD::Convolution", "Unsupported filter type.", __FILE__, __LINE__);
            exit(1);
        }

        if (do_relu)
            Relu(outputData);

        return outputData;
    }
    CDataBlob<float> ANSCNNFD::ConvolutionDP(const CDataBlob<float>& inputData,
        const Filters<float>& filtersP, const Filters<float>& filtersD, bool do_relu)
    {
        CDataBlob<float> tmp = Convolution(inputData, filtersP, false);
        CDataBlob<float> out = Convolution(tmp, filtersD, do_relu);
        return out;
    }

    CDataBlob<float> ANSCNNFD::Convolution4LayerUnit(const CDataBlob<float>& inputData,
        const Filters<float>& filtersP1, const Filters<float>& filtersD1,
        const Filters<float>& filtersP2, const Filters<float>& filtersD2, bool do_relu)
    {
        CDataBlob<float> tmp = ConvolutionDP(inputData, filtersP1, filtersD1, true);
        CDataBlob<float> out = ConvolutionDP(tmp, filtersP2, filtersD2, do_relu);
        return out;
    }


    //only 2X2 S2 is supported
    CDataBlob<float> ANSCNNFD::MaxPooling2x2S2(const CDataBlob<float>& inputData)
    {
        if (inputData.isEmpty())
        {
            this->_logger.LogError("ANSCNNFD::MaxPooling2x2S2", "The input data is empty.", __FILE__, __LINE__);
            exit(1);
        }
        int outputR = static_cast<int>(ceil((inputData.rows - 3.0f) / 2)) + 1;
        int outputC = static_cast<int>(ceil((inputData.cols - 3.0f) / 2)) + 1;
        int outputCH = inputData.channels;

        if (outputR < 1 || outputC < 1)
        {
            this->_logger.LogError("ANSCNNFD::MaxPooling2x2S2", "The size of the output is not correct.", __FILE__, __LINE__);
            exit(1);
        }

        CDataBlob<float> outputData(outputR, outputC, outputCH);
        outputData.setZero();

        for (int row = 0; row < outputData.rows; row++)
        {
            for (int col = 0; col < outputData.cols; col++)
            {
                size_t inputMatOffsetsInElement[4];
                int elementCount = 0;

                int rstart = row * 2;
                int cstart = col * 2;
                int rend = MIN(rstart + 2, inputData.rows);
                int cend = MIN(cstart + 2, inputData.cols);

                for (int fr = rstart; fr < rend; fr++)
                {
                    for (int fc = cstart; fc < cend; fc++)
                    {
                        inputMatOffsetsInElement[elementCount++] = (size_t(fr) * inputData.cols + fc) * inputData.channelStep / sizeof(float);
                    }
                }

                float* pOut = outputData.ptr(row, col);
                float* pIn = inputData.data;

#if defined(_ENABLE_NEON)
                for (int ch = 0; ch < outputData.channels; ch += 4)
                {
                    float32x4_t tmp;
                    float32x4_t maxVal = vld1q_f32(pIn + ch + inputMatOffsetsInElement[0]);
                    for (int ec = 1; ec < elementCount; ec++)
                    {
                        tmp = vld1q_f32(pIn + ch + inputMatOffsetsInElement[ec]);
                        maxVal = vmaxq_f32(maxVal, tmp);
                    }
                    vst1q_f32(pOut + ch, maxVal);
                }
#elif defined(_ENABLE_AVX512)
                for (int ch = 0; ch < outputData.channels; ch += 16)
                {
                    __m512 tmp;
                    __m512 maxVal = _mm512_load_ps((__m512 const*)(pIn + ch + inputMatOffsetsInElement[0]));
                    for (int ec = 1; ec < elementCount; ec++)
                    {
                        tmp = _mm512_load_ps((__m512 const*)(pIn + ch + inputMatOffsetsInElement[ec]));
                        maxVal = _mm512_max_ps(maxVal, tmp);
                    }
                    _mm512_store_ps((__m512*)(pOut + ch), maxVal);
                }
#elif defined(_ENABLE_AVX2)
                for (int ch = 0; ch < outputData.channels; ch += 8)
                {
                    __m256 tmp;
                    __m256 maxVal = _mm256_load_ps((float const*)(pIn + ch + inputMatOffsetsInElement[0]));
                    for (int ec = 1; ec < elementCount; ec++)
                    {
                        tmp = _mm256_load_ps((float const*)(pIn + ch + inputMatOffsetsInElement[ec]));
                        maxVal = _mm256_max_ps(maxVal, tmp);
                    }
                    _mm256_store_ps(pOut + ch, maxVal);
                }
#else
                for (int ch = 0; ch < outputData.channels; ch++)
                {
                    float maxVal = pIn[ch + inputMatOffsetsInElement[0]];
                    for (int ec = 1; ec < elementCount; ec++)
                    {
                        maxVal = MAX(maxVal, pIn[ch + inputMatOffsetsInElement[ec]]);
                    }
                    pOut[ch] = maxVal;
                }
#endif
            }
        }
        return outputData;
    }

    CDataBlob<float> ANSCNNFD::MeshGrid(int feature_width, int feature_height, int stride, float offset) {
        CDataBlob<float> out(feature_height, feature_width, 2);
        for (int r = 0; r < feature_height; ++r) {
            float rx = (float)(r * stride) + offset;
            for (int c = 0; c < feature_width; ++c) {
                float* p = out.ptr(r, c);
                p[0] = (float)(c * stride) + offset;
                p[1] = rx;
            }
        }
        return out;
    }

    void ANSCNNFD::BboxDecode(CDataBlob<float>& bbox_pred, const CDataBlob<float>& priors, int stride) {
        if (bbox_pred.cols != priors.cols || bbox_pred.rows != priors.rows) {
            this->_logger.LogError("ANSCNNFD::BboxDecode", "Mismatch between feature map and anchor size.", __FILE__, __LINE__);
        }
        if (bbox_pred.channels != 4) {
            this->_logger.LogError("ANSCNNFD::BboxDecode", "The bbox dim must be 4.", __FILE__, __LINE__);
        }
        float fstride = (float)stride;
        for (int r = 0; r < bbox_pred.rows; ++r) {
            for (int c = 0; c < bbox_pred.cols; ++c) {
                float* pb = bbox_pred.ptr(r, c);
                const float* pp = priors.ptr(r, c);
                float cx = pb[0] * fstride + pp[0];
                float cy = pb[1] * fstride + pp[1];
                float w = std::exp(pb[2]) * fstride;
                float h = std::exp(pb[3]) * fstride;
                pb[0] = cx - w / 2.f;
                pb[1] = cy - h / 2.f;
                pb[2] = cx + w / 2.f;
                pb[3] = cy + h / 2.f;
            }
        }
    }

    void ANSCNNFD::KPSDecode(CDataBlob<float>& kps_pred, const CDataBlob<float>& priors, int stride) {
        if (kps_pred.cols != priors.cols || kps_pred.rows != priors.rows) {
            this->_logger.LogError("ANSCNNFD::KPSDecode", "Mismatch between feature map and anchor size.", __FILE__, __LINE__);

            exit(1);
        }
        if (kps_pred.channels & 1) {
            this->_logger.LogError("ANSCNNFD::KPSDecode", "The kps dim must be even.", __FILE__, __LINE__);
            exit(1);
        }
        float fstride = (float)stride;
        int num_points = kps_pred.channels >> 1;

        for (int r = 0; r < kps_pred.rows; ++r) {
            for (int c = 0; c < kps_pred.cols; ++c) {
                float* pb = kps_pred.ptr(r, c);
                const float* pp = priors.ptr(r, c);
                for (int n = 0; n < num_points; ++n) {
                    pb[2 * n] = pb[2 * n] * fstride + pp[0];
                    pb[2 * n + 1] = pb[2 * n + 1] * fstride + pp[1];
                }
            }
        }
    }

    template<typename T>
    CDataBlob<T> ANSCNNFD::Concat3(const CDataBlob<T>& inputData1, const CDataBlob<T>& inputData2, const CDataBlob<T>& inputData3)
    {
        if ((inputData1.isEmpty()) || (inputData2.isEmpty()) || (inputData3.isEmpty()))
        {
            this->_logger.LogError("ANSCNNFD::Concat3", "The input data is empty.", __FILE__, __LINE__);
            exit(1);
        }

        if ((inputData1.cols != inputData2.cols) ||
            (inputData1.rows != inputData2.rows) ||
            (inputData1.cols != inputData3.cols) ||
            (inputData1.rows != inputData3.rows))
        {
            this->_logger.LogError("ANSCNNFD::Concat3", "The three inputs must have the same size.", __FILE__, __LINE__);
            exit(1);
        }
        int outputR = inputData1.rows;
        int outputC = inputData1.cols;
        int outputCH = inputData1.channels + inputData2.channels + inputData3.channels;

        if (outputR < 1 || outputC < 1 || outputCH < 1)
        {
            this->_logger.LogError("ANSCNNFD::Concat3", "The size of the output is not correct.", __FILE__, __LINE__);
            exit(1);
        }

        CDataBlob<T> outputData(outputR, outputC, outputCH);

        for (int row = 0; row < outputData.rows; row++)
        {
            for (int col = 0; col < outputData.cols; col++)
            {
                T* pOut = outputData.ptr(row, col);
                const T* pIn1 = inputData1.ptr(row, col);
                const T* pIn2 = inputData2.ptr(row, col);
                const T* pIn3 = inputData3.ptr(row, col);

                memcpy(pOut, pIn1, sizeof(T) * inputData1.channels);
                memcpy(pOut + inputData1.channels, pIn2, sizeof(T) * inputData2.channels);
                memcpy(pOut + inputData1.channels + inputData2.channels, pIn3, sizeof(T) * inputData3.channels);
            }
        }
        return outputData;
    }
    template CDataBlob<float> ANSCNNFD::Concat3(const CDataBlob<float>& inputData1, const CDataBlob<float>& inputData2, const CDataBlob<float>& inputData3);

    template<typename T>
    CDataBlob<T> ANSCNNFD::Blob2Vector(const CDataBlob<T>& inputData)
    {
        if (inputData.isEmpty())
        {
            this->_logger.LogError("ANSCNNFD::Blob2Vector", "The input data is empty.", __FILE__, __LINE__);
            exit(1);
        }

        CDataBlob<T> outputData(1, 1, inputData.cols * inputData.rows * inputData.channels);

        int bytesOfAChannel = inputData.channels * sizeof(T);
        T* pOut = outputData.ptr(0, 0);
        for (int row = 0; row < inputData.rows; row++)
        {
            for (int col = 0; col < inputData.cols; col++)
            {
                const T* pIn = inputData.ptr(row, col);
                memcpy(pOut, pIn, bytesOfAChannel);
                pOut += inputData.channels;
            }
        }

        return outputData;
    }
    template CDataBlob<float> ANSCNNFD::Blob2Vector(const CDataBlob<float>& inputData);

    void ANSCNNFD::Sigmoid(CDataBlob<float>& inputData) {
        for (int r = 0; r < inputData.rows; ++r) {
            for (int c = 0; c < inputData.cols; ++c) {
                float* pIn = inputData.ptr(r, c);
                for (int ch = 0; ch < inputData.channels; ++ch) {
                    float v = pIn[ch];
                    v = std::min(v, 88.3762626647949f);
                    v = std::max(v, -88.3762626647949f);
                    pIn[ch] = static_cast<float>(1.f / (1.f + exp(-v)));
                }
            }
        }
    }

    std::vector<FaceRect> ANSCNNFD::DetectionOutput(const CDataBlob<float>& cls,
        const CDataBlob<float>& reg,
        const CDataBlob<float>& kps,
        const CDataBlob<float>& obj,
        float overlap_threshold,
        float confidence_threshold,
        int top_k,
        int keep_top_k)
    {
        if (reg.isEmpty() || cls.isEmpty() || kps.isEmpty() || obj.isEmpty())//|| iou.isEmpty())
        {
            this->_logger.LogError("ANSCNNFD::DetectionOutput", "The input data is null.", __FILE__, __LINE__);
            exit(1);
        }
        if (reg.cols != 1 || reg.rows != 1 || cls.cols != 1 || cls.rows != 1 || kps.cols != 1 || kps.rows != 1 || obj.cols != 1 || obj.rows != 1) {
            this->_logger.LogError("ANSCNNFD::DetectionOutput", "Only support vector format.", __FILE__, __LINE__);

            exit(1);
        }

        if ((int)(kps.channels / obj.channels) != 10) {
            this->_logger.LogError("ANSCNNFD::DetectionOutput", "Only support 5 keypoints.", __FILE__, __LINE__);
            exit(1);
        }

        const float* pCls = cls.ptr(0, 0);
        const float* pReg = reg.ptr(0, 0);
        const float* pObj = obj.ptr(0, 0);
        const float* pKps = kps.ptr(0, 0);

        std::vector<std::pair<float, NormalizedBBox> > score_bbox_vec;
        std::vector<std::pair<float, NormalizedBBox> > final_score_bbox_vec;

        //get the candidates those are > confidence_threshold
        for (int i = 0; i < cls.channels; ++i)
        {
            float conf = std::sqrt(pCls[i] * pObj[i]);
            // float conf = pCls[i] * pObj[i];

            if (conf >= confidence_threshold)
            {
                NormalizedBBox bb;
                bb.xmin = pReg[4 * i];
                bb.ymin = pReg[4 * i + 1];
                bb.xmax = pReg[4 * i + 2];
                bb.ymax = pReg[4 * i + 3];

                //store the five landmarks
                memcpy(bb.lm, pKps + 10 * i, 10 * sizeof(float));
                score_bbox_vec.push_back(std::make_pair(conf, bb));
            }
        }

        //Sort the score pair according to the scores in descending order
        std::stable_sort(score_bbox_vec.begin(), score_bbox_vec.end(), SortScoreBBoxPairDescend);

        // Keep top_k scores if needed.
        if (top_k > -1 && size_t(top_k) < score_bbox_vec.size()) {
            score_bbox_vec.resize(top_k);
        }

        //Do NMS
        final_score_bbox_vec.clear();
        while (score_bbox_vec.size() != 0) {
            const NormalizedBBox bb1 = score_bbox_vec.front().second;
            bool keep = true;
            for (size_t k = 0; k < final_score_bbox_vec.size(); k++)
            {
                if (keep)
                {
                    const NormalizedBBox bb2 = final_score_bbox_vec[k].second;
                    float overlap = JaccardOverlap(bb1, bb2);
                    keep = (overlap <= overlap_threshold);
                }
                else
                {
                    break;
                }
            }
            if (keep) {
                final_score_bbox_vec.push_back(score_bbox_vec.front());
            }
            score_bbox_vec.erase(score_bbox_vec.begin());
        }
        if (keep_top_k > -1 && size_t(keep_top_k) < final_score_bbox_vec.size()) {
            final_score_bbox_vec.resize(keep_top_k);
        }

        //copy the results to the output blob
        int num_faces = (int)final_score_bbox_vec.size();

        std::vector<FaceRect> facesInfo;
        for (int fi = 0; fi < num_faces; fi++)
        {
            std::pair<float, NormalizedBBox> pp = final_score_bbox_vec[fi];

            FaceRect r;
            r.score = pp.first;
            r.x = int(pp.second.xmin);
            r.y = int(pp.second.ymin);
            r.w = int(pp.second.xmax - pp.second.xmin);
            r.h = int(pp.second.ymax - pp.second.ymin);
            //copy landmark data
            for (int i = 0; i < 10; ++i) {
                r.lm[i] = int(pp.second.lm[i]);
            }
            facesInfo.emplace_back(r);
        }

        return facesInfo;
    }


}