#include "CNNFaceDetector.h" #include "Utility.h" void* AllocMemory(size_t size) { char* ptr, * ptr0; ptr0 = (char*)malloc( (size_t)(size + _MALLOC_ALIGN * ((size >= 4096) + 1L) + sizeof(char*))); if (!ptr0) return 0; // align the pointer ptr = (char*)(((size_t)(ptr0 + sizeof(char*) + 1) + _MALLOC_ALIGN - 1) & ~(size_t)(_MALLOC_ALIGN - 1)); *(char**)(ptr - sizeof(char*)) = ptr0; return ptr; } void FreeMemory_(void* ptr) { try { if (ptr) { if (((size_t)ptr & (_MALLOC_ALIGN - 1)) != 0) return; free(*((char**)ptr - 1)); } } catch (std::exception& e) { std::cout << "ANSCENTER::FreeMemory:" << e.what(); } } namespace ANSCENTER { bool SortScoreBBoxPairDescend(const std::pair& pair1, const std::pair& pair2) { return pair1.first > pair2.first; } bool ANSCNNFD::OptimizeModel(bool fp16, std::string& optimizedModelFolder) { return true; } bool ANSCNNFD::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) { try { return true; } catch (std::exception& e) { this->_logger.LogFatal("ANSCNNFD::LoadModel", e.what(), __FILE__, __LINE__); return false; } } bool ANSCNNFD::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, std::string& labelMap) { bool result = true; _licenseValid = true; if (!result) return false; try { _modelConfig = modelConfig; _modelConfig.modelType = ModelType::FACEDETECT; _modelConfig.detectionType = DetectionType::FACEDETECTOR; InitParameters(); labelMap = "Face"; _isInitialized = true; return true; } catch (std::exception& e) { this->_logger.LogFatal("ANSCNNFD::Initialize", e.what(), __FILE__, __LINE__); return false; } } std::vector ANSCNNFD::RunInference(const cv::Mat& input) { std::vector output; output.clear(); if (!_licenseValid) { if (_modelLoading.load()) return {}; this->_logger.LogError("ANSCNNFD::RunInference", "Invalid license", __FILE__, __LINE__); return output; } if (!_isInitialized) { this->_logger.LogError("ANSCNNFD::RunInference", "Invalid model", __FILE__, __LINE__); return output; } try { bool croppedFace = false; // Check if the image is cropped face image int* pResults = nullptr; unsigned char* pBuffer = static_cast(malloc(DETECT_BUFFER_SIZE)); cv::Mat frame = input.clone(); // We know that the image sizes <=300 px, it is likely that image is cropped for face only if ((input.size[0] <= 300) || (input.size[1] <= 300)) croppedFace = true; if (croppedFace) cv::copyMakeBorder(input, frame, 200, 200, 200, 200, cv::BORDER_REPLICATE); pResults = FaceDetectCNN(pBuffer, static_cast(frame.ptr(0)), frame.cols, frame.rows, static_cast(frame.step)); for (int i = 0; i < (pResults ? *pResults : 0); i++) { Object result; short* p = reinterpret_cast(pResults + 1) + 16 * i; float confidence = static_cast(p[0]) / 100; if (confidence >= _modelConfig.detectionScoreThreshold) { int x = p[1]; int y = p[2]; int w = p[3]; int h = p[4]; int x1, y1, x2, y2; result.classId = 0; result.className = "Face"; result.confidence = confidence; result.box.x = x; result.box.y = y; if (croppedFace) { if (x <= 200) x = 200; if (y <= 200) y = 200; result.box.x = x - 200; result.box.y = y - 200; } result.box.width = w; result.box.height = h; x1 = x; y1 = y; x2 = x + w; y2 = y + h; cv::Rect facePos(cv::Point(x1, y1), cv::Point(x2, y2)); cv::Mat currFace = frame(facePos); result.mask = currFace.clone(); output.push_back(result); } } free(pBuffer); frame.release(); return output; } catch (std::exception& e) { this->_logger.LogFatal("ANSCNNFD::RunInference", e.what(), __FILE__, __LINE__); return output; } } ANSCNNFD::~ANSCNNFD() { try { this->_logger.LogInfo("ANSCNNFD::~ANSCNNFD()", "Release ANSCNNFD ",__FILE__, __LINE__); } catch (std::exception& e) { std::cout << "ANSCNNFD::~ANSCNNFD()" << e.what() << std::endl; } } bool ANSCNNFD::Destroy() { try { this->_logger.LogInfo("ANSCNNFD::Destroy()", "Release ANSCNNFD ", __FILE__, __LINE__); return true; } catch (std::exception& e) { std::cout << "ANSCNNFD::Destroy()" << e.what() << std::endl; return false; } } // Private int* ANSCNNFD::FaceDetectCNN(unsigned char* result_buffer, unsigned char* rgb_image_data, int width, int height, int step) //input image, it must be BGR (three-channel) image! { try { if (!result_buffer) { this->_logger.LogError("ANSCNNFD::FaceDetectCNN", "Null buffer memory", __FILE__, __LINE__); return nullptr; } //clear memory result_buffer[0] = 0; result_buffer[1] = 0; result_buffer[2] = 0; result_buffer[3] = 0; std::vector faces = ObjectDetectCNN(rgb_image_data, width, height, step); int num_faces = static_cast(faces.size()); num_faces = MIN(num_faces, 1024); //1024 = 0x9000 / (16 * 2 + 4) int* pCount = reinterpret_cast(result_buffer); pCount[0] = num_faces; for (int i = 0; i < num_faces; i++) { //copy data short* p = reinterpret_cast(result_buffer + 4) + 16 * size_t(i); p[0] = static_cast(faces[i].score * 100); p[1] = static_cast(faces[i].x); p[2] = static_cast(faces[i].y); p[3] = static_cast(faces[i].w); p[4] = static_cast(faces[i].h); //copy landmarks for (int lmidx = 0; lmidx < 10; lmidx++) { p[5 + lmidx] = static_cast(faces[i].lm[lmidx]); } } return pCount; } catch (std::exception& e) { this->_logger.LogFatal("ANSCNNFD::FaceDetectCNN", e.what(), __FILE__, __LINE__); return nullptr; } } void ANSCNNFD::InitParameters() { for (int i = 0; i < NUM_CONV_LAYER; i++) g_pFilters[i] = param_pConvInfo[i]; _paramInitialized = true; } std::vector ANSCNNFD::ObjectDetectCNN(const unsigned char* rgbImageData, int width, int height, int step) { try { TIME_START; if (!_paramInitialized) { InitParameters(); } TIME_END("init"); TIME_START; auto fx = SetDataFrom3x3S2P1To1x1S1P0FromImage(rgbImageData, width, height, 3, step); TIME_END("convert data"); /***************CONV0*********************/ TIME_START; fx = Convolution(fx, g_pFilters[0]); TIME_END("conv_head"); TIME_START; fx = ConvolutionDP(fx, g_pFilters[1], g_pFilters[2]); TIME_END("conv0"); TIME_START; fx = MaxPooling2x2S2(fx); TIME_END("pool0"); /***************CONV1*********************/ TIME_START; fx = Convolution4LayerUnit(fx, g_pFilters[3], g_pFilters[4], g_pFilters[5], g_pFilters[6]); TIME_END("conv1"); /***************CONV2*********************/ TIME_START; fx = Convolution4LayerUnit(fx, g_pFilters[7], g_pFilters[8], g_pFilters[9], g_pFilters[10]); TIME_END("conv2"); /***************CONV3*********************/ TIME_START; fx = MaxPooling2x2S2(fx); TIME_END("pool3"); TIME_START; auto fb1 = Convolution4LayerUnit(fx, g_pFilters[11], g_pFilters[12], g_pFilters[13], g_pFilters[14]); TIME_END("conv3"); /***************CONV4*********************/ TIME_START; fx = MaxPooling2x2S2(fb1); TIME_END("pool4"); TIME_START; auto fb2 = Convolution4LayerUnit(fx, g_pFilters[15], g_pFilters[16], g_pFilters[17], g_pFilters[18]); TIME_END("conv4"); /***************CONV5*********************/ TIME_START; fx = MaxPooling2x2S2(fb2); TIME_END("pool5"); TIME_START; auto fb3 = Convolution4LayerUnit(fx, g_pFilters[19], g_pFilters[20], g_pFilters[21], g_pFilters[22]); TIME_END("conv5"); CDataBlob pred_reg[3], pred_cls[3], pred_kps[3], pred_obj[3]; /***************branch5*********************/ TIME_START; fb3 = ConvolutionDP(fb3, g_pFilters[27], g_pFilters[28]); pred_cls[2] = ConvolutionDP(fb3, g_pFilters[33], g_pFilters[34], false); pred_reg[2] = ConvolutionDP(fb3, g_pFilters[39], g_pFilters[40], false); pred_kps[2] = ConvolutionDP(fb3, g_pFilters[51], g_pFilters[52], false); pred_obj[2] = ConvolutionDP(fb3, g_pFilters[45], g_pFilters[46], false); TIME_END("branch5"); /*****************add5*********************/ TIME_START; fb2 = ElementAdd(UpsampleX2(fb3), fb2); TIME_END("add5"); /*****************add6*********************/ TIME_START; fb2 = ConvolutionDP(fb2, g_pFilters[25], g_pFilters[26]); pred_cls[1] = ConvolutionDP(fb2, g_pFilters[31], g_pFilters[32], false); pred_reg[1] = ConvolutionDP(fb2, g_pFilters[37], g_pFilters[38], false); pred_kps[1] = ConvolutionDP(fb2, g_pFilters[49], g_pFilters[50], false); pred_obj[1] = ConvolutionDP(fb2, g_pFilters[43], g_pFilters[44], false); TIME_END("branch4"); /*****************add4*********************/ TIME_START; fb1 = ElementAdd(UpsampleX2(fb2), fb1); TIME_END("add4"); /***************branch3*********************/ TIME_START; fb1 = ConvolutionDP(fb1, g_pFilters[23], g_pFilters[24]); pred_cls[0] = ConvolutionDP(fb1, g_pFilters[29], g_pFilters[30], false); pred_reg[0] = ConvolutionDP(fb1, g_pFilters[35], g_pFilters[36], false); pred_kps[0] = ConvolutionDP(fb1, g_pFilters[47], g_pFilters[48], false); pred_obj[0] = ConvolutionDP(fb1, g_pFilters[41], g_pFilters[42], false); TIME_END("branch3"); /***************PRIORBOX*********************/ TIME_START; auto prior3 = MeshGrid(fb1.cols, fb1.rows, 8); auto prior4 = MeshGrid(fb2.cols, fb2.rows, 16); auto prior5 = MeshGrid(fb3.cols, fb3.rows, 32); TIME_END("prior"); /***************PRIORBOX*********************/ TIME_START; BboxDecode(pred_reg[0], prior3, 8); BboxDecode(pred_reg[1], prior4, 16); BboxDecode(pred_reg[2], prior5, 32); KPSDecode(pred_kps[0], prior3, 8); KPSDecode(pred_kps[1], prior4, 16); KPSDecode(pred_kps[2], prior5, 32); auto cls = Concat3(Blob2Vector(pred_cls[0]), Blob2Vector(pred_cls[1]), Blob2Vector(pred_cls[2])); auto reg = Concat3(Blob2Vector(pred_reg[0]), Blob2Vector(pred_reg[1]), Blob2Vector(pred_reg[2])); auto kps = Concat3(Blob2Vector(pred_kps[0]), Blob2Vector(pred_kps[1]), Blob2Vector(pred_kps[2])); auto obj = Concat3(Blob2Vector(pred_obj[0]), Blob2Vector(pred_obj[1]), Blob2Vector(pred_obj[2])); Sigmoid(cls); Sigmoid(obj); TIME_END("decode") TIME_START; std::vector facesInfo = DetectionOutput(cls, reg, kps, obj, 0.45f, 0.2f, 1000, 512); TIME_END("detection output") return facesInfo; } catch (std::exception& e) { std::vector facesInfo; facesInfo.clear(); this->_logger.LogFatal("ANSCNNFD::ObjectDetectCNN", e.what(), __FILE__, __LINE__); return facesInfo; } } CDataBlob ANSCNNFD::SetDataFrom3x3S2P1To1x1S1P0FromImage(const unsigned char* inputData, int imgWidth, int imgHeight, int imgChannels, int imgWidthStep, int padDivisor) { if (imgChannels != 3) { this->_logger.LogError("ANSCNNFD::SetDataFrom3x3S2P1To1x1S1P0FromImage", "The input image must be a 3-channel RGB image", __FILE__, __LINE__); exit(1); } if (padDivisor != 32) { this->_logger.LogError("ANSCNNFD::SetDataFrom3x3S2P1To1x1S1P0FromImage", "This version need pad of 32", __FILE__, __LINE__); exit(1); } int rows = ((imgHeight - 1) / padDivisor + 1) * padDivisor / 2; int cols = ((imgWidth - 1) / padDivisor + 1) * padDivisor / 2; int channels = 32; CDataBlob outBlob(rows, cols, channels); #if defined(_OPENMP) #pragma omp parallel for #endif for (int r = 0; r < rows; r++) { for (int c = 0; c < cols; c++) { float* pData = outBlob.ptr(r, c); for (int fy = -1; fy <= 1; fy++) { int srcy = r * 2 + fy; if (srcy < 0 || srcy >= imgHeight) //out of the range of the image continue; for (int fx = -1; fx <= 1; fx++) { int srcx = c * 2 + fx; if (srcx < 0 || srcx >= imgWidth) //out of the range of the image continue; const unsigned char* pImgData = inputData + size_t(imgWidthStep) * srcy + imgChannels * srcx; int output_channel_offset = ((fy + 1) * 3 + fx + 1); //3x3 filters, 3-channel image pData[output_channel_offset * imgChannels] = pImgData[0]; pData[output_channel_offset * imgChannels + 1] = pImgData[1]; pData[output_channel_offset * imgChannels + 2] = pImgData[2]; } } } } return outBlob; } //p1 and p2 must be 512-bit aligned (16 float numbers) inline float dotProduct(const float* p1, const float* p2, int num) { float sum = 0.f; #if defined(_ENABLE_AVX512) __m512 a_float_x16, b_float_x16; __m512 sum_float_x16 = _mm512_setzero_ps(); for (int i = 0; i < num; i += 16) { a_float_x16 = _mm512_load_ps(p1 + i); b_float_x16 = _mm512_load_ps(p2 + i); sum_float_x16 = _mm512_add_ps(sum_float_x16, _mm512_mul_ps(a_float_x16, b_float_x16)); } sum = _mm512_reduce_add_ps(sum_float_x16); #elif defined(_ENABLE_AVX2) __m256 a_float_x8, b_float_x8; __m256 sum_float_x8 = _mm256_setzero_ps(); for (int i = 0; i < num; i += 8) { a_float_x8 = _mm256_load_ps(p1 + i); b_float_x8 = _mm256_load_ps(p2 + i); sum_float_x8 = _mm256_add_ps(sum_float_x8, _mm256_mul_ps(a_float_x8, b_float_x8)); } sum_float_x8 = _mm256_hadd_ps(sum_float_x8, sum_float_x8); sum_float_x8 = _mm256_hadd_ps(sum_float_x8, sum_float_x8); sum = ((float*)&sum_float_x8)[0] + ((float*)&sum_float_x8)[4]; #elif defined(_ENABLE_NEON) float32x4_t a_float_x4, b_float_x4; float32x4_t sum_float_x4; sum_float_x4 = vdupq_n_f32(0); for (int i = 0; i < num; i += 4) { a_float_x4 = vld1q_f32(p1 + i); b_float_x4 = vld1q_f32(p2 + i); sum_float_x4 = vaddq_f32(sum_float_x4, vmulq_f32(a_float_x4, b_float_x4)); } sum += vgetq_lane_f32(sum_float_x4, 0); sum += vgetq_lane_f32(sum_float_x4, 1); sum += vgetq_lane_f32(sum_float_x4, 2); sum += vgetq_lane_f32(sum_float_x4, 3); #else for (int i = 0; i < num; i++) { sum += (p1[i] * p2[i]); } #endif return sum; } inline bool vecMulAdd(const float* p1, const float* p2, float* p3, int num) { #if defined(_ENABLE_AVX512) __m512 a_float_x16, b_float_x16, c_float_x16; for (int i = 0; i < num; i += 16) { a_float_x16 = _mm512_load_ps(p1 + i); b_float_x16 = _mm512_load_ps(p2 + i); c_float_x16 = _mm512_load_ps(p3 + i); c_float_x16 = _mm512_add_ps(c_float_x16, _mm512_mul_ps(a_float_x16, b_float_x16)); _mm512_store_ps(p3 + i, c_float_x16); } #elif defined(_ENABLE_AVX2) __m256 a_float_x8, b_float_x8, c_float_x8; for (int i = 0; i < num; i += 8) { a_float_x8 = _mm256_load_ps(p1 + i); b_float_x8 = _mm256_load_ps(p2 + i); c_float_x8 = _mm256_load_ps(p3 + i); c_float_x8 = _mm256_add_ps(c_float_x8, _mm256_mul_ps(a_float_x8, b_float_x8)); _mm256_store_ps(p3 + i, c_float_x8); } #elif defined(_ENABLE_NEON) float32x4_t a_float_x4, b_float_x4, c_float_x4; for (int i = 0; i < num; i += 4) { a_float_x4 = vld1q_f32(p1 + i); b_float_x4 = vld1q_f32(p2 + i); c_float_x4 = vld1q_f32(p3 + i); c_float_x4 = vaddq_f32(c_float_x4, vmulq_f32(a_float_x4, b_float_x4)); vst1q_f32(p3 + i, c_float_x4); } #else for (int i = 0; i < num; i++) p3[i] += (p1[i] * p2[i]); #endif return true; } inline bool vecAdd(const float* p1, float* p2, int num) { #if defined(_ENABLE_AVX512) __m512 a_float_x16, b_float_x16; for (int i = 0; i < num; i += 16) { a_float_x16 = _mm512_load_ps(p1 + i); b_float_x16 = _mm512_load_ps(p2 + i); b_float_x16 = _mm512_add_ps(a_float_x16, b_float_x16); _mm512_store_ps(p2 + i, b_float_x16); } #elif defined(_ENABLE_AVX2) __m256 a_float_x8, b_float_x8; for (int i = 0; i < num; i += 8) { a_float_x8 = _mm256_load_ps(p1 + i); b_float_x8 = _mm256_load_ps(p2 + i); b_float_x8 = _mm256_add_ps(a_float_x8, b_float_x8); _mm256_store_ps(p2 + i, b_float_x8); } #elif defined(_ENABLE_NEON) float32x4_t a_float_x4, b_float_x4, c_float_x4; for (int i = 0; i < num; i += 4) { a_float_x4 = vld1q_f32(p1 + i); b_float_x4 = vld1q_f32(p2 + i); c_float_x4 = vaddq_f32(a_float_x4, b_float_x4); vst1q_f32(p2 + i, c_float_x4); } #else for (int i = 0; i < num; i++) { p2[i] += p1[i]; } #endif return true; } inline bool vecAdd(const float* p1, const float* p2, float* p3, int num) { #if defined(_ENABLE_AVX512) __m512 a_float_x16, b_float_x16; for (int i = 0; i < num; i += 16) { a_float_x16 = _mm512_load_ps(p1 + i); b_float_x16 = _mm512_load_ps(p2 + i); b_float_x16 = _mm512_add_ps(a_float_x16, b_float_x16); _mm512_store_ps(p3 + i, b_float_x16); } #elif defined(_ENABLE_AVX2) __m256 a_float_x8, b_float_x8; for (int i = 0; i < num; i += 8) { a_float_x8 = _mm256_load_ps(p1 + i); b_float_x8 = _mm256_load_ps(p2 + i); b_float_x8 = _mm256_add_ps(a_float_x8, b_float_x8); _mm256_store_ps(p3 + i, b_float_x8); } #elif defined(_ENABLE_NEON) float32x4_t a_float_x4, b_float_x4, c_float_x4; for (int i = 0; i < num; i += 4) { a_float_x4 = vld1q_f32(p1 + i); b_float_x4 = vld1q_f32(p2 + i); c_float_x4 = vaddq_f32(a_float_x4, b_float_x4); vst1q_f32(p3 + i, c_float_x4); } #else for (int i = 0; i < num; i++) { p3[i] = p1[i] + p2[i]; } #endif return true; } bool ANSCNNFD::Convolution1x1PointWise(const CDataBlob& inputData, const Filters& filters, CDataBlob& outputData) { #if defined(_OPENMP) #pragma omp parallel for #endif for (int row = 0; row < outputData.rows; row++) { for (int col = 0; col < outputData.cols; col++) { float* pOut = outputData.ptr(row, col); const float* pIn = inputData.ptr(row, col); for (int ch = 0; ch < outputData.channels; ch++) { const float* pF = filters.weights.ptr(0, ch); pOut[ch] = dotProduct(pIn, pF, inputData.channels); pOut[ch] += filters.biases.data[ch]; } } } return true; } bool ANSCNNFD::Convolution3x3DepthWise(const CDataBlob& inputData, const Filters& filters, CDataBlob& outputData) { //set all elements in outputData to zeros outputData.setZero(); #if defined(_OPENMP) #pragma omp parallel for #endif for (int row = 0; row < outputData.rows; row++) { int srcy_start = row - 1; int srcy_end = srcy_start + 3; srcy_start = MAX(0, srcy_start); srcy_end = MIN(srcy_end, inputData.rows); for (int col = 0; col < outputData.cols; col++) { float* pOut = outputData.ptr(row, col); int srcx_start = col - 1; int srcx_end = srcx_start + 3; srcx_start = MAX(0, srcx_start); srcx_end = MIN(srcx_end, inputData.cols); for (int r = srcy_start; r < srcy_end; r++) for (int c = srcx_start; c < srcx_end; c++) { int filter_r = r - row + 1; int filter_c = c - col + 1; int filter_idx = filter_r * 3 + filter_c; vecMulAdd(inputData.ptr(r, c), filters.weights.ptr(0, filter_idx), pOut, filters.num_filters); } vecAdd(filters.biases.ptr(0, 0), pOut, filters.num_filters); } } return true; } bool ANSCNNFD::Relu(CDataBlob& inputoutputData) { if (inputoutputData.isEmpty()) { this->_logger.LogError("ANSCNNFD::Relu", "The input data is empty", __FILE__, __LINE__); return false; } int len = inputoutputData.cols * inputoutputData.rows * inputoutputData.channelStep / sizeof(float); #if defined(_ENABLE_AVX512) __m512 a, bzeros; bzeros = _mm512_setzero_ps(); //zeros for (int i = 0; i < len; i += 16) { a = _mm512_load_ps(inputoutputData.data + i); a = _mm512_max_ps(a, bzeros); _mm512_store_ps(inputoutputData.data + i, a); } #elif defined(_ENABLE_AVX2) __m256 a, bzeros; bzeros = _mm256_setzero_ps(); //zeros for (int i = 0; i < len; i += 8) { a = _mm256_load_ps(inputoutputData.data + i); a = _mm256_max_ps(a, bzeros); _mm256_store_ps(inputoutputData.data + i, a); } #else for (int i = 0; i < len; i++) inputoutputData.data[i] *= (inputoutputData.data[i] > 0); #endif return true; } void ANSCNNFD::IntersectBBox(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2, NormalizedBBox* intersect_bbox) { if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin || bbox2.ymin > bbox1.ymax || bbox2.ymax < bbox1.ymin) { // Return [0, 0, 0, 0] if there is no intersection. intersect_bbox->xmin = 0; intersect_bbox->ymin = 0; intersect_bbox->xmax = 0; intersect_bbox->ymax = 0; } else { intersect_bbox->xmin = (std::max(bbox1.xmin, bbox2.xmin)); intersect_bbox->ymin = (std::max(bbox1.ymin, bbox2.ymin)); intersect_bbox->xmax = (std::min(bbox1.xmax, bbox2.xmax)); intersect_bbox->ymax = (std::min(bbox1.ymax, bbox2.ymax)); } } float ANSCNNFD::JaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2) { NormalizedBBox intersect_bbox; IntersectBBox(bbox1, bbox2, &intersect_bbox); float intersect_width, intersect_height; intersect_width = intersect_bbox.xmax - intersect_bbox.xmin; intersect_height = intersect_bbox.ymax - intersect_bbox.ymin; if (intersect_width > 0 && intersect_height > 0) { float intersect_size = intersect_width * intersect_height; float bsize1 = (bbox1.xmax - bbox1.xmin) * (bbox1.ymax - bbox1.ymin); float bsize2 = (bbox2.xmax - bbox2.xmin) * (bbox2.ymax - bbox2.ymin); return intersect_size / (bsize1 + bsize2 - intersect_size); } else { return 0.f; } } CDataBlob ANSCNNFD::UpsampleX2(const CDataBlob& inputData) { if (inputData.isEmpty()) { this->_logger.LogError("ANSCNNFD::UpsampleX2", "The input data is empty", __FILE__, __LINE__); exit(1); } CDataBlob outData(inputData.rows * 2, inputData.cols * 2, inputData.channels); for (int r = 0; r < inputData.rows; r++) { for (int c = 0; c < inputData.cols; c++) { const float* pIn = inputData.ptr(r, c); int outr = r * 2; int outc = c * 2; for (int ch = 0; ch < inputData.channels; ++ch) { outData.ptr(outr, outc)[ch] = pIn[ch]; outData.ptr(outr, outc + 1)[ch] = pIn[ch]; outData.ptr(outr + 1, outc)[ch] = pIn[ch]; outData.ptr(outr + 1, outc + 1)[ch] = pIn[ch]; } } } return outData; } CDataBlob ANSCNNFD::ElementAdd(const CDataBlob& inputData1, const CDataBlob& inputData2) { if (inputData1.rows != inputData2.rows || inputData1.cols != inputData2.cols || inputData1.channels != inputData2.channels) { this->_logger.LogError("ANSCNNFD::ElementAdd", "The two input datas must be in the same shape.", __FILE__, __LINE__); exit(1); } CDataBlob outData(inputData1.rows, inputData1.cols, inputData1.channels); for (int r = 0; r < inputData1.rows; r++) { for (int c = 0; c < inputData1.cols; c++) { const float* pIn1 = inputData1.ptr(r, c); const float* pIn2 = inputData2.ptr(r, c); float* pOut = outData.ptr(r, c); vecAdd(pIn1, pIn2, pOut, inputData1.channels); } } return outData; } CDataBlob ANSCNNFD::Convolution(const CDataBlob& inputData, const Filters& filters, bool do_relu) { if (inputData.isEmpty() || filters.weights.isEmpty() || filters.biases.isEmpty()) { this->_logger.LogError("ANSCNNFD::Convolution", "The input data or filter data is empty.", __FILE__, __LINE__); exit(1); } if (inputData.channels != filters.channels) { this->_logger.LogError("ANSCNNFD::Convolution", "The input data dimension cannot meet filters.", __FILE__, __LINE__); exit(1); } CDataBlob outputData(inputData.rows, inputData.cols, filters.num_filters); if (filters.is_pointwise && !filters.is_depthwise) Convolution1x1PointWise(inputData, filters, outputData); else if (!filters.is_pointwise && filters.is_depthwise) Convolution3x3DepthWise(inputData, filters, outputData); else { this->_logger.LogError("ANSCNNFD::Convolution", "Unsupported filter type.", __FILE__, __LINE__); exit(1); } if (do_relu) Relu(outputData); return outputData; } CDataBlob ANSCNNFD::ConvolutionDP(const CDataBlob& inputData, const Filters& filtersP, const Filters& filtersD, bool do_relu) { CDataBlob tmp = Convolution(inputData, filtersP, false); CDataBlob out = Convolution(tmp, filtersD, do_relu); return out; } CDataBlob ANSCNNFD::Convolution4LayerUnit(const CDataBlob& inputData, const Filters& filtersP1, const Filters& filtersD1, const Filters& filtersP2, const Filters& filtersD2, bool do_relu) { CDataBlob tmp = ConvolutionDP(inputData, filtersP1, filtersD1, true); CDataBlob out = ConvolutionDP(tmp, filtersP2, filtersD2, do_relu); return out; } //only 2X2 S2 is supported CDataBlob ANSCNNFD::MaxPooling2x2S2(const CDataBlob& inputData) { if (inputData.isEmpty()) { this->_logger.LogError("ANSCNNFD::MaxPooling2x2S2", "The input data is empty.", __FILE__, __LINE__); exit(1); } int outputR = static_cast(ceil((inputData.rows - 3.0f) / 2)) + 1; int outputC = static_cast(ceil((inputData.cols - 3.0f) / 2)) + 1; int outputCH = inputData.channels; if (outputR < 1 || outputC < 1) { this->_logger.LogError("ANSCNNFD::MaxPooling2x2S2", "The size of the output is not correct.", __FILE__, __LINE__); exit(1); } CDataBlob outputData(outputR, outputC, outputCH); outputData.setZero(); for (int row = 0; row < outputData.rows; row++) { for (int col = 0; col < outputData.cols; col++) { size_t inputMatOffsetsInElement[4]; int elementCount = 0; int rstart = row * 2; int cstart = col * 2; int rend = MIN(rstart + 2, inputData.rows); int cend = MIN(cstart + 2, inputData.cols); for (int fr = rstart; fr < rend; fr++) { for (int fc = cstart; fc < cend; fc++) { inputMatOffsetsInElement[elementCount++] = (size_t(fr) * inputData.cols + fc) * inputData.channelStep / sizeof(float); } } float* pOut = outputData.ptr(row, col); float* pIn = inputData.data; #if defined(_ENABLE_NEON) for (int ch = 0; ch < outputData.channels; ch += 4) { float32x4_t tmp; float32x4_t maxVal = vld1q_f32(pIn + ch + inputMatOffsetsInElement[0]); for (int ec = 1; ec < elementCount; ec++) { tmp = vld1q_f32(pIn + ch + inputMatOffsetsInElement[ec]); maxVal = vmaxq_f32(maxVal, tmp); } vst1q_f32(pOut + ch, maxVal); } #elif defined(_ENABLE_AVX512) for (int ch = 0; ch < outputData.channels; ch += 16) { __m512 tmp; __m512 maxVal = _mm512_load_ps((__m512 const*)(pIn + ch + inputMatOffsetsInElement[0])); for (int ec = 1; ec < elementCount; ec++) { tmp = _mm512_load_ps((__m512 const*)(pIn + ch + inputMatOffsetsInElement[ec])); maxVal = _mm512_max_ps(maxVal, tmp); } _mm512_store_ps((__m512*)(pOut + ch), maxVal); } #elif defined(_ENABLE_AVX2) for (int ch = 0; ch < outputData.channels; ch += 8) { __m256 tmp; __m256 maxVal = _mm256_load_ps((float const*)(pIn + ch + inputMatOffsetsInElement[0])); for (int ec = 1; ec < elementCount; ec++) { tmp = _mm256_load_ps((float const*)(pIn + ch + inputMatOffsetsInElement[ec])); maxVal = _mm256_max_ps(maxVal, tmp); } _mm256_store_ps(pOut + ch, maxVal); } #else for (int ch = 0; ch < outputData.channels; ch++) { float maxVal = pIn[ch + inputMatOffsetsInElement[0]]; for (int ec = 1; ec < elementCount; ec++) { maxVal = MAX(maxVal, pIn[ch + inputMatOffsetsInElement[ec]]); } pOut[ch] = maxVal; } #endif } } return outputData; } CDataBlob ANSCNNFD::MeshGrid(int feature_width, int feature_height, int stride, float offset) { CDataBlob out(feature_height, feature_width, 2); for (int r = 0; r < feature_height; ++r) { float rx = (float)(r * stride) + offset; for (int c = 0; c < feature_width; ++c) { float* p = out.ptr(r, c); p[0] = (float)(c * stride) + offset; p[1] = rx; } } return out; } void ANSCNNFD::BboxDecode(CDataBlob& bbox_pred, const CDataBlob& priors, int stride) { if (bbox_pred.cols != priors.cols || bbox_pred.rows != priors.rows) { this->_logger.LogError("ANSCNNFD::BboxDecode", "Mismatch between feature map and anchor size.", __FILE__, __LINE__); } if (bbox_pred.channels != 4) { this->_logger.LogError("ANSCNNFD::BboxDecode", "The bbox dim must be 4.", __FILE__, __LINE__); } float fstride = (float)stride; for (int r = 0; r < bbox_pred.rows; ++r) { for (int c = 0; c < bbox_pred.cols; ++c) { float* pb = bbox_pred.ptr(r, c); const float* pp = priors.ptr(r, c); float cx = pb[0] * fstride + pp[0]; float cy = pb[1] * fstride + pp[1]; float w = std::exp(pb[2]) * fstride; float h = std::exp(pb[3]) * fstride; pb[0] = cx - w / 2.f; pb[1] = cy - h / 2.f; pb[2] = cx + w / 2.f; pb[3] = cy + h / 2.f; } } } void ANSCNNFD::KPSDecode(CDataBlob& kps_pred, const CDataBlob& priors, int stride) { if (kps_pred.cols != priors.cols || kps_pred.rows != priors.rows) { this->_logger.LogError("ANSCNNFD::KPSDecode", "Mismatch between feature map and anchor size.", __FILE__, __LINE__); exit(1); } if (kps_pred.channels & 1) { this->_logger.LogError("ANSCNNFD::KPSDecode", "The kps dim must be even.", __FILE__, __LINE__); exit(1); } float fstride = (float)stride; int num_points = kps_pred.channels >> 1; for (int r = 0; r < kps_pred.rows; ++r) { for (int c = 0; c < kps_pred.cols; ++c) { float* pb = kps_pred.ptr(r, c); const float* pp = priors.ptr(r, c); for (int n = 0; n < num_points; ++n) { pb[2 * n] = pb[2 * n] * fstride + pp[0]; pb[2 * n + 1] = pb[2 * n + 1] * fstride + pp[1]; } } } } template CDataBlob ANSCNNFD::Concat3(const CDataBlob& inputData1, const CDataBlob& inputData2, const CDataBlob& inputData3) { if ((inputData1.isEmpty()) || (inputData2.isEmpty()) || (inputData3.isEmpty())) { this->_logger.LogError("ANSCNNFD::Concat3", "The input data is empty.", __FILE__, __LINE__); exit(1); } if ((inputData1.cols != inputData2.cols) || (inputData1.rows != inputData2.rows) || (inputData1.cols != inputData3.cols) || (inputData1.rows != inputData3.rows)) { this->_logger.LogError("ANSCNNFD::Concat3", "The three inputs must have the same size.", __FILE__, __LINE__); exit(1); } int outputR = inputData1.rows; int outputC = inputData1.cols; int outputCH = inputData1.channels + inputData2.channels + inputData3.channels; if (outputR < 1 || outputC < 1 || outputCH < 1) { this->_logger.LogError("ANSCNNFD::Concat3", "The size of the output is not correct.", __FILE__, __LINE__); exit(1); } CDataBlob outputData(outputR, outputC, outputCH); for (int row = 0; row < outputData.rows; row++) { for (int col = 0; col < outputData.cols; col++) { T* pOut = outputData.ptr(row, col); const T* pIn1 = inputData1.ptr(row, col); const T* pIn2 = inputData2.ptr(row, col); const T* pIn3 = inputData3.ptr(row, col); memcpy(pOut, pIn1, sizeof(T) * inputData1.channels); memcpy(pOut + inputData1.channels, pIn2, sizeof(T) * inputData2.channels); memcpy(pOut + inputData1.channels + inputData2.channels, pIn3, sizeof(T) * inputData3.channels); } } return outputData; } template CDataBlob ANSCNNFD::Concat3(const CDataBlob& inputData1, const CDataBlob& inputData2, const CDataBlob& inputData3); template CDataBlob ANSCNNFD::Blob2Vector(const CDataBlob& inputData) { if (inputData.isEmpty()) { this->_logger.LogError("ANSCNNFD::Blob2Vector", "The input data is empty.", __FILE__, __LINE__); exit(1); } CDataBlob outputData(1, 1, inputData.cols * inputData.rows * inputData.channels); int bytesOfAChannel = inputData.channels * sizeof(T); T* pOut = outputData.ptr(0, 0); for (int row = 0; row < inputData.rows; row++) { for (int col = 0; col < inputData.cols; col++) { const T* pIn = inputData.ptr(row, col); memcpy(pOut, pIn, bytesOfAChannel); pOut += inputData.channels; } } return outputData; } template CDataBlob ANSCNNFD::Blob2Vector(const CDataBlob& inputData); void ANSCNNFD::Sigmoid(CDataBlob& inputData) { for (int r = 0; r < inputData.rows; ++r) { for (int c = 0; c < inputData.cols; ++c) { float* pIn = inputData.ptr(r, c); for (int ch = 0; ch < inputData.channels; ++ch) { float v = pIn[ch]; v = std::min(v, 88.3762626647949f); v = std::max(v, -88.3762626647949f); pIn[ch] = static_cast(1.f / (1.f + exp(-v))); } } } } std::vector ANSCNNFD::DetectionOutput(const CDataBlob& cls, const CDataBlob& reg, const CDataBlob& kps, const CDataBlob& obj, float overlap_threshold, float confidence_threshold, int top_k, int keep_top_k) { if (reg.isEmpty() || cls.isEmpty() || kps.isEmpty() || obj.isEmpty())//|| iou.isEmpty()) { this->_logger.LogError("ANSCNNFD::DetectionOutput", "The input data is null.", __FILE__, __LINE__); exit(1); } if (reg.cols != 1 || reg.rows != 1 || cls.cols != 1 || cls.rows != 1 || kps.cols != 1 || kps.rows != 1 || obj.cols != 1 || obj.rows != 1) { this->_logger.LogError("ANSCNNFD::DetectionOutput", "Only support vector format.", __FILE__, __LINE__); exit(1); } if ((int)(kps.channels / obj.channels) != 10) { this->_logger.LogError("ANSCNNFD::DetectionOutput", "Only support 5 keypoints.", __FILE__, __LINE__); exit(1); } const float* pCls = cls.ptr(0, 0); const float* pReg = reg.ptr(0, 0); const float* pObj = obj.ptr(0, 0); const float* pKps = kps.ptr(0, 0); std::vector > score_bbox_vec; std::vector > final_score_bbox_vec; //get the candidates those are > confidence_threshold for (int i = 0; i < cls.channels; ++i) { float conf = std::sqrt(pCls[i] * pObj[i]); // float conf = pCls[i] * pObj[i]; if (conf >= confidence_threshold) { NormalizedBBox bb; bb.xmin = pReg[4 * i]; bb.ymin = pReg[4 * i + 1]; bb.xmax = pReg[4 * i + 2]; bb.ymax = pReg[4 * i + 3]; //store the five landmarks memcpy(bb.lm, pKps + 10 * i, 10 * sizeof(float)); score_bbox_vec.push_back(std::make_pair(conf, bb)); } } //Sort the score pair according to the scores in descending order std::stable_sort(score_bbox_vec.begin(), score_bbox_vec.end(), SortScoreBBoxPairDescend); // Keep top_k scores if needed. if (top_k > -1 && size_t(top_k) < score_bbox_vec.size()) { score_bbox_vec.resize(top_k); } //Do NMS final_score_bbox_vec.clear(); while (score_bbox_vec.size() != 0) { const NormalizedBBox bb1 = score_bbox_vec.front().second; bool keep = true; for (size_t k = 0; k < final_score_bbox_vec.size(); k++) { if (keep) { const NormalizedBBox bb2 = final_score_bbox_vec[k].second; float overlap = JaccardOverlap(bb1, bb2); keep = (overlap <= overlap_threshold); } else { break; } } if (keep) { final_score_bbox_vec.push_back(score_bbox_vec.front()); } score_bbox_vec.erase(score_bbox_vec.begin()); } if (keep_top_k > -1 && size_t(keep_top_k) < final_score_bbox_vec.size()) { final_score_bbox_vec.resize(keep_top_k); } //copy the results to the output blob int num_faces = (int)final_score_bbox_vec.size(); std::vector facesInfo; for (int fi = 0; fi < num_faces; fi++) { std::pair pp = final_score_bbox_vec[fi]; FaceRect r; r.score = pp.first; r.x = int(pp.second.xmin); r.y = int(pp.second.ymin); r.w = int(pp.second.xmax - pp.second.xmin); r.h = int(pp.second.ymax - pp.second.ymin); //copy landmark data for (int i = 0; i < 10; ++i) { r.lm[i] = int(pp.second.lm[i]); } facesInfo.emplace_back(r); } return facesInfo; } }