1142 lines
43 KiB
C++
1142 lines
43 KiB
C++
|
|
#include "CNNFaceDetector.h"
|
||
|
|
#include "Utility.h"
|
||
|
|
void* AllocMemory(size_t size)
|
||
|
|
{
|
||
|
|
char* ptr, * ptr0;
|
||
|
|
ptr0 = (char*)malloc(
|
||
|
|
(size_t)(size + _MALLOC_ALIGN * ((size >= 4096) + 1L) + sizeof(char*)));
|
||
|
|
|
||
|
|
if (!ptr0)
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
// align the pointer
|
||
|
|
ptr = (char*)(((size_t)(ptr0 + sizeof(char*) + 1) + _MALLOC_ALIGN - 1) & ~(size_t)(_MALLOC_ALIGN - 1));
|
||
|
|
*(char**)(ptr - sizeof(char*)) = ptr0;
|
||
|
|
|
||
|
|
return ptr;
|
||
|
|
}
|
||
|
|
void FreeMemory_(void* ptr)
|
||
|
|
{
|
||
|
|
try {
|
||
|
|
if (ptr)
|
||
|
|
{
|
||
|
|
if (((size_t)ptr & (_MALLOC_ALIGN - 1)) != 0)
|
||
|
|
return;
|
||
|
|
free(*((char**)ptr - 1));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
catch (std::exception& e) {
|
||
|
|
|
||
|
|
std::cout << "ANSCENTER::FreeMemory:" << e.what();
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
namespace ANSCENTER {
|
||
|
|
bool SortScoreBBoxPairDescend(const std::pair<float, NormalizedBBox>& pair1, const std::pair<float, NormalizedBBox>& pair2)
|
||
|
|
{
|
||
|
|
return pair1.first > pair2.first;
|
||
|
|
}
|
||
|
|
|
||
|
|
bool ANSCNNFD::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
bool ANSCNNFD::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) {
|
||
|
|
try {
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
catch (std::exception& e) {
|
||
|
|
this->_logger.LogFatal("ANSCNNFD::LoadModel", e.what(), __FILE__, __LINE__);
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
bool ANSCNNFD::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, std::string& labelMap) {
|
||
|
|
bool result = true;
|
||
|
|
_licenseValid = true;
|
||
|
|
if (!result) return false;
|
||
|
|
try {
|
||
|
|
_modelConfig = modelConfig;
|
||
|
|
_modelConfig.modelType = ModelType::FACEDETECT;
|
||
|
|
_modelConfig.detectionType = DetectionType::FACEDETECTOR;
|
||
|
|
InitParameters();
|
||
|
|
labelMap = "Face";
|
||
|
|
_isInitialized = true;
|
||
|
|
return true;
|
||
|
|
|
||
|
|
}
|
||
|
|
catch (std::exception& e) {
|
||
|
|
this->_logger.LogFatal("ANSCNNFD::Initialize", e.what(), __FILE__, __LINE__);
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
std::vector<Object> ANSCNNFD::RunInference(const cv::Mat& input) {
|
||
|
|
std::vector<Object> output;
|
||
|
|
output.clear();
|
||
|
|
if (!_licenseValid) {
|
||
|
|
this->_logger.LogError("ANSCNNFD::RunInference", "Invalid license", __FILE__, __LINE__);
|
||
|
|
return output;
|
||
|
|
}
|
||
|
|
if (!_isInitialized) {
|
||
|
|
this->_logger.LogError("ANSCNNFD::RunInference", "Invalid model", __FILE__, __LINE__);
|
||
|
|
return output;
|
||
|
|
}
|
||
|
|
try {
|
||
|
|
bool croppedFace = false; // Check if the image is cropped face image
|
||
|
|
int* pResults = nullptr;
|
||
|
|
unsigned char* pBuffer = static_cast<unsigned char*>(malloc(DETECT_BUFFER_SIZE));
|
||
|
|
cv::Mat frame = input.clone();
|
||
|
|
|
||
|
|
// We know that the image sizes <=300 px, it is likely that image is cropped for face only
|
||
|
|
if ((input.size[0] <= 300) || (input.size[1] <= 300)) croppedFace = true;
|
||
|
|
if (croppedFace) cv::copyMakeBorder(input, frame, 200, 200, 200, 200, cv::BORDER_REPLICATE);
|
||
|
|
|
||
|
|
|
||
|
|
pResults = FaceDetectCNN(pBuffer, static_cast<unsigned char*>(frame.ptr(0)), frame.cols, frame.rows, static_cast<int>(frame.step));
|
||
|
|
for (int i = 0; i < (pResults ? *pResults : 0); i++)
|
||
|
|
{
|
||
|
|
Object result;
|
||
|
|
short* p = reinterpret_cast<short*>(pResults + 1) + 16 * i;
|
||
|
|
float confidence = static_cast<float>(p[0]) / 100;
|
||
|
|
if (confidence >= _modelConfig.detectionScoreThreshold) {
|
||
|
|
int x = p[1];
|
||
|
|
int y = p[2];
|
||
|
|
int w = p[3];
|
||
|
|
int h = p[4];
|
||
|
|
int x1, y1, x2, y2;
|
||
|
|
|
||
|
|
result.classId = 0;
|
||
|
|
result.className = "Face";
|
||
|
|
result.confidence = confidence;
|
||
|
|
|
||
|
|
result.box.x = x;
|
||
|
|
result.box.y = y;
|
||
|
|
|
||
|
|
if (croppedFace) {
|
||
|
|
if (x <= 200) x = 200;
|
||
|
|
if (y <= 200) y = 200;
|
||
|
|
result.box.x = x - 200;
|
||
|
|
result.box.y = y - 200;
|
||
|
|
}
|
||
|
|
result.box.width = w;
|
||
|
|
result.box.height = h;
|
||
|
|
|
||
|
|
x1 = x;
|
||
|
|
y1 = y;
|
||
|
|
x2 = x + w;
|
||
|
|
y2 = y + h;
|
||
|
|
cv::Rect facePos(cv::Point(x1, y1), cv::Point(x2, y2));
|
||
|
|
cv::Mat currFace = frame(facePos);
|
||
|
|
result.mask = currFace.clone();
|
||
|
|
output.push_back(result);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
free(pBuffer);
|
||
|
|
frame.release();
|
||
|
|
return output;
|
||
|
|
}
|
||
|
|
catch (std::exception& e) {
|
||
|
|
this->_logger.LogFatal("ANSCNNFD::RunInference", e.what(), __FILE__, __LINE__);
|
||
|
|
return output;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
ANSCNNFD::~ANSCNNFD() {
|
||
|
|
try {
|
||
|
|
this->_logger.LogInfo("ANSCNNFD::~ANSCNNFD()", "Release ANSCNNFD ",__FILE__, __LINE__);
|
||
|
|
}
|
||
|
|
catch (std::exception& e) {
|
||
|
|
std::cout << "ANSCNNFD::~ANSCNNFD()" << e.what() << std::endl;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
bool ANSCNNFD::Destroy() {
|
||
|
|
try {
|
||
|
|
this->_logger.LogInfo("ANSCNNFD::Destroy()", "Release ANSCNNFD ", __FILE__, __LINE__);
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
catch (std::exception& e) {
|
||
|
|
std::cout << "ANSCNNFD::Destroy()" << e.what() << std::endl;
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Private
|
||
|
|
int* ANSCNNFD::FaceDetectCNN(unsigned char* result_buffer, unsigned char* rgb_image_data, int width, int height, int step) //input image, it must be BGR (three-channel) image!
|
||
|
|
{
|
||
|
|
try {
|
||
|
|
if (!result_buffer)
|
||
|
|
{
|
||
|
|
this->_logger.LogError("ANSCNNFD::FaceDetectCNN", "Null buffer memory", __FILE__, __LINE__);
|
||
|
|
return nullptr;
|
||
|
|
}
|
||
|
|
//clear memory
|
||
|
|
result_buffer[0] = 0;
|
||
|
|
result_buffer[1] = 0;
|
||
|
|
result_buffer[2] = 0;
|
||
|
|
result_buffer[3] = 0;
|
||
|
|
|
||
|
|
std::vector<FaceRect> faces = ObjectDetectCNN(rgb_image_data, width, height, step);
|
||
|
|
|
||
|
|
int num_faces = static_cast<int>(faces.size());
|
||
|
|
num_faces = MIN(num_faces, 1024); //1024 = 0x9000 / (16 * 2 + 4)
|
||
|
|
|
||
|
|
int* pCount = reinterpret_cast<int*>(result_buffer);
|
||
|
|
pCount[0] = num_faces;
|
||
|
|
|
||
|
|
for (int i = 0; i < num_faces; i++)
|
||
|
|
{
|
||
|
|
//copy data
|
||
|
|
short* p = reinterpret_cast<short*>(result_buffer + 4) + 16 * size_t(i);
|
||
|
|
p[0] = static_cast<short>(faces[i].score * 100);
|
||
|
|
p[1] = static_cast<short>(faces[i].x);
|
||
|
|
p[2] = static_cast<short>(faces[i].y);
|
||
|
|
p[3] = static_cast<short>(faces[i].w);
|
||
|
|
p[4] = static_cast<short>(faces[i].h);
|
||
|
|
//copy landmarks
|
||
|
|
for (int lmidx = 0; lmidx < 10; lmidx++)
|
||
|
|
{
|
||
|
|
p[5 + lmidx] = static_cast<short>(faces[i].lm[lmidx]);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return pCount;
|
||
|
|
}
|
||
|
|
catch (std::exception& e) {
|
||
|
|
this->_logger.LogFatal("ANSCNNFD::FaceDetectCNN", e.what(), __FILE__, __LINE__);
|
||
|
|
return nullptr;
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
void ANSCNNFD::InitParameters() {
|
||
|
|
for (int i = 0; i < NUM_CONV_LAYER; i++)
|
||
|
|
g_pFilters[i] = param_pConvInfo[i];
|
||
|
|
_paramInitialized = true;
|
||
|
|
}
|
||
|
|
std::vector<FaceRect> ANSCNNFD::ObjectDetectCNN(const unsigned char* rgbImageData, int width, int height, int step) {
|
||
|
|
try {
|
||
|
|
TIME_START;
|
||
|
|
if (!_paramInitialized)
|
||
|
|
{
|
||
|
|
InitParameters();
|
||
|
|
}
|
||
|
|
TIME_END("init");
|
||
|
|
|
||
|
|
|
||
|
|
TIME_START;
|
||
|
|
auto fx = SetDataFrom3x3S2P1To1x1S1P0FromImage(rgbImageData, width, height, 3, step);
|
||
|
|
TIME_END("convert data");
|
||
|
|
|
||
|
|
/***************CONV0*********************/
|
||
|
|
TIME_START;
|
||
|
|
fx = Convolution(fx, g_pFilters[0]);
|
||
|
|
TIME_END("conv_head");
|
||
|
|
|
||
|
|
TIME_START;
|
||
|
|
fx = ConvolutionDP(fx, g_pFilters[1], g_pFilters[2]);
|
||
|
|
TIME_END("conv0");
|
||
|
|
|
||
|
|
TIME_START;
|
||
|
|
fx = MaxPooling2x2S2(fx);
|
||
|
|
TIME_END("pool0");
|
||
|
|
|
||
|
|
/***************CONV1*********************/
|
||
|
|
TIME_START;
|
||
|
|
fx = Convolution4LayerUnit(fx, g_pFilters[3], g_pFilters[4], g_pFilters[5], g_pFilters[6]);
|
||
|
|
TIME_END("conv1");
|
||
|
|
|
||
|
|
/***************CONV2*********************/
|
||
|
|
TIME_START;
|
||
|
|
fx = Convolution4LayerUnit(fx, g_pFilters[7], g_pFilters[8], g_pFilters[9], g_pFilters[10]);
|
||
|
|
TIME_END("conv2");
|
||
|
|
|
||
|
|
/***************CONV3*********************/
|
||
|
|
TIME_START;
|
||
|
|
fx = MaxPooling2x2S2(fx);
|
||
|
|
TIME_END("pool3");
|
||
|
|
|
||
|
|
TIME_START;
|
||
|
|
auto fb1 = Convolution4LayerUnit(fx, g_pFilters[11], g_pFilters[12], g_pFilters[13], g_pFilters[14]);
|
||
|
|
TIME_END("conv3");
|
||
|
|
|
||
|
|
/***************CONV4*********************/
|
||
|
|
TIME_START;
|
||
|
|
fx = MaxPooling2x2S2(fb1);
|
||
|
|
TIME_END("pool4");
|
||
|
|
|
||
|
|
TIME_START;
|
||
|
|
auto fb2 = Convolution4LayerUnit(fx, g_pFilters[15], g_pFilters[16], g_pFilters[17], g_pFilters[18]);
|
||
|
|
TIME_END("conv4");
|
||
|
|
|
||
|
|
/***************CONV5*********************/
|
||
|
|
TIME_START;
|
||
|
|
fx = MaxPooling2x2S2(fb2);
|
||
|
|
TIME_END("pool5");
|
||
|
|
|
||
|
|
TIME_START;
|
||
|
|
auto fb3 = Convolution4LayerUnit(fx, g_pFilters[19], g_pFilters[20], g_pFilters[21], g_pFilters[22]);
|
||
|
|
TIME_END("conv5");
|
||
|
|
|
||
|
|
CDataBlob<float> pred_reg[3], pred_cls[3], pred_kps[3], pred_obj[3];
|
||
|
|
/***************branch5*********************/
|
||
|
|
TIME_START;
|
||
|
|
fb3 = ConvolutionDP(fb3, g_pFilters[27], g_pFilters[28]);
|
||
|
|
pred_cls[2] = ConvolutionDP(fb3, g_pFilters[33], g_pFilters[34], false);
|
||
|
|
pred_reg[2] = ConvolutionDP(fb3, g_pFilters[39], g_pFilters[40], false);
|
||
|
|
pred_kps[2] = ConvolutionDP(fb3, g_pFilters[51], g_pFilters[52], false);
|
||
|
|
pred_obj[2] = ConvolutionDP(fb3, g_pFilters[45], g_pFilters[46], false);
|
||
|
|
TIME_END("branch5");
|
||
|
|
|
||
|
|
/*****************add5*********************/
|
||
|
|
TIME_START;
|
||
|
|
fb2 = ElementAdd(UpsampleX2(fb3), fb2);
|
||
|
|
TIME_END("add5");
|
||
|
|
|
||
|
|
/*****************add6*********************/
|
||
|
|
TIME_START;
|
||
|
|
fb2 = ConvolutionDP(fb2, g_pFilters[25], g_pFilters[26]);
|
||
|
|
pred_cls[1] = ConvolutionDP(fb2, g_pFilters[31], g_pFilters[32], false);
|
||
|
|
pred_reg[1] = ConvolutionDP(fb2, g_pFilters[37], g_pFilters[38], false);
|
||
|
|
pred_kps[1] = ConvolutionDP(fb2, g_pFilters[49], g_pFilters[50], false);
|
||
|
|
pred_obj[1] = ConvolutionDP(fb2, g_pFilters[43], g_pFilters[44], false);
|
||
|
|
TIME_END("branch4");
|
||
|
|
|
||
|
|
/*****************add4*********************/
|
||
|
|
TIME_START;
|
||
|
|
fb1 = ElementAdd(UpsampleX2(fb2), fb1);
|
||
|
|
TIME_END("add4");
|
||
|
|
|
||
|
|
/***************branch3*********************/
|
||
|
|
TIME_START;
|
||
|
|
fb1 = ConvolutionDP(fb1, g_pFilters[23], g_pFilters[24]);
|
||
|
|
pred_cls[0] = ConvolutionDP(fb1, g_pFilters[29], g_pFilters[30], false);
|
||
|
|
pred_reg[0] = ConvolutionDP(fb1, g_pFilters[35], g_pFilters[36], false);
|
||
|
|
pred_kps[0] = ConvolutionDP(fb1, g_pFilters[47], g_pFilters[48], false);
|
||
|
|
pred_obj[0] = ConvolutionDP(fb1, g_pFilters[41], g_pFilters[42], false);
|
||
|
|
TIME_END("branch3");
|
||
|
|
|
||
|
|
/***************PRIORBOX*********************/
|
||
|
|
TIME_START;
|
||
|
|
auto prior3 = MeshGrid(fb1.cols, fb1.rows, 8);
|
||
|
|
auto prior4 = MeshGrid(fb2.cols, fb2.rows, 16);
|
||
|
|
auto prior5 = MeshGrid(fb3.cols, fb3.rows, 32);
|
||
|
|
TIME_END("prior");
|
||
|
|
/***************PRIORBOX*********************/
|
||
|
|
|
||
|
|
TIME_START;
|
||
|
|
BboxDecode(pred_reg[0], prior3, 8);
|
||
|
|
BboxDecode(pred_reg[1], prior4, 16);
|
||
|
|
BboxDecode(pred_reg[2], prior5, 32);
|
||
|
|
|
||
|
|
KPSDecode(pred_kps[0], prior3, 8);
|
||
|
|
KPSDecode(pred_kps[1], prior4, 16);
|
||
|
|
KPSDecode(pred_kps[2], prior5, 32);
|
||
|
|
|
||
|
|
auto cls = Concat3(Blob2Vector(pred_cls[0]), Blob2Vector(pred_cls[1]), Blob2Vector(pred_cls[2]));
|
||
|
|
auto reg = Concat3(Blob2Vector(pred_reg[0]), Blob2Vector(pred_reg[1]), Blob2Vector(pred_reg[2]));
|
||
|
|
auto kps = Concat3(Blob2Vector(pred_kps[0]), Blob2Vector(pred_kps[1]), Blob2Vector(pred_kps[2]));
|
||
|
|
auto obj = Concat3(Blob2Vector(pred_obj[0]), Blob2Vector(pred_obj[1]), Blob2Vector(pred_obj[2]));
|
||
|
|
|
||
|
|
Sigmoid(cls);
|
||
|
|
Sigmoid(obj);
|
||
|
|
TIME_END("decode")
|
||
|
|
|
||
|
|
TIME_START;
|
||
|
|
std::vector<FaceRect> facesInfo = DetectionOutput(cls, reg, kps, obj, 0.45f, 0.2f, 1000, 512);
|
||
|
|
TIME_END("detection output")
|
||
|
|
return facesInfo;
|
||
|
|
}
|
||
|
|
catch (std::exception& e) {
|
||
|
|
std::vector<FaceRect> facesInfo;
|
||
|
|
facesInfo.clear();
|
||
|
|
this->_logger.LogFatal("ANSCNNFD::ObjectDetectCNN", e.what(), __FILE__, __LINE__);
|
||
|
|
return facesInfo;
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
CDataBlob<float> ANSCNNFD::SetDataFrom3x3S2P1To1x1S1P0FromImage(const unsigned char* inputData, int imgWidth, int imgHeight, int imgChannels, int imgWidthStep, int padDivisor) {
|
||
|
|
if (imgChannels != 3) {
|
||
|
|
this->_logger.LogError("ANSCNNFD::SetDataFrom3x3S2P1To1x1S1P0FromImage", "The input image must be a 3-channel RGB image", __FILE__, __LINE__);
|
||
|
|
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
if (padDivisor != 32) {
|
||
|
|
this->_logger.LogError("ANSCNNFD::SetDataFrom3x3S2P1To1x1S1P0FromImage", "This version need pad of 32", __FILE__, __LINE__);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
int rows = ((imgHeight - 1) / padDivisor + 1) * padDivisor / 2;
|
||
|
|
int cols = ((imgWidth - 1) / padDivisor + 1) * padDivisor / 2;
|
||
|
|
int channels = 32;
|
||
|
|
CDataBlob<float> outBlob(rows, cols, channels);
|
||
|
|
|
||
|
|
#if defined(_OPENMP)
|
||
|
|
#pragma omp parallel for
|
||
|
|
#endif
|
||
|
|
for (int r = 0; r < rows; r++) {
|
||
|
|
for (int c = 0; c < cols; c++) {
|
||
|
|
float* pData = outBlob.ptr(r, c);
|
||
|
|
for (int fy = -1; fy <= 1; fy++) {
|
||
|
|
int srcy = r * 2 + fy;
|
||
|
|
|
||
|
|
if (srcy < 0 || srcy >= imgHeight) //out of the range of the image
|
||
|
|
continue;
|
||
|
|
|
||
|
|
for (int fx = -1; fx <= 1; fx++) {
|
||
|
|
int srcx = c * 2 + fx;
|
||
|
|
|
||
|
|
if (srcx < 0 || srcx >= imgWidth) //out of the range of the image
|
||
|
|
continue;
|
||
|
|
|
||
|
|
const unsigned char* pImgData = inputData + size_t(imgWidthStep) * srcy + imgChannels * srcx;
|
||
|
|
|
||
|
|
int output_channel_offset = ((fy + 1) * 3 + fx + 1); //3x3 filters, 3-channel image
|
||
|
|
pData[output_channel_offset * imgChannels] = pImgData[0];
|
||
|
|
pData[output_channel_offset * imgChannels + 1] = pImgData[1];
|
||
|
|
pData[output_channel_offset * imgChannels + 2] = pImgData[2];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return outBlob;
|
||
|
|
}
|
||
|
|
|
||
|
|
//p1 and p2 must be 512-bit aligned (16 float numbers)
|
||
|
|
inline float dotProduct(const float* p1, const float* p2, int num)
|
||
|
|
{
|
||
|
|
float sum = 0.f;
|
||
|
|
|
||
|
|
#if defined(_ENABLE_AVX512)
|
||
|
|
__m512 a_float_x16, b_float_x16;
|
||
|
|
__m512 sum_float_x16 = _mm512_setzero_ps();
|
||
|
|
for (int i = 0; i < num; i += 16)
|
||
|
|
{
|
||
|
|
a_float_x16 = _mm512_load_ps(p1 + i);
|
||
|
|
b_float_x16 = _mm512_load_ps(p2 + i);
|
||
|
|
sum_float_x16 = _mm512_add_ps(sum_float_x16, _mm512_mul_ps(a_float_x16, b_float_x16));
|
||
|
|
}
|
||
|
|
sum = _mm512_reduce_add_ps(sum_float_x16);
|
||
|
|
#elif defined(_ENABLE_AVX2)
|
||
|
|
__m256 a_float_x8, b_float_x8;
|
||
|
|
__m256 sum_float_x8 = _mm256_setzero_ps();
|
||
|
|
for (int i = 0; i < num; i += 8)
|
||
|
|
{
|
||
|
|
a_float_x8 = _mm256_load_ps(p1 + i);
|
||
|
|
b_float_x8 = _mm256_load_ps(p2 + i);
|
||
|
|
sum_float_x8 = _mm256_add_ps(sum_float_x8, _mm256_mul_ps(a_float_x8, b_float_x8));
|
||
|
|
}
|
||
|
|
sum_float_x8 = _mm256_hadd_ps(sum_float_x8, sum_float_x8);
|
||
|
|
sum_float_x8 = _mm256_hadd_ps(sum_float_x8, sum_float_x8);
|
||
|
|
sum = ((float*)&sum_float_x8)[0] + ((float*)&sum_float_x8)[4];
|
||
|
|
#elif defined(_ENABLE_NEON)
|
||
|
|
float32x4_t a_float_x4, b_float_x4;
|
||
|
|
float32x4_t sum_float_x4;
|
||
|
|
sum_float_x4 = vdupq_n_f32(0);
|
||
|
|
for (int i = 0; i < num; i += 4)
|
||
|
|
{
|
||
|
|
a_float_x4 = vld1q_f32(p1 + i);
|
||
|
|
b_float_x4 = vld1q_f32(p2 + i);
|
||
|
|
sum_float_x4 = vaddq_f32(sum_float_x4, vmulq_f32(a_float_x4, b_float_x4));
|
||
|
|
}
|
||
|
|
sum += vgetq_lane_f32(sum_float_x4, 0);
|
||
|
|
sum += vgetq_lane_f32(sum_float_x4, 1);
|
||
|
|
sum += vgetq_lane_f32(sum_float_x4, 2);
|
||
|
|
sum += vgetq_lane_f32(sum_float_x4, 3);
|
||
|
|
#else
|
||
|
|
for (int i = 0; i < num; i++)
|
||
|
|
{
|
||
|
|
sum += (p1[i] * p2[i]);
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
|
||
|
|
return sum;
|
||
|
|
}
|
||
|
|
inline bool vecMulAdd(const float* p1, const float* p2, float* p3, int num)
|
||
|
|
{
|
||
|
|
#if defined(_ENABLE_AVX512)
|
||
|
|
__m512 a_float_x16, b_float_x16, c_float_x16;
|
||
|
|
for (int i = 0; i < num; i += 16)
|
||
|
|
{
|
||
|
|
a_float_x16 = _mm512_load_ps(p1 + i);
|
||
|
|
b_float_x16 = _mm512_load_ps(p2 + i);
|
||
|
|
c_float_x16 = _mm512_load_ps(p3 + i);
|
||
|
|
c_float_x16 = _mm512_add_ps(c_float_x16, _mm512_mul_ps(a_float_x16, b_float_x16));
|
||
|
|
_mm512_store_ps(p3 + i, c_float_x16);
|
||
|
|
}
|
||
|
|
#elif defined(_ENABLE_AVX2)
|
||
|
|
__m256 a_float_x8, b_float_x8, c_float_x8;
|
||
|
|
for (int i = 0; i < num; i += 8)
|
||
|
|
{
|
||
|
|
a_float_x8 = _mm256_load_ps(p1 + i);
|
||
|
|
b_float_x8 = _mm256_load_ps(p2 + i);
|
||
|
|
c_float_x8 = _mm256_load_ps(p3 + i);
|
||
|
|
c_float_x8 = _mm256_add_ps(c_float_x8, _mm256_mul_ps(a_float_x8, b_float_x8));
|
||
|
|
_mm256_store_ps(p3 + i, c_float_x8);
|
||
|
|
}
|
||
|
|
#elif defined(_ENABLE_NEON)
|
||
|
|
float32x4_t a_float_x4, b_float_x4, c_float_x4;
|
||
|
|
for (int i = 0; i < num; i += 4)
|
||
|
|
{
|
||
|
|
a_float_x4 = vld1q_f32(p1 + i);
|
||
|
|
b_float_x4 = vld1q_f32(p2 + i);
|
||
|
|
c_float_x4 = vld1q_f32(p3 + i);
|
||
|
|
c_float_x4 = vaddq_f32(c_float_x4, vmulq_f32(a_float_x4, b_float_x4));
|
||
|
|
vst1q_f32(p3 + i, c_float_x4);
|
||
|
|
}
|
||
|
|
#else
|
||
|
|
for (int i = 0; i < num; i++)
|
||
|
|
p3[i] += (p1[i] * p2[i]);
|
||
|
|
#endif
|
||
|
|
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
inline bool vecAdd(const float* p1, float* p2, int num)
|
||
|
|
{
|
||
|
|
#if defined(_ENABLE_AVX512)
|
||
|
|
__m512 a_float_x16, b_float_x16;
|
||
|
|
for (int i = 0; i < num; i += 16)
|
||
|
|
{
|
||
|
|
a_float_x16 = _mm512_load_ps(p1 + i);
|
||
|
|
b_float_x16 = _mm512_load_ps(p2 + i);
|
||
|
|
b_float_x16 = _mm512_add_ps(a_float_x16, b_float_x16);
|
||
|
|
_mm512_store_ps(p2 + i, b_float_x16);
|
||
|
|
}
|
||
|
|
#elif defined(_ENABLE_AVX2)
|
||
|
|
__m256 a_float_x8, b_float_x8;
|
||
|
|
for (int i = 0; i < num; i += 8)
|
||
|
|
{
|
||
|
|
a_float_x8 = _mm256_load_ps(p1 + i);
|
||
|
|
b_float_x8 = _mm256_load_ps(p2 + i);
|
||
|
|
b_float_x8 = _mm256_add_ps(a_float_x8, b_float_x8);
|
||
|
|
_mm256_store_ps(p2 + i, b_float_x8);
|
||
|
|
}
|
||
|
|
#elif defined(_ENABLE_NEON)
|
||
|
|
float32x4_t a_float_x4, b_float_x4, c_float_x4;
|
||
|
|
for (int i = 0; i < num; i += 4)
|
||
|
|
{
|
||
|
|
a_float_x4 = vld1q_f32(p1 + i);
|
||
|
|
b_float_x4 = vld1q_f32(p2 + i);
|
||
|
|
c_float_x4 = vaddq_f32(a_float_x4, b_float_x4);
|
||
|
|
vst1q_f32(p2 + i, c_float_x4);
|
||
|
|
}
|
||
|
|
#else
|
||
|
|
for (int i = 0; i < num; i++)
|
||
|
|
{
|
||
|
|
p2[i] += p1[i];
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
inline bool vecAdd(const float* p1, const float* p2, float* p3, int num)
|
||
|
|
{
|
||
|
|
#if defined(_ENABLE_AVX512)
|
||
|
|
__m512 a_float_x16, b_float_x16;
|
||
|
|
for (int i = 0; i < num; i += 16)
|
||
|
|
{
|
||
|
|
a_float_x16 = _mm512_load_ps(p1 + i);
|
||
|
|
b_float_x16 = _mm512_load_ps(p2 + i);
|
||
|
|
b_float_x16 = _mm512_add_ps(a_float_x16, b_float_x16);
|
||
|
|
_mm512_store_ps(p3 + i, b_float_x16);
|
||
|
|
}
|
||
|
|
#elif defined(_ENABLE_AVX2)
|
||
|
|
__m256 a_float_x8, b_float_x8;
|
||
|
|
for (int i = 0; i < num; i += 8)
|
||
|
|
{
|
||
|
|
a_float_x8 = _mm256_load_ps(p1 + i);
|
||
|
|
b_float_x8 = _mm256_load_ps(p2 + i);
|
||
|
|
b_float_x8 = _mm256_add_ps(a_float_x8, b_float_x8);
|
||
|
|
_mm256_store_ps(p3 + i, b_float_x8);
|
||
|
|
}
|
||
|
|
#elif defined(_ENABLE_NEON)
|
||
|
|
float32x4_t a_float_x4, b_float_x4, c_float_x4;
|
||
|
|
for (int i = 0; i < num; i += 4)
|
||
|
|
{
|
||
|
|
a_float_x4 = vld1q_f32(p1 + i);
|
||
|
|
b_float_x4 = vld1q_f32(p2 + i);
|
||
|
|
c_float_x4 = vaddq_f32(a_float_x4, b_float_x4);
|
||
|
|
vst1q_f32(p3 + i, c_float_x4);
|
||
|
|
}
|
||
|
|
#else
|
||
|
|
for (int i = 0; i < num; i++)
|
||
|
|
{
|
||
|
|
p3[i] = p1[i] + p2[i];
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
bool ANSCNNFD::Convolution1x1PointWise(const CDataBlob<float>& inputData, const Filters<float>& filters, CDataBlob<float>& outputData)
|
||
|
|
{
|
||
|
|
#if defined(_OPENMP)
|
||
|
|
#pragma omp parallel for
|
||
|
|
#endif
|
||
|
|
for (int row = 0; row < outputData.rows; row++)
|
||
|
|
{
|
||
|
|
for (int col = 0; col < outputData.cols; col++)
|
||
|
|
{
|
||
|
|
float* pOut = outputData.ptr(row, col);
|
||
|
|
const float* pIn = inputData.ptr(row, col);
|
||
|
|
for (int ch = 0; ch < outputData.channels; ch++)
|
||
|
|
{
|
||
|
|
const float* pF = filters.weights.ptr(0, ch);
|
||
|
|
pOut[ch] = dotProduct(pIn, pF, inputData.channels);
|
||
|
|
pOut[ch] += filters.biases.data[ch];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
bool ANSCNNFD::Convolution3x3DepthWise(const CDataBlob<float>& inputData, const Filters<float>& filters, CDataBlob<float>& outputData)
|
||
|
|
{
|
||
|
|
//set all elements in outputData to zeros
|
||
|
|
outputData.setZero();
|
||
|
|
#if defined(_OPENMP)
|
||
|
|
#pragma omp parallel for
|
||
|
|
#endif
|
||
|
|
for (int row = 0; row < outputData.rows; row++)
|
||
|
|
{
|
||
|
|
int srcy_start = row - 1;
|
||
|
|
int srcy_end = srcy_start + 3;
|
||
|
|
srcy_start = MAX(0, srcy_start);
|
||
|
|
srcy_end = MIN(srcy_end, inputData.rows);
|
||
|
|
|
||
|
|
for (int col = 0; col < outputData.cols; col++)
|
||
|
|
{
|
||
|
|
float* pOut = outputData.ptr(row, col);
|
||
|
|
int srcx_start = col - 1;
|
||
|
|
int srcx_end = srcx_start + 3;
|
||
|
|
srcx_start = MAX(0, srcx_start);
|
||
|
|
srcx_end = MIN(srcx_end, inputData.cols);
|
||
|
|
|
||
|
|
|
||
|
|
for (int r = srcy_start; r < srcy_end; r++)
|
||
|
|
for (int c = srcx_start; c < srcx_end; c++)
|
||
|
|
{
|
||
|
|
int filter_r = r - row + 1;
|
||
|
|
int filter_c = c - col + 1;
|
||
|
|
int filter_idx = filter_r * 3 + filter_c;
|
||
|
|
vecMulAdd(inputData.ptr(r, c), filters.weights.ptr(0, filter_idx), pOut, filters.num_filters);
|
||
|
|
}
|
||
|
|
vecAdd(filters.biases.ptr(0, 0), pOut, filters.num_filters);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
bool ANSCNNFD::Relu(CDataBlob<float>& inputoutputData)
|
||
|
|
{
|
||
|
|
if (inputoutputData.isEmpty())
|
||
|
|
{
|
||
|
|
this->_logger.LogError("ANSCNNFD::Relu", "The input data is empty", __FILE__, __LINE__);
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
int len = inputoutputData.cols * inputoutputData.rows * inputoutputData.channelStep / sizeof(float);
|
||
|
|
|
||
|
|
|
||
|
|
#if defined(_ENABLE_AVX512)
|
||
|
|
__m512 a, bzeros;
|
||
|
|
bzeros = _mm512_setzero_ps(); //zeros
|
||
|
|
for (int i = 0; i < len; i += 16)
|
||
|
|
{
|
||
|
|
a = _mm512_load_ps(inputoutputData.data + i);
|
||
|
|
a = _mm512_max_ps(a, bzeros);
|
||
|
|
_mm512_store_ps(inputoutputData.data + i, a);
|
||
|
|
}
|
||
|
|
#elif defined(_ENABLE_AVX2)
|
||
|
|
__m256 a, bzeros;
|
||
|
|
bzeros = _mm256_setzero_ps(); //zeros
|
||
|
|
for (int i = 0; i < len; i += 8)
|
||
|
|
{
|
||
|
|
a = _mm256_load_ps(inputoutputData.data + i);
|
||
|
|
a = _mm256_max_ps(a, bzeros);
|
||
|
|
_mm256_store_ps(inputoutputData.data + i, a);
|
||
|
|
}
|
||
|
|
#else
|
||
|
|
for (int i = 0; i < len; i++)
|
||
|
|
inputoutputData.data[i] *= (inputoutputData.data[i] > 0);
|
||
|
|
#endif
|
||
|
|
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
void ANSCNNFD::IntersectBBox(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2,
|
||
|
|
NormalizedBBox* intersect_bbox)
|
||
|
|
{
|
||
|
|
if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin ||
|
||
|
|
bbox2.ymin > bbox1.ymax || bbox2.ymax < bbox1.ymin)
|
||
|
|
{
|
||
|
|
// Return [0, 0, 0, 0] if there is no intersection.
|
||
|
|
intersect_bbox->xmin = 0;
|
||
|
|
intersect_bbox->ymin = 0;
|
||
|
|
intersect_bbox->xmax = 0;
|
||
|
|
intersect_bbox->ymax = 0;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
intersect_bbox->xmin = (std::max(bbox1.xmin, bbox2.xmin));
|
||
|
|
intersect_bbox->ymin = (std::max(bbox1.ymin, bbox2.ymin));
|
||
|
|
intersect_bbox->xmax = (std::min(bbox1.xmax, bbox2.xmax));
|
||
|
|
intersect_bbox->ymax = (std::min(bbox1.ymax, bbox2.ymax));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
float ANSCNNFD::JaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2)
|
||
|
|
{
|
||
|
|
NormalizedBBox intersect_bbox;
|
||
|
|
IntersectBBox(bbox1, bbox2, &intersect_bbox);
|
||
|
|
float intersect_width, intersect_height;
|
||
|
|
intersect_width = intersect_bbox.xmax - intersect_bbox.xmin;
|
||
|
|
intersect_height = intersect_bbox.ymax - intersect_bbox.ymin;
|
||
|
|
|
||
|
|
if (intersect_width > 0 && intersect_height > 0)
|
||
|
|
{
|
||
|
|
float intersect_size = intersect_width * intersect_height;
|
||
|
|
float bsize1 = (bbox1.xmax - bbox1.xmin) * (bbox1.ymax - bbox1.ymin);
|
||
|
|
float bsize2 = (bbox2.xmax - bbox2.xmin) * (bbox2.ymax - bbox2.ymin);
|
||
|
|
return intersect_size / (bsize1 + bsize2 - intersect_size);
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
return 0.f;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
CDataBlob<float> ANSCNNFD::UpsampleX2(const CDataBlob<float>& inputData) {
|
||
|
|
if (inputData.isEmpty()) {
|
||
|
|
this->_logger.LogError("ANSCNNFD::UpsampleX2", "The input data is empty", __FILE__, __LINE__);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
CDataBlob<float> outData(inputData.rows * 2, inputData.cols * 2, inputData.channels);
|
||
|
|
|
||
|
|
for (int r = 0; r < inputData.rows; r++) {
|
||
|
|
for (int c = 0; c < inputData.cols; c++) {
|
||
|
|
const float* pIn = inputData.ptr(r, c);
|
||
|
|
int outr = r * 2;
|
||
|
|
int outc = c * 2;
|
||
|
|
for (int ch = 0; ch < inputData.channels; ++ch) {
|
||
|
|
outData.ptr(outr, outc)[ch] = pIn[ch];
|
||
|
|
outData.ptr(outr, outc + 1)[ch] = pIn[ch];
|
||
|
|
outData.ptr(outr + 1, outc)[ch] = pIn[ch];
|
||
|
|
outData.ptr(outr + 1, outc + 1)[ch] = pIn[ch];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return outData;
|
||
|
|
}
|
||
|
|
|
||
|
|
CDataBlob<float> ANSCNNFD::ElementAdd(const CDataBlob<float>& inputData1, const CDataBlob<float>& inputData2) {
|
||
|
|
if (inputData1.rows != inputData2.rows || inputData1.cols != inputData2.cols || inputData1.channels != inputData2.channels) {
|
||
|
|
this->_logger.LogError("ANSCNNFD::ElementAdd", "The two input datas must be in the same shape.", __FILE__, __LINE__);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
CDataBlob<float> outData(inputData1.rows, inputData1.cols, inputData1.channels);
|
||
|
|
for (int r = 0; r < inputData1.rows; r++) {
|
||
|
|
for (int c = 0; c < inputData1.cols; c++) {
|
||
|
|
const float* pIn1 = inputData1.ptr(r, c);
|
||
|
|
const float* pIn2 = inputData2.ptr(r, c);
|
||
|
|
float* pOut = outData.ptr(r, c);
|
||
|
|
vecAdd(pIn1, pIn2, pOut, inputData1.channels);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return outData;
|
||
|
|
}
|
||
|
|
CDataBlob<float> ANSCNNFD::Convolution(const CDataBlob<float>& inputData, const Filters<float>& filters, bool do_relu)
|
||
|
|
{
|
||
|
|
if (inputData.isEmpty() || filters.weights.isEmpty() || filters.biases.isEmpty())
|
||
|
|
{
|
||
|
|
this->_logger.LogError("ANSCNNFD::Convolution", "The input data or filter data is empty.", __FILE__, __LINE__);
|
||
|
|
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
if (inputData.channels != filters.channels)
|
||
|
|
{
|
||
|
|
this->_logger.LogError("ANSCNNFD::Convolution", "The input data dimension cannot meet filters.", __FILE__, __LINE__);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
CDataBlob<float> outputData(inputData.rows, inputData.cols, filters.num_filters);
|
||
|
|
if (filters.is_pointwise && !filters.is_depthwise)
|
||
|
|
Convolution1x1PointWise(inputData, filters, outputData);
|
||
|
|
else if (!filters.is_pointwise && filters.is_depthwise)
|
||
|
|
Convolution3x3DepthWise(inputData, filters, outputData);
|
||
|
|
else
|
||
|
|
{
|
||
|
|
this->_logger.LogError("ANSCNNFD::Convolution", "Unsupported filter type.", __FILE__, __LINE__);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (do_relu)
|
||
|
|
Relu(outputData);
|
||
|
|
|
||
|
|
return outputData;
|
||
|
|
}
|
||
|
|
CDataBlob<float> ANSCNNFD::ConvolutionDP(const CDataBlob<float>& inputData,
|
||
|
|
const Filters<float>& filtersP, const Filters<float>& filtersD, bool do_relu)
|
||
|
|
{
|
||
|
|
CDataBlob<float> tmp = Convolution(inputData, filtersP, false);
|
||
|
|
CDataBlob<float> out = Convolution(tmp, filtersD, do_relu);
|
||
|
|
return out;
|
||
|
|
}
|
||
|
|
|
||
|
|
CDataBlob<float> ANSCNNFD::Convolution4LayerUnit(const CDataBlob<float>& inputData,
|
||
|
|
const Filters<float>& filtersP1, const Filters<float>& filtersD1,
|
||
|
|
const Filters<float>& filtersP2, const Filters<float>& filtersD2, bool do_relu)
|
||
|
|
{
|
||
|
|
CDataBlob<float> tmp = ConvolutionDP(inputData, filtersP1, filtersD1, true);
|
||
|
|
CDataBlob<float> out = ConvolutionDP(tmp, filtersP2, filtersD2, do_relu);
|
||
|
|
return out;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
//only 2X2 S2 is supported
|
||
|
|
CDataBlob<float> ANSCNNFD::MaxPooling2x2S2(const CDataBlob<float>& inputData)
|
||
|
|
{
|
||
|
|
if (inputData.isEmpty())
|
||
|
|
{
|
||
|
|
this->_logger.LogError("ANSCNNFD::MaxPooling2x2S2", "The input data is empty.", __FILE__, __LINE__);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
int outputR = static_cast<int>(ceil((inputData.rows - 3.0f) / 2)) + 1;
|
||
|
|
int outputC = static_cast<int>(ceil((inputData.cols - 3.0f) / 2)) + 1;
|
||
|
|
int outputCH = inputData.channels;
|
||
|
|
|
||
|
|
if (outputR < 1 || outputC < 1)
|
||
|
|
{
|
||
|
|
this->_logger.LogError("ANSCNNFD::MaxPooling2x2S2", "The size of the output is not correct.", __FILE__, __LINE__);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
CDataBlob<float> outputData(outputR, outputC, outputCH);
|
||
|
|
outputData.setZero();
|
||
|
|
|
||
|
|
for (int row = 0; row < outputData.rows; row++)
|
||
|
|
{
|
||
|
|
for (int col = 0; col < outputData.cols; col++)
|
||
|
|
{
|
||
|
|
size_t inputMatOffsetsInElement[4];
|
||
|
|
int elementCount = 0;
|
||
|
|
|
||
|
|
int rstart = row * 2;
|
||
|
|
int cstart = col * 2;
|
||
|
|
int rend = MIN(rstart + 2, inputData.rows);
|
||
|
|
int cend = MIN(cstart + 2, inputData.cols);
|
||
|
|
|
||
|
|
for (int fr = rstart; fr < rend; fr++)
|
||
|
|
{
|
||
|
|
for (int fc = cstart; fc < cend; fc++)
|
||
|
|
{
|
||
|
|
inputMatOffsetsInElement[elementCount++] = (size_t(fr) * inputData.cols + fc) * inputData.channelStep / sizeof(float);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
float* pOut = outputData.ptr(row, col);
|
||
|
|
float* pIn = inputData.data;
|
||
|
|
|
||
|
|
#if defined(_ENABLE_NEON)
|
||
|
|
for (int ch = 0; ch < outputData.channels; ch += 4)
|
||
|
|
{
|
||
|
|
float32x4_t tmp;
|
||
|
|
float32x4_t maxVal = vld1q_f32(pIn + ch + inputMatOffsetsInElement[0]);
|
||
|
|
for (int ec = 1; ec < elementCount; ec++)
|
||
|
|
{
|
||
|
|
tmp = vld1q_f32(pIn + ch + inputMatOffsetsInElement[ec]);
|
||
|
|
maxVal = vmaxq_f32(maxVal, tmp);
|
||
|
|
}
|
||
|
|
vst1q_f32(pOut + ch, maxVal);
|
||
|
|
}
|
||
|
|
#elif defined(_ENABLE_AVX512)
|
||
|
|
for (int ch = 0; ch < outputData.channels; ch += 16)
|
||
|
|
{
|
||
|
|
__m512 tmp;
|
||
|
|
__m512 maxVal = _mm512_load_ps((__m512 const*)(pIn + ch + inputMatOffsetsInElement[0]));
|
||
|
|
for (int ec = 1; ec < elementCount; ec++)
|
||
|
|
{
|
||
|
|
tmp = _mm512_load_ps((__m512 const*)(pIn + ch + inputMatOffsetsInElement[ec]));
|
||
|
|
maxVal = _mm512_max_ps(maxVal, tmp);
|
||
|
|
}
|
||
|
|
_mm512_store_ps((__m512*)(pOut + ch), maxVal);
|
||
|
|
}
|
||
|
|
#elif defined(_ENABLE_AVX2)
|
||
|
|
for (int ch = 0; ch < outputData.channels; ch += 8)
|
||
|
|
{
|
||
|
|
__m256 tmp;
|
||
|
|
__m256 maxVal = _mm256_load_ps((float const*)(pIn + ch + inputMatOffsetsInElement[0]));
|
||
|
|
for (int ec = 1; ec < elementCount; ec++)
|
||
|
|
{
|
||
|
|
tmp = _mm256_load_ps((float const*)(pIn + ch + inputMatOffsetsInElement[ec]));
|
||
|
|
maxVal = _mm256_max_ps(maxVal, tmp);
|
||
|
|
}
|
||
|
|
_mm256_store_ps(pOut + ch, maxVal);
|
||
|
|
}
|
||
|
|
#else
|
||
|
|
for (int ch = 0; ch < outputData.channels; ch++)
|
||
|
|
{
|
||
|
|
float maxVal = pIn[ch + inputMatOffsetsInElement[0]];
|
||
|
|
for (int ec = 1; ec < elementCount; ec++)
|
||
|
|
{
|
||
|
|
maxVal = MAX(maxVal, pIn[ch + inputMatOffsetsInElement[ec]]);
|
||
|
|
}
|
||
|
|
pOut[ch] = maxVal;
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return outputData;
|
||
|
|
}
|
||
|
|
|
||
|
|
CDataBlob<float> ANSCNNFD::MeshGrid(int feature_width, int feature_height, int stride, float offset) {
|
||
|
|
CDataBlob<float> out(feature_height, feature_width, 2);
|
||
|
|
for (int r = 0; r < feature_height; ++r) {
|
||
|
|
float rx = (float)(r * stride) + offset;
|
||
|
|
for (int c = 0; c < feature_width; ++c) {
|
||
|
|
float* p = out.ptr(r, c);
|
||
|
|
p[0] = (float)(c * stride) + offset;
|
||
|
|
p[1] = rx;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return out;
|
||
|
|
}
|
||
|
|
|
||
|
|
void ANSCNNFD::BboxDecode(CDataBlob<float>& bbox_pred, const CDataBlob<float>& priors, int stride) {
|
||
|
|
if (bbox_pred.cols != priors.cols || bbox_pred.rows != priors.rows) {
|
||
|
|
this->_logger.LogError("ANSCNNFD::BboxDecode", "Mismatch between feature map and anchor size.", __FILE__, __LINE__);
|
||
|
|
}
|
||
|
|
if (bbox_pred.channels != 4) {
|
||
|
|
this->_logger.LogError("ANSCNNFD::BboxDecode", "The bbox dim must be 4.", __FILE__, __LINE__);
|
||
|
|
}
|
||
|
|
float fstride = (float)stride;
|
||
|
|
for (int r = 0; r < bbox_pred.rows; ++r) {
|
||
|
|
for (int c = 0; c < bbox_pred.cols; ++c) {
|
||
|
|
float* pb = bbox_pred.ptr(r, c);
|
||
|
|
const float* pp = priors.ptr(r, c);
|
||
|
|
float cx = pb[0] * fstride + pp[0];
|
||
|
|
float cy = pb[1] * fstride + pp[1];
|
||
|
|
float w = std::exp(pb[2]) * fstride;
|
||
|
|
float h = std::exp(pb[3]) * fstride;
|
||
|
|
pb[0] = cx - w / 2.f;
|
||
|
|
pb[1] = cy - h / 2.f;
|
||
|
|
pb[2] = cx + w / 2.f;
|
||
|
|
pb[3] = cy + h / 2.f;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
void ANSCNNFD::KPSDecode(CDataBlob<float>& kps_pred, const CDataBlob<float>& priors, int stride) {
|
||
|
|
if (kps_pred.cols != priors.cols || kps_pred.rows != priors.rows) {
|
||
|
|
this->_logger.LogError("ANSCNNFD::KPSDecode", "Mismatch between feature map and anchor size.", __FILE__, __LINE__);
|
||
|
|
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
if (kps_pred.channels & 1) {
|
||
|
|
this->_logger.LogError("ANSCNNFD::KPSDecode", "The kps dim must be even.", __FILE__, __LINE__);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
float fstride = (float)stride;
|
||
|
|
int num_points = kps_pred.channels >> 1;
|
||
|
|
|
||
|
|
for (int r = 0; r < kps_pred.rows; ++r) {
|
||
|
|
for (int c = 0; c < kps_pred.cols; ++c) {
|
||
|
|
float* pb = kps_pred.ptr(r, c);
|
||
|
|
const float* pp = priors.ptr(r, c);
|
||
|
|
for (int n = 0; n < num_points; ++n) {
|
||
|
|
pb[2 * n] = pb[2 * n] * fstride + pp[0];
|
||
|
|
pb[2 * n + 1] = pb[2 * n + 1] * fstride + pp[1];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
template<typename T>
|
||
|
|
CDataBlob<T> ANSCNNFD::Concat3(const CDataBlob<T>& inputData1, const CDataBlob<T>& inputData2, const CDataBlob<T>& inputData3)
|
||
|
|
{
|
||
|
|
if ((inputData1.isEmpty()) || (inputData2.isEmpty()) || (inputData3.isEmpty()))
|
||
|
|
{
|
||
|
|
this->_logger.LogError("ANSCNNFD::Concat3", "The input data is empty.", __FILE__, __LINE__);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
if ((inputData1.cols != inputData2.cols) ||
|
||
|
|
(inputData1.rows != inputData2.rows) ||
|
||
|
|
(inputData1.cols != inputData3.cols) ||
|
||
|
|
(inputData1.rows != inputData3.rows))
|
||
|
|
{
|
||
|
|
this->_logger.LogError("ANSCNNFD::Concat3", "The three inputs must have the same size.", __FILE__, __LINE__);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
int outputR = inputData1.rows;
|
||
|
|
int outputC = inputData1.cols;
|
||
|
|
int outputCH = inputData1.channels + inputData2.channels + inputData3.channels;
|
||
|
|
|
||
|
|
if (outputR < 1 || outputC < 1 || outputCH < 1)
|
||
|
|
{
|
||
|
|
this->_logger.LogError("ANSCNNFD::Concat3", "The size of the output is not correct.", __FILE__, __LINE__);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
CDataBlob<T> outputData(outputR, outputC, outputCH);
|
||
|
|
|
||
|
|
for (int row = 0; row < outputData.rows; row++)
|
||
|
|
{
|
||
|
|
for (int col = 0; col < outputData.cols; col++)
|
||
|
|
{
|
||
|
|
T* pOut = outputData.ptr(row, col);
|
||
|
|
const T* pIn1 = inputData1.ptr(row, col);
|
||
|
|
const T* pIn2 = inputData2.ptr(row, col);
|
||
|
|
const T* pIn3 = inputData3.ptr(row, col);
|
||
|
|
|
||
|
|
memcpy(pOut, pIn1, sizeof(T) * inputData1.channels);
|
||
|
|
memcpy(pOut + inputData1.channels, pIn2, sizeof(T) * inputData2.channels);
|
||
|
|
memcpy(pOut + inputData1.channels + inputData2.channels, pIn3, sizeof(T) * inputData3.channels);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return outputData;
|
||
|
|
}
|
||
|
|
template CDataBlob<float> ANSCNNFD::Concat3(const CDataBlob<float>& inputData1, const CDataBlob<float>& inputData2, const CDataBlob<float>& inputData3);
|
||
|
|
|
||
|
|
template<typename T>
|
||
|
|
CDataBlob<T> ANSCNNFD::Blob2Vector(const CDataBlob<T>& inputData)
|
||
|
|
{
|
||
|
|
if (inputData.isEmpty())
|
||
|
|
{
|
||
|
|
this->_logger.LogError("ANSCNNFD::Blob2Vector", "The input data is empty.", __FILE__, __LINE__);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
CDataBlob<T> outputData(1, 1, inputData.cols * inputData.rows * inputData.channels);
|
||
|
|
|
||
|
|
int bytesOfAChannel = inputData.channels * sizeof(T);
|
||
|
|
T* pOut = outputData.ptr(0, 0);
|
||
|
|
for (int row = 0; row < inputData.rows; row++)
|
||
|
|
{
|
||
|
|
for (int col = 0; col < inputData.cols; col++)
|
||
|
|
{
|
||
|
|
const T* pIn = inputData.ptr(row, col);
|
||
|
|
memcpy(pOut, pIn, bytesOfAChannel);
|
||
|
|
pOut += inputData.channels;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return outputData;
|
||
|
|
}
|
||
|
|
template CDataBlob<float> ANSCNNFD::Blob2Vector(const CDataBlob<float>& inputData);
|
||
|
|
|
||
|
|
void ANSCNNFD::Sigmoid(CDataBlob<float>& inputData) {
|
||
|
|
for (int r = 0; r < inputData.rows; ++r) {
|
||
|
|
for (int c = 0; c < inputData.cols; ++c) {
|
||
|
|
float* pIn = inputData.ptr(r, c);
|
||
|
|
for (int ch = 0; ch < inputData.channels; ++ch) {
|
||
|
|
float v = pIn[ch];
|
||
|
|
v = std::min(v, 88.3762626647949f);
|
||
|
|
v = std::max(v, -88.3762626647949f);
|
||
|
|
pIn[ch] = static_cast<float>(1.f / (1.f + exp(-v)));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
std::vector<FaceRect> ANSCNNFD::DetectionOutput(const CDataBlob<float>& cls,
|
||
|
|
const CDataBlob<float>& reg,
|
||
|
|
const CDataBlob<float>& kps,
|
||
|
|
const CDataBlob<float>& obj,
|
||
|
|
float overlap_threshold,
|
||
|
|
float confidence_threshold,
|
||
|
|
int top_k,
|
||
|
|
int keep_top_k)
|
||
|
|
{
|
||
|
|
if (reg.isEmpty() || cls.isEmpty() || kps.isEmpty() || obj.isEmpty())//|| iou.isEmpty())
|
||
|
|
{
|
||
|
|
this->_logger.LogError("ANSCNNFD::DetectionOutput", "The input data is null.", __FILE__, __LINE__);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
if (reg.cols != 1 || reg.rows != 1 || cls.cols != 1 || cls.rows != 1 || kps.cols != 1 || kps.rows != 1 || obj.cols != 1 || obj.rows != 1) {
|
||
|
|
this->_logger.LogError("ANSCNNFD::DetectionOutput", "Only support vector format.", __FILE__, __LINE__);
|
||
|
|
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
if ((int)(kps.channels / obj.channels) != 10) {
|
||
|
|
this->_logger.LogError("ANSCNNFD::DetectionOutput", "Only support 5 keypoints.", __FILE__, __LINE__);
|
||
|
|
exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
const float* pCls = cls.ptr(0, 0);
|
||
|
|
const float* pReg = reg.ptr(0, 0);
|
||
|
|
const float* pObj = obj.ptr(0, 0);
|
||
|
|
const float* pKps = kps.ptr(0, 0);
|
||
|
|
|
||
|
|
std::vector<std::pair<float, NormalizedBBox> > score_bbox_vec;
|
||
|
|
std::vector<std::pair<float, NormalizedBBox> > final_score_bbox_vec;
|
||
|
|
|
||
|
|
//get the candidates those are > confidence_threshold
|
||
|
|
for (int i = 0; i < cls.channels; ++i)
|
||
|
|
{
|
||
|
|
float conf = std::sqrt(pCls[i] * pObj[i]);
|
||
|
|
// float conf = pCls[i] * pObj[i];
|
||
|
|
|
||
|
|
if (conf >= confidence_threshold)
|
||
|
|
{
|
||
|
|
NormalizedBBox bb;
|
||
|
|
bb.xmin = pReg[4 * i];
|
||
|
|
bb.ymin = pReg[4 * i + 1];
|
||
|
|
bb.xmax = pReg[4 * i + 2];
|
||
|
|
bb.ymax = pReg[4 * i + 3];
|
||
|
|
|
||
|
|
//store the five landmarks
|
||
|
|
memcpy(bb.lm, pKps + 10 * i, 10 * sizeof(float));
|
||
|
|
score_bbox_vec.push_back(std::make_pair(conf, bb));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
//Sort the score pair according to the scores in descending order
|
||
|
|
std::stable_sort(score_bbox_vec.begin(), score_bbox_vec.end(), SortScoreBBoxPairDescend);
|
||
|
|
|
||
|
|
// Keep top_k scores if needed.
|
||
|
|
if (top_k > -1 && size_t(top_k) < score_bbox_vec.size()) {
|
||
|
|
score_bbox_vec.resize(top_k);
|
||
|
|
}
|
||
|
|
|
||
|
|
//Do NMS
|
||
|
|
final_score_bbox_vec.clear();
|
||
|
|
while (score_bbox_vec.size() != 0) {
|
||
|
|
const NormalizedBBox bb1 = score_bbox_vec.front().second;
|
||
|
|
bool keep = true;
|
||
|
|
for (size_t k = 0; k < final_score_bbox_vec.size(); k++)
|
||
|
|
{
|
||
|
|
if (keep)
|
||
|
|
{
|
||
|
|
const NormalizedBBox bb2 = final_score_bbox_vec[k].second;
|
||
|
|
float overlap = JaccardOverlap(bb1, bb2);
|
||
|
|
keep = (overlap <= overlap_threshold);
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (keep) {
|
||
|
|
final_score_bbox_vec.push_back(score_bbox_vec.front());
|
||
|
|
}
|
||
|
|
score_bbox_vec.erase(score_bbox_vec.begin());
|
||
|
|
}
|
||
|
|
if (keep_top_k > -1 && size_t(keep_top_k) < final_score_bbox_vec.size()) {
|
||
|
|
final_score_bbox_vec.resize(keep_top_k);
|
||
|
|
}
|
||
|
|
|
||
|
|
//copy the results to the output blob
|
||
|
|
int num_faces = (int)final_score_bbox_vec.size();
|
||
|
|
|
||
|
|
std::vector<FaceRect> facesInfo;
|
||
|
|
for (int fi = 0; fi < num_faces; fi++)
|
||
|
|
{
|
||
|
|
std::pair<float, NormalizedBBox> pp = final_score_bbox_vec[fi];
|
||
|
|
|
||
|
|
FaceRect r;
|
||
|
|
r.score = pp.first;
|
||
|
|
r.x = int(pp.second.xmin);
|
||
|
|
r.y = int(pp.second.ymin);
|
||
|
|
r.w = int(pp.second.xmax - pp.second.xmin);
|
||
|
|
r.h = int(pp.second.ymax - pp.second.ymin);
|
||
|
|
//copy landmark data
|
||
|
|
for (int i = 0; i < 10; ++i) {
|
||
|
|
r.lm[i] = int(pp.second.lm[i]);
|
||
|
|
}
|
||
|
|
facesInfo.emplace_back(r);
|
||
|
|
}
|
||
|
|
|
||
|
|
return facesInfo;
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
}
|