Files
ANSCORE/modules/ANSODEngine/CNNFaceDetector.cpp

1143 lines
44 KiB
C++
Raw Permalink Normal View History

2026-03-28 16:54:11 +11:00
#include "CNNFaceDetector.h"
#include "Utility.h"
void* AllocMemory(size_t size)
{
char* ptr, * ptr0;
ptr0 = (char*)malloc(
(size_t)(size + _MALLOC_ALIGN * ((size >= 4096) + 1L) + sizeof(char*)));
if (!ptr0)
return 0;
// align the pointer
ptr = (char*)(((size_t)(ptr0 + sizeof(char*) + 1) + _MALLOC_ALIGN - 1) & ~(size_t)(_MALLOC_ALIGN - 1));
*(char**)(ptr - sizeof(char*)) = ptr0;
return ptr;
}
void FreeMemory_(void* ptr)
{
try {
if (ptr)
{
if (((size_t)ptr & (_MALLOC_ALIGN - 1)) != 0)
return;
free(*((char**)ptr - 1));
}
}
catch (std::exception& e) {
std::cout << "ANSCENTER::FreeMemory:" << e.what();
}
}
namespace ANSCENTER {
bool SortScoreBBoxPairDescend(const std::pair<float, NormalizedBBox>& pair1, const std::pair<float, NormalizedBBox>& pair2)
{
return pair1.first > pair2.first;
}
bool ANSCNNFD::OptimizeModel(bool fp16, std::string& optimizedModelFolder) {
return true;
}
bool ANSCNNFD::LoadModel(const std::string& modelZipFilePath, const std::string& modelZipPassword) {
try {
return true;
}
catch (std::exception& e) {
this->_logger.LogFatal("ANSCNNFD::LoadModel", e.what(), __FILE__, __LINE__);
return false;
}
}
bool ANSCNNFD::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, std::string& labelMap) {
bool result = true;
_licenseValid = true;
if (!result) return false;
try {
_modelConfig = modelConfig;
_modelConfig.modelType = ModelType::FACEDETECT;
_modelConfig.detectionType = DetectionType::FACEDETECTOR;
InitParameters();
labelMap = "Face";
_isInitialized = true;
return true;
}
catch (std::exception& e) {
this->_logger.LogFatal("ANSCNNFD::Initialize", e.what(), __FILE__, __LINE__);
return false;
}
}
std::vector<Object> ANSCNNFD::RunInference(const cv::Mat& input) {
std::vector<Object> output;
output.clear();
if (!_licenseValid) {
2026-04-13 20:38:40 +10:00
if (_modelLoading.load()) return {};
2026-03-28 16:54:11 +11:00
this->_logger.LogError("ANSCNNFD::RunInference", "Invalid license", __FILE__, __LINE__);
return output;
}
if (!_isInitialized) {
this->_logger.LogError("ANSCNNFD::RunInference", "Invalid model", __FILE__, __LINE__);
return output;
}
try {
bool croppedFace = false; // Check if the image is cropped face image
int* pResults = nullptr;
unsigned char* pBuffer = static_cast<unsigned char*>(malloc(DETECT_BUFFER_SIZE));
cv::Mat frame = input.clone();
// We know that the image sizes <=300 px, it is likely that image is cropped for face only
if ((input.size[0] <= 300) || (input.size[1] <= 300)) croppedFace = true;
if (croppedFace) cv::copyMakeBorder(input, frame, 200, 200, 200, 200, cv::BORDER_REPLICATE);
pResults = FaceDetectCNN(pBuffer, static_cast<unsigned char*>(frame.ptr(0)), frame.cols, frame.rows, static_cast<int>(frame.step));
for (int i = 0; i < (pResults ? *pResults : 0); i++)
{
Object result;
short* p = reinterpret_cast<short*>(pResults + 1) + 16 * i;
float confidence = static_cast<float>(p[0]) / 100;
if (confidence >= _modelConfig.detectionScoreThreshold) {
int x = p[1];
int y = p[2];
int w = p[3];
int h = p[4];
int x1, y1, x2, y2;
result.classId = 0;
result.className = "Face";
result.confidence = confidence;
result.box.x = x;
result.box.y = y;
if (croppedFace) {
if (x <= 200) x = 200;
if (y <= 200) y = 200;
result.box.x = x - 200;
result.box.y = y - 200;
}
result.box.width = w;
result.box.height = h;
x1 = x;
y1 = y;
x2 = x + w;
y2 = y + h;
cv::Rect facePos(cv::Point(x1, y1), cv::Point(x2, y2));
cv::Mat currFace = frame(facePos);
result.mask = currFace.clone();
output.push_back(result);
}
}
free(pBuffer);
frame.release();
return output;
}
catch (std::exception& e) {
this->_logger.LogFatal("ANSCNNFD::RunInference", e.what(), __FILE__, __LINE__);
return output;
}
}
ANSCNNFD::~ANSCNNFD() {
try {
this->_logger.LogInfo("ANSCNNFD::~ANSCNNFD()", "Release ANSCNNFD ",__FILE__, __LINE__);
}
catch (std::exception& e) {
std::cout << "ANSCNNFD::~ANSCNNFD()" << e.what() << std::endl;
}
}
bool ANSCNNFD::Destroy() {
try {
this->_logger.LogInfo("ANSCNNFD::Destroy()", "Release ANSCNNFD ", __FILE__, __LINE__);
return true;
}
catch (std::exception& e) {
std::cout << "ANSCNNFD::Destroy()" << e.what() << std::endl;
return false;
}
}
// Private
int* ANSCNNFD::FaceDetectCNN(unsigned char* result_buffer, unsigned char* rgb_image_data, int width, int height, int step) //input image, it must be BGR (three-channel) image!
{
try {
if (!result_buffer)
{
this->_logger.LogError("ANSCNNFD::FaceDetectCNN", "Null buffer memory", __FILE__, __LINE__);
return nullptr;
}
//clear memory
result_buffer[0] = 0;
result_buffer[1] = 0;
result_buffer[2] = 0;
result_buffer[3] = 0;
std::vector<FaceRect> faces = ObjectDetectCNN(rgb_image_data, width, height, step);
int num_faces = static_cast<int>(faces.size());
num_faces = MIN(num_faces, 1024); //1024 = 0x9000 / (16 * 2 + 4)
int* pCount = reinterpret_cast<int*>(result_buffer);
pCount[0] = num_faces;
for (int i = 0; i < num_faces; i++)
{
//copy data
short* p = reinterpret_cast<short*>(result_buffer + 4) + 16 * size_t(i);
p[0] = static_cast<short>(faces[i].score * 100);
p[1] = static_cast<short>(faces[i].x);
p[2] = static_cast<short>(faces[i].y);
p[3] = static_cast<short>(faces[i].w);
p[4] = static_cast<short>(faces[i].h);
//copy landmarks
for (int lmidx = 0; lmidx < 10; lmidx++)
{
p[5 + lmidx] = static_cast<short>(faces[i].lm[lmidx]);
}
}
return pCount;
}
catch (std::exception& e) {
this->_logger.LogFatal("ANSCNNFD::FaceDetectCNN", e.what(), __FILE__, __LINE__);
return nullptr;
}
}
void ANSCNNFD::InitParameters() {
for (int i = 0; i < NUM_CONV_LAYER; i++)
g_pFilters[i] = param_pConvInfo[i];
_paramInitialized = true;
}
std::vector<FaceRect> ANSCNNFD::ObjectDetectCNN(const unsigned char* rgbImageData, int width, int height, int step) {
try {
TIME_START;
if (!_paramInitialized)
{
InitParameters();
}
TIME_END("init");
TIME_START;
auto fx = SetDataFrom3x3S2P1To1x1S1P0FromImage(rgbImageData, width, height, 3, step);
TIME_END("convert data");
/***************CONV0*********************/
TIME_START;
fx = Convolution(fx, g_pFilters[0]);
TIME_END("conv_head");
TIME_START;
fx = ConvolutionDP(fx, g_pFilters[1], g_pFilters[2]);
TIME_END("conv0");
TIME_START;
fx = MaxPooling2x2S2(fx);
TIME_END("pool0");
/***************CONV1*********************/
TIME_START;
fx = Convolution4LayerUnit(fx, g_pFilters[3], g_pFilters[4], g_pFilters[5], g_pFilters[6]);
TIME_END("conv1");
/***************CONV2*********************/
TIME_START;
fx = Convolution4LayerUnit(fx, g_pFilters[7], g_pFilters[8], g_pFilters[9], g_pFilters[10]);
TIME_END("conv2");
/***************CONV3*********************/
TIME_START;
fx = MaxPooling2x2S2(fx);
TIME_END("pool3");
TIME_START;
auto fb1 = Convolution4LayerUnit(fx, g_pFilters[11], g_pFilters[12], g_pFilters[13], g_pFilters[14]);
TIME_END("conv3");
/***************CONV4*********************/
TIME_START;
fx = MaxPooling2x2S2(fb1);
TIME_END("pool4");
TIME_START;
auto fb2 = Convolution4LayerUnit(fx, g_pFilters[15], g_pFilters[16], g_pFilters[17], g_pFilters[18]);
TIME_END("conv4");
/***************CONV5*********************/
TIME_START;
fx = MaxPooling2x2S2(fb2);
TIME_END("pool5");
TIME_START;
auto fb3 = Convolution4LayerUnit(fx, g_pFilters[19], g_pFilters[20], g_pFilters[21], g_pFilters[22]);
TIME_END("conv5");
CDataBlob<float> pred_reg[3], pred_cls[3], pred_kps[3], pred_obj[3];
/***************branch5*********************/
TIME_START;
fb3 = ConvolutionDP(fb3, g_pFilters[27], g_pFilters[28]);
pred_cls[2] = ConvolutionDP(fb3, g_pFilters[33], g_pFilters[34], false);
pred_reg[2] = ConvolutionDP(fb3, g_pFilters[39], g_pFilters[40], false);
pred_kps[2] = ConvolutionDP(fb3, g_pFilters[51], g_pFilters[52], false);
pred_obj[2] = ConvolutionDP(fb3, g_pFilters[45], g_pFilters[46], false);
TIME_END("branch5");
/*****************add5*********************/
TIME_START;
fb2 = ElementAdd(UpsampleX2(fb3), fb2);
TIME_END("add5");
/*****************add6*********************/
TIME_START;
fb2 = ConvolutionDP(fb2, g_pFilters[25], g_pFilters[26]);
pred_cls[1] = ConvolutionDP(fb2, g_pFilters[31], g_pFilters[32], false);
pred_reg[1] = ConvolutionDP(fb2, g_pFilters[37], g_pFilters[38], false);
pred_kps[1] = ConvolutionDP(fb2, g_pFilters[49], g_pFilters[50], false);
pred_obj[1] = ConvolutionDP(fb2, g_pFilters[43], g_pFilters[44], false);
TIME_END("branch4");
/*****************add4*********************/
TIME_START;
fb1 = ElementAdd(UpsampleX2(fb2), fb1);
TIME_END("add4");
/***************branch3*********************/
TIME_START;
fb1 = ConvolutionDP(fb1, g_pFilters[23], g_pFilters[24]);
pred_cls[0] = ConvolutionDP(fb1, g_pFilters[29], g_pFilters[30], false);
pred_reg[0] = ConvolutionDP(fb1, g_pFilters[35], g_pFilters[36], false);
pred_kps[0] = ConvolutionDP(fb1, g_pFilters[47], g_pFilters[48], false);
pred_obj[0] = ConvolutionDP(fb1, g_pFilters[41], g_pFilters[42], false);
TIME_END("branch3");
/***************PRIORBOX*********************/
TIME_START;
auto prior3 = MeshGrid(fb1.cols, fb1.rows, 8);
auto prior4 = MeshGrid(fb2.cols, fb2.rows, 16);
auto prior5 = MeshGrid(fb3.cols, fb3.rows, 32);
TIME_END("prior");
/***************PRIORBOX*********************/
TIME_START;
BboxDecode(pred_reg[0], prior3, 8);
BboxDecode(pred_reg[1], prior4, 16);
BboxDecode(pred_reg[2], prior5, 32);
KPSDecode(pred_kps[0], prior3, 8);
KPSDecode(pred_kps[1], prior4, 16);
KPSDecode(pred_kps[2], prior5, 32);
auto cls = Concat3(Blob2Vector(pred_cls[0]), Blob2Vector(pred_cls[1]), Blob2Vector(pred_cls[2]));
auto reg = Concat3(Blob2Vector(pred_reg[0]), Blob2Vector(pred_reg[1]), Blob2Vector(pred_reg[2]));
auto kps = Concat3(Blob2Vector(pred_kps[0]), Blob2Vector(pred_kps[1]), Blob2Vector(pred_kps[2]));
auto obj = Concat3(Blob2Vector(pred_obj[0]), Blob2Vector(pred_obj[1]), Blob2Vector(pred_obj[2]));
Sigmoid(cls);
Sigmoid(obj);
TIME_END("decode")
TIME_START;
std::vector<FaceRect> facesInfo = DetectionOutput(cls, reg, kps, obj, 0.45f, 0.2f, 1000, 512);
TIME_END("detection output")
return facesInfo;
}
catch (std::exception& e) {
std::vector<FaceRect> facesInfo;
facesInfo.clear();
this->_logger.LogFatal("ANSCNNFD::ObjectDetectCNN", e.what(), __FILE__, __LINE__);
return facesInfo;
}
}
CDataBlob<float> ANSCNNFD::SetDataFrom3x3S2P1To1x1S1P0FromImage(const unsigned char* inputData, int imgWidth, int imgHeight, int imgChannels, int imgWidthStep, int padDivisor) {
if (imgChannels != 3) {
this->_logger.LogError("ANSCNNFD::SetDataFrom3x3S2P1To1x1S1P0FromImage", "The input image must be a 3-channel RGB image", __FILE__, __LINE__);
exit(1);
}
if (padDivisor != 32) {
this->_logger.LogError("ANSCNNFD::SetDataFrom3x3S2P1To1x1S1P0FromImage", "This version need pad of 32", __FILE__, __LINE__);
exit(1);
}
int rows = ((imgHeight - 1) / padDivisor + 1) * padDivisor / 2;
int cols = ((imgWidth - 1) / padDivisor + 1) * padDivisor / 2;
int channels = 32;
CDataBlob<float> outBlob(rows, cols, channels);
#if defined(_OPENMP)
#pragma omp parallel for
#endif
for (int r = 0; r < rows; r++) {
for (int c = 0; c < cols; c++) {
float* pData = outBlob.ptr(r, c);
for (int fy = -1; fy <= 1; fy++) {
int srcy = r * 2 + fy;
if (srcy < 0 || srcy >= imgHeight) //out of the range of the image
continue;
for (int fx = -1; fx <= 1; fx++) {
int srcx = c * 2 + fx;
if (srcx < 0 || srcx >= imgWidth) //out of the range of the image
continue;
const unsigned char* pImgData = inputData + size_t(imgWidthStep) * srcy + imgChannels * srcx;
int output_channel_offset = ((fy + 1) * 3 + fx + 1); //3x3 filters, 3-channel image
pData[output_channel_offset * imgChannels] = pImgData[0];
pData[output_channel_offset * imgChannels + 1] = pImgData[1];
pData[output_channel_offset * imgChannels + 2] = pImgData[2];
}
}
}
}
return outBlob;
}
//p1 and p2 must be 512-bit aligned (16 float numbers)
inline float dotProduct(const float* p1, const float* p2, int num)
{
float sum = 0.f;
#if defined(_ENABLE_AVX512)
__m512 a_float_x16, b_float_x16;
__m512 sum_float_x16 = _mm512_setzero_ps();
for (int i = 0; i < num; i += 16)
{
a_float_x16 = _mm512_load_ps(p1 + i);
b_float_x16 = _mm512_load_ps(p2 + i);
sum_float_x16 = _mm512_add_ps(sum_float_x16, _mm512_mul_ps(a_float_x16, b_float_x16));
}
sum = _mm512_reduce_add_ps(sum_float_x16);
#elif defined(_ENABLE_AVX2)
__m256 a_float_x8, b_float_x8;
__m256 sum_float_x8 = _mm256_setzero_ps();
for (int i = 0; i < num; i += 8)
{
a_float_x8 = _mm256_load_ps(p1 + i);
b_float_x8 = _mm256_load_ps(p2 + i);
sum_float_x8 = _mm256_add_ps(sum_float_x8, _mm256_mul_ps(a_float_x8, b_float_x8));
}
sum_float_x8 = _mm256_hadd_ps(sum_float_x8, sum_float_x8);
sum_float_x8 = _mm256_hadd_ps(sum_float_x8, sum_float_x8);
sum = ((float*)&sum_float_x8)[0] + ((float*)&sum_float_x8)[4];
#elif defined(_ENABLE_NEON)
float32x4_t a_float_x4, b_float_x4;
float32x4_t sum_float_x4;
sum_float_x4 = vdupq_n_f32(0);
for (int i = 0; i < num; i += 4)
{
a_float_x4 = vld1q_f32(p1 + i);
b_float_x4 = vld1q_f32(p2 + i);
sum_float_x4 = vaddq_f32(sum_float_x4, vmulq_f32(a_float_x4, b_float_x4));
}
sum += vgetq_lane_f32(sum_float_x4, 0);
sum += vgetq_lane_f32(sum_float_x4, 1);
sum += vgetq_lane_f32(sum_float_x4, 2);
sum += vgetq_lane_f32(sum_float_x4, 3);
#else
for (int i = 0; i < num; i++)
{
sum += (p1[i] * p2[i]);
}
#endif
return sum;
}
inline bool vecMulAdd(const float* p1, const float* p2, float* p3, int num)
{
#if defined(_ENABLE_AVX512)
__m512 a_float_x16, b_float_x16, c_float_x16;
for (int i = 0; i < num; i += 16)
{
a_float_x16 = _mm512_load_ps(p1 + i);
b_float_x16 = _mm512_load_ps(p2 + i);
c_float_x16 = _mm512_load_ps(p3 + i);
c_float_x16 = _mm512_add_ps(c_float_x16, _mm512_mul_ps(a_float_x16, b_float_x16));
_mm512_store_ps(p3 + i, c_float_x16);
}
#elif defined(_ENABLE_AVX2)
__m256 a_float_x8, b_float_x8, c_float_x8;
for (int i = 0; i < num; i += 8)
{
a_float_x8 = _mm256_load_ps(p1 + i);
b_float_x8 = _mm256_load_ps(p2 + i);
c_float_x8 = _mm256_load_ps(p3 + i);
c_float_x8 = _mm256_add_ps(c_float_x8, _mm256_mul_ps(a_float_x8, b_float_x8));
_mm256_store_ps(p3 + i, c_float_x8);
}
#elif defined(_ENABLE_NEON)
float32x4_t a_float_x4, b_float_x4, c_float_x4;
for (int i = 0; i < num; i += 4)
{
a_float_x4 = vld1q_f32(p1 + i);
b_float_x4 = vld1q_f32(p2 + i);
c_float_x4 = vld1q_f32(p3 + i);
c_float_x4 = vaddq_f32(c_float_x4, vmulq_f32(a_float_x4, b_float_x4));
vst1q_f32(p3 + i, c_float_x4);
}
#else
for (int i = 0; i < num; i++)
p3[i] += (p1[i] * p2[i]);
#endif
return true;
}
inline bool vecAdd(const float* p1, float* p2, int num)
{
#if defined(_ENABLE_AVX512)
__m512 a_float_x16, b_float_x16;
for (int i = 0; i < num; i += 16)
{
a_float_x16 = _mm512_load_ps(p1 + i);
b_float_x16 = _mm512_load_ps(p2 + i);
b_float_x16 = _mm512_add_ps(a_float_x16, b_float_x16);
_mm512_store_ps(p2 + i, b_float_x16);
}
#elif defined(_ENABLE_AVX2)
__m256 a_float_x8, b_float_x8;
for (int i = 0; i < num; i += 8)
{
a_float_x8 = _mm256_load_ps(p1 + i);
b_float_x8 = _mm256_load_ps(p2 + i);
b_float_x8 = _mm256_add_ps(a_float_x8, b_float_x8);
_mm256_store_ps(p2 + i, b_float_x8);
}
#elif defined(_ENABLE_NEON)
float32x4_t a_float_x4, b_float_x4, c_float_x4;
for (int i = 0; i < num; i += 4)
{
a_float_x4 = vld1q_f32(p1 + i);
b_float_x4 = vld1q_f32(p2 + i);
c_float_x4 = vaddq_f32(a_float_x4, b_float_x4);
vst1q_f32(p2 + i, c_float_x4);
}
#else
for (int i = 0; i < num; i++)
{
p2[i] += p1[i];
}
#endif
return true;
}
inline bool vecAdd(const float* p1, const float* p2, float* p3, int num)
{
#if defined(_ENABLE_AVX512)
__m512 a_float_x16, b_float_x16;
for (int i = 0; i < num; i += 16)
{
a_float_x16 = _mm512_load_ps(p1 + i);
b_float_x16 = _mm512_load_ps(p2 + i);
b_float_x16 = _mm512_add_ps(a_float_x16, b_float_x16);
_mm512_store_ps(p3 + i, b_float_x16);
}
#elif defined(_ENABLE_AVX2)
__m256 a_float_x8, b_float_x8;
for (int i = 0; i < num; i += 8)
{
a_float_x8 = _mm256_load_ps(p1 + i);
b_float_x8 = _mm256_load_ps(p2 + i);
b_float_x8 = _mm256_add_ps(a_float_x8, b_float_x8);
_mm256_store_ps(p3 + i, b_float_x8);
}
#elif defined(_ENABLE_NEON)
float32x4_t a_float_x4, b_float_x4, c_float_x4;
for (int i = 0; i < num; i += 4)
{
a_float_x4 = vld1q_f32(p1 + i);
b_float_x4 = vld1q_f32(p2 + i);
c_float_x4 = vaddq_f32(a_float_x4, b_float_x4);
vst1q_f32(p3 + i, c_float_x4);
}
#else
for (int i = 0; i < num; i++)
{
p3[i] = p1[i] + p2[i];
}
#endif
return true;
}
bool ANSCNNFD::Convolution1x1PointWise(const CDataBlob<float>& inputData, const Filters<float>& filters, CDataBlob<float>& outputData)
{
#if defined(_OPENMP)
#pragma omp parallel for
#endif
for (int row = 0; row < outputData.rows; row++)
{
for (int col = 0; col < outputData.cols; col++)
{
float* pOut = outputData.ptr(row, col);
const float* pIn = inputData.ptr(row, col);
for (int ch = 0; ch < outputData.channels; ch++)
{
const float* pF = filters.weights.ptr(0, ch);
pOut[ch] = dotProduct(pIn, pF, inputData.channels);
pOut[ch] += filters.biases.data[ch];
}
}
}
return true;
}
bool ANSCNNFD::Convolution3x3DepthWise(const CDataBlob<float>& inputData, const Filters<float>& filters, CDataBlob<float>& outputData)
{
//set all elements in outputData to zeros
outputData.setZero();
#if defined(_OPENMP)
#pragma omp parallel for
#endif
for (int row = 0; row < outputData.rows; row++)
{
int srcy_start = row - 1;
int srcy_end = srcy_start + 3;
srcy_start = MAX(0, srcy_start);
srcy_end = MIN(srcy_end, inputData.rows);
for (int col = 0; col < outputData.cols; col++)
{
float* pOut = outputData.ptr(row, col);
int srcx_start = col - 1;
int srcx_end = srcx_start + 3;
srcx_start = MAX(0, srcx_start);
srcx_end = MIN(srcx_end, inputData.cols);
for (int r = srcy_start; r < srcy_end; r++)
for (int c = srcx_start; c < srcx_end; c++)
{
int filter_r = r - row + 1;
int filter_c = c - col + 1;
int filter_idx = filter_r * 3 + filter_c;
vecMulAdd(inputData.ptr(r, c), filters.weights.ptr(0, filter_idx), pOut, filters.num_filters);
}
vecAdd(filters.biases.ptr(0, 0), pOut, filters.num_filters);
}
}
return true;
}
bool ANSCNNFD::Relu(CDataBlob<float>& inputoutputData)
{
if (inputoutputData.isEmpty())
{
this->_logger.LogError("ANSCNNFD::Relu", "The input data is empty", __FILE__, __LINE__);
return false;
}
int len = inputoutputData.cols * inputoutputData.rows * inputoutputData.channelStep / sizeof(float);
#if defined(_ENABLE_AVX512)
__m512 a, bzeros;
bzeros = _mm512_setzero_ps(); //zeros
for (int i = 0; i < len; i += 16)
{
a = _mm512_load_ps(inputoutputData.data + i);
a = _mm512_max_ps(a, bzeros);
_mm512_store_ps(inputoutputData.data + i, a);
}
#elif defined(_ENABLE_AVX2)
__m256 a, bzeros;
bzeros = _mm256_setzero_ps(); //zeros
for (int i = 0; i < len; i += 8)
{
a = _mm256_load_ps(inputoutputData.data + i);
a = _mm256_max_ps(a, bzeros);
_mm256_store_ps(inputoutputData.data + i, a);
}
#else
for (int i = 0; i < len; i++)
inputoutputData.data[i] *= (inputoutputData.data[i] > 0);
#endif
return true;
}
void ANSCNNFD::IntersectBBox(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2,
NormalizedBBox* intersect_bbox)
{
if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin ||
bbox2.ymin > bbox1.ymax || bbox2.ymax < bbox1.ymin)
{
// Return [0, 0, 0, 0] if there is no intersection.
intersect_bbox->xmin = 0;
intersect_bbox->ymin = 0;
intersect_bbox->xmax = 0;
intersect_bbox->ymax = 0;
}
else
{
intersect_bbox->xmin = (std::max(bbox1.xmin, bbox2.xmin));
intersect_bbox->ymin = (std::max(bbox1.ymin, bbox2.ymin));
intersect_bbox->xmax = (std::min(bbox1.xmax, bbox2.xmax));
intersect_bbox->ymax = (std::min(bbox1.ymax, bbox2.ymax));
}
}
float ANSCNNFD::JaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2)
{
NormalizedBBox intersect_bbox;
IntersectBBox(bbox1, bbox2, &intersect_bbox);
float intersect_width, intersect_height;
intersect_width = intersect_bbox.xmax - intersect_bbox.xmin;
intersect_height = intersect_bbox.ymax - intersect_bbox.ymin;
if (intersect_width > 0 && intersect_height > 0)
{
float intersect_size = intersect_width * intersect_height;
float bsize1 = (bbox1.xmax - bbox1.xmin) * (bbox1.ymax - bbox1.ymin);
float bsize2 = (bbox2.xmax - bbox2.xmin) * (bbox2.ymax - bbox2.ymin);
return intersect_size / (bsize1 + bsize2 - intersect_size);
}
else
{
return 0.f;
}
}
CDataBlob<float> ANSCNNFD::UpsampleX2(const CDataBlob<float>& inputData) {
if (inputData.isEmpty()) {
this->_logger.LogError("ANSCNNFD::UpsampleX2", "The input data is empty", __FILE__, __LINE__);
exit(1);
}
CDataBlob<float> outData(inputData.rows * 2, inputData.cols * 2, inputData.channels);
for (int r = 0; r < inputData.rows; r++) {
for (int c = 0; c < inputData.cols; c++) {
const float* pIn = inputData.ptr(r, c);
int outr = r * 2;
int outc = c * 2;
for (int ch = 0; ch < inputData.channels; ++ch) {
outData.ptr(outr, outc)[ch] = pIn[ch];
outData.ptr(outr, outc + 1)[ch] = pIn[ch];
outData.ptr(outr + 1, outc)[ch] = pIn[ch];
outData.ptr(outr + 1, outc + 1)[ch] = pIn[ch];
}
}
}
return outData;
}
CDataBlob<float> ANSCNNFD::ElementAdd(const CDataBlob<float>& inputData1, const CDataBlob<float>& inputData2) {
if (inputData1.rows != inputData2.rows || inputData1.cols != inputData2.cols || inputData1.channels != inputData2.channels) {
this->_logger.LogError("ANSCNNFD::ElementAdd", "The two input datas must be in the same shape.", __FILE__, __LINE__);
exit(1);
}
CDataBlob<float> outData(inputData1.rows, inputData1.cols, inputData1.channels);
for (int r = 0; r < inputData1.rows; r++) {
for (int c = 0; c < inputData1.cols; c++) {
const float* pIn1 = inputData1.ptr(r, c);
const float* pIn2 = inputData2.ptr(r, c);
float* pOut = outData.ptr(r, c);
vecAdd(pIn1, pIn2, pOut, inputData1.channels);
}
}
return outData;
}
CDataBlob<float> ANSCNNFD::Convolution(const CDataBlob<float>& inputData, const Filters<float>& filters, bool do_relu)
{
if (inputData.isEmpty() || filters.weights.isEmpty() || filters.biases.isEmpty())
{
this->_logger.LogError("ANSCNNFD::Convolution", "The input data or filter data is empty.", __FILE__, __LINE__);
exit(1);
}
if (inputData.channels != filters.channels)
{
this->_logger.LogError("ANSCNNFD::Convolution", "The input data dimension cannot meet filters.", __FILE__, __LINE__);
exit(1);
}
CDataBlob<float> outputData(inputData.rows, inputData.cols, filters.num_filters);
if (filters.is_pointwise && !filters.is_depthwise)
Convolution1x1PointWise(inputData, filters, outputData);
else if (!filters.is_pointwise && filters.is_depthwise)
Convolution3x3DepthWise(inputData, filters, outputData);
else
{
this->_logger.LogError("ANSCNNFD::Convolution", "Unsupported filter type.", __FILE__, __LINE__);
exit(1);
}
if (do_relu)
Relu(outputData);
return outputData;
}
CDataBlob<float> ANSCNNFD::ConvolutionDP(const CDataBlob<float>& inputData,
const Filters<float>& filtersP, const Filters<float>& filtersD, bool do_relu)
{
CDataBlob<float> tmp = Convolution(inputData, filtersP, false);
CDataBlob<float> out = Convolution(tmp, filtersD, do_relu);
return out;
}
CDataBlob<float> ANSCNNFD::Convolution4LayerUnit(const CDataBlob<float>& inputData,
const Filters<float>& filtersP1, const Filters<float>& filtersD1,
const Filters<float>& filtersP2, const Filters<float>& filtersD2, bool do_relu)
{
CDataBlob<float> tmp = ConvolutionDP(inputData, filtersP1, filtersD1, true);
CDataBlob<float> out = ConvolutionDP(tmp, filtersP2, filtersD2, do_relu);
return out;
}
//only 2X2 S2 is supported
CDataBlob<float> ANSCNNFD::MaxPooling2x2S2(const CDataBlob<float>& inputData)
{
if (inputData.isEmpty())
{
this->_logger.LogError("ANSCNNFD::MaxPooling2x2S2", "The input data is empty.", __FILE__, __LINE__);
exit(1);
}
int outputR = static_cast<int>(ceil((inputData.rows - 3.0f) / 2)) + 1;
int outputC = static_cast<int>(ceil((inputData.cols - 3.0f) / 2)) + 1;
int outputCH = inputData.channels;
if (outputR < 1 || outputC < 1)
{
this->_logger.LogError("ANSCNNFD::MaxPooling2x2S2", "The size of the output is not correct.", __FILE__, __LINE__);
exit(1);
}
CDataBlob<float> outputData(outputR, outputC, outputCH);
outputData.setZero();
for (int row = 0; row < outputData.rows; row++)
{
for (int col = 0; col < outputData.cols; col++)
{
size_t inputMatOffsetsInElement[4];
int elementCount = 0;
int rstart = row * 2;
int cstart = col * 2;
int rend = MIN(rstart + 2, inputData.rows);
int cend = MIN(cstart + 2, inputData.cols);
for (int fr = rstart; fr < rend; fr++)
{
for (int fc = cstart; fc < cend; fc++)
{
inputMatOffsetsInElement[elementCount++] = (size_t(fr) * inputData.cols + fc) * inputData.channelStep / sizeof(float);
}
}
float* pOut = outputData.ptr(row, col);
float* pIn = inputData.data;
#if defined(_ENABLE_NEON)
for (int ch = 0; ch < outputData.channels; ch += 4)
{
float32x4_t tmp;
float32x4_t maxVal = vld1q_f32(pIn + ch + inputMatOffsetsInElement[0]);
for (int ec = 1; ec < elementCount; ec++)
{
tmp = vld1q_f32(pIn + ch + inputMatOffsetsInElement[ec]);
maxVal = vmaxq_f32(maxVal, tmp);
}
vst1q_f32(pOut + ch, maxVal);
}
#elif defined(_ENABLE_AVX512)
for (int ch = 0; ch < outputData.channels; ch += 16)
{
__m512 tmp;
__m512 maxVal = _mm512_load_ps((__m512 const*)(pIn + ch + inputMatOffsetsInElement[0]));
for (int ec = 1; ec < elementCount; ec++)
{
tmp = _mm512_load_ps((__m512 const*)(pIn + ch + inputMatOffsetsInElement[ec]));
maxVal = _mm512_max_ps(maxVal, tmp);
}
_mm512_store_ps((__m512*)(pOut + ch), maxVal);
}
#elif defined(_ENABLE_AVX2)
for (int ch = 0; ch < outputData.channels; ch += 8)
{
__m256 tmp;
__m256 maxVal = _mm256_load_ps((float const*)(pIn + ch + inputMatOffsetsInElement[0]));
for (int ec = 1; ec < elementCount; ec++)
{
tmp = _mm256_load_ps((float const*)(pIn + ch + inputMatOffsetsInElement[ec]));
maxVal = _mm256_max_ps(maxVal, tmp);
}
_mm256_store_ps(pOut + ch, maxVal);
}
#else
for (int ch = 0; ch < outputData.channels; ch++)
{
float maxVal = pIn[ch + inputMatOffsetsInElement[0]];
for (int ec = 1; ec < elementCount; ec++)
{
maxVal = MAX(maxVal, pIn[ch + inputMatOffsetsInElement[ec]]);
}
pOut[ch] = maxVal;
}
#endif
}
}
return outputData;
}
CDataBlob<float> ANSCNNFD::MeshGrid(int feature_width, int feature_height, int stride, float offset) {
CDataBlob<float> out(feature_height, feature_width, 2);
for (int r = 0; r < feature_height; ++r) {
float rx = (float)(r * stride) + offset;
for (int c = 0; c < feature_width; ++c) {
float* p = out.ptr(r, c);
p[0] = (float)(c * stride) + offset;
p[1] = rx;
}
}
return out;
}
void ANSCNNFD::BboxDecode(CDataBlob<float>& bbox_pred, const CDataBlob<float>& priors, int stride) {
if (bbox_pred.cols != priors.cols || bbox_pred.rows != priors.rows) {
this->_logger.LogError("ANSCNNFD::BboxDecode", "Mismatch between feature map and anchor size.", __FILE__, __LINE__);
}
if (bbox_pred.channels != 4) {
this->_logger.LogError("ANSCNNFD::BboxDecode", "The bbox dim must be 4.", __FILE__, __LINE__);
}
float fstride = (float)stride;
for (int r = 0; r < bbox_pred.rows; ++r) {
for (int c = 0; c < bbox_pred.cols; ++c) {
float* pb = bbox_pred.ptr(r, c);
const float* pp = priors.ptr(r, c);
float cx = pb[0] * fstride + pp[0];
float cy = pb[1] * fstride + pp[1];
float w = std::exp(pb[2]) * fstride;
float h = std::exp(pb[3]) * fstride;
pb[0] = cx - w / 2.f;
pb[1] = cy - h / 2.f;
pb[2] = cx + w / 2.f;
pb[3] = cy + h / 2.f;
}
}
}
void ANSCNNFD::KPSDecode(CDataBlob<float>& kps_pred, const CDataBlob<float>& priors, int stride) {
if (kps_pred.cols != priors.cols || kps_pred.rows != priors.rows) {
this->_logger.LogError("ANSCNNFD::KPSDecode", "Mismatch between feature map and anchor size.", __FILE__, __LINE__);
exit(1);
}
if (kps_pred.channels & 1) {
this->_logger.LogError("ANSCNNFD::KPSDecode", "The kps dim must be even.", __FILE__, __LINE__);
exit(1);
}
float fstride = (float)stride;
int num_points = kps_pred.channels >> 1;
for (int r = 0; r < kps_pred.rows; ++r) {
for (int c = 0; c < kps_pred.cols; ++c) {
float* pb = kps_pred.ptr(r, c);
const float* pp = priors.ptr(r, c);
for (int n = 0; n < num_points; ++n) {
pb[2 * n] = pb[2 * n] * fstride + pp[0];
pb[2 * n + 1] = pb[2 * n + 1] * fstride + pp[1];
}
}
}
}
template<typename T>
CDataBlob<T> ANSCNNFD::Concat3(const CDataBlob<T>& inputData1, const CDataBlob<T>& inputData2, const CDataBlob<T>& inputData3)
{
if ((inputData1.isEmpty()) || (inputData2.isEmpty()) || (inputData3.isEmpty()))
{
this->_logger.LogError("ANSCNNFD::Concat3", "The input data is empty.", __FILE__, __LINE__);
exit(1);
}
if ((inputData1.cols != inputData2.cols) ||
(inputData1.rows != inputData2.rows) ||
(inputData1.cols != inputData3.cols) ||
(inputData1.rows != inputData3.rows))
{
this->_logger.LogError("ANSCNNFD::Concat3", "The three inputs must have the same size.", __FILE__, __LINE__);
exit(1);
}
int outputR = inputData1.rows;
int outputC = inputData1.cols;
int outputCH = inputData1.channels + inputData2.channels + inputData3.channels;
if (outputR < 1 || outputC < 1 || outputCH < 1)
{
this->_logger.LogError("ANSCNNFD::Concat3", "The size of the output is not correct.", __FILE__, __LINE__);
exit(1);
}
CDataBlob<T> outputData(outputR, outputC, outputCH);
for (int row = 0; row < outputData.rows; row++)
{
for (int col = 0; col < outputData.cols; col++)
{
T* pOut = outputData.ptr(row, col);
const T* pIn1 = inputData1.ptr(row, col);
const T* pIn2 = inputData2.ptr(row, col);
const T* pIn3 = inputData3.ptr(row, col);
memcpy(pOut, pIn1, sizeof(T) * inputData1.channels);
memcpy(pOut + inputData1.channels, pIn2, sizeof(T) * inputData2.channels);
memcpy(pOut + inputData1.channels + inputData2.channels, pIn3, sizeof(T) * inputData3.channels);
}
}
return outputData;
}
template CDataBlob<float> ANSCNNFD::Concat3(const CDataBlob<float>& inputData1, const CDataBlob<float>& inputData2, const CDataBlob<float>& inputData3);
template<typename T>
CDataBlob<T> ANSCNNFD::Blob2Vector(const CDataBlob<T>& inputData)
{
if (inputData.isEmpty())
{
this->_logger.LogError("ANSCNNFD::Blob2Vector", "The input data is empty.", __FILE__, __LINE__);
exit(1);
}
CDataBlob<T> outputData(1, 1, inputData.cols * inputData.rows * inputData.channels);
int bytesOfAChannel = inputData.channels * sizeof(T);
T* pOut = outputData.ptr(0, 0);
for (int row = 0; row < inputData.rows; row++)
{
for (int col = 0; col < inputData.cols; col++)
{
const T* pIn = inputData.ptr(row, col);
memcpy(pOut, pIn, bytesOfAChannel);
pOut += inputData.channels;
}
}
return outputData;
}
template CDataBlob<float> ANSCNNFD::Blob2Vector(const CDataBlob<float>& inputData);
void ANSCNNFD::Sigmoid(CDataBlob<float>& inputData) {
for (int r = 0; r < inputData.rows; ++r) {
for (int c = 0; c < inputData.cols; ++c) {
float* pIn = inputData.ptr(r, c);
for (int ch = 0; ch < inputData.channels; ++ch) {
float v = pIn[ch];
v = std::min(v, 88.3762626647949f);
v = std::max(v, -88.3762626647949f);
pIn[ch] = static_cast<float>(1.f / (1.f + exp(-v)));
}
}
}
}
std::vector<FaceRect> ANSCNNFD::DetectionOutput(const CDataBlob<float>& cls,
const CDataBlob<float>& reg,
const CDataBlob<float>& kps,
const CDataBlob<float>& obj,
float overlap_threshold,
float confidence_threshold,
int top_k,
int keep_top_k)
{
if (reg.isEmpty() || cls.isEmpty() || kps.isEmpty() || obj.isEmpty())//|| iou.isEmpty())
{
this->_logger.LogError("ANSCNNFD::DetectionOutput", "The input data is null.", __FILE__, __LINE__);
exit(1);
}
if (reg.cols != 1 || reg.rows != 1 || cls.cols != 1 || cls.rows != 1 || kps.cols != 1 || kps.rows != 1 || obj.cols != 1 || obj.rows != 1) {
this->_logger.LogError("ANSCNNFD::DetectionOutput", "Only support vector format.", __FILE__, __LINE__);
exit(1);
}
if ((int)(kps.channels / obj.channels) != 10) {
this->_logger.LogError("ANSCNNFD::DetectionOutput", "Only support 5 keypoints.", __FILE__, __LINE__);
exit(1);
}
const float* pCls = cls.ptr(0, 0);
const float* pReg = reg.ptr(0, 0);
const float* pObj = obj.ptr(0, 0);
const float* pKps = kps.ptr(0, 0);
std::vector<std::pair<float, NormalizedBBox> > score_bbox_vec;
std::vector<std::pair<float, NormalizedBBox> > final_score_bbox_vec;
//get the candidates those are > confidence_threshold
for (int i = 0; i < cls.channels; ++i)
{
float conf = std::sqrt(pCls[i] * pObj[i]);
// float conf = pCls[i] * pObj[i];
if (conf >= confidence_threshold)
{
NormalizedBBox bb;
bb.xmin = pReg[4 * i];
bb.ymin = pReg[4 * i + 1];
bb.xmax = pReg[4 * i + 2];
bb.ymax = pReg[4 * i + 3];
//store the five landmarks
memcpy(bb.lm, pKps + 10 * i, 10 * sizeof(float));
score_bbox_vec.push_back(std::make_pair(conf, bb));
}
}
//Sort the score pair according to the scores in descending order
std::stable_sort(score_bbox_vec.begin(), score_bbox_vec.end(), SortScoreBBoxPairDescend);
// Keep top_k scores if needed.
if (top_k > -1 && size_t(top_k) < score_bbox_vec.size()) {
score_bbox_vec.resize(top_k);
}
//Do NMS
final_score_bbox_vec.clear();
while (score_bbox_vec.size() != 0) {
const NormalizedBBox bb1 = score_bbox_vec.front().second;
bool keep = true;
for (size_t k = 0; k < final_score_bbox_vec.size(); k++)
{
if (keep)
{
const NormalizedBBox bb2 = final_score_bbox_vec[k].second;
float overlap = JaccardOverlap(bb1, bb2);
keep = (overlap <= overlap_threshold);
}
else
{
break;
}
}
if (keep) {
final_score_bbox_vec.push_back(score_bbox_vec.front());
}
score_bbox_vec.erase(score_bbox_vec.begin());
}
if (keep_top_k > -1 && size_t(keep_top_k) < final_score_bbox_vec.size()) {
final_score_bbox_vec.resize(keep_top_k);
}
//copy the results to the output blob
int num_faces = (int)final_score_bbox_vec.size();
std::vector<FaceRect> facesInfo;
for (int fi = 0; fi < num_faces; fi++)
{
std::pair<float, NormalizedBBox> pp = final_score_bbox_vec[fi];
FaceRect r;
r.score = pp.first;
r.x = int(pp.second.xmin);
r.y = int(pp.second.ymin);
r.w = int(pp.second.xmax - pp.second.xmin);
r.h = int(pp.second.ymax - pp.second.ymin);
//copy landmark data
for (int i = 0; i < 10; ++i) {
r.lm[i] = int(pp.second.lm[i]);
}
facesInfo.emplace_back(r);
}
return facesInfo;
}
}