2026-03-28 16:54:11 +11:00
# include "CNNFaceDetector.h"
# include "Utility.h"
void * AllocMemory ( size_t size )
{
char * ptr , * ptr0 ;
ptr0 = ( char * ) malloc (
( size_t ) ( size + _MALLOC_ALIGN * ( ( size > = 4096 ) + 1L ) + sizeof ( char * ) ) ) ;
if ( ! ptr0 )
return 0 ;
// align the pointer
ptr = ( char * ) ( ( ( size_t ) ( ptr0 + sizeof ( char * ) + 1 ) + _MALLOC_ALIGN - 1 ) & ~ ( size_t ) ( _MALLOC_ALIGN - 1 ) ) ;
* ( char * * ) ( ptr - sizeof ( char * ) ) = ptr0 ;
return ptr ;
}
void FreeMemory_ ( void * ptr )
{
try {
if ( ptr )
{
if ( ( ( size_t ) ptr & ( _MALLOC_ALIGN - 1 ) ) ! = 0 )
return ;
free ( * ( ( char * * ) ptr - 1 ) ) ;
}
}
catch ( std : : exception & e ) {
std : : cout < < " ANSCENTER::FreeMemory: " < < e . what ( ) ;
}
}
namespace ANSCENTER {
bool SortScoreBBoxPairDescend ( const std : : pair < float , NormalizedBBox > & pair1 , const std : : pair < float , NormalizedBBox > & pair2 )
{
return pair1 . first > pair2 . first ;
}
bool ANSCNNFD : : OptimizeModel ( bool fp16 , std : : string & optimizedModelFolder ) {
return true ;
}
bool ANSCNNFD : : LoadModel ( const std : : string & modelZipFilePath , const std : : string & modelZipPassword ) {
try {
return true ;
}
catch ( std : : exception & e ) {
this - > _logger . LogFatal ( " ANSCNNFD::LoadModel " , e . what ( ) , __FILE__ , __LINE__ ) ;
return false ;
}
}
bool ANSCNNFD : : Initialize ( std : : string licenseKey , ModelConfig modelConfig , const std : : string & modelZipFilePath , const std : : string & modelZipPassword , std : : string & labelMap ) {
bool result = true ;
_licenseValid = true ;
if ( ! result ) return false ;
try {
_modelConfig = modelConfig ;
_modelConfig . modelType = ModelType : : FACEDETECT ;
_modelConfig . detectionType = DetectionType : : FACEDETECTOR ;
InitParameters ( ) ;
labelMap = " Face " ;
_isInitialized = true ;
return true ;
}
catch ( std : : exception & e ) {
this - > _logger . LogFatal ( " ANSCNNFD::Initialize " , e . what ( ) , __FILE__ , __LINE__ ) ;
return false ;
}
}
std : : vector < Object > ANSCNNFD : : RunInference ( const cv : : Mat & input ) {
std : : vector < Object > output ;
output . clear ( ) ;
if ( ! _licenseValid ) {
2026-04-13 20:38:40 +10:00
if ( _modelLoading . load ( ) ) return { } ;
2026-03-28 16:54:11 +11:00
this - > _logger . LogError ( " ANSCNNFD::RunInference " , " Invalid license " , __FILE__ , __LINE__ ) ;
return output ;
}
if ( ! _isInitialized ) {
this - > _logger . LogError ( " ANSCNNFD::RunInference " , " Invalid model " , __FILE__ , __LINE__ ) ;
return output ;
}
try {
bool croppedFace = false ; // Check if the image is cropped face image
int * pResults = nullptr ;
unsigned char * pBuffer = static_cast < unsigned char * > ( malloc ( DETECT_BUFFER_SIZE ) ) ;
cv : : Mat frame = input . clone ( ) ;
// We know that the image sizes <=300 px, it is likely that image is cropped for face only
if ( ( input . size [ 0 ] < = 300 ) | | ( input . size [ 1 ] < = 300 ) ) croppedFace = true ;
if ( croppedFace ) cv : : copyMakeBorder ( input , frame , 200 , 200 , 200 , 200 , cv : : BORDER_REPLICATE ) ;
pResults = FaceDetectCNN ( pBuffer , static_cast < unsigned char * > ( frame . ptr ( 0 ) ) , frame . cols , frame . rows , static_cast < int > ( frame . step ) ) ;
for ( int i = 0 ; i < ( pResults ? * pResults : 0 ) ; i + + )
{
Object result ;
short * p = reinterpret_cast < short * > ( pResults + 1 ) + 16 * i ;
float confidence = static_cast < float > ( p [ 0 ] ) / 100 ;
if ( confidence > = _modelConfig . detectionScoreThreshold ) {
int x = p [ 1 ] ;
int y = p [ 2 ] ;
int w = p [ 3 ] ;
int h = p [ 4 ] ;
int x1 , y1 , x2 , y2 ;
result . classId = 0 ;
result . className = " Face " ;
result . confidence = confidence ;
result . box . x = x ;
result . box . y = y ;
if ( croppedFace ) {
if ( x < = 200 ) x = 200 ;
if ( y < = 200 ) y = 200 ;
result . box . x = x - 200 ;
result . box . y = y - 200 ;
}
result . box . width = w ;
result . box . height = h ;
x1 = x ;
y1 = y ;
x2 = x + w ;
y2 = y + h ;
cv : : Rect facePos ( cv : : Point ( x1 , y1 ) , cv : : Point ( x2 , y2 ) ) ;
cv : : Mat currFace = frame ( facePos ) ;
result . mask = currFace . clone ( ) ;
output . push_back ( result ) ;
}
}
free ( pBuffer ) ;
frame . release ( ) ;
return output ;
}
catch ( std : : exception & e ) {
this - > _logger . LogFatal ( " ANSCNNFD::RunInference " , e . what ( ) , __FILE__ , __LINE__ ) ;
return output ;
}
}
ANSCNNFD : : ~ ANSCNNFD ( ) {
try {
this - > _logger . LogInfo ( " ANSCNNFD::~ANSCNNFD() " , " Release ANSCNNFD " , __FILE__ , __LINE__ ) ;
}
catch ( std : : exception & e ) {
std : : cout < < " ANSCNNFD::~ANSCNNFD() " < < e . what ( ) < < std : : endl ;
}
}
bool ANSCNNFD : : Destroy ( ) {
try {
this - > _logger . LogInfo ( " ANSCNNFD::Destroy() " , " Release ANSCNNFD " , __FILE__ , __LINE__ ) ;
return true ;
}
catch ( std : : exception & e ) {
std : : cout < < " ANSCNNFD::Destroy() " < < e . what ( ) < < std : : endl ;
return false ;
}
}
// Private
int * ANSCNNFD : : FaceDetectCNN ( unsigned char * result_buffer , unsigned char * rgb_image_data , int width , int height , int step ) //input image, it must be BGR (three-channel) image!
{
try {
if ( ! result_buffer )
{
this - > _logger . LogError ( " ANSCNNFD::FaceDetectCNN " , " Null buffer memory " , __FILE__ , __LINE__ ) ;
return nullptr ;
}
//clear memory
result_buffer [ 0 ] = 0 ;
result_buffer [ 1 ] = 0 ;
result_buffer [ 2 ] = 0 ;
result_buffer [ 3 ] = 0 ;
std : : vector < FaceRect > faces = ObjectDetectCNN ( rgb_image_data , width , height , step ) ;
int num_faces = static_cast < int > ( faces . size ( ) ) ;
num_faces = MIN ( num_faces , 1024 ) ; //1024 = 0x9000 / (16 * 2 + 4)
int * pCount = reinterpret_cast < int * > ( result_buffer ) ;
pCount [ 0 ] = num_faces ;
for ( int i = 0 ; i < num_faces ; i + + )
{
//copy data
short * p = reinterpret_cast < short * > ( result_buffer + 4 ) + 16 * size_t ( i ) ;
p [ 0 ] = static_cast < short > ( faces [ i ] . score * 100 ) ;
p [ 1 ] = static_cast < short > ( faces [ i ] . x ) ;
p [ 2 ] = static_cast < short > ( faces [ i ] . y ) ;
p [ 3 ] = static_cast < short > ( faces [ i ] . w ) ;
p [ 4 ] = static_cast < short > ( faces [ i ] . h ) ;
//copy landmarks
for ( int lmidx = 0 ; lmidx < 10 ; lmidx + + )
{
p [ 5 + lmidx ] = static_cast < short > ( faces [ i ] . lm [ lmidx ] ) ;
}
}
return pCount ;
}
catch ( std : : exception & e ) {
this - > _logger . LogFatal ( " ANSCNNFD::FaceDetectCNN " , e . what ( ) , __FILE__ , __LINE__ ) ;
return nullptr ;
}
}
void ANSCNNFD : : InitParameters ( ) {
for ( int i = 0 ; i < NUM_CONV_LAYER ; i + + )
g_pFilters [ i ] = param_pConvInfo [ i ] ;
_paramInitialized = true ;
}
std : : vector < FaceRect > ANSCNNFD : : ObjectDetectCNN ( const unsigned char * rgbImageData , int width , int height , int step ) {
try {
TIME_START ;
if ( ! _paramInitialized )
{
InitParameters ( ) ;
}
TIME_END ( " init " ) ;
TIME_START ;
auto fx = SetDataFrom3x3S2P1To1x1S1P0FromImage ( rgbImageData , width , height , 3 , step ) ;
TIME_END ( " convert data " ) ;
/***************CONV0*********************/
TIME_START ;
fx = Convolution ( fx , g_pFilters [ 0 ] ) ;
TIME_END ( " conv_head " ) ;
TIME_START ;
fx = ConvolutionDP ( fx , g_pFilters [ 1 ] , g_pFilters [ 2 ] ) ;
TIME_END ( " conv0 " ) ;
TIME_START ;
fx = MaxPooling2x2S2 ( fx ) ;
TIME_END ( " pool0 " ) ;
/***************CONV1*********************/
TIME_START ;
fx = Convolution4LayerUnit ( fx , g_pFilters [ 3 ] , g_pFilters [ 4 ] , g_pFilters [ 5 ] , g_pFilters [ 6 ] ) ;
TIME_END ( " conv1 " ) ;
/***************CONV2*********************/
TIME_START ;
fx = Convolution4LayerUnit ( fx , g_pFilters [ 7 ] , g_pFilters [ 8 ] , g_pFilters [ 9 ] , g_pFilters [ 10 ] ) ;
TIME_END ( " conv2 " ) ;
/***************CONV3*********************/
TIME_START ;
fx = MaxPooling2x2S2 ( fx ) ;
TIME_END ( " pool3 " ) ;
TIME_START ;
auto fb1 = Convolution4LayerUnit ( fx , g_pFilters [ 11 ] , g_pFilters [ 12 ] , g_pFilters [ 13 ] , g_pFilters [ 14 ] ) ;
TIME_END ( " conv3 " ) ;
/***************CONV4*********************/
TIME_START ;
fx = MaxPooling2x2S2 ( fb1 ) ;
TIME_END ( " pool4 " ) ;
TIME_START ;
auto fb2 = Convolution4LayerUnit ( fx , g_pFilters [ 15 ] , g_pFilters [ 16 ] , g_pFilters [ 17 ] , g_pFilters [ 18 ] ) ;
TIME_END ( " conv4 " ) ;
/***************CONV5*********************/
TIME_START ;
fx = MaxPooling2x2S2 ( fb2 ) ;
TIME_END ( " pool5 " ) ;
TIME_START ;
auto fb3 = Convolution4LayerUnit ( fx , g_pFilters [ 19 ] , g_pFilters [ 20 ] , g_pFilters [ 21 ] , g_pFilters [ 22 ] ) ;
TIME_END ( " conv5 " ) ;
CDataBlob < float > pred_reg [ 3 ] , pred_cls [ 3 ] , pred_kps [ 3 ] , pred_obj [ 3 ] ;
/***************branch5*********************/
TIME_START ;
fb3 = ConvolutionDP ( fb3 , g_pFilters [ 27 ] , g_pFilters [ 28 ] ) ;
pred_cls [ 2 ] = ConvolutionDP ( fb3 , g_pFilters [ 33 ] , g_pFilters [ 34 ] , false ) ;
pred_reg [ 2 ] = ConvolutionDP ( fb3 , g_pFilters [ 39 ] , g_pFilters [ 40 ] , false ) ;
pred_kps [ 2 ] = ConvolutionDP ( fb3 , g_pFilters [ 51 ] , g_pFilters [ 52 ] , false ) ;
pred_obj [ 2 ] = ConvolutionDP ( fb3 , g_pFilters [ 45 ] , g_pFilters [ 46 ] , false ) ;
TIME_END ( " branch5 " ) ;
/*****************add5*********************/
TIME_START ;
fb2 = ElementAdd ( UpsampleX2 ( fb3 ) , fb2 ) ;
TIME_END ( " add5 " ) ;
/*****************add6*********************/
TIME_START ;
fb2 = ConvolutionDP ( fb2 , g_pFilters [ 25 ] , g_pFilters [ 26 ] ) ;
pred_cls [ 1 ] = ConvolutionDP ( fb2 , g_pFilters [ 31 ] , g_pFilters [ 32 ] , false ) ;
pred_reg [ 1 ] = ConvolutionDP ( fb2 , g_pFilters [ 37 ] , g_pFilters [ 38 ] , false ) ;
pred_kps [ 1 ] = ConvolutionDP ( fb2 , g_pFilters [ 49 ] , g_pFilters [ 50 ] , false ) ;
pred_obj [ 1 ] = ConvolutionDP ( fb2 , g_pFilters [ 43 ] , g_pFilters [ 44 ] , false ) ;
TIME_END ( " branch4 " ) ;
/*****************add4*********************/
TIME_START ;
fb1 = ElementAdd ( UpsampleX2 ( fb2 ) , fb1 ) ;
TIME_END ( " add4 " ) ;
/***************branch3*********************/
TIME_START ;
fb1 = ConvolutionDP ( fb1 , g_pFilters [ 23 ] , g_pFilters [ 24 ] ) ;
pred_cls [ 0 ] = ConvolutionDP ( fb1 , g_pFilters [ 29 ] , g_pFilters [ 30 ] , false ) ;
pred_reg [ 0 ] = ConvolutionDP ( fb1 , g_pFilters [ 35 ] , g_pFilters [ 36 ] , false ) ;
pred_kps [ 0 ] = ConvolutionDP ( fb1 , g_pFilters [ 47 ] , g_pFilters [ 48 ] , false ) ;
pred_obj [ 0 ] = ConvolutionDP ( fb1 , g_pFilters [ 41 ] , g_pFilters [ 42 ] , false ) ;
TIME_END ( " branch3 " ) ;
/***************PRIORBOX*********************/
TIME_START ;
auto prior3 = MeshGrid ( fb1 . cols , fb1 . rows , 8 ) ;
auto prior4 = MeshGrid ( fb2 . cols , fb2 . rows , 16 ) ;
auto prior5 = MeshGrid ( fb3 . cols , fb3 . rows , 32 ) ;
TIME_END ( " prior " ) ;
/***************PRIORBOX*********************/
TIME_START ;
BboxDecode ( pred_reg [ 0 ] , prior3 , 8 ) ;
BboxDecode ( pred_reg [ 1 ] , prior4 , 16 ) ;
BboxDecode ( pred_reg [ 2 ] , prior5 , 32 ) ;
KPSDecode ( pred_kps [ 0 ] , prior3 , 8 ) ;
KPSDecode ( pred_kps [ 1 ] , prior4 , 16 ) ;
KPSDecode ( pred_kps [ 2 ] , prior5 , 32 ) ;
auto cls = Concat3 ( Blob2Vector ( pred_cls [ 0 ] ) , Blob2Vector ( pred_cls [ 1 ] ) , Blob2Vector ( pred_cls [ 2 ] ) ) ;
auto reg = Concat3 ( Blob2Vector ( pred_reg [ 0 ] ) , Blob2Vector ( pred_reg [ 1 ] ) , Blob2Vector ( pred_reg [ 2 ] ) ) ;
auto kps = Concat3 ( Blob2Vector ( pred_kps [ 0 ] ) , Blob2Vector ( pred_kps [ 1 ] ) , Blob2Vector ( pred_kps [ 2 ] ) ) ;
auto obj = Concat3 ( Blob2Vector ( pred_obj [ 0 ] ) , Blob2Vector ( pred_obj [ 1 ] ) , Blob2Vector ( pred_obj [ 2 ] ) ) ;
Sigmoid ( cls ) ;
Sigmoid ( obj ) ;
TIME_END ( " decode " )
TIME_START ;
std : : vector < FaceRect > facesInfo = DetectionOutput ( cls , reg , kps , obj , 0.45f , 0.2f , 1000 , 512 ) ;
TIME_END ( " detection output " )
return facesInfo ;
}
catch ( std : : exception & e ) {
std : : vector < FaceRect > facesInfo ;
facesInfo . clear ( ) ;
this - > _logger . LogFatal ( " ANSCNNFD::ObjectDetectCNN " , e . what ( ) , __FILE__ , __LINE__ ) ;
return facesInfo ;
}
}
CDataBlob < float > ANSCNNFD : : SetDataFrom3x3S2P1To1x1S1P0FromImage ( const unsigned char * inputData , int imgWidth , int imgHeight , int imgChannels , int imgWidthStep , int padDivisor ) {
if ( imgChannels ! = 3 ) {
this - > _logger . LogError ( " ANSCNNFD::SetDataFrom3x3S2P1To1x1S1P0FromImage " , " The input image must be a 3-channel RGB image " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
if ( padDivisor ! = 32 ) {
this - > _logger . LogError ( " ANSCNNFD::SetDataFrom3x3S2P1To1x1S1P0FromImage " , " This version need pad of 32 " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
int rows = ( ( imgHeight - 1 ) / padDivisor + 1 ) * padDivisor / 2 ;
int cols = ( ( imgWidth - 1 ) / padDivisor + 1 ) * padDivisor / 2 ;
int channels = 32 ;
CDataBlob < float > outBlob ( rows , cols , channels ) ;
# if defined(_OPENMP)
# pragma omp parallel for
# endif
for ( int r = 0 ; r < rows ; r + + ) {
for ( int c = 0 ; c < cols ; c + + ) {
float * pData = outBlob . ptr ( r , c ) ;
for ( int fy = - 1 ; fy < = 1 ; fy + + ) {
int srcy = r * 2 + fy ;
if ( srcy < 0 | | srcy > = imgHeight ) //out of the range of the image
continue ;
for ( int fx = - 1 ; fx < = 1 ; fx + + ) {
int srcx = c * 2 + fx ;
if ( srcx < 0 | | srcx > = imgWidth ) //out of the range of the image
continue ;
const unsigned char * pImgData = inputData + size_t ( imgWidthStep ) * srcy + imgChannels * srcx ;
int output_channel_offset = ( ( fy + 1 ) * 3 + fx + 1 ) ; //3x3 filters, 3-channel image
pData [ output_channel_offset * imgChannels ] = pImgData [ 0 ] ;
pData [ output_channel_offset * imgChannels + 1 ] = pImgData [ 1 ] ;
pData [ output_channel_offset * imgChannels + 2 ] = pImgData [ 2 ] ;
}
}
}
}
return outBlob ;
}
//p1 and p2 must be 512-bit aligned (16 float numbers)
inline float dotProduct ( const float * p1 , const float * p2 , int num )
{
float sum = 0.f ;
# if defined(_ENABLE_AVX512)
__m512 a_float_x16 , b_float_x16 ;
__m512 sum_float_x16 = _mm512_setzero_ps ( ) ;
for ( int i = 0 ; i < num ; i + = 16 )
{
a_float_x16 = _mm512_load_ps ( p1 + i ) ;
b_float_x16 = _mm512_load_ps ( p2 + i ) ;
sum_float_x16 = _mm512_add_ps ( sum_float_x16 , _mm512_mul_ps ( a_float_x16 , b_float_x16 ) ) ;
}
sum = _mm512_reduce_add_ps ( sum_float_x16 ) ;
# elif defined(_ENABLE_AVX2)
__m256 a_float_x8 , b_float_x8 ;
__m256 sum_float_x8 = _mm256_setzero_ps ( ) ;
for ( int i = 0 ; i < num ; i + = 8 )
{
a_float_x8 = _mm256_load_ps ( p1 + i ) ;
b_float_x8 = _mm256_load_ps ( p2 + i ) ;
sum_float_x8 = _mm256_add_ps ( sum_float_x8 , _mm256_mul_ps ( a_float_x8 , b_float_x8 ) ) ;
}
sum_float_x8 = _mm256_hadd_ps ( sum_float_x8 , sum_float_x8 ) ;
sum_float_x8 = _mm256_hadd_ps ( sum_float_x8 , sum_float_x8 ) ;
sum = ( ( float * ) & sum_float_x8 ) [ 0 ] + ( ( float * ) & sum_float_x8 ) [ 4 ] ;
# elif defined(_ENABLE_NEON)
float32x4_t a_float_x4 , b_float_x4 ;
float32x4_t sum_float_x4 ;
sum_float_x4 = vdupq_n_f32 ( 0 ) ;
for ( int i = 0 ; i < num ; i + = 4 )
{
a_float_x4 = vld1q_f32 ( p1 + i ) ;
b_float_x4 = vld1q_f32 ( p2 + i ) ;
sum_float_x4 = vaddq_f32 ( sum_float_x4 , vmulq_f32 ( a_float_x4 , b_float_x4 ) ) ;
}
sum + = vgetq_lane_f32 ( sum_float_x4 , 0 ) ;
sum + = vgetq_lane_f32 ( sum_float_x4 , 1 ) ;
sum + = vgetq_lane_f32 ( sum_float_x4 , 2 ) ;
sum + = vgetq_lane_f32 ( sum_float_x4 , 3 ) ;
# else
for ( int i = 0 ; i < num ; i + + )
{
sum + = ( p1 [ i ] * p2 [ i ] ) ;
}
# endif
return sum ;
}
inline bool vecMulAdd ( const float * p1 , const float * p2 , float * p3 , int num )
{
# if defined(_ENABLE_AVX512)
__m512 a_float_x16 , b_float_x16 , c_float_x16 ;
for ( int i = 0 ; i < num ; i + = 16 )
{
a_float_x16 = _mm512_load_ps ( p1 + i ) ;
b_float_x16 = _mm512_load_ps ( p2 + i ) ;
c_float_x16 = _mm512_load_ps ( p3 + i ) ;
c_float_x16 = _mm512_add_ps ( c_float_x16 , _mm512_mul_ps ( a_float_x16 , b_float_x16 ) ) ;
_mm512_store_ps ( p3 + i , c_float_x16 ) ;
}
# elif defined(_ENABLE_AVX2)
__m256 a_float_x8 , b_float_x8 , c_float_x8 ;
for ( int i = 0 ; i < num ; i + = 8 )
{
a_float_x8 = _mm256_load_ps ( p1 + i ) ;
b_float_x8 = _mm256_load_ps ( p2 + i ) ;
c_float_x8 = _mm256_load_ps ( p3 + i ) ;
c_float_x8 = _mm256_add_ps ( c_float_x8 , _mm256_mul_ps ( a_float_x8 , b_float_x8 ) ) ;
_mm256_store_ps ( p3 + i , c_float_x8 ) ;
}
# elif defined(_ENABLE_NEON)
float32x4_t a_float_x4 , b_float_x4 , c_float_x4 ;
for ( int i = 0 ; i < num ; i + = 4 )
{
a_float_x4 = vld1q_f32 ( p1 + i ) ;
b_float_x4 = vld1q_f32 ( p2 + i ) ;
c_float_x4 = vld1q_f32 ( p3 + i ) ;
c_float_x4 = vaddq_f32 ( c_float_x4 , vmulq_f32 ( a_float_x4 , b_float_x4 ) ) ;
vst1q_f32 ( p3 + i , c_float_x4 ) ;
}
# else
for ( int i = 0 ; i < num ; i + + )
p3 [ i ] + = ( p1 [ i ] * p2 [ i ] ) ;
# endif
return true ;
}
inline bool vecAdd ( const float * p1 , float * p2 , int num )
{
# if defined(_ENABLE_AVX512)
__m512 a_float_x16 , b_float_x16 ;
for ( int i = 0 ; i < num ; i + = 16 )
{
a_float_x16 = _mm512_load_ps ( p1 + i ) ;
b_float_x16 = _mm512_load_ps ( p2 + i ) ;
b_float_x16 = _mm512_add_ps ( a_float_x16 , b_float_x16 ) ;
_mm512_store_ps ( p2 + i , b_float_x16 ) ;
}
# elif defined(_ENABLE_AVX2)
__m256 a_float_x8 , b_float_x8 ;
for ( int i = 0 ; i < num ; i + = 8 )
{
a_float_x8 = _mm256_load_ps ( p1 + i ) ;
b_float_x8 = _mm256_load_ps ( p2 + i ) ;
b_float_x8 = _mm256_add_ps ( a_float_x8 , b_float_x8 ) ;
_mm256_store_ps ( p2 + i , b_float_x8 ) ;
}
# elif defined(_ENABLE_NEON)
float32x4_t a_float_x4 , b_float_x4 , c_float_x4 ;
for ( int i = 0 ; i < num ; i + = 4 )
{
a_float_x4 = vld1q_f32 ( p1 + i ) ;
b_float_x4 = vld1q_f32 ( p2 + i ) ;
c_float_x4 = vaddq_f32 ( a_float_x4 , b_float_x4 ) ;
vst1q_f32 ( p2 + i , c_float_x4 ) ;
}
# else
for ( int i = 0 ; i < num ; i + + )
{
p2 [ i ] + = p1 [ i ] ;
}
# endif
return true ;
}
inline bool vecAdd ( const float * p1 , const float * p2 , float * p3 , int num )
{
# if defined(_ENABLE_AVX512)
__m512 a_float_x16 , b_float_x16 ;
for ( int i = 0 ; i < num ; i + = 16 )
{
a_float_x16 = _mm512_load_ps ( p1 + i ) ;
b_float_x16 = _mm512_load_ps ( p2 + i ) ;
b_float_x16 = _mm512_add_ps ( a_float_x16 , b_float_x16 ) ;
_mm512_store_ps ( p3 + i , b_float_x16 ) ;
}
# elif defined(_ENABLE_AVX2)
__m256 a_float_x8 , b_float_x8 ;
for ( int i = 0 ; i < num ; i + = 8 )
{
a_float_x8 = _mm256_load_ps ( p1 + i ) ;
b_float_x8 = _mm256_load_ps ( p2 + i ) ;
b_float_x8 = _mm256_add_ps ( a_float_x8 , b_float_x8 ) ;
_mm256_store_ps ( p3 + i , b_float_x8 ) ;
}
# elif defined(_ENABLE_NEON)
float32x4_t a_float_x4 , b_float_x4 , c_float_x4 ;
for ( int i = 0 ; i < num ; i + = 4 )
{
a_float_x4 = vld1q_f32 ( p1 + i ) ;
b_float_x4 = vld1q_f32 ( p2 + i ) ;
c_float_x4 = vaddq_f32 ( a_float_x4 , b_float_x4 ) ;
vst1q_f32 ( p3 + i , c_float_x4 ) ;
}
# else
for ( int i = 0 ; i < num ; i + + )
{
p3 [ i ] = p1 [ i ] + p2 [ i ] ;
}
# endif
return true ;
}
bool ANSCNNFD : : Convolution1x1PointWise ( const CDataBlob < float > & inputData , const Filters < float > & filters , CDataBlob < float > & outputData )
{
# if defined(_OPENMP)
# pragma omp parallel for
# endif
for ( int row = 0 ; row < outputData . rows ; row + + )
{
for ( int col = 0 ; col < outputData . cols ; col + + )
{
float * pOut = outputData . ptr ( row , col ) ;
const float * pIn = inputData . ptr ( row , col ) ;
for ( int ch = 0 ; ch < outputData . channels ; ch + + )
{
const float * pF = filters . weights . ptr ( 0 , ch ) ;
pOut [ ch ] = dotProduct ( pIn , pF , inputData . channels ) ;
pOut [ ch ] + = filters . biases . data [ ch ] ;
}
}
}
return true ;
}
bool ANSCNNFD : : Convolution3x3DepthWise ( const CDataBlob < float > & inputData , const Filters < float > & filters , CDataBlob < float > & outputData )
{
//set all elements in outputData to zeros
outputData . setZero ( ) ;
# if defined(_OPENMP)
# pragma omp parallel for
# endif
for ( int row = 0 ; row < outputData . rows ; row + + )
{
int srcy_start = row - 1 ;
int srcy_end = srcy_start + 3 ;
srcy_start = MAX ( 0 , srcy_start ) ;
srcy_end = MIN ( srcy_end , inputData . rows ) ;
for ( int col = 0 ; col < outputData . cols ; col + + )
{
float * pOut = outputData . ptr ( row , col ) ;
int srcx_start = col - 1 ;
int srcx_end = srcx_start + 3 ;
srcx_start = MAX ( 0 , srcx_start ) ;
srcx_end = MIN ( srcx_end , inputData . cols ) ;
for ( int r = srcy_start ; r < srcy_end ; r + + )
for ( int c = srcx_start ; c < srcx_end ; c + + )
{
int filter_r = r - row + 1 ;
int filter_c = c - col + 1 ;
int filter_idx = filter_r * 3 + filter_c ;
vecMulAdd ( inputData . ptr ( r , c ) , filters . weights . ptr ( 0 , filter_idx ) , pOut , filters . num_filters ) ;
}
vecAdd ( filters . biases . ptr ( 0 , 0 ) , pOut , filters . num_filters ) ;
}
}
return true ;
}
bool ANSCNNFD : : Relu ( CDataBlob < float > & inputoutputData )
{
if ( inputoutputData . isEmpty ( ) )
{
this - > _logger . LogError ( " ANSCNNFD::Relu " , " The input data is empty " , __FILE__ , __LINE__ ) ;
return false ;
}
int len = inputoutputData . cols * inputoutputData . rows * inputoutputData . channelStep / sizeof ( float ) ;
# if defined(_ENABLE_AVX512)
__m512 a , bzeros ;
bzeros = _mm512_setzero_ps ( ) ; //zeros
for ( int i = 0 ; i < len ; i + = 16 )
{
a = _mm512_load_ps ( inputoutputData . data + i ) ;
a = _mm512_max_ps ( a , bzeros ) ;
_mm512_store_ps ( inputoutputData . data + i , a ) ;
}
# elif defined(_ENABLE_AVX2)
__m256 a , bzeros ;
bzeros = _mm256_setzero_ps ( ) ; //zeros
for ( int i = 0 ; i < len ; i + = 8 )
{
a = _mm256_load_ps ( inputoutputData . data + i ) ;
a = _mm256_max_ps ( a , bzeros ) ;
_mm256_store_ps ( inputoutputData . data + i , a ) ;
}
# else
for ( int i = 0 ; i < len ; i + + )
inputoutputData . data [ i ] * = ( inputoutputData . data [ i ] > 0 ) ;
# endif
return true ;
}
void ANSCNNFD : : IntersectBBox ( const NormalizedBBox & bbox1 , const NormalizedBBox & bbox2 ,
NormalizedBBox * intersect_bbox )
{
if ( bbox2 . xmin > bbox1 . xmax | | bbox2 . xmax < bbox1 . xmin | |
bbox2 . ymin > bbox1 . ymax | | bbox2 . ymax < bbox1 . ymin )
{
// Return [0, 0, 0, 0] if there is no intersection.
intersect_bbox - > xmin = 0 ;
intersect_bbox - > ymin = 0 ;
intersect_bbox - > xmax = 0 ;
intersect_bbox - > ymax = 0 ;
}
else
{
intersect_bbox - > xmin = ( std : : max ( bbox1 . xmin , bbox2 . xmin ) ) ;
intersect_bbox - > ymin = ( std : : max ( bbox1 . ymin , bbox2 . ymin ) ) ;
intersect_bbox - > xmax = ( std : : min ( bbox1 . xmax , bbox2 . xmax ) ) ;
intersect_bbox - > ymax = ( std : : min ( bbox1 . ymax , bbox2 . ymax ) ) ;
}
}
float ANSCNNFD : : JaccardOverlap ( const NormalizedBBox & bbox1 , const NormalizedBBox & bbox2 )
{
NormalizedBBox intersect_bbox ;
IntersectBBox ( bbox1 , bbox2 , & intersect_bbox ) ;
float intersect_width , intersect_height ;
intersect_width = intersect_bbox . xmax - intersect_bbox . xmin ;
intersect_height = intersect_bbox . ymax - intersect_bbox . ymin ;
if ( intersect_width > 0 & & intersect_height > 0 )
{
float intersect_size = intersect_width * intersect_height ;
float bsize1 = ( bbox1 . xmax - bbox1 . xmin ) * ( bbox1 . ymax - bbox1 . ymin ) ;
float bsize2 = ( bbox2 . xmax - bbox2 . xmin ) * ( bbox2 . ymax - bbox2 . ymin ) ;
return intersect_size / ( bsize1 + bsize2 - intersect_size ) ;
}
else
{
return 0.f ;
}
}
CDataBlob < float > ANSCNNFD : : UpsampleX2 ( const CDataBlob < float > & inputData ) {
if ( inputData . isEmpty ( ) ) {
this - > _logger . LogError ( " ANSCNNFD::UpsampleX2 " , " The input data is empty " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
CDataBlob < float > outData ( inputData . rows * 2 , inputData . cols * 2 , inputData . channels ) ;
for ( int r = 0 ; r < inputData . rows ; r + + ) {
for ( int c = 0 ; c < inputData . cols ; c + + ) {
const float * pIn = inputData . ptr ( r , c ) ;
int outr = r * 2 ;
int outc = c * 2 ;
for ( int ch = 0 ; ch < inputData . channels ; + + ch ) {
outData . ptr ( outr , outc ) [ ch ] = pIn [ ch ] ;
outData . ptr ( outr , outc + 1 ) [ ch ] = pIn [ ch ] ;
outData . ptr ( outr + 1 , outc ) [ ch ] = pIn [ ch ] ;
outData . ptr ( outr + 1 , outc + 1 ) [ ch ] = pIn [ ch ] ;
}
}
}
return outData ;
}
CDataBlob < float > ANSCNNFD : : ElementAdd ( const CDataBlob < float > & inputData1 , const CDataBlob < float > & inputData2 ) {
if ( inputData1 . rows ! = inputData2 . rows | | inputData1 . cols ! = inputData2 . cols | | inputData1 . channels ! = inputData2 . channels ) {
this - > _logger . LogError ( " ANSCNNFD::ElementAdd " , " The two input datas must be in the same shape. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
CDataBlob < float > outData ( inputData1 . rows , inputData1 . cols , inputData1 . channels ) ;
for ( int r = 0 ; r < inputData1 . rows ; r + + ) {
for ( int c = 0 ; c < inputData1 . cols ; c + + ) {
const float * pIn1 = inputData1 . ptr ( r , c ) ;
const float * pIn2 = inputData2 . ptr ( r , c ) ;
float * pOut = outData . ptr ( r , c ) ;
vecAdd ( pIn1 , pIn2 , pOut , inputData1 . channels ) ;
}
}
return outData ;
}
CDataBlob < float > ANSCNNFD : : Convolution ( const CDataBlob < float > & inputData , const Filters < float > & filters , bool do_relu )
{
if ( inputData . isEmpty ( ) | | filters . weights . isEmpty ( ) | | filters . biases . isEmpty ( ) )
{
this - > _logger . LogError ( " ANSCNNFD::Convolution " , " The input data or filter data is empty. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
if ( inputData . channels ! = filters . channels )
{
this - > _logger . LogError ( " ANSCNNFD::Convolution " , " The input data dimension cannot meet filters. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
CDataBlob < float > outputData ( inputData . rows , inputData . cols , filters . num_filters ) ;
if ( filters . is_pointwise & & ! filters . is_depthwise )
Convolution1x1PointWise ( inputData , filters , outputData ) ;
else if ( ! filters . is_pointwise & & filters . is_depthwise )
Convolution3x3DepthWise ( inputData , filters , outputData ) ;
else
{
this - > _logger . LogError ( " ANSCNNFD::Convolution " , " Unsupported filter type. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
if ( do_relu )
Relu ( outputData ) ;
return outputData ;
}
CDataBlob < float > ANSCNNFD : : ConvolutionDP ( const CDataBlob < float > & inputData ,
const Filters < float > & filtersP , const Filters < float > & filtersD , bool do_relu )
{
CDataBlob < float > tmp = Convolution ( inputData , filtersP , false ) ;
CDataBlob < float > out = Convolution ( tmp , filtersD , do_relu ) ;
return out ;
}
CDataBlob < float > ANSCNNFD : : Convolution4LayerUnit ( const CDataBlob < float > & inputData ,
const Filters < float > & filtersP1 , const Filters < float > & filtersD1 ,
const Filters < float > & filtersP2 , const Filters < float > & filtersD2 , bool do_relu )
{
CDataBlob < float > tmp = ConvolutionDP ( inputData , filtersP1 , filtersD1 , true ) ;
CDataBlob < float > out = ConvolutionDP ( tmp , filtersP2 , filtersD2 , do_relu ) ;
return out ;
}
//only 2X2 S2 is supported
CDataBlob < float > ANSCNNFD : : MaxPooling2x2S2 ( const CDataBlob < float > & inputData )
{
if ( inputData . isEmpty ( ) )
{
this - > _logger . LogError ( " ANSCNNFD::MaxPooling2x2S2 " , " The input data is empty. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
int outputR = static_cast < int > ( ceil ( ( inputData . rows - 3.0f ) / 2 ) ) + 1 ;
int outputC = static_cast < int > ( ceil ( ( inputData . cols - 3.0f ) / 2 ) ) + 1 ;
int outputCH = inputData . channels ;
if ( outputR < 1 | | outputC < 1 )
{
this - > _logger . LogError ( " ANSCNNFD::MaxPooling2x2S2 " , " The size of the output is not correct. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
CDataBlob < float > outputData ( outputR , outputC , outputCH ) ;
outputData . setZero ( ) ;
for ( int row = 0 ; row < outputData . rows ; row + + )
{
for ( int col = 0 ; col < outputData . cols ; col + + )
{
size_t inputMatOffsetsInElement [ 4 ] ;
int elementCount = 0 ;
int rstart = row * 2 ;
int cstart = col * 2 ;
int rend = MIN ( rstart + 2 , inputData . rows ) ;
int cend = MIN ( cstart + 2 , inputData . cols ) ;
for ( int fr = rstart ; fr < rend ; fr + + )
{
for ( int fc = cstart ; fc < cend ; fc + + )
{
inputMatOffsetsInElement [ elementCount + + ] = ( size_t ( fr ) * inputData . cols + fc ) * inputData . channelStep / sizeof ( float ) ;
}
}
float * pOut = outputData . ptr ( row , col ) ;
float * pIn = inputData . data ;
# if defined(_ENABLE_NEON)
for ( int ch = 0 ; ch < outputData . channels ; ch + = 4 )
{
float32x4_t tmp ;
float32x4_t maxVal = vld1q_f32 ( pIn + ch + inputMatOffsetsInElement [ 0 ] ) ;
for ( int ec = 1 ; ec < elementCount ; ec + + )
{
tmp = vld1q_f32 ( pIn + ch + inputMatOffsetsInElement [ ec ] ) ;
maxVal = vmaxq_f32 ( maxVal , tmp ) ;
}
vst1q_f32 ( pOut + ch , maxVal ) ;
}
# elif defined(_ENABLE_AVX512)
for ( int ch = 0 ; ch < outputData . channels ; ch + = 16 )
{
__m512 tmp ;
__m512 maxVal = _mm512_load_ps ( ( __m512 const * ) ( pIn + ch + inputMatOffsetsInElement [ 0 ] ) ) ;
for ( int ec = 1 ; ec < elementCount ; ec + + )
{
tmp = _mm512_load_ps ( ( __m512 const * ) ( pIn + ch + inputMatOffsetsInElement [ ec ] ) ) ;
maxVal = _mm512_max_ps ( maxVal , tmp ) ;
}
_mm512_store_ps ( ( __m512 * ) ( pOut + ch ) , maxVal ) ;
}
# elif defined(_ENABLE_AVX2)
for ( int ch = 0 ; ch < outputData . channels ; ch + = 8 )
{
__m256 tmp ;
__m256 maxVal = _mm256_load_ps ( ( float const * ) ( pIn + ch + inputMatOffsetsInElement [ 0 ] ) ) ;
for ( int ec = 1 ; ec < elementCount ; ec + + )
{
tmp = _mm256_load_ps ( ( float const * ) ( pIn + ch + inputMatOffsetsInElement [ ec ] ) ) ;
maxVal = _mm256_max_ps ( maxVal , tmp ) ;
}
_mm256_store_ps ( pOut + ch , maxVal ) ;
}
# else
for ( int ch = 0 ; ch < outputData . channels ; ch + + )
{
float maxVal = pIn [ ch + inputMatOffsetsInElement [ 0 ] ] ;
for ( int ec = 1 ; ec < elementCount ; ec + + )
{
maxVal = MAX ( maxVal , pIn [ ch + inputMatOffsetsInElement [ ec ] ] ) ;
}
pOut [ ch ] = maxVal ;
}
# endif
}
}
return outputData ;
}
CDataBlob < float > ANSCNNFD : : MeshGrid ( int feature_width , int feature_height , int stride , float offset ) {
CDataBlob < float > out ( feature_height , feature_width , 2 ) ;
for ( int r = 0 ; r < feature_height ; + + r ) {
float rx = ( float ) ( r * stride ) + offset ;
for ( int c = 0 ; c < feature_width ; + + c ) {
float * p = out . ptr ( r , c ) ;
p [ 0 ] = ( float ) ( c * stride ) + offset ;
p [ 1 ] = rx ;
}
}
return out ;
}
void ANSCNNFD : : BboxDecode ( CDataBlob < float > & bbox_pred , const CDataBlob < float > & priors , int stride ) {
if ( bbox_pred . cols ! = priors . cols | | bbox_pred . rows ! = priors . rows ) {
this - > _logger . LogError ( " ANSCNNFD::BboxDecode " , " Mismatch between feature map and anchor size. " , __FILE__ , __LINE__ ) ;
}
if ( bbox_pred . channels ! = 4 ) {
this - > _logger . LogError ( " ANSCNNFD::BboxDecode " , " The bbox dim must be 4. " , __FILE__ , __LINE__ ) ;
}
float fstride = ( float ) stride ;
for ( int r = 0 ; r < bbox_pred . rows ; + + r ) {
for ( int c = 0 ; c < bbox_pred . cols ; + + c ) {
float * pb = bbox_pred . ptr ( r , c ) ;
const float * pp = priors . ptr ( r , c ) ;
float cx = pb [ 0 ] * fstride + pp [ 0 ] ;
float cy = pb [ 1 ] * fstride + pp [ 1 ] ;
float w = std : : exp ( pb [ 2 ] ) * fstride ;
float h = std : : exp ( pb [ 3 ] ) * fstride ;
pb [ 0 ] = cx - w / 2.f ;
pb [ 1 ] = cy - h / 2.f ;
pb [ 2 ] = cx + w / 2.f ;
pb [ 3 ] = cy + h / 2.f ;
}
}
}
void ANSCNNFD : : KPSDecode ( CDataBlob < float > & kps_pred , const CDataBlob < float > & priors , int stride ) {
if ( kps_pred . cols ! = priors . cols | | kps_pred . rows ! = priors . rows ) {
this - > _logger . LogError ( " ANSCNNFD::KPSDecode " , " Mismatch between feature map and anchor size. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
if ( kps_pred . channels & 1 ) {
this - > _logger . LogError ( " ANSCNNFD::KPSDecode " , " The kps dim must be even. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
float fstride = ( float ) stride ;
int num_points = kps_pred . channels > > 1 ;
for ( int r = 0 ; r < kps_pred . rows ; + + r ) {
for ( int c = 0 ; c < kps_pred . cols ; + + c ) {
float * pb = kps_pred . ptr ( r , c ) ;
const float * pp = priors . ptr ( r , c ) ;
for ( int n = 0 ; n < num_points ; + + n ) {
pb [ 2 * n ] = pb [ 2 * n ] * fstride + pp [ 0 ] ;
pb [ 2 * n + 1 ] = pb [ 2 * n + 1 ] * fstride + pp [ 1 ] ;
}
}
}
}
template < typename T >
CDataBlob < T > ANSCNNFD : : Concat3 ( const CDataBlob < T > & inputData1 , const CDataBlob < T > & inputData2 , const CDataBlob < T > & inputData3 )
{
if ( ( inputData1 . isEmpty ( ) ) | | ( inputData2 . isEmpty ( ) ) | | ( inputData3 . isEmpty ( ) ) )
{
this - > _logger . LogError ( " ANSCNNFD::Concat3 " , " The input data is empty. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
if ( ( inputData1 . cols ! = inputData2 . cols ) | |
( inputData1 . rows ! = inputData2 . rows ) | |
( inputData1 . cols ! = inputData3 . cols ) | |
( inputData1 . rows ! = inputData3 . rows ) )
{
this - > _logger . LogError ( " ANSCNNFD::Concat3 " , " The three inputs must have the same size. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
int outputR = inputData1 . rows ;
int outputC = inputData1 . cols ;
int outputCH = inputData1 . channels + inputData2 . channels + inputData3 . channels ;
if ( outputR < 1 | | outputC < 1 | | outputCH < 1 )
{
this - > _logger . LogError ( " ANSCNNFD::Concat3 " , " The size of the output is not correct. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
CDataBlob < T > outputData ( outputR , outputC , outputCH ) ;
for ( int row = 0 ; row < outputData . rows ; row + + )
{
for ( int col = 0 ; col < outputData . cols ; col + + )
{
T * pOut = outputData . ptr ( row , col ) ;
const T * pIn1 = inputData1 . ptr ( row , col ) ;
const T * pIn2 = inputData2 . ptr ( row , col ) ;
const T * pIn3 = inputData3 . ptr ( row , col ) ;
memcpy ( pOut , pIn1 , sizeof ( T ) * inputData1 . channels ) ;
memcpy ( pOut + inputData1 . channels , pIn2 , sizeof ( T ) * inputData2 . channels ) ;
memcpy ( pOut + inputData1 . channels + inputData2 . channels , pIn3 , sizeof ( T ) * inputData3 . channels ) ;
}
}
return outputData ;
}
template CDataBlob < float > ANSCNNFD : : Concat3 ( const CDataBlob < float > & inputData1 , const CDataBlob < float > & inputData2 , const CDataBlob < float > & inputData3 ) ;
template < typename T >
CDataBlob < T > ANSCNNFD : : Blob2Vector ( const CDataBlob < T > & inputData )
{
if ( inputData . isEmpty ( ) )
{
this - > _logger . LogError ( " ANSCNNFD::Blob2Vector " , " The input data is empty. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
CDataBlob < T > outputData ( 1 , 1 , inputData . cols * inputData . rows * inputData . channels ) ;
int bytesOfAChannel = inputData . channels * sizeof ( T ) ;
T * pOut = outputData . ptr ( 0 , 0 ) ;
for ( int row = 0 ; row < inputData . rows ; row + + )
{
for ( int col = 0 ; col < inputData . cols ; col + + )
{
const T * pIn = inputData . ptr ( row , col ) ;
memcpy ( pOut , pIn , bytesOfAChannel ) ;
pOut + = inputData . channels ;
}
}
return outputData ;
}
template CDataBlob < float > ANSCNNFD : : Blob2Vector ( const CDataBlob < float > & inputData ) ;
void ANSCNNFD : : Sigmoid ( CDataBlob < float > & inputData ) {
for ( int r = 0 ; r < inputData . rows ; + + r ) {
for ( int c = 0 ; c < inputData . cols ; + + c ) {
float * pIn = inputData . ptr ( r , c ) ;
for ( int ch = 0 ; ch < inputData . channels ; + + ch ) {
float v = pIn [ ch ] ;
v = std : : min ( v , 88.3762626647949f ) ;
v = std : : max ( v , - 88.3762626647949f ) ;
pIn [ ch ] = static_cast < float > ( 1.f / ( 1.f + exp ( - v ) ) ) ;
}
}
}
}
std : : vector < FaceRect > ANSCNNFD : : DetectionOutput ( const CDataBlob < float > & cls ,
const CDataBlob < float > & reg ,
const CDataBlob < float > & kps ,
const CDataBlob < float > & obj ,
float overlap_threshold ,
float confidence_threshold ,
int top_k ,
int keep_top_k )
{
if ( reg . isEmpty ( ) | | cls . isEmpty ( ) | | kps . isEmpty ( ) | | obj . isEmpty ( ) ) //|| iou.isEmpty())
{
this - > _logger . LogError ( " ANSCNNFD::DetectionOutput " , " The input data is null. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
if ( reg . cols ! = 1 | | reg . rows ! = 1 | | cls . cols ! = 1 | | cls . rows ! = 1 | | kps . cols ! = 1 | | kps . rows ! = 1 | | obj . cols ! = 1 | | obj . rows ! = 1 ) {
this - > _logger . LogError ( " ANSCNNFD::DetectionOutput " , " Only support vector format. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
if ( ( int ) ( kps . channels / obj . channels ) ! = 10 ) {
this - > _logger . LogError ( " ANSCNNFD::DetectionOutput " , " Only support 5 keypoints. " , __FILE__ , __LINE__ ) ;
exit ( 1 ) ;
}
const float * pCls = cls . ptr ( 0 , 0 ) ;
const float * pReg = reg . ptr ( 0 , 0 ) ;
const float * pObj = obj . ptr ( 0 , 0 ) ;
const float * pKps = kps . ptr ( 0 , 0 ) ;
std : : vector < std : : pair < float , NormalizedBBox > > score_bbox_vec ;
std : : vector < std : : pair < float , NormalizedBBox > > final_score_bbox_vec ;
//get the candidates those are > confidence_threshold
for ( int i = 0 ; i < cls . channels ; + + i )
{
float conf = std : : sqrt ( pCls [ i ] * pObj [ i ] ) ;
// float conf = pCls[i] * pObj[i];
if ( conf > = confidence_threshold )
{
NormalizedBBox bb ;
bb . xmin = pReg [ 4 * i ] ;
bb . ymin = pReg [ 4 * i + 1 ] ;
bb . xmax = pReg [ 4 * i + 2 ] ;
bb . ymax = pReg [ 4 * i + 3 ] ;
//store the five landmarks
memcpy ( bb . lm , pKps + 10 * i , 10 * sizeof ( float ) ) ;
score_bbox_vec . push_back ( std : : make_pair ( conf , bb ) ) ;
}
}
//Sort the score pair according to the scores in descending order
std : : stable_sort ( score_bbox_vec . begin ( ) , score_bbox_vec . end ( ) , SortScoreBBoxPairDescend ) ;
// Keep top_k scores if needed.
if ( top_k > - 1 & & size_t ( top_k ) < score_bbox_vec . size ( ) ) {
score_bbox_vec . resize ( top_k ) ;
}
//Do NMS
final_score_bbox_vec . clear ( ) ;
while ( score_bbox_vec . size ( ) ! = 0 ) {
const NormalizedBBox bb1 = score_bbox_vec . front ( ) . second ;
bool keep = true ;
for ( size_t k = 0 ; k < final_score_bbox_vec . size ( ) ; k + + )
{
if ( keep )
{
const NormalizedBBox bb2 = final_score_bbox_vec [ k ] . second ;
float overlap = JaccardOverlap ( bb1 , bb2 ) ;
keep = ( overlap < = overlap_threshold ) ;
}
else
{
break ;
}
}
if ( keep ) {
final_score_bbox_vec . push_back ( score_bbox_vec . front ( ) ) ;
}
score_bbox_vec . erase ( score_bbox_vec . begin ( ) ) ;
}
if ( keep_top_k > - 1 & & size_t ( keep_top_k ) < final_score_bbox_vec . size ( ) ) {
final_score_bbox_vec . resize ( keep_top_k ) ;
}
//copy the results to the output blob
int num_faces = ( int ) final_score_bbox_vec . size ( ) ;
std : : vector < FaceRect > facesInfo ;
for ( int fi = 0 ; fi < num_faces ; fi + + )
{
std : : pair < float , NormalizedBBox > pp = final_score_bbox_vec [ fi ] ;
FaceRect r ;
r . score = pp . first ;
r . x = int ( pp . second . xmin ) ;
r . y = int ( pp . second . ymin ) ;
r . w = int ( pp . second . xmax - pp . second . xmin ) ;
r . h = int ( pp . second . ymax - pp . second . ymin ) ;
//copy landmark data
for ( int i = 0 ; i < 10 ; + + i ) {
r . lm [ i ] = int ( pp . second . lm [ i ] ) ;
}
facesInfo . emplace_back ( r ) ;
}
return facesInfo ;
}
}