Initial setup for CLion
This commit is contained in:
38
engines/TensorRTEngine/face_recognizer_postprocess.cu
Normal file
38
engines/TensorRTEngine/face_recognizer_postprocess.cu
Normal file
@@ -0,0 +1,38 @@
|
||||
#include "face_recognizer_postprocess.cuh"
|
||||
|
||||
|
||||
__global__ void computeNormKernel(
|
||||
float* input,
|
||||
int size,
|
||||
float* norm
|
||||
){
|
||||
__shared__ float sharedMem[256];
|
||||
float localSum = 0.0f;
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < size) {
|
||||
localSum = input[idx] * input[idx];
|
||||
// CUDA kernel print (works within kernel)
|
||||
// printf("Thread %d: localSum = %f\n", idx, input[idx]);
|
||||
}
|
||||
|
||||
sharedMem[threadIdx.x] = localSum;
|
||||
for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
|
||||
__syncthreads();
|
||||
if (threadIdx.x < stride) {
|
||||
sharedMem[threadIdx.x] += sharedMem[threadIdx.x + stride];
|
||||
}
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
*norm = sqrtf(sharedMem[0]);
|
||||
// printf("Thread %d: norm = %f\n", idx,*norm);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void normalizeVectorKernel(float* input, int size, float norm) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (idx < size) {
|
||||
// printf("Thread %d: norm = %f\n", idx,norm);
|
||||
input[idx] /= norm;
|
||||
}
|
||||
}
|
||||
10
engines/TensorRTEngine/face_recognizer_postprocess.cuh
Normal file
10
engines/TensorRTEngine/face_recognizer_postprocess.cuh
Normal file
@@ -0,0 +1,10 @@
|
||||
#include "cuda_runtime.h"
|
||||
#include "stdio.h"
|
||||
extern "C" __global__ void computeNormKernel(
|
||||
float* input,
|
||||
int size,
|
||||
float* norm
|
||||
);
|
||||
|
||||
extern "C" __global__ void normalizeVectorKernel(float* input,
|
||||
int size, float norm);
|
||||
Reference in New Issue
Block a user