Files
ANSCORE/engines/TensorRTEngine/face_recognizer_postprocess.cu

39 lines
1.1 KiB
Plaintext
Raw Normal View History

2026-03-28 16:54:11 +11:00
#include "face_recognizer_postprocess.cuh"
__global__ void computeNormKernel(
float* input,
int size,
float* norm
){
__shared__ float sharedMem[256];
float localSum = 0.0f;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
localSum = input[idx] * input[idx];
// CUDA kernel print (works within kernel)
// printf("Thread %d: localSum = %f\n", idx, input[idx]);
}
sharedMem[threadIdx.x] = localSum;
for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
__syncthreads();
if (threadIdx.x < stride) {
sharedMem[threadIdx.x] += sharedMem[threadIdx.x + stride];
}
}
if (threadIdx.x == 0) {
*norm = sqrtf(sharedMem[0]);
// printf("Thread %d: norm = %f\n", idx,*norm);
}
}
__global__ void normalizeVectorKernel(float* input, int size, float norm) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
// printf("Thread %d: norm = %f\n", idx,norm);
input[idx] /= norm;
}
}