#include "face_recognizer_postprocess.cuh" __global__ void computeNormKernel( float* input, int size, float* norm ){ __shared__ float sharedMem[256]; float localSum = 0.0f; int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { localSum = input[idx] * input[idx]; // CUDA kernel print (works within kernel) // printf("Thread %d: localSum = %f\n", idx, input[idx]); } sharedMem[threadIdx.x] = localSum; for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) { __syncthreads(); if (threadIdx.x < stride) { sharedMem[threadIdx.x] += sharedMem[threadIdx.x + stride]; } } if (threadIdx.x == 0) { *norm = sqrtf(sharedMem[0]); // printf("Thread %d: norm = %f\n", idx,*norm); } } __global__ void normalizeVectorKernel(float* input, int size, float norm) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { // printf("Thread %d: norm = %f\n", idx,norm); input[idx] /= norm; } }