Initial setup for CLion

2026-03-28 16:54:11 +11:00
parent 239cc02591
commit 7b4134133c
1136 changed files with 811916 additions and 0 deletions
--- a/engines/TensorRTEngine/face_recognizer_postprocess.cu
+++ b/engines/TensorRTEngine/face_recognizer_postprocess.cu
@@ -0,0 +1,38 @@
+#include "face_recognizer_postprocess.cuh"
+
+
+__global__ void computeNormKernel(
+        float* input,
+        int size,
+        float* norm
+){
+    __shared__ float sharedMem[256];
+    float localSum = 0.0f;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) {
+        localSum = input[idx] * input[idx];
+        // CUDA kernel print (works within kernel)
+//        printf("Thread %d: localSum = %f\n", idx, input[idx]);
+    }
+
+    sharedMem[threadIdx.x] = localSum;
+    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
+        __syncthreads();
+        if (threadIdx.x < stride) {
+            sharedMem[threadIdx.x] += sharedMem[threadIdx.x + stride];
+        }
+    }
+    if (threadIdx.x == 0) {
+        *norm = sqrtf(sharedMem[0]);
+//        printf("Thread %d: norm = %f\n", idx,*norm);
+    }
+}
+
+__global__ void normalizeVectorKernel(float* input, int size, float norm) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < size) {
+//        printf("Thread %d: norm = %f\n", idx,norm);
+        input[idx] /= norm;
+    }
+}
--- a/engines/TensorRTEngine/face_recognizer_postprocess.cuh
+++ b/engines/TensorRTEngine/face_recognizer_postprocess.cuh
@@ -0,0 +1,10 @@
+#include "cuda_runtime.h"
+#include "stdio.h"
+extern "C" __global__ void computeNormKernel(
+        float* input,
+        int size,
+        float* norm
+);
+
+extern "C" __global__ void normalizeVectorKernel(float* input,
+                                                 int size, float norm);