Initial setup for CLion

2026-03-28 16:54:11 +11:00
parent 239cc02591
commit 7b4134133c
1136 changed files with 811916 additions and 0 deletions
--- a/engines/TensorRTAPI/include/engine/EngineUtilities.inl
+++ b/engines/TensorRTAPI/include/engine/EngineUtilities.inl
@@ -0,0 +1,250 @@
+#pragma once
+#include <filesystem>
+#include <NvInfer.h>        // NV_TENSORRT_MAJOR/MINOR/PATCH
+#include <NvInferVersion.h> // also defines TRT version macros
+#include <cudnn_version.h>  // CUDNN_MAJOR/MINOR/PATCHLEVEL
+#include <cuda_runtime.h>   // cudaRuntimeGetVersion
+
+template <typename T>
+void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output) {
+    if (input.size() == 1) {
+        output = std::move(input[0]);
+	}
+	else {
+		auto msg = "The feature vector has incorrect dimensions!";
+		std::cout<<msg;
+    }
+}
+template <typename T> 
+void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output) {
+    if (input.size() != 1 || input[0].size() != 1) {
+        auto msg = "The feature vector has incorrect dimensions!";
+        std::cout<<msg;
+    }
+    output = std::move(input[0][0]);
+}
+template <typename T>
+cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input, 
+                                                                size_t height, size_t width,
+                                                                const cv::Scalar& bgcolor) {
+    // Ensure input is valid
+    if (input.empty()) {
+		return cv::cuda::GpuMat();
+    }
+    // Create a CUDA stream
+    cv::cuda::Stream stream;
+    // Calculate aspect ratio and unpadded dimensions
+    float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
+    size_t unpad_w = static_cast<size_t>(r * input.cols);
+    size_t unpad_h = static_cast<size_t>(r * input.rows);
+    // Resize the input image
+    cv::cuda::GpuMat re;
+    re.create(unpad_h, unpad_w, input.type());
+    cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);
+    // Create the output image and fill with the background color
+    cv::cuda::GpuMat out;
+    out.create(height, width, input.type());
+    out.setTo(bgcolor, stream);
+    // Copy the resized content into the top-left corner of the output image
+    re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
+    stream.waitForCompletion();
+    return out;
+}
+
+template <typename T> void Engine<T>::getDeviceNames(std::vector<std::string> &deviceNames) {
+    int numGPUs;
+    cudaGetDeviceCount(&numGPUs);
+    for (int device = 0; device < numGPUs; device++) {
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, device);
+        deviceNames.push_back(std::string(prop.name));
+    }
+}
+template <typename T>  int  Engine<T>::getBindingIndexByName(const std::string& name) {
+    for (int i = 0, e = m_engine->getNbIOTensors(); i < e; i++)
+    {
+		if (name == m_engine->getIOTensorName(i))
+		{
+			return i;
+		}
+    }
+	return -1;
+}
+
+
+//template <typename T> std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options &options, const std::string &onnxModelPath) {
+//    const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
+//    std::string engineName = onnxModelPath.substr(filenamePos, onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
+//
+//    // Add the GPU device name to the file to ensure that the model is only used
+//    // on devices with the exact same GPU
+//    std::vector<std::string> deviceNames;
+//    getDeviceNames(deviceNames);
+//
+//    if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
+//        auto msg = "Error, provided device index is out of range!";
+//        std::cout<<msg;
+//		return "";
+//    }
+//
+//    auto deviceName = deviceNames[options.deviceIndex];
+//    // Remove spaces from the device name
+//    deviceName.erase(std::remove_if(deviceName.begin(), deviceName.end(), ::isspace), deviceName.end());
+//    engineName += "." + deviceName;
+//    // Serialize the specified options into the filename
+//    if (options.precision == ANSCENTER::Precision::FP16) {
+//        engineName += ".fp16";
+//    } else if (options.precision == ANSCENTER::Precision::FP32) {
+//        engineName += ".fp32";
+//    } else {
+//        engineName += ".int8";
+//    }
+//    if (options.maxBatchSize > 1) {
+//        engineName += "." + std::to_string(options.maxBatchSize);
+//    }
+//    return engineName;
+//}
+
+template <typename T>
+std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options& options,
+    const std::string& onnxModelPath)
+{
+    // -- Base name from ONNX file ---------------------------------------------
+    const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
+    std::string engineName = onnxModelPath.substr(
+        filenamePos,
+        onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
+
+    // -- GPU device name ------------------------------------------------------
+    // Ensures the engine is only loaded on the exact GPU it was built for.
+    std::vector<std::string> deviceNames;
+    getDeviceNames(deviceNames);
+    if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
+        std::cout << "Error, provided device index is out of range!";
+        return "";
+    }
+    auto deviceName = deviceNames[options.deviceIndex];
+    deviceName.erase(
+        std::remove_if(deviceName.begin(), deviceName.end(), ::isspace),
+        deviceName.end());
+    engineName += "." + deviceName;
+
+    // -- Precision ------------------------------------------------------------
+    if (options.precision == ANSCENTER::Precision::FP16) {
+        engineName += ".fp16";
+    }
+    else if (options.precision == ANSCENTER::Precision::FP32) {
+        engineName += ".fp32";
+    }
+    else {
+        engineName += ".int8";
+    }
+
+    // -- Batch size -----------------------------------------------------------
+    if (options.maxBatchSize > 1) {
+        engineName += ".b" + std::to_string(options.maxBatchSize);
+    }
+
+    // -- Max spatial dims: intentionally NOT included in the filename ----------
+    // buildWithRetry() may reduce max dims (e.g. 2560→1920) when GPU memory
+    // is insufficient.  If the filename included .s{H}x{W}, the next launch
+    // would look for .s2560x2560, miss the cached .s1920x1920, and waste
+    // minutes re-attempting the doomed 2560 build before falling back.
+    // Without the suffix, the cache is found immediately on the next launch.
+    // The actual profile max is queried at runtime via getProfileMaxHeight/Width.
+
+    // -- TensorRT version -----------------------------------------------------
+    // Engine format changes between TensorRT minor versions -- must rebuild.
+    // NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH are defined in
+    // <NvInferVersion.h> which is included via NvInfer.h.
+    engineName += ".trt"
+        + std::to_string(NV_TENSORRT_MAJOR) + "."
+        + std::to_string(NV_TENSORRT_MINOR) + "."
+        + std::to_string(NV_TENSORRT_PATCH);
+
+    // -- CUDA runtime version -------------------------------------------------
+    // Engines built with different CUDA versions may use different PTX/cubin
+    // formats and must be rebuilt.
+    int cudaVersion = 0;
+    cudaRuntimeGetVersion(&cudaVersion);
+    const int cudaMajor = cudaVersion / 1000;
+    const int cudaMinor = (cudaVersion % 1000) / 10;
+    engineName += ".cuda"
+        + std::to_string(cudaMajor) + "."
+        + std::to_string(cudaMinor);
+
+    // -- cuDNN version --------------------------------------------------------
+    // cuDNN version affects layer implementations inside the engine.
+    // CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL are defined in <cudnn_version.h>.
+    engineName += ".cudnn"
+        + std::to_string(CUDNN_MAJOR) + "."
+        + std::to_string(CUDNN_MINOR);
+
+    return engineName;
+}
+
+template <typename T>
+cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
+                                            const std::array<float, 3> &divVals, bool normalize, bool swapRB,
+                                            cv::cuda::Stream &stream) {
+    cv::cuda::GpuMat result;
+    if (batchInput.empty()) return result;
+    if (batchInput[0].channels() != 3) return result;
+
+    const int H     = batchInput[0].rows;
+    const int W     = batchInput[0].cols;
+    const int batch = static_cast<int>(batchInput.size());
+    const size_t planeSize = static_cast<size_t>(H) * W;   // pixels per channel
+
+    // Output blob: planar NCHW layout stored as a single-channel GpuMat.
+    // Total elements = batch * 3 * H * W.
+    cv::cuda::GpuMat blob(1, batch * 3 * static_cast<int>(planeSize), CV_32FC1);
+
+    for (int img = 0; img < batch; ++img) {
+        // 1. Convert to float and normalise while still in HWC (interleaved) format.
+        //    Channel-wise subtract / divide operate correctly on interleaved data.
+        cv::cuda::GpuMat floatImg;
+        if (normalize) {
+            batchInput[img].convertTo(floatImg, CV_32FC3, 1.f / 255.f, stream);
+        } else {
+            batchInput[img].convertTo(floatImg, CV_32FC3, 1.0, stream);
+        }
+
+        cv::cuda::subtract(floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), floatImg, cv::noArray(), -1, stream);
+        cv::cuda::divide(floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), floatImg, 1, -1, stream);
+
+        // 2. Split normalised HWC image into CHW planes directly into the blob.
+        size_t offset = static_cast<size_t>(img) * 3 * planeSize;
+
+        if (swapRB) {
+            // BGR input -> RGB planes: B goes to plane 2, G to plane 1, R to plane 0
+            std::vector<cv::cuda::GpuMat> channels{
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),  // B -> plane 2
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),      // G -> plane 1
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};                  // R -> plane 0
+            cv::cuda::split(floatImg, channels, stream);
+        } else {
+            // BGR input -> BGR planes: keep channel order
+            std::vector<cv::cuda::GpuMat> channels{
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
+            cv::cuda::split(floatImg, channels, stream);
+        }
+    }
+
+    return blob;
+}
+
+template <typename T> void Engine<T>::clearGpuBuffers() {
+    if (!m_buffers.empty()) {
+        // Free ALL I/O GPU buffers (both inputs and outputs).
+        // Previously only outputs were freed, leaking input allocations from loadNetwork().
+        for (void* ptr : m_buffers) {
+            if (ptr) {
+                Util::checkCudaErrorCode(cudaFree(ptr));
+            }
+        }
+        m_buffers.clear();
+    }
+}