#pragma once #include #include // NV_TENSORRT_MAJOR/MINOR/PATCH #include // also defines TRT version macros #include // CUDNN_MAJOR/MINOR/PATCHLEVEL #include // cudaRuntimeGetVersion template void Engine::transformOutput(std::vector>> &input, std::vector> &output) { if (input.size() == 1) { output = std::move(input[0]); } else { auto msg = "The feature vector has incorrect dimensions!"; std::cout< void Engine::transformOutput(std::vector>> &input, std::vector &output) { if (input.size() != 1 || input[0].size() != 1) { auto msg = "The feature vector has incorrect dimensions!"; std::cout< cv::cuda::GpuMat Engine::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input, size_t height, size_t width, const cv::Scalar& bgcolor) { // Ensure input is valid if (input.empty()) { return cv::cuda::GpuMat(); } // Create a CUDA stream cv::cuda::Stream stream; // Calculate aspect ratio and unpadded dimensions float r = std::min(static_cast(width) / input.cols, static_cast(height) / input.rows); size_t unpad_w = static_cast(r * input.cols); size_t unpad_h = static_cast(r * input.rows); // Resize the input image cv::cuda::GpuMat re; re.create(unpad_h, unpad_w, input.type()); cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream); // Create the output image and fill with the background color cv::cuda::GpuMat out; out.create(height, width, input.type()); out.setTo(bgcolor, stream); // Copy the resized content into the top-left corner of the output image re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream); stream.waitForCompletion(); return out; } template void Engine::getDeviceNames(std::vector &deviceNames) { int numGPUs; cudaGetDeviceCount(&numGPUs); for (int device = 0; device < numGPUs; device++) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, device); deviceNames.push_back(std::string(prop.name)); } } template int Engine::getBindingIndexByName(const std::string& name) { for (int i = 0, e = m_engine->getNbIOTensors(); i < e; i++) { if (name == m_engine->getIOTensorName(i)) { return i; } } return -1; } //template std::string Engine::serializeEngineOptions(const ANSCENTER::Options &options, const std::string &onnxModelPath) { // const auto filenamePos = onnxModelPath.find_last_of('/') + 1; // std::string engineName = onnxModelPath.substr(filenamePos, onnxModelPath.find_last_of('.') - filenamePos) + ".engine"; // // // Add the GPU device name to the file to ensure that the model is only used // // on devices with the exact same GPU // std::vector deviceNames; // getDeviceNames(deviceNames); // // if (static_cast(options.deviceIndex) >= deviceNames.size()) { // auto msg = "Error, provided device index is out of range!"; // std::cout< 1) { // engineName += "." + std::to_string(options.maxBatchSize); // } // return engineName; //} template std::string Engine::serializeEngineOptions(const ANSCENTER::Options& options, const std::string& onnxModelPath) { // -- Base name from ONNX file --------------------------------------------- const auto filenamePos = onnxModelPath.find_last_of('/') + 1; std::string engineName = onnxModelPath.substr( filenamePos, onnxModelPath.find_last_of('.') - filenamePos) + ".engine"; // -- GPU device name ------------------------------------------------------ // Ensures the engine is only loaded on the exact GPU it was built for. std::vector deviceNames; getDeviceNames(deviceNames); if (static_cast(options.deviceIndex) >= deviceNames.size()) { std::cout << "Error, provided device index is out of range!"; return ""; } auto deviceName = deviceNames[options.deviceIndex]; deviceName.erase( std::remove_if(deviceName.begin(), deviceName.end(), ::isspace), deviceName.end()); engineName += "." + deviceName; // -- Precision ------------------------------------------------------------ if (options.precision == ANSCENTER::Precision::FP16) { engineName += ".fp16"; } else if (options.precision == ANSCENTER::Precision::FP32) { engineName += ".fp32"; } else { engineName += ".int8"; } // -- Batch size ----------------------------------------------------------- if (options.maxBatchSize > 1) { engineName += ".b" + std::to_string(options.maxBatchSize); } // -- Max spatial dims: intentionally NOT included in the filename ---------- // buildWithRetry() may reduce max dims (e.g. 2560→1920) when GPU memory // is insufficient. If the filename included .s{H}x{W}, the next launch // would look for .s2560x2560, miss the cached .s1920x1920, and waste // minutes re-attempting the doomed 2560 build before falling back. // Without the suffix, the cache is found immediately on the next launch. // The actual profile max is queried at runtime via getProfileMaxHeight/Width. // -- TensorRT version ----------------------------------------------------- // Engine format changes between TensorRT minor versions -- must rebuild. // NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH are defined in // which is included via NvInfer.h. engineName += ".trt" + std::to_string(NV_TENSORRT_MAJOR) + "." + std::to_string(NV_TENSORRT_MINOR) + "." + std::to_string(NV_TENSORRT_PATCH); // -- CUDA runtime version ------------------------------------------------- // Engines built with different CUDA versions may use different PTX/cubin // formats and must be rebuilt. int cudaVersion = 0; cudaRuntimeGetVersion(&cudaVersion); const int cudaMajor = cudaVersion / 1000; const int cudaMinor = (cudaVersion % 1000) / 10; engineName += ".cuda" + std::to_string(cudaMajor) + "." + std::to_string(cudaMinor); // -- cuDNN version -------------------------------------------------------- // cuDNN version affects layer implementations inside the engine. // CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL are defined in . engineName += ".cudnn" + std::to_string(CUDNN_MAJOR) + "." + std::to_string(CUDNN_MINOR); return engineName; } template cv::cuda::GpuMat Engine::blobFromGpuMats(const std::vector &batchInput, const std::array &subVals, const std::array &divVals, bool normalize, bool swapRB, cv::cuda::Stream &stream) { cv::cuda::GpuMat result; if (batchInput.empty()) return result; if (batchInput[0].channels() != 3) return result; const int H = batchInput[0].rows; const int W = batchInput[0].cols; const int batch = static_cast(batchInput.size()); const size_t planeSize = static_cast(H) * W; // pixels per channel // Output blob: planar NCHW layout stored as a single-channel GpuMat. // Total elements = batch * 3 * H * W. cv::cuda::GpuMat blob(1, batch * 3 * static_cast(planeSize), CV_32FC1); for (int img = 0; img < batch; ++img) { // 1. Convert to float and normalise while still in HWC (interleaved) format. // Channel-wise subtract / divide operate correctly on interleaved data. cv::cuda::GpuMat floatImg; if (normalize) { batchInput[img].convertTo(floatImg, CV_32FC3, 1.f / 255.f, stream); } else { batchInput[img].convertTo(floatImg, CV_32FC3, 1.0, stream); } cv::cuda::subtract(floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), floatImg, cv::noArray(), -1, stream); cv::cuda::divide(floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), floatImg, 1, -1, stream); // 2. Split normalised HWC image into CHW planes directly into the blob. size_t offset = static_cast(img) * 3 * planeSize; if (swapRB) { // BGR input -> RGB planes: B goes to plane 2, G to plane 1, R to plane 0 std::vector channels{ cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset + 2 * planeSize), // B -> plane 2 cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset + planeSize), // G -> plane 1 cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset)}; // R -> plane 0 cv::cuda::split(floatImg, channels, stream); } else { // BGR input -> BGR planes: keep channel order std::vector channels{ cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset), cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset + planeSize), cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr() + offset + 2 * planeSize)}; cv::cuda::split(floatImg, channels, stream); } } return blob; } template void Engine::clearGpuBuffers() { if (!m_buffers.empty()) { // Free ALL I/O GPU buffers (both inputs and outputs). // Previously only outputs were freed, leaking input allocations from loadNetwork(). for (void* ptr : m_buffers) { if (ptr) { Util::checkCudaErrorCode(cudaFree(ptr)); } } m_buffers.clear(); } }