2026-03-28 16:54:11 +11:00
|
|
|
#pragma once
|
|
|
|
|
#include <filesystem>
|
|
|
|
|
#include <NvInfer.h> // NV_TENSORRT_MAJOR/MINOR/PATCH
|
|
|
|
|
#include <NvInferVersion.h> // also defines TRT version macros
|
|
|
|
|
#include <cudnn_version.h> // CUDNN_MAJOR/MINOR/PATCHLEVEL
|
|
|
|
|
#include <cuda_runtime.h> // cudaRuntimeGetVersion
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output) {
|
|
|
|
|
if (input.size() == 1) {
|
|
|
|
|
output = std::move(input[0]);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
auto msg = "The feature vector has incorrect dimensions!";
|
|
|
|
|
std::cout<<msg;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
template <typename T>
|
|
|
|
|
void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output) {
|
|
|
|
|
if (input.size() != 1 || input[0].size() != 1) {
|
|
|
|
|
auto msg = "The feature vector has incorrect dimensions!";
|
|
|
|
|
std::cout<<msg;
|
|
|
|
|
}
|
|
|
|
|
output = std::move(input[0][0]);
|
|
|
|
|
}
|
|
|
|
|
template <typename T>
|
2026-04-04 20:19:54 +11:00
|
|
|
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
|
2026-03-28 16:54:11 +11:00
|
|
|
size_t height, size_t width,
|
|
|
|
|
const cv::Scalar& bgcolor) {
|
|
|
|
|
if (input.empty()) {
|
2026-04-04 20:19:54 +11:00
|
|
|
return cv::cuda::GpuMat();
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
2026-04-04 20:19:54 +11:00
|
|
|
|
|
|
|
|
// Use a thread_local stream to avoid creating a new CUDA stream per call.
|
|
|
|
|
// Creating cv::cuda::Stream() each call leaks stream handles under WDDM.
|
|
|
|
|
thread_local cv::cuda::Stream stream;
|
|
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
|
|
|
|
|
size_t unpad_w = static_cast<size_t>(r * input.cols);
|
|
|
|
|
size_t unpad_h = static_cast<size_t>(r * input.rows);
|
2026-04-04 20:19:54 +11:00
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
// Resize the input image
|
|
|
|
|
cv::cuda::GpuMat re;
|
2026-04-04 20:19:54 +11:00
|
|
|
re.create(static_cast<int>(unpad_h), static_cast<int>(unpad_w), input.type());
|
2026-03-28 16:54:11 +11:00
|
|
|
cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);
|
2026-04-04 20:19:54 +11:00
|
|
|
|
2026-03-28 16:54:11 +11:00
|
|
|
// Create the output image and fill with the background color
|
|
|
|
|
cv::cuda::GpuMat out;
|
2026-04-04 20:19:54 +11:00
|
|
|
out.create(static_cast<int>(height), static_cast<int>(width), input.type());
|
2026-03-28 16:54:11 +11:00
|
|
|
out.setTo(bgcolor, stream);
|
2026-04-04 20:19:54 +11:00
|
|
|
|
|
|
|
|
// Copy the resized content into the top-left corner
|
2026-03-28 16:54:11 +11:00
|
|
|
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
|
|
|
|
|
stream.waitForCompletion();
|
|
|
|
|
return out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T> void Engine<T>::getDeviceNames(std::vector<std::string> &deviceNames) {
|
|
|
|
|
int numGPUs;
|
|
|
|
|
cudaGetDeviceCount(&numGPUs);
|
|
|
|
|
for (int device = 0; device < numGPUs; device++) {
|
|
|
|
|
cudaDeviceProp prop;
|
|
|
|
|
cudaGetDeviceProperties(&prop, device);
|
|
|
|
|
deviceNames.push_back(std::string(prop.name));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
template <typename T> int Engine<T>::getBindingIndexByName(const std::string& name) {
|
|
|
|
|
for (int i = 0, e = m_engine->getNbIOTensors(); i < e; i++)
|
|
|
|
|
{
|
|
|
|
|
if (name == m_engine->getIOTensorName(i))
|
|
|
|
|
{
|
|
|
|
|
return i;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//template <typename T> std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options &options, const std::string &onnxModelPath) {
|
|
|
|
|
// const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
|
|
|
|
|
// std::string engineName = onnxModelPath.substr(filenamePos, onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
|
|
|
|
|
//
|
|
|
|
|
// // Add the GPU device name to the file to ensure that the model is only used
|
|
|
|
|
// // on devices with the exact same GPU
|
|
|
|
|
// std::vector<std::string> deviceNames;
|
|
|
|
|
// getDeviceNames(deviceNames);
|
|
|
|
|
//
|
|
|
|
|
// if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
|
|
|
|
|
// auto msg = "Error, provided device index is out of range!";
|
|
|
|
|
// std::cout<<msg;
|
|
|
|
|
// return "";
|
|
|
|
|
// }
|
|
|
|
|
//
|
|
|
|
|
// auto deviceName = deviceNames[options.deviceIndex];
|
|
|
|
|
// // Remove spaces from the device name
|
|
|
|
|
// deviceName.erase(std::remove_if(deviceName.begin(), deviceName.end(), ::isspace), deviceName.end());
|
|
|
|
|
// engineName += "." + deviceName;
|
|
|
|
|
// // Serialize the specified options into the filename
|
|
|
|
|
// if (options.precision == ANSCENTER::Precision::FP16) {
|
|
|
|
|
// engineName += ".fp16";
|
|
|
|
|
// } else if (options.precision == ANSCENTER::Precision::FP32) {
|
|
|
|
|
// engineName += ".fp32";
|
|
|
|
|
// } else {
|
|
|
|
|
// engineName += ".int8";
|
|
|
|
|
// }
|
|
|
|
|
// if (options.maxBatchSize > 1) {
|
|
|
|
|
// engineName += "." + std::to_string(options.maxBatchSize);
|
|
|
|
|
// }
|
|
|
|
|
// return engineName;
|
|
|
|
|
//}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options& options,
|
|
|
|
|
const std::string& onnxModelPath)
|
|
|
|
|
{
|
|
|
|
|
// -- Base name from ONNX file ---------------------------------------------
|
|
|
|
|
const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
|
|
|
|
|
std::string engineName = onnxModelPath.substr(
|
|
|
|
|
filenamePos,
|
|
|
|
|
onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
|
|
|
|
|
|
|
|
|
|
// -- GPU device name ------------------------------------------------------
|
|
|
|
|
// Ensures the engine is only loaded on the exact GPU it was built for.
|
|
|
|
|
std::vector<std::string> deviceNames;
|
|
|
|
|
getDeviceNames(deviceNames);
|
|
|
|
|
if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
|
|
|
|
|
std::cout << "Error, provided device index is out of range!";
|
|
|
|
|
return "";
|
|
|
|
|
}
|
|
|
|
|
auto deviceName = deviceNames[options.deviceIndex];
|
|
|
|
|
deviceName.erase(
|
|
|
|
|
std::remove_if(deviceName.begin(), deviceName.end(), ::isspace),
|
|
|
|
|
deviceName.end());
|
|
|
|
|
engineName += "." + deviceName;
|
|
|
|
|
|
|
|
|
|
// -- Precision ------------------------------------------------------------
|
|
|
|
|
if (options.precision == ANSCENTER::Precision::FP16) {
|
|
|
|
|
engineName += ".fp16";
|
|
|
|
|
}
|
|
|
|
|
else if (options.precision == ANSCENTER::Precision::FP32) {
|
|
|
|
|
engineName += ".fp32";
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
engineName += ".int8";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// -- Batch size -----------------------------------------------------------
|
|
|
|
|
if (options.maxBatchSize > 1) {
|
|
|
|
|
engineName += ".b" + std::to_string(options.maxBatchSize);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// -- Max spatial dims: intentionally NOT included in the filename ----------
|
|
|
|
|
// buildWithRetry() may reduce max dims (e.g. 2560→1920) when GPU memory
|
|
|
|
|
// is insufficient. If the filename included .s{H}x{W}, the next launch
|
|
|
|
|
// would look for .s2560x2560, miss the cached .s1920x1920, and waste
|
|
|
|
|
// minutes re-attempting the doomed 2560 build before falling back.
|
|
|
|
|
// Without the suffix, the cache is found immediately on the next launch.
|
|
|
|
|
// The actual profile max is queried at runtime via getProfileMaxHeight/Width.
|
|
|
|
|
|
|
|
|
|
// -- TensorRT version -----------------------------------------------------
|
|
|
|
|
// Engine format changes between TensorRT minor versions -- must rebuild.
|
|
|
|
|
// NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH are defined in
|
|
|
|
|
// <NvInferVersion.h> which is included via NvInfer.h.
|
|
|
|
|
engineName += ".trt"
|
|
|
|
|
+ std::to_string(NV_TENSORRT_MAJOR) + "."
|
|
|
|
|
+ std::to_string(NV_TENSORRT_MINOR) + "."
|
|
|
|
|
+ std::to_string(NV_TENSORRT_PATCH);
|
|
|
|
|
|
|
|
|
|
// -- CUDA runtime version -------------------------------------------------
|
|
|
|
|
// Engines built with different CUDA versions may use different PTX/cubin
|
|
|
|
|
// formats and must be rebuilt.
|
|
|
|
|
int cudaVersion = 0;
|
|
|
|
|
cudaRuntimeGetVersion(&cudaVersion);
|
|
|
|
|
const int cudaMajor = cudaVersion / 1000;
|
|
|
|
|
const int cudaMinor = (cudaVersion % 1000) / 10;
|
|
|
|
|
engineName += ".cuda"
|
|
|
|
|
+ std::to_string(cudaMajor) + "."
|
|
|
|
|
+ std::to_string(cudaMinor);
|
|
|
|
|
|
|
|
|
|
// -- cuDNN version --------------------------------------------------------
|
|
|
|
|
// cuDNN version affects layer implementations inside the engine.
|
|
|
|
|
// CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL are defined in <cudnn_version.h>.
|
|
|
|
|
engineName += ".cudnn"
|
|
|
|
|
+ std::to_string(CUDNN_MAJOR) + "."
|
|
|
|
|
+ std::to_string(CUDNN_MINOR);
|
|
|
|
|
|
|
|
|
|
return engineName;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
|
|
|
|
|
const std::array<float, 3> &divVals, bool normalize, bool swapRB,
|
|
|
|
|
cv::cuda::Stream &stream) {
|
|
|
|
|
cv::cuda::GpuMat result;
|
|
|
|
|
if (batchInput.empty()) return result;
|
|
|
|
|
if (batchInput[0].channels() != 3) return result;
|
|
|
|
|
|
|
|
|
|
const int H = batchInput[0].rows;
|
|
|
|
|
const int W = batchInput[0].cols;
|
|
|
|
|
const int batch = static_cast<int>(batchInput.size());
|
|
|
|
|
const size_t planeSize = static_cast<size_t>(H) * W; // pixels per channel
|
2026-04-04 20:19:54 +11:00
|
|
|
const int totalElems = batch * 3 * static_cast<int>(planeSize);
|
2026-03-28 16:54:11 +11:00
|
|
|
|
2026-04-04 20:19:54 +11:00
|
|
|
// thread_local cached buffers — reused across calls on the same thread.
|
|
|
|
|
// KEY: allocate for MAX seen size, never shrink. This prevents the VRAM leak
|
|
|
|
|
// caused by OpenCV's GpuMat pool growing unbounded when batch sizes alternate
|
|
|
|
|
// (e.g., batch=1,6,1,6 → each size triggers new alloc, old goes to pool, never freed).
|
|
|
|
|
thread_local cv::cuda::GpuMat tl_blob;
|
|
|
|
|
thread_local cv::cuda::GpuMat tl_floatImg;
|
|
|
|
|
thread_local int tl_blobMaxElems = 0;
|
|
|
|
|
|
|
|
|
|
if (totalElems > tl_blobMaxElems) {
|
|
|
|
|
tl_blob = cv::cuda::GpuMat(1, totalElems, CV_32FC1);
|
|
|
|
|
tl_blobMaxElems = totalElems;
|
|
|
|
|
size_t blobBytes = static_cast<size_t>(totalElems) * sizeof(float);
|
|
|
|
|
ANS_DBG("TRT_Preproc", "blobFromGpuMats: ALLOC blob batch=%d %dx%d %.1fMB (new max)",
|
|
|
|
|
batch, W, H, blobBytes / (1024.0 * 1024.0));
|
|
|
|
|
}
|
|
|
|
|
// Use a sub-region of the cached blob for the current batch
|
|
|
|
|
cv::cuda::GpuMat blob = tl_blob.colRange(0, totalElems);
|
2026-03-28 16:54:11 +11:00
|
|
|
|
|
|
|
|
for (int img = 0; img < batch; ++img) {
|
|
|
|
|
if (normalize) {
|
2026-04-04 20:19:54 +11:00
|
|
|
batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.f / 255.f, stream);
|
2026-03-28 16:54:11 +11:00
|
|
|
} else {
|
2026-04-04 20:19:54 +11:00
|
|
|
batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.0, stream);
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
|
|
|
|
|
2026-04-04 20:19:54 +11:00
|
|
|
cv::cuda::subtract(tl_floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), tl_floatImg, cv::noArray(), -1, stream);
|
|
|
|
|
cv::cuda::divide(tl_floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), tl_floatImg, 1, -1, stream);
|
2026-03-28 16:54:11 +11:00
|
|
|
|
|
|
|
|
// 2. Split normalised HWC image into CHW planes directly into the blob.
|
|
|
|
|
size_t offset = static_cast<size_t>(img) * 3 * planeSize;
|
|
|
|
|
|
|
|
|
|
if (swapRB) {
|
|
|
|
|
std::vector<cv::cuda::GpuMat> channels{
|
2026-04-04 20:19:54 +11:00
|
|
|
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),
|
|
|
|
|
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
|
|
|
|
|
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};
|
|
|
|
|
cv::cuda::split(tl_floatImg, channels, stream);
|
2026-03-28 16:54:11 +11:00
|
|
|
} else {
|
|
|
|
|
std::vector<cv::cuda::GpuMat> channels{
|
|
|
|
|
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
|
|
|
|
|
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
|
|
|
|
|
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
|
2026-04-04 20:19:54 +11:00
|
|
|
cv::cuda::split(tl_floatImg, channels, stream);
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return blob;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T> void Engine<T>::clearGpuBuffers() {
|
|
|
|
|
if (!m_buffers.empty()) {
|
|
|
|
|
// Free ALL I/O GPU buffers (both inputs and outputs).
|
|
|
|
|
for (void* ptr : m_buffers) {
|
|
|
|
|
if (ptr) {
|
|
|
|
|
Util::checkCudaErrorCode(cudaFree(ptr));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
m_buffers.clear();
|
|
|
|
|
}
|
2026-04-04 20:19:54 +11:00
|
|
|
|
|
|
|
|
// Note: blob/floatImg caches are thread_local inside blobFromGpuMats (static method).
|
|
|
|
|
// They are cleaned up automatically when threads exit.
|
|
|
|
|
ANS_DBG("TRT_Engine", "clearGpuBuffers: I/O buffers released");
|
2026-03-28 16:54:11 +11:00
|
|
|
}
|