Files
ANSCORE/engines/TensorRTAPI/include/engine/EngineUtilities.inl

291 lines
12 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#pragma once
#include <filesystem>
#include <NvInfer.h> // NV_TENSORRT_MAJOR/MINOR/PATCH
#include <NvInferVersion.h> // also defines TRT version macros
#include <cudnn_version.h> // CUDNN_MAJOR/MINOR/PATCHLEVEL
#include <cuda_runtime.h> // cudaRuntimeGetVersion
template <typename T>
void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output) {
if (input.size() == 1) {
output = std::move(input[0]);
}
else {
auto msg = "The feature vector has incorrect dimensions!";
std::cout<<msg;
}
}
template <typename T>
void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output) {
if (input.size() != 1 || input[0].size() != 1) {
auto msg = "The feature vector has incorrect dimensions!";
std::cout<<msg;
}
output = std::move(input[0][0]);
}
// CPU letterbox resize — same logic as the GPU version but runs on CPU.
// Used in Preprocess to resize BEFORE GPU upload, reducing PCIe transfer
// from 25 MB (4K) to 1.2 MB (640×640) — 20x less bandwidth.
template <typename T>
cv::Mat Engine<T>::cpuResizeKeepAspectRatioPadRightBottom(const cv::Mat& input,
size_t height, size_t width,
const cv::Scalar& bgcolor) {
if (input.empty()) return cv::Mat();
float r = std::min(static_cast<float>(width) / input.cols,
static_cast<float>(height) / input.rows);
int unpad_w = static_cast<int>(r * input.cols);
int unpad_h = static_cast<int>(r * input.rows);
cv::Mat re;
cv::resize(input, re, cv::Size(unpad_w, unpad_h), 0, 0, cv::INTER_LINEAR);
cv::Mat out(static_cast<int>(height), static_cast<int>(width), input.type(), bgcolor);
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
return out;
}
template <typename T>
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
size_t height, size_t width,
const cv::Scalar& bgcolor) {
if (input.empty()) {
return cv::cuda::GpuMat();
}
// Use a thread_local stream to avoid creating a new CUDA stream per call.
// Creating cv::cuda::Stream() each call leaks stream handles under WDDM.
thread_local cv::cuda::Stream stream;
float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
size_t unpad_w = static_cast<size_t>(r * input.cols);
size_t unpad_h = static_cast<size_t>(r * input.rows);
// Resize the input image
cv::cuda::GpuMat re;
re.create(static_cast<int>(unpad_h), static_cast<int>(unpad_w), input.type());
cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);
// Create the output image and fill with the background color
cv::cuda::GpuMat out;
out.create(static_cast<int>(height), static_cast<int>(width), input.type());
out.setTo(bgcolor, stream);
// Copy the resized content into the top-left corner
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
stream.waitForCompletion();
return out;
}
template <typename T> void Engine<T>::getDeviceNames(std::vector<std::string> &deviceNames) {
int numGPUs;
cudaGetDeviceCount(&numGPUs);
for (int device = 0; device < numGPUs; device++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device);
deviceNames.push_back(std::string(prop.name));
}
}
template <typename T> int Engine<T>::getBindingIndexByName(const std::string& name) {
for (int i = 0, e = m_engine->getNbIOTensors(); i < e; i++)
{
if (name == m_engine->getIOTensorName(i))
{
return i;
}
}
return -1;
}
//template <typename T> std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options &options, const std::string &onnxModelPath) {
// const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
// std::string engineName = onnxModelPath.substr(filenamePos, onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
//
// // Add the GPU device name to the file to ensure that the model is only used
// // on devices with the exact same GPU
// std::vector<std::string> deviceNames;
// getDeviceNames(deviceNames);
//
// if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
// auto msg = "Error, provided device index is out of range!";
// std::cout<<msg;
// return "";
// }
//
// auto deviceName = deviceNames[options.deviceIndex];
// // Remove spaces from the device name
// deviceName.erase(std::remove_if(deviceName.begin(), deviceName.end(), ::isspace), deviceName.end());
// engineName += "." + deviceName;
// // Serialize the specified options into the filename
// if (options.precision == ANSCENTER::Precision::FP16) {
// engineName += ".fp16";
// } else if (options.precision == ANSCENTER::Precision::FP32) {
// engineName += ".fp32";
// } else {
// engineName += ".int8";
// }
// if (options.maxBatchSize > 1) {
// engineName += "." + std::to_string(options.maxBatchSize);
// }
// return engineName;
//}
template <typename T>
std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options& options,
const std::string& onnxModelPath)
{
// -- Base name from ONNX file ---------------------------------------------
const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
std::string engineName = onnxModelPath.substr(
filenamePos,
onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
// -- GPU device name ------------------------------------------------------
// Ensures the engine is only loaded on the exact GPU it was built for.
std::vector<std::string> deviceNames;
getDeviceNames(deviceNames);
if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
std::cout << "Error, provided device index is out of range!";
return "";
}
auto deviceName = deviceNames[options.deviceIndex];
deviceName.erase(
std::remove_if(deviceName.begin(), deviceName.end(), ::isspace),
deviceName.end());
engineName += "." + deviceName;
// -- Precision ------------------------------------------------------------
if (options.precision == ANSCENTER::Precision::FP16) {
engineName += ".fp16";
}
else if (options.precision == ANSCENTER::Precision::FP32) {
engineName += ".fp32";
}
else {
engineName += ".int8";
}
// -- Batch size -----------------------------------------------------------
if (options.maxBatchSize > 1) {
engineName += ".b" + std::to_string(options.maxBatchSize);
}
// -- Max spatial dims: intentionally NOT included in the filename ----------
// buildWithRetry() may reduce max dims (e.g. 2560→1920) when GPU memory
// is insufficient. If the filename included .s{H}x{W}, the next launch
// would look for .s2560x2560, miss the cached .s1920x1920, and waste
// minutes re-attempting the doomed 2560 build before falling back.
// Without the suffix, the cache is found immediately on the next launch.
// The actual profile max is queried at runtime via getProfileMaxHeight/Width.
// -- TensorRT version -----------------------------------------------------
// Engine format changes between TensorRT minor versions -- must rebuild.
// NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH are defined in
// <NvInferVersion.h> which is included via NvInfer.h.
engineName += ".trt"
+ std::to_string(NV_TENSORRT_MAJOR) + "."
+ std::to_string(NV_TENSORRT_MINOR) + "."
+ std::to_string(NV_TENSORRT_PATCH);
// -- CUDA runtime version -------------------------------------------------
// Engines built with different CUDA versions may use different PTX/cubin
// formats and must be rebuilt.
int cudaVersion = 0;
cudaRuntimeGetVersion(&cudaVersion);
const int cudaMajor = cudaVersion / 1000;
const int cudaMinor = (cudaVersion % 1000) / 10;
engineName += ".cuda"
+ std::to_string(cudaMajor) + "."
+ std::to_string(cudaMinor);
// -- cuDNN version --------------------------------------------------------
// cuDNN version affects layer implementations inside the engine.
// CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL are defined in <cudnn_version.h>.
engineName += ".cudnn"
+ std::to_string(CUDNN_MAJOR) + "."
+ std::to_string(CUDNN_MINOR);
return engineName;
}
template <typename T>
cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
const std::array<float, 3> &divVals, bool normalize, bool swapRB,
cv::cuda::Stream &stream) {
cv::cuda::GpuMat result;
if (batchInput.empty()) return result;
if (batchInput[0].channels() != 3) return result;
const int H = batchInput[0].rows;
const int W = batchInput[0].cols;
const int batch = static_cast<int>(batchInput.size());
const size_t planeSize = static_cast<size_t>(H) * W; // pixels per channel
const int totalElems = batch * 3 * static_cast<int>(planeSize);
// thread_local cached buffers — reused across calls on the same thread.
// KEY: allocate for MAX seen size, never shrink. This prevents the VRAM leak
// caused by OpenCV's GpuMat pool growing unbounded when batch sizes alternate
// (e.g., batch=1,6,1,6 → each size triggers new alloc, old goes to pool, never freed).
thread_local cv::cuda::GpuMat tl_blob;
thread_local cv::cuda::GpuMat tl_floatImg;
thread_local int tl_blobMaxElems = 0;
if (totalElems > tl_blobMaxElems) {
tl_blob = cv::cuda::GpuMat(1, totalElems, CV_32FC1);
tl_blobMaxElems = totalElems;
size_t blobBytes = static_cast<size_t>(totalElems) * sizeof(float);
ANS_DBG("TRT_Preproc", "blobFromGpuMats: ALLOC blob batch=%d %dx%d %.1fMB (new max)",
batch, W, H, blobBytes / (1024.0 * 1024.0));
}
// Use a sub-region of the cached blob for the current batch
cv::cuda::GpuMat blob = tl_blob.colRange(0, totalElems);
for (int img = 0; img < batch; ++img) {
if (normalize) {
batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.f / 255.f, stream);
} else {
batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.0, stream);
}
cv::cuda::subtract(tl_floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), tl_floatImg, cv::noArray(), -1, stream);
cv::cuda::divide(tl_floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), tl_floatImg, 1, -1, stream);
// 2. Split normalised HWC image into CHW planes directly into the blob.
size_t offset = static_cast<size_t>(img) * 3 * planeSize;
if (swapRB) {
std::vector<cv::cuda::GpuMat> channels{
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};
cv::cuda::split(tl_floatImg, channels, stream);
} else {
std::vector<cv::cuda::GpuMat> channels{
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
cv::cuda::split(tl_floatImg, channels, stream);
}
}
return blob;
}
template <typename T> void Engine<T>::clearGpuBuffers() {
if (!m_buffers.empty()) {
// Free ALL I/O GPU buffers (both inputs and outputs).
for (void* ptr : m_buffers) {
if (ptr) {
Util::checkCudaErrorCode(cudaFree(ptr));
}
}
m_buffers.clear();
}
// Note: blob/floatImg caches are thread_local inside blobFromGpuMats (static method).
// They are cleaned up automatically when threads exit.
ANS_DBG("TRT_Engine", "clearGpuBuffers: I/O buffers released");
}