Use software decoder by default
This commit is contained in:
@@ -24,28 +24,32 @@ void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input,
|
||||
output = std::move(input[0][0]);
|
||||
}
|
||||
template <typename T>
|
||||
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
|
||||
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
|
||||
size_t height, size_t width,
|
||||
const cv::Scalar& bgcolor) {
|
||||
// Ensure input is valid
|
||||
if (input.empty()) {
|
||||
return cv::cuda::GpuMat();
|
||||
return cv::cuda::GpuMat();
|
||||
}
|
||||
// Create a CUDA stream
|
||||
cv::cuda::Stream stream;
|
||||
// Calculate aspect ratio and unpadded dimensions
|
||||
|
||||
// Use a thread_local stream to avoid creating a new CUDA stream per call.
|
||||
// Creating cv::cuda::Stream() each call leaks stream handles under WDDM.
|
||||
thread_local cv::cuda::Stream stream;
|
||||
|
||||
float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
|
||||
size_t unpad_w = static_cast<size_t>(r * input.cols);
|
||||
size_t unpad_h = static_cast<size_t>(r * input.rows);
|
||||
|
||||
// Resize the input image
|
||||
cv::cuda::GpuMat re;
|
||||
re.create(unpad_h, unpad_w, input.type());
|
||||
re.create(static_cast<int>(unpad_h), static_cast<int>(unpad_w), input.type());
|
||||
cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);
|
||||
|
||||
// Create the output image and fill with the background color
|
||||
cv::cuda::GpuMat out;
|
||||
out.create(height, width, input.type());
|
||||
out.create(static_cast<int>(height), static_cast<int>(width), input.type());
|
||||
out.setTo(bgcolor, stream);
|
||||
// Copy the resized content into the top-left corner of the output image
|
||||
|
||||
// Copy the resized content into the top-left corner
|
||||
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
|
||||
stream.waitForCompletion();
|
||||
return out;
|
||||
@@ -195,41 +199,51 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
|
||||
const int W = batchInput[0].cols;
|
||||
const int batch = static_cast<int>(batchInput.size());
|
||||
const size_t planeSize = static_cast<size_t>(H) * W; // pixels per channel
|
||||
const int totalElems = batch * 3 * static_cast<int>(planeSize);
|
||||
|
||||
// Output blob: planar NCHW layout stored as a single-channel GpuMat.
|
||||
// Total elements = batch * 3 * H * W.
|
||||
cv::cuda::GpuMat blob(1, batch * 3 * static_cast<int>(planeSize), CV_32FC1);
|
||||
// thread_local cached buffers — reused across calls on the same thread.
|
||||
// KEY: allocate for MAX seen size, never shrink. This prevents the VRAM leak
|
||||
// caused by OpenCV's GpuMat pool growing unbounded when batch sizes alternate
|
||||
// (e.g., batch=1,6,1,6 → each size triggers new alloc, old goes to pool, never freed).
|
||||
thread_local cv::cuda::GpuMat tl_blob;
|
||||
thread_local cv::cuda::GpuMat tl_floatImg;
|
||||
thread_local int tl_blobMaxElems = 0;
|
||||
|
||||
if (totalElems > tl_blobMaxElems) {
|
||||
tl_blob = cv::cuda::GpuMat(1, totalElems, CV_32FC1);
|
||||
tl_blobMaxElems = totalElems;
|
||||
size_t blobBytes = static_cast<size_t>(totalElems) * sizeof(float);
|
||||
ANS_DBG("TRT_Preproc", "blobFromGpuMats: ALLOC blob batch=%d %dx%d %.1fMB (new max)",
|
||||
batch, W, H, blobBytes / (1024.0 * 1024.0));
|
||||
}
|
||||
// Use a sub-region of the cached blob for the current batch
|
||||
cv::cuda::GpuMat blob = tl_blob.colRange(0, totalElems);
|
||||
|
||||
for (int img = 0; img < batch; ++img) {
|
||||
// 1. Convert to float and normalise while still in HWC (interleaved) format.
|
||||
// Channel-wise subtract / divide operate correctly on interleaved data.
|
||||
cv::cuda::GpuMat floatImg;
|
||||
if (normalize) {
|
||||
batchInput[img].convertTo(floatImg, CV_32FC3, 1.f / 255.f, stream);
|
||||
batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.f / 255.f, stream);
|
||||
} else {
|
||||
batchInput[img].convertTo(floatImg, CV_32FC3, 1.0, stream);
|
||||
batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.0, stream);
|
||||
}
|
||||
|
||||
cv::cuda::subtract(floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), floatImg, cv::noArray(), -1, stream);
|
||||
cv::cuda::divide(floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), floatImg, 1, -1, stream);
|
||||
cv::cuda::subtract(tl_floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), tl_floatImg, cv::noArray(), -1, stream);
|
||||
cv::cuda::divide(tl_floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), tl_floatImg, 1, -1, stream);
|
||||
|
||||
// 2. Split normalised HWC image into CHW planes directly into the blob.
|
||||
size_t offset = static_cast<size_t>(img) * 3 * planeSize;
|
||||
|
||||
if (swapRB) {
|
||||
// BGR input -> RGB planes: B goes to plane 2, G to plane 1, R to plane 0
|
||||
std::vector<cv::cuda::GpuMat> channels{
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize), // B -> plane 2
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize), // G -> plane 1
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)}; // R -> plane 0
|
||||
cv::cuda::split(floatImg, channels, stream);
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};
|
||||
cv::cuda::split(tl_floatImg, channels, stream);
|
||||
} else {
|
||||
// BGR input -> BGR planes: keep channel order
|
||||
std::vector<cv::cuda::GpuMat> channels{
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
|
||||
cv::cuda::split(floatImg, channels, stream);
|
||||
cv::cuda::split(tl_floatImg, channels, stream);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -239,7 +253,6 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
|
||||
template <typename T> void Engine<T>::clearGpuBuffers() {
|
||||
if (!m_buffers.empty()) {
|
||||
// Free ALL I/O GPU buffers (both inputs and outputs).
|
||||
// Previously only outputs were freed, leaking input allocations from loadNetwork().
|
||||
for (void* ptr : m_buffers) {
|
||||
if (ptr) {
|
||||
Util::checkCudaErrorCode(cudaFree(ptr));
|
||||
@@ -247,4 +260,8 @@ template <typename T> void Engine<T>::clearGpuBuffers() {
|
||||
}
|
||||
m_buffers.clear();
|
||||
}
|
||||
|
||||
// Note: blob/floatImg caches are thread_local inside blobFromGpuMats (static method).
|
||||
// They are cleaned up automatically when threads exit.
|
||||
ANS_DBG("TRT_Engine", "clearGpuBuffers: I/O buffers released");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user