Use software decoder by default

This commit is contained in:
2026-04-04 20:19:54 +11:00
parent 3a21026790
commit e134ebdf15
24 changed files with 693 additions and 215 deletions

View File

@@ -24,28 +24,32 @@ void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input,
output = std::move(input[0][0]);
}
template <typename T>
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
size_t height, size_t width,
const cv::Scalar& bgcolor) {
// Ensure input is valid
if (input.empty()) {
return cv::cuda::GpuMat();
return cv::cuda::GpuMat();
}
// Create a CUDA stream
cv::cuda::Stream stream;
// Calculate aspect ratio and unpadded dimensions
// Use a thread_local stream to avoid creating a new CUDA stream per call.
// Creating cv::cuda::Stream() each call leaks stream handles under WDDM.
thread_local cv::cuda::Stream stream;
float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
size_t unpad_w = static_cast<size_t>(r * input.cols);
size_t unpad_h = static_cast<size_t>(r * input.rows);
// Resize the input image
cv::cuda::GpuMat re;
re.create(unpad_h, unpad_w, input.type());
re.create(static_cast<int>(unpad_h), static_cast<int>(unpad_w), input.type());
cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);
// Create the output image and fill with the background color
cv::cuda::GpuMat out;
out.create(height, width, input.type());
out.create(static_cast<int>(height), static_cast<int>(width), input.type());
out.setTo(bgcolor, stream);
// Copy the resized content into the top-left corner of the output image
// Copy the resized content into the top-left corner
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
stream.waitForCompletion();
return out;
@@ -195,41 +199,51 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
const int W = batchInput[0].cols;
const int batch = static_cast<int>(batchInput.size());
const size_t planeSize = static_cast<size_t>(H) * W; // pixels per channel
const int totalElems = batch * 3 * static_cast<int>(planeSize);
// Output blob: planar NCHW layout stored as a single-channel GpuMat.
// Total elements = batch * 3 * H * W.
cv::cuda::GpuMat blob(1, batch * 3 * static_cast<int>(planeSize), CV_32FC1);
// thread_local cached buffers — reused across calls on the same thread.
// KEY: allocate for MAX seen size, never shrink. This prevents the VRAM leak
// caused by OpenCV's GpuMat pool growing unbounded when batch sizes alternate
// (e.g., batch=1,6,1,6 → each size triggers new alloc, old goes to pool, never freed).
thread_local cv::cuda::GpuMat tl_blob;
thread_local cv::cuda::GpuMat tl_floatImg;
thread_local int tl_blobMaxElems = 0;
if (totalElems > tl_blobMaxElems) {
tl_blob = cv::cuda::GpuMat(1, totalElems, CV_32FC1);
tl_blobMaxElems = totalElems;
size_t blobBytes = static_cast<size_t>(totalElems) * sizeof(float);
ANS_DBG("TRT_Preproc", "blobFromGpuMats: ALLOC blob batch=%d %dx%d %.1fMB (new max)",
batch, W, H, blobBytes / (1024.0 * 1024.0));
}
// Use a sub-region of the cached blob for the current batch
cv::cuda::GpuMat blob = tl_blob.colRange(0, totalElems);
for (int img = 0; img < batch; ++img) {
// 1. Convert to float and normalise while still in HWC (interleaved) format.
// Channel-wise subtract / divide operate correctly on interleaved data.
cv::cuda::GpuMat floatImg;
if (normalize) {
batchInput[img].convertTo(floatImg, CV_32FC3, 1.f / 255.f, stream);
batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.f / 255.f, stream);
} else {
batchInput[img].convertTo(floatImg, CV_32FC3, 1.0, stream);
batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.0, stream);
}
cv::cuda::subtract(floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), floatImg, cv::noArray(), -1, stream);
cv::cuda::divide(floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), floatImg, 1, -1, stream);
cv::cuda::subtract(tl_floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), tl_floatImg, cv::noArray(), -1, stream);
cv::cuda::divide(tl_floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), tl_floatImg, 1, -1, stream);
// 2. Split normalised HWC image into CHW planes directly into the blob.
size_t offset = static_cast<size_t>(img) * 3 * planeSize;
if (swapRB) {
// BGR input -> RGB planes: B goes to plane 2, G to plane 1, R to plane 0
std::vector<cv::cuda::GpuMat> channels{
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize), // B -> plane 2
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize), // G -> plane 1
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)}; // R -> plane 0
cv::cuda::split(floatImg, channels, stream);
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};
cv::cuda::split(tl_floatImg, channels, stream);
} else {
// BGR input -> BGR planes: keep channel order
std::vector<cv::cuda::GpuMat> channels{
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
cv::cuda::split(floatImg, channels, stream);
cv::cuda::split(tl_floatImg, channels, stream);
}
}
@@ -239,7 +253,6 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
template <typename T> void Engine<T>::clearGpuBuffers() {
if (!m_buffers.empty()) {
// Free ALL I/O GPU buffers (both inputs and outputs).
// Previously only outputs were freed, leaking input allocations from loadNetwork().
for (void* ptr : m_buffers) {
if (ptr) {
Util::checkCudaErrorCode(cudaFree(ptr));
@@ -247,4 +260,8 @@ template <typename T> void Engine<T>::clearGpuBuffers() {
}
m_buffers.clear();
}
// Note: blob/floatImg caches are thread_local inside blobFromGpuMats (static method).
// They are cleaned up automatically when threads exit.
ANS_DBG("TRT_Engine", "clearGpuBuffers: I/O buffers released");
}