Use software decoder by default

2026-04-04 20:19:54 +11:00
parent 3a21026790
commit e134ebdf15
24 changed files with 693 additions and 215 deletions
--- a/engines/TensorRTAPI/include/engine/EngineUtilities.inl
+++ b/engines/TensorRTAPI/include/engine/EngineUtilities.inl
@@ -24,28 +24,32 @@ void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input,
    output = std::move(input[0][0]);
 }
 template <typename T>
-cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input, 
+cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
                                                                size_t height, size_t width,
                                                                const cv::Scalar& bgcolor) {
-    // Ensure input is valid
    if (input.empty()) {
-		return cv::cuda::GpuMat();
+        return cv::cuda::GpuMat();
    }
-    // Create a CUDA stream
-    cv::cuda::Stream stream;
-    // Calculate aspect ratio and unpadded dimensions
+
+    // Use a thread_local stream to avoid creating a new CUDA stream per call.
+    // Creating cv::cuda::Stream() each call leaks stream handles under WDDM.
+    thread_local cv::cuda::Stream stream;
+
    float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
    size_t unpad_w = static_cast<size_t>(r * input.cols);
    size_t unpad_h = static_cast<size_t>(r * input.rows);
+
    // Resize the input image
    cv::cuda::GpuMat re;
-    re.create(unpad_h, unpad_w, input.type());
+    re.create(static_cast<int>(unpad_h), static_cast<int>(unpad_w), input.type());
    cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);
+
    // Create the output image and fill with the background color
    cv::cuda::GpuMat out;
-    out.create(height, width, input.type());
+    out.create(static_cast<int>(height), static_cast<int>(width), input.type());
    out.setTo(bgcolor, stream);
-    // Copy the resized content into the top-left corner of the output image
+
+    // Copy the resized content into the top-left corner
    re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
    stream.waitForCompletion();
    return out;
@@ -195,41 +199,51 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
    const int W     = batchInput[0].cols;
    const int batch = static_cast<int>(batchInput.size());
    const size_t planeSize = static_cast<size_t>(H) * W;   // pixels per channel
+    const int totalElems = batch * 3 * static_cast<int>(planeSize);

-    // Output blob: planar NCHW layout stored as a single-channel GpuMat.
-    // Total elements = batch * 3 * H * W.
-    cv::cuda::GpuMat blob(1, batch * 3 * static_cast<int>(planeSize), CV_32FC1);
+    // thread_local cached buffers — reused across calls on the same thread.
+    // KEY: allocate for MAX seen size, never shrink. This prevents the VRAM leak
+    // caused by OpenCV's GpuMat pool growing unbounded when batch sizes alternate
+    // (e.g., batch=1,6,1,6 → each size triggers new alloc, old goes to pool, never freed).
+    thread_local cv::cuda::GpuMat tl_blob;
+    thread_local cv::cuda::GpuMat tl_floatImg;
+    thread_local int tl_blobMaxElems = 0;
+
+    if (totalElems > tl_blobMaxElems) {
+        tl_blob = cv::cuda::GpuMat(1, totalElems, CV_32FC1);
+        tl_blobMaxElems = totalElems;
+        size_t blobBytes = static_cast<size_t>(totalElems) * sizeof(float);
+        ANS_DBG("TRT_Preproc", "blobFromGpuMats: ALLOC blob batch=%d %dx%d %.1fMB (new max)",
+                batch, W, H, blobBytes / (1024.0 * 1024.0));
+    }
+    // Use a sub-region of the cached blob for the current batch
+    cv::cuda::GpuMat blob = tl_blob.colRange(0, totalElems);

    for (int img = 0; img < batch; ++img) {
-        // 1. Convert to float and normalise while still in HWC (interleaved) format.
-        //    Channel-wise subtract / divide operate correctly on interleaved data.
-        cv::cuda::GpuMat floatImg;
        if (normalize) {
-            batchInput[img].convertTo(floatImg, CV_32FC3, 1.f / 255.f, stream);
+            batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.f / 255.f, stream);
        } else {
-            batchInput[img].convertTo(floatImg, CV_32FC3, 1.0, stream);
+            batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.0, stream);
        }

-        cv::cuda::subtract(floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), floatImg, cv::noArray(), -1, stream);
-        cv::cuda::divide(floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), floatImg, 1, -1, stream);
+        cv::cuda::subtract(tl_floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), tl_floatImg, cv::noArray(), -1, stream);
+        cv::cuda::divide(tl_floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), tl_floatImg, 1, -1, stream);

        // 2. Split normalised HWC image into CHW planes directly into the blob.
        size_t offset = static_cast<size_t>(img) * 3 * planeSize;

        if (swapRB) {
-            // BGR input -> RGB planes: B goes to plane 2, G to plane 1, R to plane 0
            std::vector<cv::cuda::GpuMat> channels{
-                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),  // B -> plane 2
-                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),      // G -> plane 1
-                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};                  // R -> plane 0
-            cv::cuda::split(floatImg, channels, stream);
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};
+            cv::cuda::split(tl_floatImg, channels, stream);
        } else {
-            // BGR input -> BGR planes: keep channel order
            std::vector<cv::cuda::GpuMat> channels{
                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
-            cv::cuda::split(floatImg, channels, stream);
+            cv::cuda::split(tl_floatImg, channels, stream);
        }
    }

@@ -239,7 +253,6 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
 template <typename T> void Engine<T>::clearGpuBuffers() {
    if (!m_buffers.empty()) {
        // Free ALL I/O GPU buffers (both inputs and outputs).
-        // Previously only outputs were freed, leaking input allocations from loadNetwork().
        for (void* ptr : m_buffers) {
            if (ptr) {
                Util::checkCudaErrorCode(cudaFree(ptr));
@@ -247,4 +260,8 @@ template <typename T> void Engine<T>::clearGpuBuffers() {
        }
        m_buffers.clear();
    }
+
+    // Note: blob/floatImg caches are thread_local inside blobFromGpuMats (static method).
+    // They are cleaned up automatically when threads exit.
+    ANS_DBG("TRT_Engine", "clearGpuBuffers: I/O buffers released");
 }