Use software decoder by default

2026-04-04 20:19:54 +11:00
parent 3a21026790
commit e134ebdf15
24 changed files with 693 additions and 215 deletions
--- a/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl
+++ b/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl
@@ -623,6 +623,65 @@ bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<flo
 template <typename T>
 bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize)
 {
+    // Install a custom OpenCV CUDA allocator that uses cudaMallocAsync/cudaFreeAsync
+    // instead of the default cudaMalloc/cudaFree.  The stream-ordered allocator
+    // respects the cudaMemPool release threshold (set to 0), so freed memory is
+    // returned to the GPU immediately instead of being cached forever.
+    //
+    // The default cudaMalloc/cudaFree allocator caches all freed blocks permanently
+    // (no API to force release), causing VRAM to grow monotonically when GpuMat
+    // objects of varying sizes are allocated and freed repeatedly (different batch
+    // sizes, different image resolutions across cameras).
+    {
+        static std::once_flag s_allocatorFlag;
+        std::call_once(s_allocatorFlag, []() {
+            // Set release threshold to 0 on all GPUs
+            int deviceCount = 0;
+            cudaGetDeviceCount(&deviceCount);
+            for (int d = 0; d < deviceCount; ++d) {
+                cudaMemPool_t pool = nullptr;
+                if (cudaDeviceGetDefaultMemPool(&pool, d) == cudaSuccess && pool) {
+                    uint64_t threshold = 0;
+                    cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold);
+                }
+            }
+
+            // Custom allocator: uses cudaMallocAsync on stream 0 (behaves like
+            // synchronous cudaMalloc but goes through the stream-ordered pool).
+            struct AsyncAllocator : cv::cuda::GpuMat::Allocator {
+                bool allocate(cv::cuda::GpuMat* mat, int rows, int cols, size_t elemSize) override {
+                    // Same logic as OpenCV's default allocator, but using cudaMallocAsync
+                    size_t step = elemSize * cols;
+                    // Align step to 256 bytes (same as default allocator)
+                    step = (step + 255) & ~size_t(255);
+                    void* ptr = nullptr;
+                    cudaError_t err = cudaMallocAsync(&ptr, step * rows, nullptr); // stream 0
+                    if (err != cudaSuccess || !ptr) {
+                        // Fallback to regular cudaMalloc if async not supported
+                        err = cudaMalloc(&ptr, step * rows);
+                        if (err != cudaSuccess) return false;
+                    }
+                    mat->data = static_cast<uchar*>(ptr);
+                    mat->step = step;
+                    mat->refcount = static_cast<int*>(cv::fastMalloc(sizeof(int)));
+                    *mat->refcount = 1;
+                    return true;
+                }
+                void free(cv::cuda::GpuMat* mat) override {
+                    cudaFreeAsync(mat->data, nullptr); // stream 0 — goes through pool with threshold=0
+                    cv::fastFree(mat->refcount);
+                    mat->data = nullptr;
+                    mat->datastart = nullptr;
+                    mat->dataend = nullptr;
+                    mat->refcount = nullptr;
+                }
+            };
+            static AsyncAllocator s_allocator;
+            cv::cuda::GpuMat::setDefaultAllocator(&s_allocator);
+            ANS_DBG("TRT_Load", "Custom CUDA async allocator installed — VRAM freed immediately on GpuMat release");
+        });
+    }
+
    m_lastLoadFailedVRAM = false;  // reset on each load attempt
    m_subVals = subVals;
    m_divVals = divVals;
@@ -958,11 +1017,13 @@ trt_cache_create_context:

    m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
    if (!m_context) {
+        ANS_DBG("TRT_Load", "ERROR: createExecutionContext returned null");
        logEngineEvent("[Engine] loadNetwork FAIL: createExecutionContext returned null for "
                      + trtModelPath, true);
        return false;
    }

+    ANS_DBG("TRT_Load", "Execution context created OK for %s", trtModelPath.c_str());
    if (m_verbose) std::cout << "Info: Execution context created successfully" << std::endl;

    // ============================================================================
@@ -1135,6 +1196,15 @@ trt_cache_create_context:
        }
    }

+    {
+        size_t vramFree = 0, vramTotal = 0;
+        cudaMemGetInfo(&vramFree, &vramTotal);
+        ANS_DBG("TRT_Load", "Buffers allocated: %zuMB, VRAM: %zuMB used / %zuMB free / %zuMB total",
+                totalAllocated / (1024*1024),
+                (vramTotal - vramFree) / (1024*1024),
+                vramFree / (1024*1024),
+                vramTotal / (1024*1024));
+    }
    if (m_verbose) std::cout << "\nInfo: Total GPU memory allocated: " << totalAllocated / (1024 * 1024) << " MiB" << std::endl;

    // -- Pinned output buffers (CUDA graph prerequisite) -----------------------
--- a/engines/TensorRTAPI/include/engine/EngineMultiGpu.inl
+++ b/engines/TensorRTAPI/include/engine/EngineMultiGpu.inl
@@ -607,6 +607,7 @@ bool Engine<T>::runInferenceFromPool(
    //   harmless — the second one finds a fresh slot immediately.
    InferenceSlot* slot = nullptr;
    bool kickedGrowth = false;
+    auto _poolAcquireStart = std::chrono::steady_clock::now();

    {
        std::unique_lock<std::mutex> lock(m_slotMutex);
@@ -630,6 +631,8 @@ bool Engine<T>::runInferenceFromPool(
            }

            if (!slot) {
+                ANS_DBG("TRT_Pool", "ALL SLOTS BUSY: %zu slots, active=%d — waiting for free slot",
+                        n, m_activeCount.load());
                // All slots busy.  In elastic mode, proactively grow the
                // pool in the background so the next request has a slot
                // on a different GPU.  We only kick once per wait cycle.
@@ -672,7 +675,17 @@ bool Engine<T>::runInferenceFromPool(
    }

    // -- 3. Still no slot => reject ---------------------------------------
+    {
+        double _acquireMs = std::chrono::duration<double, std::milli>(
+            std::chrono::steady_clock::now() - _poolAcquireStart).count();
+        if (_acquireMs > 100.0) {
+            ANS_DBG("TRT_Pool", "SLOW slot acquire: %.1fms slot=%p gpu=%d active=%d/%zu",
+                    _acquireMs, (void*)slot, slot ? slot->deviceIndex : -1,
+                    m_activeCount.load(), m_slots.size());
+        }
+    }
    if (!slot) {
+        ANS_DBG("TRT_Pool", "ERROR: No slot available — all %zu slots busy, timeout", m_slots.size());
        std::string errMsg = "[Engine] runInferenceFromPool FAIL: Capacity reached -- all "
            + std::to_string(m_activeCount.load()) + "/" + std::to_string(m_totalCapacity)
            + " slot(s) busy"
@@ -699,12 +712,23 @@ bool Engine<T>::runInferenceFromPool(
        if (currentDev != slot->deviceIndex) {
            cudaSetDevice(slot->deviceIndex);
        }
+        ANS_DBG("TRT_Pool", "Slot dispatch: gpu=%d active=%d/%zu",
+                slot->deviceIndex, m_activeCount.load(), m_slots.size());
+        auto _slotStart = std::chrono::steady_clock::now();
        result = slot->engine->runInference(inputs, featureVectors);
+        auto _slotEnd = std::chrono::steady_clock::now();
+        double _slotMs = std::chrono::duration<double, std::milli>(_slotEnd - _slotStart).count();
+        if (_slotMs > 500.0) {
+            ANS_DBG("TRT_Pool", "SLOW slot inference: %.1fms gpu=%d active=%d/%zu",
+                    _slotMs, slot->deviceIndex, m_activeCount.load(), m_slots.size());
+        }
    }
    catch (const std::exception& ex) {
+        ANS_DBG("TRT_Pool", "ERROR: runInference threw: %s", ex.what());
        std::cout << "Error [Pool]: runInference threw: " << ex.what() << std::endl;
    }
    catch (...) {
+        ANS_DBG("TRT_Pool", "ERROR: runInference threw unknown exception");
        std::cout << "Error [Pool]: runInference threw unknown exception" << std::endl;
    }

--- a/engines/TensorRTAPI/include/engine/EngineRunInference.inl
+++ b/engines/TensorRTAPI/include/engine/EngineRunInference.inl
@@ -1,8 +1,10 @@
 #pragma once
 #include <cstring>
+#include <chrono>
 #include <filesystem>
 #include <semaphore>
 #include "TRTCompat.h"
+#include "ANSLicense.h"   // ANS_DBG macro for DebugView logging

 // Per-device mutex for CUDA graph capture.
 // TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
@@ -86,11 +88,9 @@ static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
    cudaError_t err = cudaStreamQuery(stream);
    if (err != cudaErrorNotReady) return err;

+    auto syncStart = std::chrono::steady_clock::now();
+
    // Short Sleep(0) fast path (~10 iterations) catches sub-ms kernel completions.
-    // Then switch to Sleep(1) to give cleanup operations (cuArrayDestroy, cuMemFree)
-    // a window to acquire the exclusive nvcuda64 SRW lock.
-    // Previously used 1000 Sleep(0) iterations which hogged the SRW lock and
-    // caused ~20-second stalls when concurrent cleanup needed exclusive access.
    for (int i = 0; i < 10; ++i) {
        Sleep(0);
        err = cudaStreamQuery(stream);
@@ -98,10 +98,21 @@ static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
    }

    // 1ms sleeps — adds negligible latency at 30 FPS but prevents SRW lock starvation.
+    int sleepCount = 0;
    while (true) {
        Sleep(1);
+        sleepCount++;
        err = cudaStreamQuery(stream);
-        if (err != cudaErrorNotReady) return err;
+        if (err != cudaErrorNotReady) {
+            // Log if sync took too long (>500ms indicates GPU stall)
+            auto elapsed = std::chrono::duration<double, std::milli>(
+                std::chrono::steady_clock::now() - syncStart).count();
+            if (elapsed > 500.0) {
+                ANS_DBG("TRT_Engine", "SLOW SYNC: %.1fms (%d sleeps) stream=%p err=%d",
+                        elapsed, sleepCount, (void*)stream, (int)err);
+            }
+            return err;
+        }
    }
 }

@@ -368,6 +379,71 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
        return false;
    }

+    // ============================================================================
+    // SM=100% DETECTOR — tracks inference timing trends to catch the exact
+    // moment GPU becomes saturated. Logs every 50 inferences with rolling
+    // average, and immediately when degradation is detected.
+    // ============================================================================
+    // Global (process-wide) counters shared across all engine instances/threads
+    static std::atomic<int64_t> s_globalInfCount{0};
+    static std::atomic<int>     s_globalActiveInf{0};   // currently in-flight inferences
+    static std::atomic<double>  s_globalLastAvgMs{0.0};  // last known avg inference time
+
+    const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
+    s_globalActiveInf.fetch_add(1);
+
+    // Per-thread tracking
+    {
+        static thread_local int64_t s_infCount = 0;
+        static thread_local std::chrono::steady_clock::time_point s_lastLog;
+        static thread_local double  s_rollingAvgMs = 0.0;
+        static thread_local double  s_baselineMs = 0.0;   // avg during first 100 inferences
+        static thread_local double  s_maxMs = 0.0;
+        static thread_local bool    s_degradationLogged = false;
+        s_infCount++;
+
+        if (s_infCount == 1) {
+            s_lastLog = std::chrono::steady_clock::now();
+            ANS_DBG("TRT_SM100", "FIRST inference — engine alive, globalInf=%lld", myInfNum);
+        }
+
+        // Log every 50 inferences (more frequent than 500 to catch transitions)
+        if (s_infCount % 50 == 0) {
+            auto now = std::chrono::steady_clock::now();
+            double elapsed = std::chrono::duration<double>(now - s_lastLog).count();
+            double fps = (elapsed > 0) ? (50.0 / elapsed) : 0;
+            s_lastLog = now;
+
+            size_t vramFree = 0, vramTotal = 0;
+            cudaMemGetInfo(&vramFree, &vramTotal);
+            size_t vramUsedMB = (vramTotal - vramFree) / (1024 * 1024);
+            size_t vramFreeMB = vramFree / (1024 * 1024);
+
+            ANS_DBG("TRT_SM100", "#%lld [global=%lld active=%d] %.1f inf/sec avgMs=%.1f maxMs=%.1f batch=%d graphs=%zu VRAM=%zuMB/%zuMB",
+                    s_infCount, myInfNum, s_globalActiveInf.load(),
+                    fps, s_rollingAvgMs, s_maxMs,
+                    (int)inputs[0].size(), m_graphExecs.size(),
+                    vramUsedMB, vramFreeMB);
+
+            // Capture baseline from first 100 inferences
+            if (s_infCount == 100) {
+                s_baselineMs = s_rollingAvgMs;
+                ANS_DBG("TRT_SM100", "BASELINE established: %.1fms/inference", s_baselineMs);
+            }
+
+            // Detect degradation: avg >3x baseline AND baseline is set
+            if (s_baselineMs > 0 && s_rollingAvgMs > s_baselineMs * 3.0 && !s_degradationLogged) {
+                s_degradationLogged = true;
+                ANS_DBG("TRT_SM100", "*** DEGRADATION DETECTED *** avg=%.1fms baseline=%.1fms (%.1fx) VRAM=%zuMB/%zuMB active=%d",
+                        s_rollingAvgMs, s_baselineMs, s_rollingAvgMs / s_baselineMs,
+                        vramUsedMB, vramFreeMB, s_globalActiveInf.load());
+            }
+
+            // Reset max for next window
+            s_maxMs = 0.0;
+        }
+    }
+
    const auto numInputs = m_inputDims.size();
    if (inputs.size() != numInputs) {
        std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
@@ -457,6 +533,9 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
        }

        if (anyDimChanged) {
+            ANS_DBG("TRT_Engine", "Shape change detected: batch %d -> %d (graphsCached=%zu)",
+                    m_lastBatchSize, batchSize, m_graphExecs.size());
+
            // === First-time diagnostics (verbose, once) ===
            const bool firstTime = !m_batchShapeChangeLogged;

@@ -536,7 +615,10 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
                        << newDims.d[3] << "]" << std::endl;
                }

+                ANS_DBG("TRT_Engine", "setInputShape('%s') [%d,%d,%d,%d]",
+                        tensorName, newDims.d[0], newDims.d[1], newDims.d[2], newDims.d[3]);
                if (!m_context->setInputShape(tensorName, newDims)) {
+                    ANS_DBG("TRT_Engine", "ERROR: setInputShape FAILED for '%s'", tensorName);
                    std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
                    return false;
                }
@@ -576,6 +658,25 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i

            m_lastBatchSize = batchSize;
            m_batchShapeChangeLogged = true;
+
+            // CRITICAL: Invalidate all cached CUDA graphs after shape change.
+            // Graphs were captured with the OLD context state (old tensor shapes).
+            // Launching them after setInputShape() produces undefined GPU behavior
+            // (invalid kernel sequences, SM lockup at 100%, inference hang).
+            if (!m_graphExecs.empty()) {
+                size_t destroyed = m_graphExecs.size();
+                for (auto& [bs, ge] : m_graphExecs) {
+                    if (ge) cudaGraphExecDestroy(ge);
+                }
+                m_graphExecs.clear();
+                ANS_DBG("TRT_Engine", "INVALIDATED %zu cached CUDA graphs after shape change (batch=%d)",
+                        destroyed, batchSize);
+                if (m_verbose || firstTime) {
+                    std::cout << "Info: Invalidated " << destroyed
+                        << " cached CUDA graphs after shape change" << std::endl;
+                }
+            }
+
            if (m_verbose && firstTime) {
                std::cout << "\nInfo: Input shapes updated successfully for batch size "
                    << batchSize << " ✓\n" << std::endl;
@@ -619,6 +720,7 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
    //
    // GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
    // cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
+    auto _prepStart = std::chrono::steady_clock::now();
    cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
    std::vector<cv::cuda::GpuMat> preprocessedBuffers;
    preprocessedBuffers.reserve(numInputs);
@@ -647,6 +749,14 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
        }
    }

+    {
+        double _prepMs = std::chrono::duration<double, std::milli>(
+            std::chrono::steady_clock::now() - _prepStart).count();
+        if (_prepMs > 100.0) {
+            ANS_DBG("TRT_SM100", "SLOW PREPROCESS: %.1fms batch=%d (blobFromGpuMats+D2D)", _prepMs, batchSize);
+        }
+    }
+
    // ============================================================================
    // PRE-ALLOCATE OUTPUT STRUCTURE
    // ============================================================================
@@ -690,6 +800,8 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
    if (canGraph) {
        auto& graphExec = m_graphExecs[batchSize];   // inserts nullptr on first access
        if (!graphExec) {
+            ANS_DBG("TRT_Engine", "CUDA graph CAPTURE starting for batch=%d (cached=%zu)",
+                    batchSize, m_graphExecs.size());
            // First call for this batchSize -- capture a new graph.
            // Serialise captures across all Engine instances on this device to
            // prevent TRT's shared workspace from creating cross-stream
@@ -727,9 +839,13 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
                if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
                    graphExec = exec;
                cudaGraphDestroy(graph);
+                ANS_DBG("TRT_Engine", "CUDA graph CAPTURED OK for batch=%d exec=%p",
+                        batchSize, (void*)graphExec);
            }

            if (!graphExec) {
+                ANS_DBG("TRT_Engine", "CUDA graph capture FAILED for batch=%d — falling back to direct path",
+                        batchSize);
                std::cout << "Warning: CUDA graph capture failed for batchSize="
                    << batchSize << " -- falling back to direct inference path." << std::endl;
                // Disable graph acceleration for this Engine instance.
@@ -740,9 +856,17 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
        }

        if (graphExec) {
+            ANS_DBG("TRT_Engine", "CUDA graph LAUNCH batch=%d exec=%p", batchSize, (void*)graphExec);
            // Launch the pre-captured graph (single API call replaces many).
+            auto _graphStart = std::chrono::steady_clock::now();
            cudaGraphLaunch(graphExec, m_inferenceStream);
            cudaStreamSynchronize_Safe(m_inferenceStream);
+            auto _graphEnd = std::chrono::steady_clock::now();
+            double _graphMs = std::chrono::duration<double, std::milli>(_graphEnd - _graphStart).count();
+            if (_graphMs > 500.0) {
+                ANS_DBG("TRT_SM100", "SLOW GRAPH: %.1fms batch=%d active=%d",
+                        _graphMs, batchSize, s_globalActiveInf.load());
+            }

            // CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
            for (int batch = 0; batch < batchSize; ++batch) {
@@ -762,8 +886,16 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
    // ----------------------
    // Used when pinned buffers are unavailable or graph capture failed.
    if (!graphUsed) {
+        ANS_DBG("TRT_Engine", "Direct path (no graph) batch=%d", batchSize);
+        auto enqueueStart = std::chrono::steady_clock::now();
        bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
+        auto enqueueEnd = std::chrono::steady_clock::now();
+        double enqueueMs = std::chrono::duration<double, std::milli>(enqueueEnd - enqueueStart).count();
+        if (enqueueMs > 500.0) {
+            ANS_DBG("TRT_Engine", "SLOW ENQUEUE: %.1fms batch=%d (enqueueV3 blocked!)", enqueueMs, batchSize);
+        }
        if (!success) {
+            ANS_DBG("TRT_Engine", "ERROR: enqueueV3 FAILED batch=%d", batchSize);
            std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
                + std::to_string(batchSize)
                + ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
@@ -805,8 +937,16 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
            }
        }

+        auto syncStart = std::chrono::steady_clock::now();
        cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
+        auto syncEnd = std::chrono::steady_clock::now();
+        double syncMs = std::chrono::duration<double, std::milli>(syncEnd - syncStart).count();
+        if (syncMs > 500.0) {
+            ANS_DBG("TRT_Engine", "SLOW INFERENCE SYNC: %.1fms batch=%d (direct path)", syncMs, batchSize);
+        }
        if (syncErr != cudaSuccess) {
+            ANS_DBG("TRT_Engine", "ERROR: cudaStreamSync FAILED err=%d (%s)",
+                    (int)syncErr, cudaGetErrorString(syncErr));
            std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
                + std::string(cudaGetErrorString(syncErr));
            std::cout << errMsg << std::endl;
@@ -815,5 +955,33 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
        }
    }

+    // ============================================================================
+    // SM=100% DETECTOR — end-of-inference timing
+    // ============================================================================
+    {
+        static thread_local double s_ema = 0;
+        static thread_local std::chrono::steady_clock::time_point s_prevEnd;
+        static thread_local bool s_firstDone = false;
+
+        auto _now = std::chrono::steady_clock::now();
+        if (s_firstDone) {
+            double sinceLastMs = std::chrono::duration<double, std::milli>(_now - s_prevEnd).count();
+            // If time between consecutive inferences jumps dramatically,
+            // something blocked the thread (SM=100% or mutex contention)
+            if (s_ema > 0 && sinceLastMs > s_ema * 3.0 && sinceLastMs > 500.0) {
+                size_t vf = 0, vt = 0;
+                cudaMemGetInfo(&vf, &vt);
+                ANS_DBG("TRT_SM100", "GAP DETECTED: %.1fms between inferences (avg=%.1fms, %.1fx) active=%d VRAM=%zuMB free",
+                        sinceLastMs, s_ema, sinceLastMs / s_ema,
+                        s_globalActiveInf.load(), vf / (1024*1024));
+            }
+            s_ema = (s_ema == 0) ? sinceLastMs : (0.9 * s_ema + 0.1 * sinceLastMs);
+        }
+        s_prevEnd = _now;
+        s_firstDone = true;
+
+        s_globalActiveInf.fetch_sub(1);
+    }
+
    return true;
 }
--- a/engines/TensorRTAPI/include/engine/EngineUtilities.inl
+++ b/engines/TensorRTAPI/include/engine/EngineUtilities.inl
@@ -24,28 +24,32 @@ void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input,
    output = std::move(input[0][0]);
 }
 template <typename T>
-cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input, 
+cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
                                                                size_t height, size_t width,
                                                                const cv::Scalar& bgcolor) {
-    // Ensure input is valid
    if (input.empty()) {
-		return cv::cuda::GpuMat();
+        return cv::cuda::GpuMat();
    }
-    // Create a CUDA stream
-    cv::cuda::Stream stream;
-    // Calculate aspect ratio and unpadded dimensions
+
+    // Use a thread_local stream to avoid creating a new CUDA stream per call.
+    // Creating cv::cuda::Stream() each call leaks stream handles under WDDM.
+    thread_local cv::cuda::Stream stream;
+
    float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
    size_t unpad_w = static_cast<size_t>(r * input.cols);
    size_t unpad_h = static_cast<size_t>(r * input.rows);
+
    // Resize the input image
    cv::cuda::GpuMat re;
-    re.create(unpad_h, unpad_w, input.type());
+    re.create(static_cast<int>(unpad_h), static_cast<int>(unpad_w), input.type());
    cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);
+
    // Create the output image and fill with the background color
    cv::cuda::GpuMat out;
-    out.create(height, width, input.type());
+    out.create(static_cast<int>(height), static_cast<int>(width), input.type());
    out.setTo(bgcolor, stream);
-    // Copy the resized content into the top-left corner of the output image
+
+    // Copy the resized content into the top-left corner
    re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
    stream.waitForCompletion();
    return out;
@@ -195,41 +199,51 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
    const int W     = batchInput[0].cols;
    const int batch = static_cast<int>(batchInput.size());
    const size_t planeSize = static_cast<size_t>(H) * W;   // pixels per channel
+    const int totalElems = batch * 3 * static_cast<int>(planeSize);

-    // Output blob: planar NCHW layout stored as a single-channel GpuMat.
-    // Total elements = batch * 3 * H * W.
-    cv::cuda::GpuMat blob(1, batch * 3 * static_cast<int>(planeSize), CV_32FC1);
+    // thread_local cached buffers — reused across calls on the same thread.
+    // KEY: allocate for MAX seen size, never shrink. This prevents the VRAM leak
+    // caused by OpenCV's GpuMat pool growing unbounded when batch sizes alternate
+    // (e.g., batch=1,6,1,6 → each size triggers new alloc, old goes to pool, never freed).
+    thread_local cv::cuda::GpuMat tl_blob;
+    thread_local cv::cuda::GpuMat tl_floatImg;
+    thread_local int tl_blobMaxElems = 0;
+
+    if (totalElems > tl_blobMaxElems) {
+        tl_blob = cv::cuda::GpuMat(1, totalElems, CV_32FC1);
+        tl_blobMaxElems = totalElems;
+        size_t blobBytes = static_cast<size_t>(totalElems) * sizeof(float);
+        ANS_DBG("TRT_Preproc", "blobFromGpuMats: ALLOC blob batch=%d %dx%d %.1fMB (new max)",
+                batch, W, H, blobBytes / (1024.0 * 1024.0));
+    }
+    // Use a sub-region of the cached blob for the current batch
+    cv::cuda::GpuMat blob = tl_blob.colRange(0, totalElems);

    for (int img = 0; img < batch; ++img) {
-        // 1. Convert to float and normalise while still in HWC (interleaved) format.
-        //    Channel-wise subtract / divide operate correctly on interleaved data.
-        cv::cuda::GpuMat floatImg;
        if (normalize) {
-            batchInput[img].convertTo(floatImg, CV_32FC3, 1.f / 255.f, stream);
+            batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.f / 255.f, stream);
        } else {
-            batchInput[img].convertTo(floatImg, CV_32FC3, 1.0, stream);
+            batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.0, stream);
        }

-        cv::cuda::subtract(floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), floatImg, cv::noArray(), -1, stream);
-        cv::cuda::divide(floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), floatImg, 1, -1, stream);
+        cv::cuda::subtract(tl_floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), tl_floatImg, cv::noArray(), -1, stream);
+        cv::cuda::divide(tl_floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), tl_floatImg, 1, -1, stream);

        // 2. Split normalised HWC image into CHW planes directly into the blob.
        size_t offset = static_cast<size_t>(img) * 3 * planeSize;

        if (swapRB) {
-            // BGR input -> RGB planes: B goes to plane 2, G to plane 1, R to plane 0
            std::vector<cv::cuda::GpuMat> channels{
-                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),  // B -> plane 2
-                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),      // G -> plane 1
-                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};                  // R -> plane 0
-            cv::cuda::split(floatImg, channels, stream);
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};
+            cv::cuda::split(tl_floatImg, channels, stream);
        } else {
-            // BGR input -> BGR planes: keep channel order
            std::vector<cv::cuda::GpuMat> channels{
                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
-            cv::cuda::split(floatImg, channels, stream);
+            cv::cuda::split(tl_floatImg, channels, stream);
        }
    }

@@ -239,7 +253,6 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
 template <typename T> void Engine<T>::clearGpuBuffers() {
    if (!m_buffers.empty()) {
        // Free ALL I/O GPU buffers (both inputs and outputs).
-        // Previously only outputs were freed, leaking input allocations from loadNetwork().
        for (void* ptr : m_buffers) {
            if (ptr) {
                Util::checkCudaErrorCode(cudaFree(ptr));
@@ -247,4 +260,8 @@ template <typename T> void Engine<T>::clearGpuBuffers() {
        }
        m_buffers.clear();
    }
+
+    // Note: blob/floatImg caches are thread_local inside blobFromGpuMats (static method).
+    // They are cleaned up automatically when threads exit.
+    ANS_DBG("TRT_Engine", "clearGpuBuffers: I/O buffers released");
 }