Use software decoder by default

2026-04-04 20:19:54 +11:00
parent 3a21026790
commit e134ebdf15
24 changed files with 693 additions and 215 deletions
--- a/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl
+++ b/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl
@@ -623,6 +623,65 @@ bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<flo
 template <typename T>
 bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize)
 {
+    // Install a custom OpenCV CUDA allocator that uses cudaMallocAsync/cudaFreeAsync
+    // instead of the default cudaMalloc/cudaFree.  The stream-ordered allocator
+    // respects the cudaMemPool release threshold (set to 0), so freed memory is
+    // returned to the GPU immediately instead of being cached forever.
+    //
+    // The default cudaMalloc/cudaFree allocator caches all freed blocks permanently
+    // (no API to force release), causing VRAM to grow monotonically when GpuMat
+    // objects of varying sizes are allocated and freed repeatedly (different batch
+    // sizes, different image resolutions across cameras).
+    {
+        static std::once_flag s_allocatorFlag;
+        std::call_once(s_allocatorFlag, []() {
+            // Set release threshold to 0 on all GPUs
+            int deviceCount = 0;
+            cudaGetDeviceCount(&deviceCount);
+            for (int d = 0; d < deviceCount; ++d) {
+                cudaMemPool_t pool = nullptr;
+                if (cudaDeviceGetDefaultMemPool(&pool, d) == cudaSuccess && pool) {
+                    uint64_t threshold = 0;
+                    cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold);
+                }
+            }
+
+            // Custom allocator: uses cudaMallocAsync on stream 0 (behaves like
+            // synchronous cudaMalloc but goes through the stream-ordered pool).
+            struct AsyncAllocator : cv::cuda::GpuMat::Allocator {
+                bool allocate(cv::cuda::GpuMat* mat, int rows, int cols, size_t elemSize) override {
+                    // Same logic as OpenCV's default allocator, but using cudaMallocAsync
+                    size_t step = elemSize * cols;
+                    // Align step to 256 bytes (same as default allocator)
+                    step = (step + 255) & ~size_t(255);
+                    void* ptr = nullptr;
+                    cudaError_t err = cudaMallocAsync(&ptr, step * rows, nullptr); // stream 0
+                    if (err != cudaSuccess || !ptr) {
+                        // Fallback to regular cudaMalloc if async not supported
+                        err = cudaMalloc(&ptr, step * rows);
+                        if (err != cudaSuccess) return false;
+                    }
+                    mat->data = static_cast<uchar*>(ptr);
+                    mat->step = step;
+                    mat->refcount = static_cast<int*>(cv::fastMalloc(sizeof(int)));
+                    *mat->refcount = 1;
+                    return true;
+                }
+                void free(cv::cuda::GpuMat* mat) override {
+                    cudaFreeAsync(mat->data, nullptr); // stream 0 — goes through pool with threshold=0
+                    cv::fastFree(mat->refcount);
+                    mat->data = nullptr;
+                    mat->datastart = nullptr;
+                    mat->dataend = nullptr;
+                    mat->refcount = nullptr;
+                }
+            };
+            static AsyncAllocator s_allocator;
+            cv::cuda::GpuMat::setDefaultAllocator(&s_allocator);
+            ANS_DBG("TRT_Load", "Custom CUDA async allocator installed — VRAM freed immediately on GpuMat release");
+        });
+    }
+
    m_lastLoadFailedVRAM = false;  // reset on each load attempt
    m_subVals = subVals;
    m_divVals = divVals;
@@ -958,11 +1017,13 @@ trt_cache_create_context:

    m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
    if (!m_context) {
+        ANS_DBG("TRT_Load", "ERROR: createExecutionContext returned null");
        logEngineEvent("[Engine] loadNetwork FAIL: createExecutionContext returned null for "
                      + trtModelPath, true);
        return false;
    }

+    ANS_DBG("TRT_Load", "Execution context created OK for %s", trtModelPath.c_str());
    if (m_verbose) std::cout << "Info: Execution context created successfully" << std::endl;

    // ============================================================================
@@ -1135,6 +1196,15 @@ trt_cache_create_context:
        }
    }

+    {
+        size_t vramFree = 0, vramTotal = 0;
+        cudaMemGetInfo(&vramFree, &vramTotal);
+        ANS_DBG("TRT_Load", "Buffers allocated: %zuMB, VRAM: %zuMB used / %zuMB free / %zuMB total",
+                totalAllocated / (1024*1024),
+                (vramTotal - vramFree) / (1024*1024),
+                vramFree / (1024*1024),
+                vramTotal / (1024*1024));
+    }
    if (m_verbose) std::cout << "\nInfo: Total GPU memory allocated: " << totalAllocated / (1024 * 1024) << " MiB" << std::endl;

    // -- Pinned output buffers (CUDA graph prerequisite) -----------------------