Use software decoder by default

This commit is contained in:
2026-04-04 20:19:54 +11:00
parent 3a21026790
commit e134ebdf15
24 changed files with 693 additions and 215 deletions

View File

@@ -623,6 +623,65 @@ bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<flo
template <typename T>
bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize)
{
// Install a custom OpenCV CUDA allocator that uses cudaMallocAsync/cudaFreeAsync
// instead of the default cudaMalloc/cudaFree. The stream-ordered allocator
// respects the cudaMemPool release threshold (set to 0), so freed memory is
// returned to the GPU immediately instead of being cached forever.
//
// The default cudaMalloc/cudaFree allocator caches all freed blocks permanently
// (no API to force release), causing VRAM to grow monotonically when GpuMat
// objects of varying sizes are allocated and freed repeatedly (different batch
// sizes, different image resolutions across cameras).
{
static std::once_flag s_allocatorFlag;
std::call_once(s_allocatorFlag, []() {
// Set release threshold to 0 on all GPUs
int deviceCount = 0;
cudaGetDeviceCount(&deviceCount);
for (int d = 0; d < deviceCount; ++d) {
cudaMemPool_t pool = nullptr;
if (cudaDeviceGetDefaultMemPool(&pool, d) == cudaSuccess && pool) {
uint64_t threshold = 0;
cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold);
}
}
// Custom allocator: uses cudaMallocAsync on stream 0 (behaves like
// synchronous cudaMalloc but goes through the stream-ordered pool).
struct AsyncAllocator : cv::cuda::GpuMat::Allocator {
bool allocate(cv::cuda::GpuMat* mat, int rows, int cols, size_t elemSize) override {
// Same logic as OpenCV's default allocator, but using cudaMallocAsync
size_t step = elemSize * cols;
// Align step to 256 bytes (same as default allocator)
step = (step + 255) & ~size_t(255);
void* ptr = nullptr;
cudaError_t err = cudaMallocAsync(&ptr, step * rows, nullptr); // stream 0
if (err != cudaSuccess || !ptr) {
// Fallback to regular cudaMalloc if async not supported
err = cudaMalloc(&ptr, step * rows);
if (err != cudaSuccess) return false;
}
mat->data = static_cast<uchar*>(ptr);
mat->step = step;
mat->refcount = static_cast<int*>(cv::fastMalloc(sizeof(int)));
*mat->refcount = 1;
return true;
}
void free(cv::cuda::GpuMat* mat) override {
cudaFreeAsync(mat->data, nullptr); // stream 0 — goes through pool with threshold=0
cv::fastFree(mat->refcount);
mat->data = nullptr;
mat->datastart = nullptr;
mat->dataend = nullptr;
mat->refcount = nullptr;
}
};
static AsyncAllocator s_allocator;
cv::cuda::GpuMat::setDefaultAllocator(&s_allocator);
ANS_DBG("TRT_Load", "Custom CUDA async allocator installed — VRAM freed immediately on GpuMat release");
});
}
m_lastLoadFailedVRAM = false; // reset on each load attempt
m_subVals = subVals;
m_divVals = divVals;
@@ -958,11 +1017,13 @@ trt_cache_create_context:
m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
if (!m_context) {
ANS_DBG("TRT_Load", "ERROR: createExecutionContext returned null");
logEngineEvent("[Engine] loadNetwork FAIL: createExecutionContext returned null for "
+ trtModelPath, true);
return false;
}
ANS_DBG("TRT_Load", "Execution context created OK for %s", trtModelPath.c_str());
if (m_verbose) std::cout << "Info: Execution context created successfully" << std::endl;
// ============================================================================
@@ -1135,6 +1196,15 @@ trt_cache_create_context:
}
}
{
size_t vramFree = 0, vramTotal = 0;
cudaMemGetInfo(&vramFree, &vramTotal);
ANS_DBG("TRT_Load", "Buffers allocated: %zuMB, VRAM: %zuMB used / %zuMB free / %zuMB total",
totalAllocated / (1024*1024),
(vramTotal - vramFree) / (1024*1024),
vramFree / (1024*1024),
vramTotal / (1024*1024));
}
if (m_verbose) std::cout << "\nInfo: Total GPU memory allocated: " << totalAllocated / (1024 * 1024) << " MiB" << std::endl;
// -- Pinned output buffers (CUDA graph prerequisite) -----------------------