Use software decoder by default
This commit is contained in:
@@ -623,6 +623,65 @@ bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<flo
|
||||
template <typename T>
|
||||
bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize)
|
||||
{
|
||||
// Install a custom OpenCV CUDA allocator that uses cudaMallocAsync/cudaFreeAsync
|
||||
// instead of the default cudaMalloc/cudaFree. The stream-ordered allocator
|
||||
// respects the cudaMemPool release threshold (set to 0), so freed memory is
|
||||
// returned to the GPU immediately instead of being cached forever.
|
||||
//
|
||||
// The default cudaMalloc/cudaFree allocator caches all freed blocks permanently
|
||||
// (no API to force release), causing VRAM to grow monotonically when GpuMat
|
||||
// objects of varying sizes are allocated and freed repeatedly (different batch
|
||||
// sizes, different image resolutions across cameras).
|
||||
{
|
||||
static std::once_flag s_allocatorFlag;
|
||||
std::call_once(s_allocatorFlag, []() {
|
||||
// Set release threshold to 0 on all GPUs
|
||||
int deviceCount = 0;
|
||||
cudaGetDeviceCount(&deviceCount);
|
||||
for (int d = 0; d < deviceCount; ++d) {
|
||||
cudaMemPool_t pool = nullptr;
|
||||
if (cudaDeviceGetDefaultMemPool(&pool, d) == cudaSuccess && pool) {
|
||||
uint64_t threshold = 0;
|
||||
cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold);
|
||||
}
|
||||
}
|
||||
|
||||
// Custom allocator: uses cudaMallocAsync on stream 0 (behaves like
|
||||
// synchronous cudaMalloc but goes through the stream-ordered pool).
|
||||
struct AsyncAllocator : cv::cuda::GpuMat::Allocator {
|
||||
bool allocate(cv::cuda::GpuMat* mat, int rows, int cols, size_t elemSize) override {
|
||||
// Same logic as OpenCV's default allocator, but using cudaMallocAsync
|
||||
size_t step = elemSize * cols;
|
||||
// Align step to 256 bytes (same as default allocator)
|
||||
step = (step + 255) & ~size_t(255);
|
||||
void* ptr = nullptr;
|
||||
cudaError_t err = cudaMallocAsync(&ptr, step * rows, nullptr); // stream 0
|
||||
if (err != cudaSuccess || !ptr) {
|
||||
// Fallback to regular cudaMalloc if async not supported
|
||||
err = cudaMalloc(&ptr, step * rows);
|
||||
if (err != cudaSuccess) return false;
|
||||
}
|
||||
mat->data = static_cast<uchar*>(ptr);
|
||||
mat->step = step;
|
||||
mat->refcount = static_cast<int*>(cv::fastMalloc(sizeof(int)));
|
||||
*mat->refcount = 1;
|
||||
return true;
|
||||
}
|
||||
void free(cv::cuda::GpuMat* mat) override {
|
||||
cudaFreeAsync(mat->data, nullptr); // stream 0 — goes through pool with threshold=0
|
||||
cv::fastFree(mat->refcount);
|
||||
mat->data = nullptr;
|
||||
mat->datastart = nullptr;
|
||||
mat->dataend = nullptr;
|
||||
mat->refcount = nullptr;
|
||||
}
|
||||
};
|
||||
static AsyncAllocator s_allocator;
|
||||
cv::cuda::GpuMat::setDefaultAllocator(&s_allocator);
|
||||
ANS_DBG("TRT_Load", "Custom CUDA async allocator installed — VRAM freed immediately on GpuMat release");
|
||||
});
|
||||
}
|
||||
|
||||
m_lastLoadFailedVRAM = false; // reset on each load attempt
|
||||
m_subVals = subVals;
|
||||
m_divVals = divVals;
|
||||
@@ -958,11 +1017,13 @@ trt_cache_create_context:
|
||||
|
||||
m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
|
||||
if (!m_context) {
|
||||
ANS_DBG("TRT_Load", "ERROR: createExecutionContext returned null");
|
||||
logEngineEvent("[Engine] loadNetwork FAIL: createExecutionContext returned null for "
|
||||
+ trtModelPath, true);
|
||||
return false;
|
||||
}
|
||||
|
||||
ANS_DBG("TRT_Load", "Execution context created OK for %s", trtModelPath.c_str());
|
||||
if (m_verbose) std::cout << "Info: Execution context created successfully" << std::endl;
|
||||
|
||||
// ============================================================================
|
||||
@@ -1135,6 +1196,15 @@ trt_cache_create_context:
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
size_t vramFree = 0, vramTotal = 0;
|
||||
cudaMemGetInfo(&vramFree, &vramTotal);
|
||||
ANS_DBG("TRT_Load", "Buffers allocated: %zuMB, VRAM: %zuMB used / %zuMB free / %zuMB total",
|
||||
totalAllocated / (1024*1024),
|
||||
(vramTotal - vramFree) / (1024*1024),
|
||||
vramFree / (1024*1024),
|
||||
vramTotal / (1024*1024));
|
||||
}
|
||||
if (m_verbose) std::cout << "\nInfo: Total GPU memory allocated: " << totalAllocated / (1024 * 1024) << " MiB" << std::endl;
|
||||
|
||||
// -- Pinned output buffers (CUDA graph prerequisite) -----------------------
|
||||
|
||||
Reference in New Issue
Block a user