Use software decoder by default

This commit is contained in:
2026-04-04 20:19:54 +11:00
parent 3a21026790
commit e134ebdf15
24 changed files with 693 additions and 215 deletions

View File

@@ -623,6 +623,65 @@ bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<flo
template <typename T>
bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize)
{
// Install a custom OpenCV CUDA allocator that uses cudaMallocAsync/cudaFreeAsync
// instead of the default cudaMalloc/cudaFree. The stream-ordered allocator
// respects the cudaMemPool release threshold (set to 0), so freed memory is
// returned to the GPU immediately instead of being cached forever.
//
// The default cudaMalloc/cudaFree allocator caches all freed blocks permanently
// (no API to force release), causing VRAM to grow monotonically when GpuMat
// objects of varying sizes are allocated and freed repeatedly (different batch
// sizes, different image resolutions across cameras).
{
static std::once_flag s_allocatorFlag;
std::call_once(s_allocatorFlag, []() {
// Set release threshold to 0 on all GPUs
int deviceCount = 0;
cudaGetDeviceCount(&deviceCount);
for (int d = 0; d < deviceCount; ++d) {
cudaMemPool_t pool = nullptr;
if (cudaDeviceGetDefaultMemPool(&pool, d) == cudaSuccess && pool) {
uint64_t threshold = 0;
cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold);
}
}
// Custom allocator: uses cudaMallocAsync on stream 0 (behaves like
// synchronous cudaMalloc but goes through the stream-ordered pool).
struct AsyncAllocator : cv::cuda::GpuMat::Allocator {
bool allocate(cv::cuda::GpuMat* mat, int rows, int cols, size_t elemSize) override {
// Same logic as OpenCV's default allocator, but using cudaMallocAsync
size_t step = elemSize * cols;
// Align step to 256 bytes (same as default allocator)
step = (step + 255) & ~size_t(255);
void* ptr = nullptr;
cudaError_t err = cudaMallocAsync(&ptr, step * rows, nullptr); // stream 0
if (err != cudaSuccess || !ptr) {
// Fallback to regular cudaMalloc if async not supported
err = cudaMalloc(&ptr, step * rows);
if (err != cudaSuccess) return false;
}
mat->data = static_cast<uchar*>(ptr);
mat->step = step;
mat->refcount = static_cast<int*>(cv::fastMalloc(sizeof(int)));
*mat->refcount = 1;
return true;
}
void free(cv::cuda::GpuMat* mat) override {
cudaFreeAsync(mat->data, nullptr); // stream 0 — goes through pool with threshold=0
cv::fastFree(mat->refcount);
mat->data = nullptr;
mat->datastart = nullptr;
mat->dataend = nullptr;
mat->refcount = nullptr;
}
};
static AsyncAllocator s_allocator;
cv::cuda::GpuMat::setDefaultAllocator(&s_allocator);
ANS_DBG("TRT_Load", "Custom CUDA async allocator installed — VRAM freed immediately on GpuMat release");
});
}
m_lastLoadFailedVRAM = false; // reset on each load attempt
m_subVals = subVals;
m_divVals = divVals;
@@ -958,11 +1017,13 @@ trt_cache_create_context:
m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
if (!m_context) {
ANS_DBG("TRT_Load", "ERROR: createExecutionContext returned null");
logEngineEvent("[Engine] loadNetwork FAIL: createExecutionContext returned null for "
+ trtModelPath, true);
return false;
}
ANS_DBG("TRT_Load", "Execution context created OK for %s", trtModelPath.c_str());
if (m_verbose) std::cout << "Info: Execution context created successfully" << std::endl;
// ============================================================================
@@ -1135,6 +1196,15 @@ trt_cache_create_context:
}
}
{
size_t vramFree = 0, vramTotal = 0;
cudaMemGetInfo(&vramFree, &vramTotal);
ANS_DBG("TRT_Load", "Buffers allocated: %zuMB, VRAM: %zuMB used / %zuMB free / %zuMB total",
totalAllocated / (1024*1024),
(vramTotal - vramFree) / (1024*1024),
vramFree / (1024*1024),
vramTotal / (1024*1024));
}
if (m_verbose) std::cout << "\nInfo: Total GPU memory allocated: " << totalAllocated / (1024 * 1024) << " MiB" << std::endl;
// -- Pinned output buffers (CUDA graph prerequisite) -----------------------

View File

@@ -607,6 +607,7 @@ bool Engine<T>::runInferenceFromPool(
// harmless — the second one finds a fresh slot immediately.
InferenceSlot* slot = nullptr;
bool kickedGrowth = false;
auto _poolAcquireStart = std::chrono::steady_clock::now();
{
std::unique_lock<std::mutex> lock(m_slotMutex);
@@ -630,6 +631,8 @@ bool Engine<T>::runInferenceFromPool(
}
if (!slot) {
ANS_DBG("TRT_Pool", "ALL SLOTS BUSY: %zu slots, active=%d — waiting for free slot",
n, m_activeCount.load());
// All slots busy. In elastic mode, proactively grow the
// pool in the background so the next request has a slot
// on a different GPU. We only kick once per wait cycle.
@@ -672,7 +675,17 @@ bool Engine<T>::runInferenceFromPool(
}
// -- 3. Still no slot => reject ---------------------------------------
{
double _acquireMs = std::chrono::duration<double, std::milli>(
std::chrono::steady_clock::now() - _poolAcquireStart).count();
if (_acquireMs > 100.0) {
ANS_DBG("TRT_Pool", "SLOW slot acquire: %.1fms slot=%p gpu=%d active=%d/%zu",
_acquireMs, (void*)slot, slot ? slot->deviceIndex : -1,
m_activeCount.load(), m_slots.size());
}
}
if (!slot) {
ANS_DBG("TRT_Pool", "ERROR: No slot available — all %zu slots busy, timeout", m_slots.size());
std::string errMsg = "[Engine] runInferenceFromPool FAIL: Capacity reached -- all "
+ std::to_string(m_activeCount.load()) + "/" + std::to_string(m_totalCapacity)
+ " slot(s) busy"
@@ -699,12 +712,23 @@ bool Engine<T>::runInferenceFromPool(
if (currentDev != slot->deviceIndex) {
cudaSetDevice(slot->deviceIndex);
}
ANS_DBG("TRT_Pool", "Slot dispatch: gpu=%d active=%d/%zu",
slot->deviceIndex, m_activeCount.load(), m_slots.size());
auto _slotStart = std::chrono::steady_clock::now();
result = slot->engine->runInference(inputs, featureVectors);
auto _slotEnd = std::chrono::steady_clock::now();
double _slotMs = std::chrono::duration<double, std::milli>(_slotEnd - _slotStart).count();
if (_slotMs > 500.0) {
ANS_DBG("TRT_Pool", "SLOW slot inference: %.1fms gpu=%d active=%d/%zu",
_slotMs, slot->deviceIndex, m_activeCount.load(), m_slots.size());
}
}
catch (const std::exception& ex) {
ANS_DBG("TRT_Pool", "ERROR: runInference threw: %s", ex.what());
std::cout << "Error [Pool]: runInference threw: " << ex.what() << std::endl;
}
catch (...) {
ANS_DBG("TRT_Pool", "ERROR: runInference threw unknown exception");
std::cout << "Error [Pool]: runInference threw unknown exception" << std::endl;
}

View File

@@ -1,8 +1,10 @@
#pragma once
#include <cstring>
#include <chrono>
#include <filesystem>
#include <semaphore>
#include "TRTCompat.h"
#include "ANSLicense.h" // ANS_DBG macro for DebugView logging
// Per-device mutex for CUDA graph capture.
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
@@ -86,11 +88,9 @@ static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
cudaError_t err = cudaStreamQuery(stream);
if (err != cudaErrorNotReady) return err;
auto syncStart = std::chrono::steady_clock::now();
// Short Sleep(0) fast path (~10 iterations) catches sub-ms kernel completions.
// Then switch to Sleep(1) to give cleanup operations (cuArrayDestroy, cuMemFree)
// a window to acquire the exclusive nvcuda64 SRW lock.
// Previously used 1000 Sleep(0) iterations which hogged the SRW lock and
// caused ~20-second stalls when concurrent cleanup needed exclusive access.
for (int i = 0; i < 10; ++i) {
Sleep(0);
err = cudaStreamQuery(stream);
@@ -98,10 +98,21 @@ static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
}
// 1ms sleeps — adds negligible latency at 30 FPS but prevents SRW lock starvation.
int sleepCount = 0;
while (true) {
Sleep(1);
sleepCount++;
err = cudaStreamQuery(stream);
if (err != cudaErrorNotReady) return err;
if (err != cudaErrorNotReady) {
// Log if sync took too long (>500ms indicates GPU stall)
auto elapsed = std::chrono::duration<double, std::milli>(
std::chrono::steady_clock::now() - syncStart).count();
if (elapsed > 500.0) {
ANS_DBG("TRT_Engine", "SLOW SYNC: %.1fms (%d sleeps) stream=%p err=%d",
elapsed, sleepCount, (void*)stream, (int)err);
}
return err;
}
}
}
@@ -368,6 +379,71 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
return false;
}
// ============================================================================
// SM=100% DETECTOR — tracks inference timing trends to catch the exact
// moment GPU becomes saturated. Logs every 50 inferences with rolling
// average, and immediately when degradation is detected.
// ============================================================================
// Global (process-wide) counters shared across all engine instances/threads
static std::atomic<int64_t> s_globalInfCount{0};
static std::atomic<int> s_globalActiveInf{0}; // currently in-flight inferences
static std::atomic<double> s_globalLastAvgMs{0.0}; // last known avg inference time
const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
s_globalActiveInf.fetch_add(1);
// Per-thread tracking
{
static thread_local int64_t s_infCount = 0;
static thread_local std::chrono::steady_clock::time_point s_lastLog;
static thread_local double s_rollingAvgMs = 0.0;
static thread_local double s_baselineMs = 0.0; // avg during first 100 inferences
static thread_local double s_maxMs = 0.0;
static thread_local bool s_degradationLogged = false;
s_infCount++;
if (s_infCount == 1) {
s_lastLog = std::chrono::steady_clock::now();
ANS_DBG("TRT_SM100", "FIRST inference — engine alive, globalInf=%lld", myInfNum);
}
// Log every 50 inferences (more frequent than 500 to catch transitions)
if (s_infCount % 50 == 0) {
auto now = std::chrono::steady_clock::now();
double elapsed = std::chrono::duration<double>(now - s_lastLog).count();
double fps = (elapsed > 0) ? (50.0 / elapsed) : 0;
s_lastLog = now;
size_t vramFree = 0, vramTotal = 0;
cudaMemGetInfo(&vramFree, &vramTotal);
size_t vramUsedMB = (vramTotal - vramFree) / (1024 * 1024);
size_t vramFreeMB = vramFree / (1024 * 1024);
ANS_DBG("TRT_SM100", "#%lld [global=%lld active=%d] %.1f inf/sec avgMs=%.1f maxMs=%.1f batch=%d graphs=%zu VRAM=%zuMB/%zuMB",
s_infCount, myInfNum, s_globalActiveInf.load(),
fps, s_rollingAvgMs, s_maxMs,
(int)inputs[0].size(), m_graphExecs.size(),
vramUsedMB, vramFreeMB);
// Capture baseline from first 100 inferences
if (s_infCount == 100) {
s_baselineMs = s_rollingAvgMs;
ANS_DBG("TRT_SM100", "BASELINE established: %.1fms/inference", s_baselineMs);
}
// Detect degradation: avg >3x baseline AND baseline is set
if (s_baselineMs > 0 && s_rollingAvgMs > s_baselineMs * 3.0 && !s_degradationLogged) {
s_degradationLogged = true;
ANS_DBG("TRT_SM100", "*** DEGRADATION DETECTED *** avg=%.1fms baseline=%.1fms (%.1fx) VRAM=%zuMB/%zuMB active=%d",
s_rollingAvgMs, s_baselineMs, s_rollingAvgMs / s_baselineMs,
vramUsedMB, vramFreeMB, s_globalActiveInf.load());
}
// Reset max for next window
s_maxMs = 0.0;
}
}
const auto numInputs = m_inputDims.size();
if (inputs.size() != numInputs) {
std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
@@ -457,6 +533,9 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
}
if (anyDimChanged) {
ANS_DBG("TRT_Engine", "Shape change detected: batch %d -> %d (graphsCached=%zu)",
m_lastBatchSize, batchSize, m_graphExecs.size());
// === First-time diagnostics (verbose, once) ===
const bool firstTime = !m_batchShapeChangeLogged;
@@ -536,7 +615,10 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
<< newDims.d[3] << "]" << std::endl;
}
ANS_DBG("TRT_Engine", "setInputShape('%s') [%d,%d,%d,%d]",
tensorName, newDims.d[0], newDims.d[1], newDims.d[2], newDims.d[3]);
if (!m_context->setInputShape(tensorName, newDims)) {
ANS_DBG("TRT_Engine", "ERROR: setInputShape FAILED for '%s'", tensorName);
std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
return false;
}
@@ -576,6 +658,25 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
m_lastBatchSize = batchSize;
m_batchShapeChangeLogged = true;
// CRITICAL: Invalidate all cached CUDA graphs after shape change.
// Graphs were captured with the OLD context state (old tensor shapes).
// Launching them after setInputShape() produces undefined GPU behavior
// (invalid kernel sequences, SM lockup at 100%, inference hang).
if (!m_graphExecs.empty()) {
size_t destroyed = m_graphExecs.size();
for (auto& [bs, ge] : m_graphExecs) {
if (ge) cudaGraphExecDestroy(ge);
}
m_graphExecs.clear();
ANS_DBG("TRT_Engine", "INVALIDATED %zu cached CUDA graphs after shape change (batch=%d)",
destroyed, batchSize);
if (m_verbose || firstTime) {
std::cout << "Info: Invalidated " << destroyed
<< " cached CUDA graphs after shape change" << std::endl;
}
}
if (m_verbose && firstTime) {
std::cout << "\nInfo: Input shapes updated successfully for batch size "
<< batchSize << " ✓\n" << std::endl;
@@ -619,6 +720,7 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
//
// GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
// cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
auto _prepStart = std::chrono::steady_clock::now();
cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
std::vector<cv::cuda::GpuMat> preprocessedBuffers;
preprocessedBuffers.reserve(numInputs);
@@ -647,6 +749,14 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
}
}
{
double _prepMs = std::chrono::duration<double, std::milli>(
std::chrono::steady_clock::now() - _prepStart).count();
if (_prepMs > 100.0) {
ANS_DBG("TRT_SM100", "SLOW PREPROCESS: %.1fms batch=%d (blobFromGpuMats+D2D)", _prepMs, batchSize);
}
}
// ============================================================================
// PRE-ALLOCATE OUTPUT STRUCTURE
// ============================================================================
@@ -690,6 +800,8 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
if (canGraph) {
auto& graphExec = m_graphExecs[batchSize]; // inserts nullptr on first access
if (!graphExec) {
ANS_DBG("TRT_Engine", "CUDA graph CAPTURE starting for batch=%d (cached=%zu)",
batchSize, m_graphExecs.size());
// First call for this batchSize -- capture a new graph.
// Serialise captures across all Engine instances on this device to
// prevent TRT's shared workspace from creating cross-stream
@@ -727,9 +839,13 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
graphExec = exec;
cudaGraphDestroy(graph);
ANS_DBG("TRT_Engine", "CUDA graph CAPTURED OK for batch=%d exec=%p",
batchSize, (void*)graphExec);
}
if (!graphExec) {
ANS_DBG("TRT_Engine", "CUDA graph capture FAILED for batch=%d — falling back to direct path",
batchSize);
std::cout << "Warning: CUDA graph capture failed for batchSize="
<< batchSize << " -- falling back to direct inference path." << std::endl;
// Disable graph acceleration for this Engine instance.
@@ -740,9 +856,17 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
}
if (graphExec) {
ANS_DBG("TRT_Engine", "CUDA graph LAUNCH batch=%d exec=%p", batchSize, (void*)graphExec);
// Launch the pre-captured graph (single API call replaces many).
auto _graphStart = std::chrono::steady_clock::now();
cudaGraphLaunch(graphExec, m_inferenceStream);
cudaStreamSynchronize_Safe(m_inferenceStream);
auto _graphEnd = std::chrono::steady_clock::now();
double _graphMs = std::chrono::duration<double, std::milli>(_graphEnd - _graphStart).count();
if (_graphMs > 500.0) {
ANS_DBG("TRT_SM100", "SLOW GRAPH: %.1fms batch=%d active=%d",
_graphMs, batchSize, s_globalActiveInf.load());
}
// CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
for (int batch = 0; batch < batchSize; ++batch) {
@@ -762,8 +886,16 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
// ----------------------
// Used when pinned buffers are unavailable or graph capture failed.
if (!graphUsed) {
ANS_DBG("TRT_Engine", "Direct path (no graph) batch=%d", batchSize);
auto enqueueStart = std::chrono::steady_clock::now();
bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
auto enqueueEnd = std::chrono::steady_clock::now();
double enqueueMs = std::chrono::duration<double, std::milli>(enqueueEnd - enqueueStart).count();
if (enqueueMs > 500.0) {
ANS_DBG("TRT_Engine", "SLOW ENQUEUE: %.1fms batch=%d (enqueueV3 blocked!)", enqueueMs, batchSize);
}
if (!success) {
ANS_DBG("TRT_Engine", "ERROR: enqueueV3 FAILED batch=%d", batchSize);
std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
+ std::to_string(batchSize)
+ ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
@@ -805,8 +937,16 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
}
}
auto syncStart = std::chrono::steady_clock::now();
cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
auto syncEnd = std::chrono::steady_clock::now();
double syncMs = std::chrono::duration<double, std::milli>(syncEnd - syncStart).count();
if (syncMs > 500.0) {
ANS_DBG("TRT_Engine", "SLOW INFERENCE SYNC: %.1fms batch=%d (direct path)", syncMs, batchSize);
}
if (syncErr != cudaSuccess) {
ANS_DBG("TRT_Engine", "ERROR: cudaStreamSync FAILED err=%d (%s)",
(int)syncErr, cudaGetErrorString(syncErr));
std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
+ std::string(cudaGetErrorString(syncErr));
std::cout << errMsg << std::endl;
@@ -815,5 +955,33 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
}
}
// ============================================================================
// SM=100% DETECTOR — end-of-inference timing
// ============================================================================
{
static thread_local double s_ema = 0;
static thread_local std::chrono::steady_clock::time_point s_prevEnd;
static thread_local bool s_firstDone = false;
auto _now = std::chrono::steady_clock::now();
if (s_firstDone) {
double sinceLastMs = std::chrono::duration<double, std::milli>(_now - s_prevEnd).count();
// If time between consecutive inferences jumps dramatically,
// something blocked the thread (SM=100% or mutex contention)
if (s_ema > 0 && sinceLastMs > s_ema * 3.0 && sinceLastMs > 500.0) {
size_t vf = 0, vt = 0;
cudaMemGetInfo(&vf, &vt);
ANS_DBG("TRT_SM100", "GAP DETECTED: %.1fms between inferences (avg=%.1fms, %.1fx) active=%d VRAM=%zuMB free",
sinceLastMs, s_ema, sinceLastMs / s_ema,
s_globalActiveInf.load(), vf / (1024*1024));
}
s_ema = (s_ema == 0) ? sinceLastMs : (0.9 * s_ema + 0.1 * sinceLastMs);
}
s_prevEnd = _now;
s_firstDone = true;
s_globalActiveInf.fetch_sub(1);
}
return true;
}

View File

@@ -24,28 +24,32 @@ void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input,
output = std::move(input[0][0]);
}
template <typename T>
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
size_t height, size_t width,
const cv::Scalar& bgcolor) {
// Ensure input is valid
if (input.empty()) {
return cv::cuda::GpuMat();
return cv::cuda::GpuMat();
}
// Create a CUDA stream
cv::cuda::Stream stream;
// Calculate aspect ratio and unpadded dimensions
// Use a thread_local stream to avoid creating a new CUDA stream per call.
// Creating cv::cuda::Stream() each call leaks stream handles under WDDM.
thread_local cv::cuda::Stream stream;
float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
size_t unpad_w = static_cast<size_t>(r * input.cols);
size_t unpad_h = static_cast<size_t>(r * input.rows);
// Resize the input image
cv::cuda::GpuMat re;
re.create(unpad_h, unpad_w, input.type());
re.create(static_cast<int>(unpad_h), static_cast<int>(unpad_w), input.type());
cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);
// Create the output image and fill with the background color
cv::cuda::GpuMat out;
out.create(height, width, input.type());
out.create(static_cast<int>(height), static_cast<int>(width), input.type());
out.setTo(bgcolor, stream);
// Copy the resized content into the top-left corner of the output image
// Copy the resized content into the top-left corner
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
stream.waitForCompletion();
return out;
@@ -195,41 +199,51 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
const int W = batchInput[0].cols;
const int batch = static_cast<int>(batchInput.size());
const size_t planeSize = static_cast<size_t>(H) * W; // pixels per channel
const int totalElems = batch * 3 * static_cast<int>(planeSize);
// Output blob: planar NCHW layout stored as a single-channel GpuMat.
// Total elements = batch * 3 * H * W.
cv::cuda::GpuMat blob(1, batch * 3 * static_cast<int>(planeSize), CV_32FC1);
// thread_local cached buffers — reused across calls on the same thread.
// KEY: allocate for MAX seen size, never shrink. This prevents the VRAM leak
// caused by OpenCV's GpuMat pool growing unbounded when batch sizes alternate
// (e.g., batch=1,6,1,6 → each size triggers new alloc, old goes to pool, never freed).
thread_local cv::cuda::GpuMat tl_blob;
thread_local cv::cuda::GpuMat tl_floatImg;
thread_local int tl_blobMaxElems = 0;
if (totalElems > tl_blobMaxElems) {
tl_blob = cv::cuda::GpuMat(1, totalElems, CV_32FC1);
tl_blobMaxElems = totalElems;
size_t blobBytes = static_cast<size_t>(totalElems) * sizeof(float);
ANS_DBG("TRT_Preproc", "blobFromGpuMats: ALLOC blob batch=%d %dx%d %.1fMB (new max)",
batch, W, H, blobBytes / (1024.0 * 1024.0));
}
// Use a sub-region of the cached blob for the current batch
cv::cuda::GpuMat blob = tl_blob.colRange(0, totalElems);
for (int img = 0; img < batch; ++img) {
// 1. Convert to float and normalise while still in HWC (interleaved) format.
// Channel-wise subtract / divide operate correctly on interleaved data.
cv::cuda::GpuMat floatImg;
if (normalize) {
batchInput[img].convertTo(floatImg, CV_32FC3, 1.f / 255.f, stream);
batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.f / 255.f, stream);
} else {
batchInput[img].convertTo(floatImg, CV_32FC3, 1.0, stream);
batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.0, stream);
}
cv::cuda::subtract(floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), floatImg, cv::noArray(), -1, stream);
cv::cuda::divide(floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), floatImg, 1, -1, stream);
cv::cuda::subtract(tl_floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), tl_floatImg, cv::noArray(), -1, stream);
cv::cuda::divide(tl_floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), tl_floatImg, 1, -1, stream);
// 2. Split normalised HWC image into CHW planes directly into the blob.
size_t offset = static_cast<size_t>(img) * 3 * planeSize;
if (swapRB) {
// BGR input -> RGB planes: B goes to plane 2, G to plane 1, R to plane 0
std::vector<cv::cuda::GpuMat> channels{
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize), // B -> plane 2
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize), // G -> plane 1
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)}; // R -> plane 0
cv::cuda::split(floatImg, channels, stream);
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};
cv::cuda::split(tl_floatImg, channels, stream);
} else {
// BGR input -> BGR planes: keep channel order
std::vector<cv::cuda::GpuMat> channels{
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
cv::cuda::split(floatImg, channels, stream);
cv::cuda::split(tl_floatImg, channels, stream);
}
}
@@ -239,7 +253,6 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
template <typename T> void Engine<T>::clearGpuBuffers() {
if (!m_buffers.empty()) {
// Free ALL I/O GPU buffers (both inputs and outputs).
// Previously only outputs were freed, leaking input allocations from loadNetwork().
for (void* ptr : m_buffers) {
if (ptr) {
Util::checkCudaErrorCode(cudaFree(ptr));
@@ -247,4 +260,8 @@ template <typename T> void Engine<T>::clearGpuBuffers() {
}
m_buffers.clear();
}
// Note: blob/floatImg caches are thread_local inside blobFromGpuMats (static method).
// They are cleaned up automatically when threads exit.
ANS_DBG("TRT_Engine", "clearGpuBuffers: I/O buffers released");
}