Use software decoder by default
This commit is contained in:
@@ -623,6 +623,65 @@ bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<flo
|
||||
template <typename T>
|
||||
bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize)
|
||||
{
|
||||
// Install a custom OpenCV CUDA allocator that uses cudaMallocAsync/cudaFreeAsync
|
||||
// instead of the default cudaMalloc/cudaFree. The stream-ordered allocator
|
||||
// respects the cudaMemPool release threshold (set to 0), so freed memory is
|
||||
// returned to the GPU immediately instead of being cached forever.
|
||||
//
|
||||
// The default cudaMalloc/cudaFree allocator caches all freed blocks permanently
|
||||
// (no API to force release), causing VRAM to grow monotonically when GpuMat
|
||||
// objects of varying sizes are allocated and freed repeatedly (different batch
|
||||
// sizes, different image resolutions across cameras).
|
||||
{
|
||||
static std::once_flag s_allocatorFlag;
|
||||
std::call_once(s_allocatorFlag, []() {
|
||||
// Set release threshold to 0 on all GPUs
|
||||
int deviceCount = 0;
|
||||
cudaGetDeviceCount(&deviceCount);
|
||||
for (int d = 0; d < deviceCount; ++d) {
|
||||
cudaMemPool_t pool = nullptr;
|
||||
if (cudaDeviceGetDefaultMemPool(&pool, d) == cudaSuccess && pool) {
|
||||
uint64_t threshold = 0;
|
||||
cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold);
|
||||
}
|
||||
}
|
||||
|
||||
// Custom allocator: uses cudaMallocAsync on stream 0 (behaves like
|
||||
// synchronous cudaMalloc but goes through the stream-ordered pool).
|
||||
struct AsyncAllocator : cv::cuda::GpuMat::Allocator {
|
||||
bool allocate(cv::cuda::GpuMat* mat, int rows, int cols, size_t elemSize) override {
|
||||
// Same logic as OpenCV's default allocator, but using cudaMallocAsync
|
||||
size_t step = elemSize * cols;
|
||||
// Align step to 256 bytes (same as default allocator)
|
||||
step = (step + 255) & ~size_t(255);
|
||||
void* ptr = nullptr;
|
||||
cudaError_t err = cudaMallocAsync(&ptr, step * rows, nullptr); // stream 0
|
||||
if (err != cudaSuccess || !ptr) {
|
||||
// Fallback to regular cudaMalloc if async not supported
|
||||
err = cudaMalloc(&ptr, step * rows);
|
||||
if (err != cudaSuccess) return false;
|
||||
}
|
||||
mat->data = static_cast<uchar*>(ptr);
|
||||
mat->step = step;
|
||||
mat->refcount = static_cast<int*>(cv::fastMalloc(sizeof(int)));
|
||||
*mat->refcount = 1;
|
||||
return true;
|
||||
}
|
||||
void free(cv::cuda::GpuMat* mat) override {
|
||||
cudaFreeAsync(mat->data, nullptr); // stream 0 — goes through pool with threshold=0
|
||||
cv::fastFree(mat->refcount);
|
||||
mat->data = nullptr;
|
||||
mat->datastart = nullptr;
|
||||
mat->dataend = nullptr;
|
||||
mat->refcount = nullptr;
|
||||
}
|
||||
};
|
||||
static AsyncAllocator s_allocator;
|
||||
cv::cuda::GpuMat::setDefaultAllocator(&s_allocator);
|
||||
ANS_DBG("TRT_Load", "Custom CUDA async allocator installed — VRAM freed immediately on GpuMat release");
|
||||
});
|
||||
}
|
||||
|
||||
m_lastLoadFailedVRAM = false; // reset on each load attempt
|
||||
m_subVals = subVals;
|
||||
m_divVals = divVals;
|
||||
@@ -958,11 +1017,13 @@ trt_cache_create_context:
|
||||
|
||||
m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
|
||||
if (!m_context) {
|
||||
ANS_DBG("TRT_Load", "ERROR: createExecutionContext returned null");
|
||||
logEngineEvent("[Engine] loadNetwork FAIL: createExecutionContext returned null for "
|
||||
+ trtModelPath, true);
|
||||
return false;
|
||||
}
|
||||
|
||||
ANS_DBG("TRT_Load", "Execution context created OK for %s", trtModelPath.c_str());
|
||||
if (m_verbose) std::cout << "Info: Execution context created successfully" << std::endl;
|
||||
|
||||
// ============================================================================
|
||||
@@ -1135,6 +1196,15 @@ trt_cache_create_context:
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
size_t vramFree = 0, vramTotal = 0;
|
||||
cudaMemGetInfo(&vramFree, &vramTotal);
|
||||
ANS_DBG("TRT_Load", "Buffers allocated: %zuMB, VRAM: %zuMB used / %zuMB free / %zuMB total",
|
||||
totalAllocated / (1024*1024),
|
||||
(vramTotal - vramFree) / (1024*1024),
|
||||
vramFree / (1024*1024),
|
||||
vramTotal / (1024*1024));
|
||||
}
|
||||
if (m_verbose) std::cout << "\nInfo: Total GPU memory allocated: " << totalAllocated / (1024 * 1024) << " MiB" << std::endl;
|
||||
|
||||
// -- Pinned output buffers (CUDA graph prerequisite) -----------------------
|
||||
|
||||
@@ -607,6 +607,7 @@ bool Engine<T>::runInferenceFromPool(
|
||||
// harmless — the second one finds a fresh slot immediately.
|
||||
InferenceSlot* slot = nullptr;
|
||||
bool kickedGrowth = false;
|
||||
auto _poolAcquireStart = std::chrono::steady_clock::now();
|
||||
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(m_slotMutex);
|
||||
@@ -630,6 +631,8 @@ bool Engine<T>::runInferenceFromPool(
|
||||
}
|
||||
|
||||
if (!slot) {
|
||||
ANS_DBG("TRT_Pool", "ALL SLOTS BUSY: %zu slots, active=%d — waiting for free slot",
|
||||
n, m_activeCount.load());
|
||||
// All slots busy. In elastic mode, proactively grow the
|
||||
// pool in the background so the next request has a slot
|
||||
// on a different GPU. We only kick once per wait cycle.
|
||||
@@ -672,7 +675,17 @@ bool Engine<T>::runInferenceFromPool(
|
||||
}
|
||||
|
||||
// -- 3. Still no slot => reject ---------------------------------------
|
||||
{
|
||||
double _acquireMs = std::chrono::duration<double, std::milli>(
|
||||
std::chrono::steady_clock::now() - _poolAcquireStart).count();
|
||||
if (_acquireMs > 100.0) {
|
||||
ANS_DBG("TRT_Pool", "SLOW slot acquire: %.1fms slot=%p gpu=%d active=%d/%zu",
|
||||
_acquireMs, (void*)slot, slot ? slot->deviceIndex : -1,
|
||||
m_activeCount.load(), m_slots.size());
|
||||
}
|
||||
}
|
||||
if (!slot) {
|
||||
ANS_DBG("TRT_Pool", "ERROR: No slot available — all %zu slots busy, timeout", m_slots.size());
|
||||
std::string errMsg = "[Engine] runInferenceFromPool FAIL: Capacity reached -- all "
|
||||
+ std::to_string(m_activeCount.load()) + "/" + std::to_string(m_totalCapacity)
|
||||
+ " slot(s) busy"
|
||||
@@ -699,12 +712,23 @@ bool Engine<T>::runInferenceFromPool(
|
||||
if (currentDev != slot->deviceIndex) {
|
||||
cudaSetDevice(slot->deviceIndex);
|
||||
}
|
||||
ANS_DBG("TRT_Pool", "Slot dispatch: gpu=%d active=%d/%zu",
|
||||
slot->deviceIndex, m_activeCount.load(), m_slots.size());
|
||||
auto _slotStart = std::chrono::steady_clock::now();
|
||||
result = slot->engine->runInference(inputs, featureVectors);
|
||||
auto _slotEnd = std::chrono::steady_clock::now();
|
||||
double _slotMs = std::chrono::duration<double, std::milli>(_slotEnd - _slotStart).count();
|
||||
if (_slotMs > 500.0) {
|
||||
ANS_DBG("TRT_Pool", "SLOW slot inference: %.1fms gpu=%d active=%d/%zu",
|
||||
_slotMs, slot->deviceIndex, m_activeCount.load(), m_slots.size());
|
||||
}
|
||||
}
|
||||
catch (const std::exception& ex) {
|
||||
ANS_DBG("TRT_Pool", "ERROR: runInference threw: %s", ex.what());
|
||||
std::cout << "Error [Pool]: runInference threw: " << ex.what() << std::endl;
|
||||
}
|
||||
catch (...) {
|
||||
ANS_DBG("TRT_Pool", "ERROR: runInference threw unknown exception");
|
||||
std::cout << "Error [Pool]: runInference threw unknown exception" << std::endl;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
#pragma once
|
||||
#include <cstring>
|
||||
#include <chrono>
|
||||
#include <filesystem>
|
||||
#include <semaphore>
|
||||
#include "TRTCompat.h"
|
||||
#include "ANSLicense.h" // ANS_DBG macro for DebugView logging
|
||||
|
||||
// Per-device mutex for CUDA graph capture.
|
||||
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
|
||||
@@ -86,11 +88,9 @@ static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
|
||||
cudaError_t err = cudaStreamQuery(stream);
|
||||
if (err != cudaErrorNotReady) return err;
|
||||
|
||||
auto syncStart = std::chrono::steady_clock::now();
|
||||
|
||||
// Short Sleep(0) fast path (~10 iterations) catches sub-ms kernel completions.
|
||||
// Then switch to Sleep(1) to give cleanup operations (cuArrayDestroy, cuMemFree)
|
||||
// a window to acquire the exclusive nvcuda64 SRW lock.
|
||||
// Previously used 1000 Sleep(0) iterations which hogged the SRW lock and
|
||||
// caused ~20-second stalls when concurrent cleanup needed exclusive access.
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
Sleep(0);
|
||||
err = cudaStreamQuery(stream);
|
||||
@@ -98,10 +98,21 @@ static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
|
||||
}
|
||||
|
||||
// 1ms sleeps — adds negligible latency at 30 FPS but prevents SRW lock starvation.
|
||||
int sleepCount = 0;
|
||||
while (true) {
|
||||
Sleep(1);
|
||||
sleepCount++;
|
||||
err = cudaStreamQuery(stream);
|
||||
if (err != cudaErrorNotReady) return err;
|
||||
if (err != cudaErrorNotReady) {
|
||||
// Log if sync took too long (>500ms indicates GPU stall)
|
||||
auto elapsed = std::chrono::duration<double, std::milli>(
|
||||
std::chrono::steady_clock::now() - syncStart).count();
|
||||
if (elapsed > 500.0) {
|
||||
ANS_DBG("TRT_Engine", "SLOW SYNC: %.1fms (%d sleeps) stream=%p err=%d",
|
||||
elapsed, sleepCount, (void*)stream, (int)err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -368,6 +379,71 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
return false;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// SM=100% DETECTOR — tracks inference timing trends to catch the exact
|
||||
// moment GPU becomes saturated. Logs every 50 inferences with rolling
|
||||
// average, and immediately when degradation is detected.
|
||||
// ============================================================================
|
||||
// Global (process-wide) counters shared across all engine instances/threads
|
||||
static std::atomic<int64_t> s_globalInfCount{0};
|
||||
static std::atomic<int> s_globalActiveInf{0}; // currently in-flight inferences
|
||||
static std::atomic<double> s_globalLastAvgMs{0.0}; // last known avg inference time
|
||||
|
||||
const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
|
||||
s_globalActiveInf.fetch_add(1);
|
||||
|
||||
// Per-thread tracking
|
||||
{
|
||||
static thread_local int64_t s_infCount = 0;
|
||||
static thread_local std::chrono::steady_clock::time_point s_lastLog;
|
||||
static thread_local double s_rollingAvgMs = 0.0;
|
||||
static thread_local double s_baselineMs = 0.0; // avg during first 100 inferences
|
||||
static thread_local double s_maxMs = 0.0;
|
||||
static thread_local bool s_degradationLogged = false;
|
||||
s_infCount++;
|
||||
|
||||
if (s_infCount == 1) {
|
||||
s_lastLog = std::chrono::steady_clock::now();
|
||||
ANS_DBG("TRT_SM100", "FIRST inference — engine alive, globalInf=%lld", myInfNum);
|
||||
}
|
||||
|
||||
// Log every 50 inferences (more frequent than 500 to catch transitions)
|
||||
if (s_infCount % 50 == 0) {
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
double elapsed = std::chrono::duration<double>(now - s_lastLog).count();
|
||||
double fps = (elapsed > 0) ? (50.0 / elapsed) : 0;
|
||||
s_lastLog = now;
|
||||
|
||||
size_t vramFree = 0, vramTotal = 0;
|
||||
cudaMemGetInfo(&vramFree, &vramTotal);
|
||||
size_t vramUsedMB = (vramTotal - vramFree) / (1024 * 1024);
|
||||
size_t vramFreeMB = vramFree / (1024 * 1024);
|
||||
|
||||
ANS_DBG("TRT_SM100", "#%lld [global=%lld active=%d] %.1f inf/sec avgMs=%.1f maxMs=%.1f batch=%d graphs=%zu VRAM=%zuMB/%zuMB",
|
||||
s_infCount, myInfNum, s_globalActiveInf.load(),
|
||||
fps, s_rollingAvgMs, s_maxMs,
|
||||
(int)inputs[0].size(), m_graphExecs.size(),
|
||||
vramUsedMB, vramFreeMB);
|
||||
|
||||
// Capture baseline from first 100 inferences
|
||||
if (s_infCount == 100) {
|
||||
s_baselineMs = s_rollingAvgMs;
|
||||
ANS_DBG("TRT_SM100", "BASELINE established: %.1fms/inference", s_baselineMs);
|
||||
}
|
||||
|
||||
// Detect degradation: avg >3x baseline AND baseline is set
|
||||
if (s_baselineMs > 0 && s_rollingAvgMs > s_baselineMs * 3.0 && !s_degradationLogged) {
|
||||
s_degradationLogged = true;
|
||||
ANS_DBG("TRT_SM100", "*** DEGRADATION DETECTED *** avg=%.1fms baseline=%.1fms (%.1fx) VRAM=%zuMB/%zuMB active=%d",
|
||||
s_rollingAvgMs, s_baselineMs, s_rollingAvgMs / s_baselineMs,
|
||||
vramUsedMB, vramFreeMB, s_globalActiveInf.load());
|
||||
}
|
||||
|
||||
// Reset max for next window
|
||||
s_maxMs = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
const auto numInputs = m_inputDims.size();
|
||||
if (inputs.size() != numInputs) {
|
||||
std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
|
||||
@@ -457,6 +533,9 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
}
|
||||
|
||||
if (anyDimChanged) {
|
||||
ANS_DBG("TRT_Engine", "Shape change detected: batch %d -> %d (graphsCached=%zu)",
|
||||
m_lastBatchSize, batchSize, m_graphExecs.size());
|
||||
|
||||
// === First-time diagnostics (verbose, once) ===
|
||||
const bool firstTime = !m_batchShapeChangeLogged;
|
||||
|
||||
@@ -536,7 +615,10 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
<< newDims.d[3] << "]" << std::endl;
|
||||
}
|
||||
|
||||
ANS_DBG("TRT_Engine", "setInputShape('%s') [%d,%d,%d,%d]",
|
||||
tensorName, newDims.d[0], newDims.d[1], newDims.d[2], newDims.d[3]);
|
||||
if (!m_context->setInputShape(tensorName, newDims)) {
|
||||
ANS_DBG("TRT_Engine", "ERROR: setInputShape FAILED for '%s'", tensorName);
|
||||
std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
|
||||
return false;
|
||||
}
|
||||
@@ -576,6 +658,25 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
|
||||
m_lastBatchSize = batchSize;
|
||||
m_batchShapeChangeLogged = true;
|
||||
|
||||
// CRITICAL: Invalidate all cached CUDA graphs after shape change.
|
||||
// Graphs were captured with the OLD context state (old tensor shapes).
|
||||
// Launching them after setInputShape() produces undefined GPU behavior
|
||||
// (invalid kernel sequences, SM lockup at 100%, inference hang).
|
||||
if (!m_graphExecs.empty()) {
|
||||
size_t destroyed = m_graphExecs.size();
|
||||
for (auto& [bs, ge] : m_graphExecs) {
|
||||
if (ge) cudaGraphExecDestroy(ge);
|
||||
}
|
||||
m_graphExecs.clear();
|
||||
ANS_DBG("TRT_Engine", "INVALIDATED %zu cached CUDA graphs after shape change (batch=%d)",
|
||||
destroyed, batchSize);
|
||||
if (m_verbose || firstTime) {
|
||||
std::cout << "Info: Invalidated " << destroyed
|
||||
<< " cached CUDA graphs after shape change" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if (m_verbose && firstTime) {
|
||||
std::cout << "\nInfo: Input shapes updated successfully for batch size "
|
||||
<< batchSize << " ✓\n" << std::endl;
|
||||
@@ -619,6 +720,7 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
//
|
||||
// GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
|
||||
// cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
|
||||
auto _prepStart = std::chrono::steady_clock::now();
|
||||
cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
|
||||
std::vector<cv::cuda::GpuMat> preprocessedBuffers;
|
||||
preprocessedBuffers.reserve(numInputs);
|
||||
@@ -647,6 +749,14 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
double _prepMs = std::chrono::duration<double, std::milli>(
|
||||
std::chrono::steady_clock::now() - _prepStart).count();
|
||||
if (_prepMs > 100.0) {
|
||||
ANS_DBG("TRT_SM100", "SLOW PREPROCESS: %.1fms batch=%d (blobFromGpuMats+D2D)", _prepMs, batchSize);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// PRE-ALLOCATE OUTPUT STRUCTURE
|
||||
// ============================================================================
|
||||
@@ -690,6 +800,8 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
if (canGraph) {
|
||||
auto& graphExec = m_graphExecs[batchSize]; // inserts nullptr on first access
|
||||
if (!graphExec) {
|
||||
ANS_DBG("TRT_Engine", "CUDA graph CAPTURE starting for batch=%d (cached=%zu)",
|
||||
batchSize, m_graphExecs.size());
|
||||
// First call for this batchSize -- capture a new graph.
|
||||
// Serialise captures across all Engine instances on this device to
|
||||
// prevent TRT's shared workspace from creating cross-stream
|
||||
@@ -727,9 +839,13 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
|
||||
graphExec = exec;
|
||||
cudaGraphDestroy(graph);
|
||||
ANS_DBG("TRT_Engine", "CUDA graph CAPTURED OK for batch=%d exec=%p",
|
||||
batchSize, (void*)graphExec);
|
||||
}
|
||||
|
||||
if (!graphExec) {
|
||||
ANS_DBG("TRT_Engine", "CUDA graph capture FAILED for batch=%d — falling back to direct path",
|
||||
batchSize);
|
||||
std::cout << "Warning: CUDA graph capture failed for batchSize="
|
||||
<< batchSize << " -- falling back to direct inference path." << std::endl;
|
||||
// Disable graph acceleration for this Engine instance.
|
||||
@@ -740,9 +856,17 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
}
|
||||
|
||||
if (graphExec) {
|
||||
ANS_DBG("TRT_Engine", "CUDA graph LAUNCH batch=%d exec=%p", batchSize, (void*)graphExec);
|
||||
// Launch the pre-captured graph (single API call replaces many).
|
||||
auto _graphStart = std::chrono::steady_clock::now();
|
||||
cudaGraphLaunch(graphExec, m_inferenceStream);
|
||||
cudaStreamSynchronize_Safe(m_inferenceStream);
|
||||
auto _graphEnd = std::chrono::steady_clock::now();
|
||||
double _graphMs = std::chrono::duration<double, std::milli>(_graphEnd - _graphStart).count();
|
||||
if (_graphMs > 500.0) {
|
||||
ANS_DBG("TRT_SM100", "SLOW GRAPH: %.1fms batch=%d active=%d",
|
||||
_graphMs, batchSize, s_globalActiveInf.load());
|
||||
}
|
||||
|
||||
// CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
|
||||
for (int batch = 0; batch < batchSize; ++batch) {
|
||||
@@ -762,8 +886,16 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
// ----------------------
|
||||
// Used when pinned buffers are unavailable or graph capture failed.
|
||||
if (!graphUsed) {
|
||||
ANS_DBG("TRT_Engine", "Direct path (no graph) batch=%d", batchSize);
|
||||
auto enqueueStart = std::chrono::steady_clock::now();
|
||||
bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
|
||||
auto enqueueEnd = std::chrono::steady_clock::now();
|
||||
double enqueueMs = std::chrono::duration<double, std::milli>(enqueueEnd - enqueueStart).count();
|
||||
if (enqueueMs > 500.0) {
|
||||
ANS_DBG("TRT_Engine", "SLOW ENQUEUE: %.1fms batch=%d (enqueueV3 blocked!)", enqueueMs, batchSize);
|
||||
}
|
||||
if (!success) {
|
||||
ANS_DBG("TRT_Engine", "ERROR: enqueueV3 FAILED batch=%d", batchSize);
|
||||
std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
|
||||
+ std::to_string(batchSize)
|
||||
+ ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
|
||||
@@ -805,8 +937,16 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
}
|
||||
}
|
||||
|
||||
auto syncStart = std::chrono::steady_clock::now();
|
||||
cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
|
||||
auto syncEnd = std::chrono::steady_clock::now();
|
||||
double syncMs = std::chrono::duration<double, std::milli>(syncEnd - syncStart).count();
|
||||
if (syncMs > 500.0) {
|
||||
ANS_DBG("TRT_Engine", "SLOW INFERENCE SYNC: %.1fms batch=%d (direct path)", syncMs, batchSize);
|
||||
}
|
||||
if (syncErr != cudaSuccess) {
|
||||
ANS_DBG("TRT_Engine", "ERROR: cudaStreamSync FAILED err=%d (%s)",
|
||||
(int)syncErr, cudaGetErrorString(syncErr));
|
||||
std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
|
||||
+ std::string(cudaGetErrorString(syncErr));
|
||||
std::cout << errMsg << std::endl;
|
||||
@@ -815,5 +955,33 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// SM=100% DETECTOR — end-of-inference timing
|
||||
// ============================================================================
|
||||
{
|
||||
static thread_local double s_ema = 0;
|
||||
static thread_local std::chrono::steady_clock::time_point s_prevEnd;
|
||||
static thread_local bool s_firstDone = false;
|
||||
|
||||
auto _now = std::chrono::steady_clock::now();
|
||||
if (s_firstDone) {
|
||||
double sinceLastMs = std::chrono::duration<double, std::milli>(_now - s_prevEnd).count();
|
||||
// If time between consecutive inferences jumps dramatically,
|
||||
// something blocked the thread (SM=100% or mutex contention)
|
||||
if (s_ema > 0 && sinceLastMs > s_ema * 3.0 && sinceLastMs > 500.0) {
|
||||
size_t vf = 0, vt = 0;
|
||||
cudaMemGetInfo(&vf, &vt);
|
||||
ANS_DBG("TRT_SM100", "GAP DETECTED: %.1fms between inferences (avg=%.1fms, %.1fx) active=%d VRAM=%zuMB free",
|
||||
sinceLastMs, s_ema, sinceLastMs / s_ema,
|
||||
s_globalActiveInf.load(), vf / (1024*1024));
|
||||
}
|
||||
s_ema = (s_ema == 0) ? sinceLastMs : (0.9 * s_ema + 0.1 * sinceLastMs);
|
||||
}
|
||||
s_prevEnd = _now;
|
||||
s_firstDone = true;
|
||||
|
||||
s_globalActiveInf.fetch_sub(1);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -24,28 +24,32 @@ void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input,
|
||||
output = std::move(input[0][0]);
|
||||
}
|
||||
template <typename T>
|
||||
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
|
||||
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
|
||||
size_t height, size_t width,
|
||||
const cv::Scalar& bgcolor) {
|
||||
// Ensure input is valid
|
||||
if (input.empty()) {
|
||||
return cv::cuda::GpuMat();
|
||||
return cv::cuda::GpuMat();
|
||||
}
|
||||
// Create a CUDA stream
|
||||
cv::cuda::Stream stream;
|
||||
// Calculate aspect ratio and unpadded dimensions
|
||||
|
||||
// Use a thread_local stream to avoid creating a new CUDA stream per call.
|
||||
// Creating cv::cuda::Stream() each call leaks stream handles under WDDM.
|
||||
thread_local cv::cuda::Stream stream;
|
||||
|
||||
float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
|
||||
size_t unpad_w = static_cast<size_t>(r * input.cols);
|
||||
size_t unpad_h = static_cast<size_t>(r * input.rows);
|
||||
|
||||
// Resize the input image
|
||||
cv::cuda::GpuMat re;
|
||||
re.create(unpad_h, unpad_w, input.type());
|
||||
re.create(static_cast<int>(unpad_h), static_cast<int>(unpad_w), input.type());
|
||||
cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);
|
||||
|
||||
// Create the output image and fill with the background color
|
||||
cv::cuda::GpuMat out;
|
||||
out.create(height, width, input.type());
|
||||
out.create(static_cast<int>(height), static_cast<int>(width), input.type());
|
||||
out.setTo(bgcolor, stream);
|
||||
// Copy the resized content into the top-left corner of the output image
|
||||
|
||||
// Copy the resized content into the top-left corner
|
||||
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
|
||||
stream.waitForCompletion();
|
||||
return out;
|
||||
@@ -195,41 +199,51 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
|
||||
const int W = batchInput[0].cols;
|
||||
const int batch = static_cast<int>(batchInput.size());
|
||||
const size_t planeSize = static_cast<size_t>(H) * W; // pixels per channel
|
||||
const int totalElems = batch * 3 * static_cast<int>(planeSize);
|
||||
|
||||
// Output blob: planar NCHW layout stored as a single-channel GpuMat.
|
||||
// Total elements = batch * 3 * H * W.
|
||||
cv::cuda::GpuMat blob(1, batch * 3 * static_cast<int>(planeSize), CV_32FC1);
|
||||
// thread_local cached buffers — reused across calls on the same thread.
|
||||
// KEY: allocate for MAX seen size, never shrink. This prevents the VRAM leak
|
||||
// caused by OpenCV's GpuMat pool growing unbounded when batch sizes alternate
|
||||
// (e.g., batch=1,6,1,6 → each size triggers new alloc, old goes to pool, never freed).
|
||||
thread_local cv::cuda::GpuMat tl_blob;
|
||||
thread_local cv::cuda::GpuMat tl_floatImg;
|
||||
thread_local int tl_blobMaxElems = 0;
|
||||
|
||||
if (totalElems > tl_blobMaxElems) {
|
||||
tl_blob = cv::cuda::GpuMat(1, totalElems, CV_32FC1);
|
||||
tl_blobMaxElems = totalElems;
|
||||
size_t blobBytes = static_cast<size_t>(totalElems) * sizeof(float);
|
||||
ANS_DBG("TRT_Preproc", "blobFromGpuMats: ALLOC blob batch=%d %dx%d %.1fMB (new max)",
|
||||
batch, W, H, blobBytes / (1024.0 * 1024.0));
|
||||
}
|
||||
// Use a sub-region of the cached blob for the current batch
|
||||
cv::cuda::GpuMat blob = tl_blob.colRange(0, totalElems);
|
||||
|
||||
for (int img = 0; img < batch; ++img) {
|
||||
// 1. Convert to float and normalise while still in HWC (interleaved) format.
|
||||
// Channel-wise subtract / divide operate correctly on interleaved data.
|
||||
cv::cuda::GpuMat floatImg;
|
||||
if (normalize) {
|
||||
batchInput[img].convertTo(floatImg, CV_32FC3, 1.f / 255.f, stream);
|
||||
batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.f / 255.f, stream);
|
||||
} else {
|
||||
batchInput[img].convertTo(floatImg, CV_32FC3, 1.0, stream);
|
||||
batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.0, stream);
|
||||
}
|
||||
|
||||
cv::cuda::subtract(floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), floatImg, cv::noArray(), -1, stream);
|
||||
cv::cuda::divide(floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), floatImg, 1, -1, stream);
|
||||
cv::cuda::subtract(tl_floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), tl_floatImg, cv::noArray(), -1, stream);
|
||||
cv::cuda::divide(tl_floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), tl_floatImg, 1, -1, stream);
|
||||
|
||||
// 2. Split normalised HWC image into CHW planes directly into the blob.
|
||||
size_t offset = static_cast<size_t>(img) * 3 * planeSize;
|
||||
|
||||
if (swapRB) {
|
||||
// BGR input -> RGB planes: B goes to plane 2, G to plane 1, R to plane 0
|
||||
std::vector<cv::cuda::GpuMat> channels{
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize), // B -> plane 2
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize), // G -> plane 1
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)}; // R -> plane 0
|
||||
cv::cuda::split(floatImg, channels, stream);
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};
|
||||
cv::cuda::split(tl_floatImg, channels, stream);
|
||||
} else {
|
||||
// BGR input -> BGR planes: keep channel order
|
||||
std::vector<cv::cuda::GpuMat> channels{
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
|
||||
cv::cuda::split(floatImg, channels, stream);
|
||||
cv::cuda::split(tl_floatImg, channels, stream);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -239,7 +253,6 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
|
||||
template <typename T> void Engine<T>::clearGpuBuffers() {
|
||||
if (!m_buffers.empty()) {
|
||||
// Free ALL I/O GPU buffers (both inputs and outputs).
|
||||
// Previously only outputs were freed, leaking input allocations from loadNetwork().
|
||||
for (void* ptr : m_buffers) {
|
||||
if (ptr) {
|
||||
Util::checkCudaErrorCode(cudaFree(ptr));
|
||||
@@ -247,4 +260,8 @@ template <typename T> void Engine<T>::clearGpuBuffers() {
|
||||
}
|
||||
m_buffers.clear();
|
||||
}
|
||||
|
||||
// Note: blob/floatImg caches are thread_local inside blobFromGpuMats (static method).
|
||||
// They are cleaned up automatically when threads exit.
|
||||
ANS_DBG("TRT_Engine", "clearGpuBuffers: I/O buffers released");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user