Files
ANSCORE/engines/TensorRTAPI/include/engine/EngineRunInference.inl
2026-04-21 09:26:02 +10:00

1058 lines
49 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#pragma once
#include <cstring>
#include <chrono>
#include <filesystem>
#include <semaphore>
#include "TRTCompat.h"
#include "ANSLicense.h" // ANS_DBG macro for DebugView logging
#ifdef _WIN32
# ifndef WIN32_LEAN_AND_MEAN
# define WIN32_LEAN_AND_MEAN
# endif
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
# include <psapi.h>
# include <tlhelp32.h>
# pragma comment(lib, "psapi.lib")
#endif
// Per-device mutex for CUDA graph capture.
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
// at the CUDA context level. When two Engine instances on the same GPU
// capture graphs concurrently, these cross-stream dependencies violate
// graph capture rules ("operation not permitted when stream is capturing").
// This mutex serialises graph captures across all Engine<T> instances on
// the same device — subsequent cudaGraphLaunch calls are still concurrent.
static std::mutex& graphCaptureMutex() {
static std::mutex m;
return m;
}
// ============================================================================
// GPU INFERENCE THROTTLE
// ============================================================================
// Global counting semaphore that limits how many Engine instances can execute
// CUDA inference simultaneously. Without this, N separate Engine instances
// (one per camera) all submit GPU work at once, causing:
// 1. SM 100% saturation → each inference takes 5-10x longer
// 2. GPU thermal throttling at 85°C → further slowdown
// 3. cudaStreamSynchronize blocking indefinitely → system freeze
//
// Auto-computed from GPU VRAM:
// ≤ 4 GB → 2 concurrent 8 GB → 4 concurrent
// 6 GB → 3 concurrent 12+ GB → 6 concurrent
// Multi-GPU: sum across all GPUs
//
// Excess threads wait on CPU (nearly zero cost) while the bounded set
// runs efficiently on the GPU without thermal throttling.
static std::counting_semaphore<64>& gpuInferenceSemaphore() {
static int maxConcurrent = []() {
int totalSlots = 0;
int gpuCount = 0;
cudaGetDeviceCount(&gpuCount);
if (gpuCount <= 0) return 4; // fallback
for (int i = 0; i < gpuCount; ++i) {
size_t freeMem = 0, totalMem = 0;
cudaSetDevice(i);
cudaMemGetInfo(&freeMem, &totalMem);
int gbTotal = static_cast<int>(totalMem / (1024ULL * 1024ULL * 1024ULL));
// Scale concurrency with VRAM: ~1 slot per 2 GB, min 2, max 6 per GPU
int slotsThisGpu = std::clamp(gbTotal / 2, 2, 6);
totalSlots += slotsThisGpu;
}
totalSlots = std::clamp(totalSlots, 2, 64);
std::cout << "Info [GPU Throttle]: max concurrent inferences = "
<< totalSlots << " (across " << gpuCount << " GPU(s))" << std::endl;
return totalSlots;
}();
static std::counting_semaphore<64> sem(maxConcurrent);
return sem;
}
// RAII guard for the GPU inference semaphore
struct GpuInferenceGuard {
GpuInferenceGuard() { gpuInferenceSemaphore().acquire(); }
~GpuInferenceGuard() { gpuInferenceSemaphore().release(); }
GpuInferenceGuard(const GpuInferenceGuard&) = delete;
GpuInferenceGuard& operator=(const GpuInferenceGuard&) = delete;
};
// ============================================================================
// WDDM-SAFE STREAM SYNCHRONIZATION
// ============================================================================
// Under Windows WDDM, cudaStreamSynchronize calls cuStreamQuery in a tight
// loop with SwitchToThread, holding nvcuda64's internal SRW lock the entire
// time. When the GPU is busy with inference, this spin blocks ALL other CUDA
// operations — including HW video decode (nvcuvid), cuMemAlloc, cuArrayDestroy.
// If a camera Reconnect or decode buffer allocation needs an exclusive SRW lock
// while inference is spinning, the entire system deadlocks.
//
// This function replaces cudaStreamSynchronize with a polling loop that
// explicitly releases the SRW lock between queries by sleeping briefly.
// This allows other CUDA operations to interleave with the sync wait.
static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
// Fast path: check if already done (no sleep overhead for quick kernels)
cudaError_t err = cudaStreamQuery(stream);
if (err != cudaErrorNotReady) return err;
auto syncStart = std::chrono::steady_clock::now();
// Short Sleep(0) fast path (~10 iterations) catches sub-ms kernel completions.
for (int i = 0; i < 10; ++i) {
Sleep(0);
err = cudaStreamQuery(stream);
if (err != cudaErrorNotReady) return err;
}
// 1ms sleeps — adds negligible latency at 30 FPS but prevents SRW lock starvation.
int sleepCount = 0;
while (true) {
Sleep(1);
sleepCount++;
err = cudaStreamQuery(stream);
if (err != cudaErrorNotReady) {
// Log if sync took too long (>500ms indicates GPU stall)
auto elapsed = std::chrono::duration<double, std::milli>(
std::chrono::steady_clock::now() - syncStart).count();
if (elapsed > 500.0) {
ANS_DBG("TRT_Engine", "SLOW SYNC: %.1fms (%d sleeps) stream=%p err=%d",
elapsed, sleepCount, (void*)stream, (int)err);
}
return err;
}
}
}
template <typename T>
void Engine<T>::warmUp(int iterations) {
if (m_verbose) {
std::cout << "\n========================================" << std::endl;
std::cout << "Engine Warmup" << std::endl;
std::cout << "========================================" << std::endl;
}
// Determine batch sizes to warm up
std::vector<int> batchSizes;
if (m_options.maxBatchSize > 1) {
if (m_verbose) {
std::cout << "Dynamic batch engine detected (max batch: " << m_options.maxBatchSize << ")" << std::endl;
std::cout << "Warming up common batch sizes to pre-compile kernels..." << std::endl;
}
// Warm up ALL batch sizes from 1 to maxBatchSize.
// Each unseen batch size incurs a 100-300ms kernel compilation penalty
// on first use. Warming all sizes eliminates that latency at inference
// time and ensures every CUDA graph is pre-captured.
for (int batch = 1; batch <= m_options.maxBatchSize; ++batch) {
batchSizes.push_back(batch);
}
}
else {
if (m_verbose) std::cout << "Fixed batch engine detected (batch size: " << m_options.maxBatchSize << ")" << std::endl;
batchSizes.push_back(m_options.maxBatchSize);
}
if (m_verbose) {
std::cout << "Batch sizes to warm up: ";
for (size_t i = 0; i < batchSizes.size(); ++i) {
std::cout << batchSizes[i];
if (i < batchSizes.size() - 1) std::cout << ", ";
}
std::cout << std::endl;
}
// Warm up each batch size.
// The first call triggers kernel compilation; the second captures the CUDA
// graph. Additional iterations only measure steady-state latency for the
// optBatchSize (printed as a diagnostic).
for (int batchSize : batchSizes) {
const int iters = (batchSize == m_options.optBatchSize) ? iterations : 2;
if (m_verbose) std::cout << "\nWarming up batch=" << batchSize << " (x" << iters << " iterations)..." << std::endl;
// Create dummy inputs for this batch size
std::vector<std::vector<cv::cuda::GpuMat>> dummyInputs;
for (size_t i = 0; i < m_inputDims.size(); ++i) {
const auto& dims = m_inputDims[i];
std::vector<cv::cuda::GpuMat> batch;
// FIXED: Create proper dummy images on GPU
// For dynamic spatial dims, use opt dimensions for warmup
int warmH = (dims.d[1] > 0) ? dims.d[1] : m_options.optInputHeight;
int warmW = (dims.d[2] > 0) ? dims.d[2] : m_options.optInputWidth;
for (int b = 0; b < batchSize; ++b) {
// Create on CPU first
cv::Mat cpuImg(warmH, warmW, CV_32FC(dims.d[0]), cv::Scalar(0.5f, 0.5f, 0.5f));
// Upload to GPU
cv::cuda::GpuMat gpuImg;
gpuImg.upload(cpuImg);
batch.push_back(gpuImg);
}
dummyInputs.push_back(batch);
}
std::vector<std::vector<std::vector<T>>> dummyOutputs;
// Time the first iteration (kernel compilation happens here)
auto start = std::chrono::high_resolution_clock::now();
bool firstSuccess = runInference(dummyInputs, dummyOutputs);
auto end = std::chrono::high_resolution_clock::now();
auto firstTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
if (!firstSuccess) {
if (m_verbose) std::cout << " ✗ ERROR: First iteration failed for batch=" << batchSize << std::endl;
continue;
}
if (m_verbose) {
std::cout << " First iteration: " << firstTime << " ms";
if (firstTime > 100) {
std::cout << " (kernel compilation detected)";
}
std::cout << std::endl;
}
// Run remaining iterations to measure stable performance
if (iters > 1) {
auto iterStart = std::chrono::high_resolution_clock::now();
for (int i = 1; i < iters; ++i) {
bool success = runInference(dummyInputs, dummyOutputs);
if (!success) {
if (m_verbose) std::cout << " ✗ ERROR: Iteration " << i << " failed" << std::endl;
break;
}
}
auto iterEnd = std::chrono::high_resolution_clock::now();
auto totalTime = std::chrono::duration_cast<std::chrono::milliseconds>(iterEnd - iterStart).count();
float avgTime = totalTime / static_cast<float>(iters - 1);
if (m_verbose) {
std::cout << " Subsequent iterations (avg): " << std::fixed << std::setprecision(1)
<< avgTime << " ms" << std::endl;
if (firstTime > 100 && avgTime < firstTime * 0.5f) {
float speedup = firstTime / avgTime;
std::cout << " ✓ Speedup after warmup: " << std::fixed << std::setprecision(1)
<< speedup << "x faster" << std::endl;
}
}
}
if (m_verbose) std::cout << " ✓ Batch=" << batchSize << " warmed up successfully" << std::endl;
}
if (m_verbose) {
std::cout << "\n========================================" << std::endl;
std::cout << "Warmup Complete!" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "Kernels pre-compiled for all batch sizes." << std::endl;
std::cout << "========================================\n" << std::endl;
}
}
template <typename T>
bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,std::vector<std::vector<std::vector<T>>>& featureVectors) {
// ============================================================================
// MULTI-GPU POOL DISPATCH
// ============================================================================
// If this Engine was initialised with initializePool() / initializePoolFromEngine()
// the m_slots vector is non-empty. In that case, delegate to the pool
// dispatcher which acquires the first idle slot and runs inference there.
// This branch is NEVER taken for single-GPU use (buildLoadNetwork / loadNetwork).
if (!m_slots.empty()) {
return runInferenceFromPool(inputs, featureVectors);
}
// ============================================================================
// GPU INFERENCE THROTTLE
// ============================================================================
// Limit how many Engine instances can run CUDA inference simultaneously.
// Without this, 12 cameras each with their own Engine all submit GPU work
// at once → SM 100% → thermal throttle → cudaStreamSynchronize hangs.
// The semaphore lets excess threads wait on CPU (nearly zero cost) while
// a bounded number use the GPU efficiently.
GpuInferenceGuard gpuThrottle;
// ============================================================================
// SINGLE-ENGINE SERIALISATION
// ============================================================================
// The single Engine instance has shared mutable state (m_buffers, m_lastBatchSize,
// m_inferenceStream, TRT execution context). If two LabVIEW threads call
// runInference concurrently with different batch sizes, one will overwrite
// the input shapes and buffers while the other is mid-inference, causing a
// fatal "illegal memory access" that permanently corrupts the CUDA context.
//
// Pool-mode slots have their own busy-flag dispatch so they do NOT need this.
auto _mutexWaitStart = std::chrono::steady_clock::now();
std::lock_guard<std::mutex> inferenceLock(m_inferenceMutex);
auto _mutexAcquired = std::chrono::steady_clock::now();
double _mutexWaitMs = std::chrono::duration<double, std::milli>(_mutexAcquired - _mutexWaitStart).count();
if (_mutexWaitMs > 50.0) {
ANS_DBG("TRT_Engine", "MUTEX WAIT: %.1fms (queued behind another inference)", _mutexWaitMs);
}
// ============================================================================
// THREAD-SAFE GPU CONTEXT
// ============================================================================
// Ensure the calling thread's CUDA device matches this engine's GPU.
// This is essential for multi-GPU round-robin: LabVIEW reuses threads
// across tasks, so a thread that last ran inference on GPU 1 might now
// be running a task on GPU 0. Without this, cv::cuda::GpuMat allocations
// and kernel launches would target the wrong GPU, causing result corruption.
// Skip cudaSetDevice if already on the correct device — under WDDM
// with multiple GPUs each call costs 1-5ms of scheduler overhead.
{
int currentDev = -1;
cudaGetDevice(&currentDev);
if (currentDev != m_options.deviceIndex) {
cudaSetDevice(m_options.deviceIndex);
}
}
// ============================================================================
// DEBUG: First call diagnostics (per-instance, not process-wide)
// ============================================================================
if (m_verbose && m_firstInferenceCall) {
std::cout << "\n=== First runInference Call ===" << std::endl;
std::cout << "Number of input tensors: " << inputs.size() << std::endl;
for (size_t i = 0; i < inputs.size(); ++i) {
std::cout << "Input " << i << " batch size: " << inputs[i].size() << std::endl;
if (!inputs[i].empty()) {
const auto& img = inputs[i][0];
std::cout << " Image shape: " << img.cols << "x" << img.rows
<< "x" << img.channels() << " (type: " << img.type() << ")" << std::endl;
}
}
// Print optimization profile information
std::cout << "\n=== Engine Profile Information ===" << std::endl;
std::cout << "Number of optimization profiles: "
<< m_engine->getNbOptimizationProfiles() << std::endl;
if (m_engine->getNbOptimizationProfiles() > 0) {
for (int profile = 0; profile < m_engine->getNbOptimizationProfiles(); ++profile) {
std::cout << "\n--- Profile " << profile << " ---" << std::endl;
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
const char* tensorName = m_IOTensorNames[i].c_str();
// Check if this is an input tensor
auto ioMode = m_engine->getTensorIOMode(tensorName);
if (ioMode != nvinfer1::TensorIOMode::kINPUT) {
continue;
}
auto minDims = m_engine->getProfileShape(tensorName, profile,
nvinfer1::OptProfileSelector::kMIN);
auto optDims = m_engine->getProfileShape(tensorName, profile,
nvinfer1::OptProfileSelector::kOPT);
auto maxDims = m_engine->getProfileShape(tensorName, profile,
nvinfer1::OptProfileSelector::kMAX);
std::cout << "Tensor '" << tensorName << "' (INPUT):" << std::endl;
std::cout << " Min: [" << minDims.d[0];
for (int j = 1; j < minDims.nbDims; ++j) std::cout << "," << minDims.d[j];
std::cout << "]" << std::endl;
std::cout << " Opt: [" << optDims.d[0];
for (int j = 1; j < optDims.nbDims; ++j) std::cout << "," << optDims.d[j];
std::cout << "]" << std::endl;
std::cout << " Max: [" << maxDims.d[0];
for (int j = 1; j < maxDims.nbDims; ++j) std::cout << "," << maxDims.d[j];
std::cout << "]" << std::endl;
}
}
}
if (!m_context->allInputDimensionsSpecified()) {
std::cout << "ERROR: Input dimensions not specified in context!" << std::endl;
return false;
}
std::cout << "\nContext state: All dimensions specified ✓" << std::endl;
m_firstInferenceCall = false;
}
// ============================================================================
// INPUT VALIDATION
// ============================================================================
if (inputs.empty() || inputs[0].empty()) {
std::cout << "Error: Empty input" << std::endl;
return false;
}
// ============================================================================
// SM=100% DETECTOR — tracks inference timing trends to catch the exact
// moment GPU becomes saturated. Logs every 50 inferences with rolling
// average, and immediately when degradation is detected.
// ============================================================================
// Global (process-wide) counters shared across all engine instances/threads
static std::atomic<int64_t> s_globalInfCount{0};
static std::atomic<int> s_globalActiveInf{0}; // currently in-flight inferences
static std::atomic<double> s_globalLastAvgMs{0.0}; // last known avg inference time
const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
s_globalActiveInf.fetch_add(1);
// ── Process-wide host-RAM heartbeat (once per ~60s) ──────────────────────
// Diagnostic for long-run leak hunts: if [PROC_MEM] privateMB climbs while
// [TRT_SM100] VRAM stays flat, the leak is on the host side (FFmpeg
// contexts, RTSP threads, GDI objects). Cheap when not firing — single
// atomic load + one compare in the hot path.
#ifdef _WIN32
{
using clk = std::chrono::steady_clock;
static std::atomic<int64_t> s_hbLastNs{0};
const int64_t nowNs = clk::now().time_since_epoch().count();
int64_t prev = s_hbLastNs.load(std::memory_order_relaxed);
constexpr int64_t kIntervalNs = 60LL * 1'000'000'000LL;
if (nowNs - prev >= kIntervalNs &&
s_hbLastNs.compare_exchange_strong(prev, nowNs,
std::memory_order_relaxed)) {
PROCESS_MEMORY_COUNTERS_EX pmc{};
pmc.cb = sizeof(pmc);
GetProcessMemoryInfo(GetCurrentProcess(),
reinterpret_cast<PROCESS_MEMORY_COUNTERS*>(&pmc),
sizeof(pmc));
DWORD gdi = GetGuiResources(GetCurrentProcess(), GR_GDIOBJECTS);
DWORD usr = GetGuiResources(GetCurrentProcess(), GR_USEROBJECTS);
// Thread count via Toolhelp snapshot (filter to current PID).
DWORD threads = 0;
HANDLE snap = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);
if (snap != INVALID_HANDLE_VALUE) {
THREADENTRY32 te{ sizeof(te) };
const DWORD pid = GetCurrentProcessId();
if (Thread32First(snap, &te)) {
do {
if (te.th32OwnerProcessID == pid) ++threads;
} while (Thread32Next(snap, &te));
}
CloseHandle(snap);
}
ANS_DBG("PROC_MEM",
"privateMB=%llu workingMB=%llu peakWorkingMB=%llu "
"pagefileMB=%llu gdi=%lu user=%lu threads=%lu",
(unsigned long long)(pmc.PrivateUsage >> 20),
(unsigned long long)(pmc.WorkingSetSize >> 20),
(unsigned long long)(pmc.PeakWorkingSetSize >> 20),
(unsigned long long)(pmc.PagefileUsage >> 20),
(unsigned long)gdi, (unsigned long)usr,
(unsigned long)threads);
}
}
#endif
// Per-thread tracking
{
static thread_local int64_t s_infCount = 0;
static thread_local std::chrono::steady_clock::time_point s_lastLog;
static thread_local double s_rollingAvgMs = 0.0;
static thread_local double s_baselineMs = 0.0; // avg during first 100 inferences
static thread_local double s_maxMs = 0.0;
static thread_local bool s_degradationLogged = false;
s_infCount++;
if (s_infCount == 1) {
s_lastLog = std::chrono::steady_clock::now();
ANS_DBG("TRT_SM100", "FIRST inference — engine alive, globalInf=%lld", myInfNum);
}
// Log every 50 inferences (more frequent than 500 to catch transitions)
if (s_infCount % 50 == 0) {
auto now = std::chrono::steady_clock::now();
double elapsed = std::chrono::duration<double>(now - s_lastLog).count();
double fps = (elapsed > 0) ? (50.0 / elapsed) : 0;
s_lastLog = now;
size_t vramFree = 0, vramTotal = 0;
cudaMemGetInfo(&vramFree, &vramTotal);
size_t vramUsedMB = (vramTotal - vramFree) / (1024 * 1024);
size_t vramFreeMB = vramFree / (1024 * 1024);
ANS_DBG("TRT_SM100", "#%lld [global=%lld active=%d] %.1f inf/sec avgMs=%.1f maxMs=%.1f batch=%d graphs=%zu VRAM=%zuMB/%zuMB",
s_infCount, myInfNum, s_globalActiveInf.load(),
fps, s_rollingAvgMs, s_maxMs,
(int)inputs[0].size(), m_graphExecs.size(),
vramUsedMB, vramFreeMB);
// Capture baseline from first 100 inferences
if (s_infCount == 100) {
s_baselineMs = s_rollingAvgMs;
ANS_DBG("TRT_SM100", "BASELINE established: %.1fms/inference", s_baselineMs);
}
// Detect degradation: avg >3x baseline AND baseline is set
if (s_baselineMs > 0 && s_rollingAvgMs > s_baselineMs * 3.0 && !s_degradationLogged) {
s_degradationLogged = true;
ANS_DBG("TRT_SM100", "*** DEGRADATION DETECTED *** avg=%.1fms baseline=%.1fms (%.1fx) VRAM=%zuMB/%zuMB active=%d",
s_rollingAvgMs, s_baselineMs, s_rollingAvgMs / s_baselineMs,
vramUsedMB, vramFreeMB, s_globalActiveInf.load());
}
// Reset max for next window
s_maxMs = 0.0;
}
}
const auto numInputs = m_inputDims.size();
if (inputs.size() != numInputs) {
std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
<< ", Got: " << inputs.size() << std::endl;
return false;
}
const auto batchSize = static_cast<int32_t>(inputs[0].size());
if (batchSize > m_options.maxBatchSize) {
std::cout << "Error: Batch size " << batchSize << " exceeds maximum "
<< m_options.maxBatchSize << std::endl;
return false;
}
if (batchSize < 1) {
std::cout << "Error: Batch size must be at least 1" << std::endl;
return false;
}
// Validate batch size consistency across all inputs
for (size_t i = 1; i < inputs.size(); ++i) {
if (inputs[i].size() != static_cast<size_t>(batchSize)) {
std::cout << "Error: Inconsistent batch sizes across inputs. Input 0: "
<< batchSize << ", Input " << i << ": " << inputs[i].size() << std::endl;
return false;
}
}
// ============================================================================
// STREAM GUARD
// ============================================================================
// m_inferenceStream is now created eagerly in loadNetwork() so it is always
// valid here. Guard against the (unlikely) edge case where runInference is
// called before loadNetwork succeeds.
if (!m_streamInitialized || !m_inferenceStream) {
return false;
}
// ============================================================================
// SET INPUT SHAPES (batch size changed OR dynamic spatial dims need updating)
// ============================================================================
// Fast path: compute desired dims first, then compare against cached dims.
// This avoids all TRT API calls when the shape hasn't actually changed —
// critical for the recognizer which is called ~50-100x per image with
// dynamic width but often the same or similar widths.
// ============================================================================
{
// Lazily initialise the dims cache on first call
if (m_lastSetInputDims.empty()) {
m_lastSetInputDims.resize(numInputs);
for (size_t i = 0; i < numInputs; ++i) {
m_lastSetInputDims[i].nbDims = 0; // force mismatch on first call
}
}
// Build desired dims for every input tensor (cheap — no TRT API calls)
bool anyDimChanged = (m_lastBatchSize != batchSize);
std::vector<nvinfer1::Dims> desiredDims(numInputs);
for (size_t i = 0; i < numInputs; ++i) {
nvinfer1::Dims& nd = desiredDims[i];
nd.nbDims = 4;
nd.d[0] = batchSize;
nd.d[1] = m_inputDims[i].d[0]; // channels
if (m_hasDynamicSpatialDims && !inputs[i].empty()) {
const auto& firstImg = inputs[i][0];
nd.d[2] = (m_inputDims[i].d[1] == -1) ? firstImg.rows : m_inputDims[i].d[1];
nd.d[3] = (m_inputDims[i].d[2] == -1) ? firstImg.cols : m_inputDims[i].d[2];
} else {
nd.d[2] = m_inputDims[i].d[1];
nd.d[3] = m_inputDims[i].d[2];
}
// Compare with cached
if (!anyDimChanged) {
const auto& cached = m_lastSetInputDims[i];
if (cached.nbDims != nd.nbDims ||
cached.d[0] != nd.d[0] || cached.d[1] != nd.d[1] ||
cached.d[2] != nd.d[2] || cached.d[3] != nd.d[3]) {
anyDimChanged = true;
}
}
}
if (anyDimChanged) {
ANS_DBG("TRT_Engine", "Shape change detected: batch %d -> %d (graphsCached=%zu)",
m_lastBatchSize, batchSize, m_graphExecs.size());
// === First-time diagnostics (verbose, once) ===
const bool firstTime = !m_batchShapeChangeLogged;
if (m_verbose && firstTime) {
std::cout << "\nInfo: Batch size change: " << m_lastBatchSize
<< " -> " << batchSize << std::endl;
}
// Set optimization profile (only when truly needed)
if (m_engine->getNbOptimizationProfiles() > 0) {
int currentProfile = m_context->getOptimizationProfile();
if (currentProfile != 0 || m_lastBatchSize < 0) {
if (m_verbose && firstTime) {
std::cout << " Setting optimization profile to 0..." << std::endl;
}
if (!m_context->setOptimizationProfileAsync(0, m_inferenceStream)) {
std::cout << "Error: Failed to set optimization profile 0" << std::endl;
return false;
}
cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
if (syncErr != cudaSuccess) {
std::cout << "Error: Failed to sync after profile change: "
<< cudaGetErrorString(syncErr) << std::endl;
return false;
}
if (m_verbose && firstTime) {
std::cout << " Optimization profile set successfully" << std::endl;
}
}
}
// Update shapes for input tensors that actually changed
for (size_t i = 0; i < numInputs; ++i) {
const char* tensorName = m_IOTensorNames[i].c_str();
// Skip non-input tensors
auto ioMode = m_engine->getTensorIOMode(tensorName);
if (ioMode != nvinfer1::TensorIOMode::kINPUT) continue;
const nvinfer1::Dims& newDims = desiredDims[i];
const nvinfer1::Dims& cached = m_lastSetInputDims[i];
// Skip this tensor if its dims haven't changed
if (cached.nbDims == newDims.nbDims &&
cached.d[0] == newDims.d[0] && cached.d[1] == newDims.d[1] &&
cached.d[2] == newDims.d[2] && cached.d[3] == newDims.d[3]) {
continue;
}
// First-time verbose diagnostics
if (m_verbose && firstTime) {
std::cout << "\n Processing tensor " << i << ": '" << tensorName << "'" << std::endl;
// Validate batch size range (first time only)
if (m_engine->getNbOptimizationProfiles() > 0) {
int profileIndex = m_context->getOptimizationProfile();
nvinfer1::Dims minDims = m_engine->getProfileShape(
tensorName, profileIndex, nvinfer1::OptProfileSelector::kMIN);
nvinfer1::Dims maxDims = m_engine->getProfileShape(
tensorName, profileIndex, nvinfer1::OptProfileSelector::kMAX);
std::cout << " Profile batch range: [" << minDims.d[0]
<< " to " << maxDims.d[0] << "]" << std::endl;
if (batchSize < minDims.d[0] || batchSize > maxDims.d[0]) {
std::cout << "Error: Batch size " << batchSize
<< " outside profile range" << std::endl;
return false;
}
}
auto currentShape = m_context->getTensorShape(tensorName);
std::cout << " Current context shape: [";
for (int j = 0; j < currentShape.nbDims; ++j) {
if (j > 0) std::cout << ", ";
std::cout << currentShape.d[j];
}
std::cout << "]" << std::endl;
std::cout << " Setting new shape: [" << newDims.d[0] << ", "
<< newDims.d[1] << ", " << newDims.d[2] << ", "
<< newDims.d[3] << "]" << std::endl;
}
ANS_DBG("TRT_Engine", "setInputShape('%s') [%d,%d,%d,%d]",
tensorName, newDims.d[0], newDims.d[1], newDims.d[2], newDims.d[3]);
if (!m_context->setInputShape(tensorName, newDims)) {
ANS_DBG("TRT_Engine", "ERROR: setInputShape FAILED for '%s'", tensorName);
std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
return false;
}
// Verify shape (first time only — trust the API on hot path)
if (firstTime) {
auto verifyShape = m_context->getTensorShape(tensorName);
if (verifyShape.d[0] != batchSize) {
std::cout << "Error: Shape change didn't take effect. Expected batch "
<< batchSize << ", got " << verifyShape.d[0] << std::endl;
return false;
}
if (m_verbose) {
std::cout << " Shape updated successfully" << std::endl;
}
}
m_lastSetInputDims[i] = newDims;
}
// Verify all input dimensions specified (first time only)
if (firstTime) {
if (!m_context->allInputDimensionsSpecified()) {
std::cout << "Error: Not all input dimensions specified after shape change" << std::endl;
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
std::cout << " " << m_IOTensorNames[i] << ": [";
for (int j = 0; j < shape.nbDims; ++j) {
if (j > 0) std::cout << ", ";
std::cout << shape.d[j];
}
std::cout << "]" << std::endl;
}
return false;
}
}
m_lastBatchSize = batchSize;
m_batchShapeChangeLogged = true;
// CRITICAL: Invalidate all cached CUDA graphs after shape change.
// Graphs were captured with the OLD context state (old tensor shapes).
// Launching them after setInputShape() produces undefined GPU behavior
// (invalid kernel sequences, SM lockup at 100%, inference hang).
if (!m_graphExecs.empty()) {
size_t destroyed = m_graphExecs.size();
for (auto& [bs, ge] : m_graphExecs) {
if (ge) cudaGraphExecDestroy(ge);
}
m_graphExecs.clear();
ANS_DBG("TRT_Engine", "INVALIDATED %zu cached CUDA graphs after shape change (batch=%d)",
destroyed, batchSize);
if (m_verbose || firstTime) {
std::cout << "Info: Invalidated " << destroyed
<< " cached CUDA graphs after shape change" << std::endl;
}
}
if (m_verbose && firstTime) {
std::cout << "\nInfo: Input shapes updated successfully for batch size "
<< batchSize << "\n" << std::endl;
}
}
}
// ============================================================================
// PREPROCESS AND COPY INPUTS TO GPU BUFFERS
// ============================================================================
// Pass 1: Validate all input dimensions before any GPU work.
// Dynamic dims (-1) are skipped in validation (they accept any size).
for (size_t i = 0; i < numInputs; ++i) {
const auto& batchInput = inputs[i];
const auto& dims = m_inputDims[i];
if (!batchInput.empty()) {
const auto& firstImg = batchInput[0];
bool mismatch = false;
if (dims.d[0] > 0 && firstImg.channels() != dims.d[0]) mismatch = true;
if (dims.d[1] > 0 && firstImg.rows != dims.d[1]) mismatch = true;
if (dims.d[2] > 0 && firstImg.cols != dims.d[2]) mismatch = true;
if (mismatch) {
std::cout << "Error: Input " << i << " dimension mismatch!" << std::endl;
std::cout << " Expected: " << dims.d[2] << "x" << dims.d[1]
<< "x" << dims.d[0] << " (WxHxC, -1=dynamic)" << std::endl;
std::cout << " Got: " << firstImg.cols << "x" << firstImg.rows
<< "x" << firstImg.channels() << " (WxHxC)" << std::endl;
return false;
}
}
}
// Pass 2: Preprocess + D2D copies — all on m_inferenceStream (no null stream).
//
// All OpenCV CUDA ops (convertTo, subtract, divide, split) in blobFromGpuMats
// now run on m_inferenceStream via the cv::cuda::Stream wrapper. This means:
// • No null-stream interaction — eliminates global sync barriers on WDDM
// • No event bridge needed — same-stream ordering guarantees correctness
// • CUDA graphs are safe — cv::cuda::split runs BEFORE graph capture
//
// GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
// cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
auto _prepStart = std::chrono::steady_clock::now();
cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
std::vector<cv::cuda::GpuMat> preprocessedBuffers;
preprocessedBuffers.reserve(numInputs);
for (size_t i = 0; i < numInputs; ++i) {
const auto& batchInput = inputs[i];
// Preprocess on m_inferenceStream (not the null stream).
preprocessedBuffers.push_back(
blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize, false, cvInferStream));
// D2D copy: same stream as preprocessing, so ordering is guaranteed.
const auto& blobMat = preprocessedBuffers.back();
const size_t copySize = static_cast<size_t>(blobMat.rows) * static_cast<size_t>(blobMat.cols) * blobMat.elemSize();
cudaError_t copyErr = cudaMemcpyAsync(
m_buffers[i],
preprocessedBuffers.back().ptr<void>(),
copySize,
cudaMemcpyDeviceToDevice,
m_inferenceStream);
if (copyErr != cudaSuccess) {
std::cout << "Error: Failed to copy input " << i
<< " to inference buffer: " << cudaGetErrorString(copyErr) << std::endl;
return false;
}
}
{
double _prepMs = std::chrono::duration<double, std::milli>(
std::chrono::steady_clock::now() - _prepStart).count();
if (_prepMs > 100.0) {
ANS_DBG("TRT_SM100", "SLOW PREPROCESS: %.1fms batch=%d (blobFromGpuMats+D2D)", _prepMs, batchSize);
}
}
// ============================================================================
// PRE-ALLOCATE OUTPUT STRUCTURE
// ============================================================================
const size_t numOutputs = m_outputLengths.size();
featureVectors.clear();
featureVectors.resize(batchSize);
for (int batch = 0; batch < batchSize; ++batch) {
featureVectors[batch].resize(numOutputs);
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx)
featureVectors[batch][outputIdx].resize(m_outputLengths[outputIdx]);
}
// ============================================================================
// RUN INFERENCE + COPY OUTPUTS (CUDA Graph path or direct path)
// ============================================================================
// CUDA Graph path
// ---------------
// On the first call for a given batchSize we capture enqueueV3 + D2H copies
// into a reusable graph. Subsequent calls use cudaGraphLaunch, replacing
// many individual kernel-submission API calls with a single launch.
//
// Prerequisites satisfied here:
// • Preprocessing + D2D copies are queued on m_inferenceStream (same-stream
// ordering guarantees they complete before captured kernels execute)
// • m_pinnedOutputBuffers has stable addresses (allocated in loadNetwork)
// • m_buffers (GPU outputs) have stable addresses (allocated in loadNetwork)
//
// Falls back to the direct path if pinned buffers are unavailable or if
// graph capture/instantiation fails for any reason.
// CUDA graphs capture fixed kernel sequences; incompatible with dynamic spatial dims
// (input H/W change per inference call → different TRT kernel plans each time).
// Disabled for pool slots — concurrent graph captures on the same GPU corrupt the
// CUDA context ("operation not permitted when stream is capturing").
const bool canGraph = !m_disableGraphs && !m_pinnedOutputBuffers.empty() && !m_hasDynamicSpatialDims;
bool graphUsed = false;
if (canGraph) {
auto& graphExec = m_graphExecs[batchSize]; // inserts nullptr on first access
if (!graphExec) {
ANS_DBG("TRT_Engine", "CUDA graph CAPTURE starting for batch=%d (cached=%zu)",
batchSize, m_graphExecs.size());
// First call for this batchSize -- capture a new graph.
// Serialise captures across all Engine instances on this device to
// prevent TRT's shared workspace from creating cross-stream
// dependencies that violate CUDA graph capture rules.
std::lock_guard<std::mutex> captureLock(graphCaptureMutex());
// Clear any sticky CUDA error from a prior failed capture so that
// this attempt starts clean.
cudaGetLastError();
cudaGraph_t graph = nullptr;
bool captureOk = false;
if (cudaStreamBeginCapture(m_inferenceStream,
cudaStreamCaptureModeRelaxed) == cudaSuccess) {
// Record TRT kernels into the graph (not executed yet).
TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
// Record D2H copies to stable pinned addresses.
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
cudaMemcpyAsync(
m_pinnedOutputBuffers[outputIdx],
static_cast<char*>(m_buffers[numInputs + outputIdx]),
static_cast<size_t>(batchSize) * m_outputLengths[outputIdx] * sizeof(T),
cudaMemcpyDeviceToHost,
m_inferenceStream);
}
captureOk = (cudaStreamEndCapture(m_inferenceStream, &graph) == cudaSuccess
&& graph != nullptr);
}
if (captureOk) {
cudaGraphExec_t exec = nullptr;
if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
graphExec = exec;
cudaGraphDestroy(graph);
ANS_DBG("TRT_Engine", "CUDA graph CAPTURED OK for batch=%d exec=%p",
batchSize, (void*)graphExec);
}
if (!graphExec) {
ANS_DBG("TRT_Engine", "CUDA graph capture FAILED for batch=%d — falling back to direct path",
batchSize);
std::cout << "Warning: CUDA graph capture failed for batchSize="
<< batchSize << " -- falling back to direct inference path." << std::endl;
// Disable graph acceleration for this Engine instance.
for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
m_pinnedOutputBuffers.clear();
m_graphExecs.erase(batchSize);
}
}
if (graphExec) {
ANS_DBG("TRT_Engine", "CUDA graph LAUNCH batch=%d exec=%p", batchSize, (void*)graphExec);
// Launch the pre-captured graph (single API call replaces many).
auto _graphStart = std::chrono::steady_clock::now();
cudaGraphLaunch(graphExec, m_inferenceStream);
cudaStreamSynchronize_Safe(m_inferenceStream);
auto _graphEnd = std::chrono::steady_clock::now();
double _graphMs = std::chrono::duration<double, std::milli>(_graphEnd - _graphStart).count();
if (_graphMs > 500.0) {
ANS_DBG("TRT_SM100", "SLOW GRAPH: %.1fms batch=%d active=%d",
_graphMs, batchSize, s_globalActiveInf.load());
}
// CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
for (int batch = 0; batch < batchSize; ++batch) {
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
std::memcpy(
featureVectors[batch][outputIdx].data(),
m_pinnedOutputBuffers[outputIdx]
+ static_cast<size_t>(batch) * m_outputLengths[outputIdx],
m_outputLengths[outputIdx] * sizeof(T));
}
}
graphUsed = true;
}
}
// Direct path (no graph)
// ----------------------
// Used when pinned buffers are unavailable or graph capture failed.
if (!graphUsed) {
ANS_DBG("TRT_Engine", "Direct path (no graph) batch=%d", batchSize);
auto enqueueStart = std::chrono::steady_clock::now();
bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
auto enqueueEnd = std::chrono::steady_clock::now();
double enqueueMs = std::chrono::duration<double, std::milli>(enqueueEnd - enqueueStart).count();
if (enqueueMs > 500.0) {
ANS_DBG("TRT_Engine", "SLOW ENQUEUE: %.1fms batch=%d (enqueueV3 blocked!)", enqueueMs, batchSize);
}
if (!success) {
ANS_DBG("TRT_Engine", "ERROR: enqueueV3 FAILED batch=%d", batchSize);
return false;
}
for (int batch = 0; batch < batchSize; ++batch) {
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
const size_t outputBinding = numInputs + outputIdx;
const size_t offset =
static_cast<size_t>(batch) * m_outputLengths[outputIdx] * sizeof(T);
cudaError_t copyErr = cudaMemcpyAsync(
featureVectors[batch][outputIdx].data(),
static_cast<char*>(m_buffers[outputBinding]) + offset,
m_outputLengths[outputIdx] * sizeof(T),
cudaMemcpyDeviceToHost,
m_inferenceStream);
if (copyErr != cudaSuccess) {
return false;
}
}
}
auto syncStart = std::chrono::steady_clock::now();
cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
auto syncEnd = std::chrono::steady_clock::now();
double syncMs = std::chrono::duration<double, std::milli>(syncEnd - syncStart).count();
if (syncMs > 500.0) {
ANS_DBG("TRT_Engine", "SLOW INFERENCE SYNC: %.1fms batch=%d (direct path)", syncMs, batchSize);
}
if (syncErr != cudaSuccess) {
ANS_DBG("TRT_Engine", "ERROR: cudaStreamSync FAILED err=%d (%s)",
(int)syncErr, cudaGetErrorString(syncErr));
return false;
}
}
// ============================================================================
// Slow-inference alarm — ONE-SIDED FILTER, NOT A DISTRIBUTION
// ============================================================================
// This emits a DebugView line ONLY when a single inference's total wall
// time (mutex-wait + GPU execution) exceeds 100 ms. Fast calls are silent.
//
// Consequence: if you aggregate `[TRT_Slow]` lines and compute an average,
// you get the mean of the slow *tail*, NOT the real average inference
// time. Expect this avg to look dramatic (~200400 ms) because by design
// every sample here is already slow. A typical inference on a healthy
// system fires this line for ~13% of calls; >10% indicates a problem.
//
// For the true per-inference distribution, look at `[TRT_SM100] #N ...
// avgMs=... maxMs=...` (running-average, emitted every 50 inferences).
// The tag was previously `[TRT_Timing]` which misled readers into
// interpreting the avg as overall pipeline latency.
{
double totalMs = std::chrono::duration<double, std::milli>(
std::chrono::steady_clock::now() - _mutexWaitStart).count();
double gpuMs = totalMs - _mutexWaitMs; // Everything after mutex acquired
if (totalMs > 100.0) {
ANS_DBG("TRT_Slow",
"SLOW inference total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d "
"(this filter only fires for calls >100ms)",
totalMs, _mutexWaitMs, gpuMs, batchSize, s_globalActiveInf.load());
}
}
// ============================================================================
// SM=100% DETECTOR — end-of-inference timing
// ============================================================================
{
static thread_local double s_ema = 0;
static thread_local std::chrono::steady_clock::time_point s_prevEnd;
static thread_local bool s_firstDone = false;
auto _now = std::chrono::steady_clock::now();
if (s_firstDone) {
double sinceLastMs = std::chrono::duration<double, std::milli>(_now - s_prevEnd).count();
// If time between consecutive inferences jumps dramatically,
// something blocked the thread (SM=100% or mutex contention)
if (s_ema > 0 && sinceLastMs > s_ema * 3.0 && sinceLastMs > 500.0) {
size_t vf = 0, vt = 0;
cudaMemGetInfo(&vf, &vt);
ANS_DBG("TRT_SM100", "GAP DETECTED: %.1fms between inferences (avg=%.1fms, %.1fx) active=%d VRAM=%zuMB free",
sinceLastMs, s_ema, sinceLastMs / s_ema,
s_globalActiveInf.load(), vf / (1024*1024));
}
s_ema = (s_ema == 0) ? sinceLastMs : (0.9 * s_ema + 0.1 * sinceLastMs);
}
s_prevEnd = _now;
s_firstDone = true;
s_globalActiveInf.fetch_sub(1);
}
return true;
}