Cleaned up verbose engine telemetry emitted to stdout/stderr and the Windows Event Viewer. Removes logEngineEvent/logEvent calls (and their diagnostic-only locals) across the TensorRT engine load, build, run, multi-GPU, and pool-manager paths, plus the now-unused logEvent helper in EnginePoolManager. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
981 lines
46 KiB
C++
981 lines
46 KiB
C++
#pragma once
|
|
#include <cstring>
|
|
#include <chrono>
|
|
#include <filesystem>
|
|
#include <semaphore>
|
|
#include "TRTCompat.h"
|
|
#include "ANSLicense.h" // ANS_DBG macro for DebugView logging
|
|
|
|
// Per-device mutex for CUDA graph capture.
|
|
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
|
|
// at the CUDA context level. When two Engine instances on the same GPU
|
|
// capture graphs concurrently, these cross-stream dependencies violate
|
|
// graph capture rules ("operation not permitted when stream is capturing").
|
|
// This mutex serialises graph captures across all Engine<T> instances on
|
|
// the same device — subsequent cudaGraphLaunch calls are still concurrent.
|
|
static std::mutex& graphCaptureMutex() {
|
|
static std::mutex m;
|
|
return m;
|
|
}
|
|
|
|
// ============================================================================
|
|
// GPU INFERENCE THROTTLE
|
|
// ============================================================================
|
|
// Global counting semaphore that limits how many Engine instances can execute
|
|
// CUDA inference simultaneously. Without this, N separate Engine instances
|
|
// (one per camera) all submit GPU work at once, causing:
|
|
// 1. SM 100% saturation → each inference takes 5-10x longer
|
|
// 2. GPU thermal throttling at 85°C → further slowdown
|
|
// 3. cudaStreamSynchronize blocking indefinitely → system freeze
|
|
//
|
|
// Auto-computed from GPU VRAM:
|
|
// ≤ 4 GB → 2 concurrent 8 GB → 4 concurrent
|
|
// 6 GB → 3 concurrent 12+ GB → 6 concurrent
|
|
// Multi-GPU: sum across all GPUs
|
|
//
|
|
// Excess threads wait on CPU (nearly zero cost) while the bounded set
|
|
// runs efficiently on the GPU without thermal throttling.
|
|
static std::counting_semaphore<64>& gpuInferenceSemaphore() {
|
|
static int maxConcurrent = []() {
|
|
int totalSlots = 0;
|
|
int gpuCount = 0;
|
|
cudaGetDeviceCount(&gpuCount);
|
|
if (gpuCount <= 0) return 4; // fallback
|
|
|
|
for (int i = 0; i < gpuCount; ++i) {
|
|
size_t freeMem = 0, totalMem = 0;
|
|
cudaSetDevice(i);
|
|
cudaMemGetInfo(&freeMem, &totalMem);
|
|
int gbTotal = static_cast<int>(totalMem / (1024ULL * 1024ULL * 1024ULL));
|
|
|
|
// Scale concurrency with VRAM: ~1 slot per 2 GB, min 2, max 6 per GPU
|
|
int slotsThisGpu = std::clamp(gbTotal / 2, 2, 6);
|
|
totalSlots += slotsThisGpu;
|
|
}
|
|
|
|
totalSlots = std::clamp(totalSlots, 2, 64);
|
|
std::cout << "Info [GPU Throttle]: max concurrent inferences = "
|
|
<< totalSlots << " (across " << gpuCount << " GPU(s))" << std::endl;
|
|
return totalSlots;
|
|
}();
|
|
static std::counting_semaphore<64> sem(maxConcurrent);
|
|
return sem;
|
|
}
|
|
|
|
// RAII guard for the GPU inference semaphore
|
|
struct GpuInferenceGuard {
|
|
GpuInferenceGuard() { gpuInferenceSemaphore().acquire(); }
|
|
~GpuInferenceGuard() { gpuInferenceSemaphore().release(); }
|
|
GpuInferenceGuard(const GpuInferenceGuard&) = delete;
|
|
GpuInferenceGuard& operator=(const GpuInferenceGuard&) = delete;
|
|
};
|
|
|
|
// ============================================================================
|
|
// WDDM-SAFE STREAM SYNCHRONIZATION
|
|
// ============================================================================
|
|
// Under Windows WDDM, cudaStreamSynchronize calls cuStreamQuery in a tight
|
|
// loop with SwitchToThread, holding nvcuda64's internal SRW lock the entire
|
|
// time. When the GPU is busy with inference, this spin blocks ALL other CUDA
|
|
// operations — including HW video decode (nvcuvid), cuMemAlloc, cuArrayDestroy.
|
|
// If a camera Reconnect or decode buffer allocation needs an exclusive SRW lock
|
|
// while inference is spinning, the entire system deadlocks.
|
|
//
|
|
// This function replaces cudaStreamSynchronize with a polling loop that
|
|
// explicitly releases the SRW lock between queries by sleeping briefly.
|
|
// This allows other CUDA operations to interleave with the sync wait.
|
|
static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
|
|
// Fast path: check if already done (no sleep overhead for quick kernels)
|
|
cudaError_t err = cudaStreamQuery(stream);
|
|
if (err != cudaErrorNotReady) return err;
|
|
|
|
auto syncStart = std::chrono::steady_clock::now();
|
|
|
|
// Short Sleep(0) fast path (~10 iterations) catches sub-ms kernel completions.
|
|
for (int i = 0; i < 10; ++i) {
|
|
Sleep(0);
|
|
err = cudaStreamQuery(stream);
|
|
if (err != cudaErrorNotReady) return err;
|
|
}
|
|
|
|
// 1ms sleeps — adds negligible latency at 30 FPS but prevents SRW lock starvation.
|
|
int sleepCount = 0;
|
|
while (true) {
|
|
Sleep(1);
|
|
sleepCount++;
|
|
err = cudaStreamQuery(stream);
|
|
if (err != cudaErrorNotReady) {
|
|
// Log if sync took too long (>500ms indicates GPU stall)
|
|
auto elapsed = std::chrono::duration<double, std::milli>(
|
|
std::chrono::steady_clock::now() - syncStart).count();
|
|
if (elapsed > 500.0) {
|
|
ANS_DBG("TRT_Engine", "SLOW SYNC: %.1fms (%d sleeps) stream=%p err=%d",
|
|
elapsed, sleepCount, (void*)stream, (int)err);
|
|
}
|
|
return err;
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void Engine<T>::warmUp(int iterations) {
|
|
if (m_verbose) {
|
|
std::cout << "\n========================================" << std::endl;
|
|
std::cout << "Engine Warmup" << std::endl;
|
|
std::cout << "========================================" << std::endl;
|
|
}
|
|
|
|
// Determine batch sizes to warm up
|
|
std::vector<int> batchSizes;
|
|
|
|
if (m_options.maxBatchSize > 1) {
|
|
if (m_verbose) {
|
|
std::cout << "Dynamic batch engine detected (max batch: " << m_options.maxBatchSize << ")" << std::endl;
|
|
std::cout << "Warming up common batch sizes to pre-compile kernels..." << std::endl;
|
|
}
|
|
|
|
// Warm up ALL batch sizes from 1 to maxBatchSize.
|
|
// Each unseen batch size incurs a 100-300ms kernel compilation penalty
|
|
// on first use. Warming all sizes eliminates that latency at inference
|
|
// time and ensures every CUDA graph is pre-captured.
|
|
for (int batch = 1; batch <= m_options.maxBatchSize; ++batch) {
|
|
batchSizes.push_back(batch);
|
|
}
|
|
}
|
|
else {
|
|
if (m_verbose) std::cout << "Fixed batch engine detected (batch size: " << m_options.maxBatchSize << ")" << std::endl;
|
|
batchSizes.push_back(m_options.maxBatchSize);
|
|
}
|
|
|
|
if (m_verbose) {
|
|
std::cout << "Batch sizes to warm up: ";
|
|
for (size_t i = 0; i < batchSizes.size(); ++i) {
|
|
std::cout << batchSizes[i];
|
|
if (i < batchSizes.size() - 1) std::cout << ", ";
|
|
}
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
// Warm up each batch size.
|
|
// The first call triggers kernel compilation; the second captures the CUDA
|
|
// graph. Additional iterations only measure steady-state latency for the
|
|
// optBatchSize (printed as a diagnostic).
|
|
for (int batchSize : batchSizes) {
|
|
const int iters = (batchSize == m_options.optBatchSize) ? iterations : 2;
|
|
if (m_verbose) std::cout << "\nWarming up batch=" << batchSize << " (x" << iters << " iterations)..." << std::endl;
|
|
|
|
// Create dummy inputs for this batch size
|
|
std::vector<std::vector<cv::cuda::GpuMat>> dummyInputs;
|
|
|
|
for (size_t i = 0; i < m_inputDims.size(); ++i) {
|
|
const auto& dims = m_inputDims[i];
|
|
std::vector<cv::cuda::GpuMat> batch;
|
|
|
|
// FIXED: Create proper dummy images on GPU
|
|
// For dynamic spatial dims, use opt dimensions for warmup
|
|
int warmH = (dims.d[1] > 0) ? dims.d[1] : m_options.optInputHeight;
|
|
int warmW = (dims.d[2] > 0) ? dims.d[2] : m_options.optInputWidth;
|
|
for (int b = 0; b < batchSize; ++b) {
|
|
// Create on CPU first
|
|
cv::Mat cpuImg(warmH, warmW, CV_32FC(dims.d[0]), cv::Scalar(0.5f, 0.5f, 0.5f));
|
|
|
|
// Upload to GPU
|
|
cv::cuda::GpuMat gpuImg;
|
|
gpuImg.upload(cpuImg);
|
|
|
|
batch.push_back(gpuImg);
|
|
}
|
|
|
|
dummyInputs.push_back(batch);
|
|
}
|
|
|
|
std::vector<std::vector<std::vector<T>>> dummyOutputs;
|
|
|
|
// Time the first iteration (kernel compilation happens here)
|
|
auto start = std::chrono::high_resolution_clock::now();
|
|
|
|
bool firstSuccess = runInference(dummyInputs, dummyOutputs);
|
|
|
|
auto end = std::chrono::high_resolution_clock::now();
|
|
auto firstTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
|
|
|
|
if (!firstSuccess) {
|
|
if (m_verbose) std::cout << " ✗ ERROR: First iteration failed for batch=" << batchSize << std::endl;
|
|
continue;
|
|
}
|
|
|
|
if (m_verbose) {
|
|
std::cout << " First iteration: " << firstTime << " ms";
|
|
if (firstTime > 100) {
|
|
std::cout << " (kernel compilation detected)";
|
|
}
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
// Run remaining iterations to measure stable performance
|
|
if (iters > 1) {
|
|
auto iterStart = std::chrono::high_resolution_clock::now();
|
|
|
|
for (int i = 1; i < iters; ++i) {
|
|
bool success = runInference(dummyInputs, dummyOutputs);
|
|
if (!success) {
|
|
if (m_verbose) std::cout << " ✗ ERROR: Iteration " << i << " failed" << std::endl;
|
|
break;
|
|
}
|
|
}
|
|
|
|
auto iterEnd = std::chrono::high_resolution_clock::now();
|
|
auto totalTime = std::chrono::duration_cast<std::chrono::milliseconds>(iterEnd - iterStart).count();
|
|
float avgTime = totalTime / static_cast<float>(iters - 1);
|
|
|
|
if (m_verbose) {
|
|
std::cout << " Subsequent iterations (avg): " << std::fixed << std::setprecision(1)
|
|
<< avgTime << " ms" << std::endl;
|
|
|
|
if (firstTime > 100 && avgTime < firstTime * 0.5f) {
|
|
float speedup = firstTime / avgTime;
|
|
std::cout << " ✓ Speedup after warmup: " << std::fixed << std::setprecision(1)
|
|
<< speedup << "x faster" << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (m_verbose) std::cout << " ✓ Batch=" << batchSize << " warmed up successfully" << std::endl;
|
|
}
|
|
|
|
if (m_verbose) {
|
|
std::cout << "\n========================================" << std::endl;
|
|
std::cout << "Warmup Complete!" << std::endl;
|
|
std::cout << "========================================" << std::endl;
|
|
std::cout << "Kernels pre-compiled for all batch sizes." << std::endl;
|
|
std::cout << "========================================\n" << std::endl;
|
|
}
|
|
}
|
|
template <typename T>
|
|
bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,std::vector<std::vector<std::vector<T>>>& featureVectors) {
|
|
|
|
// ============================================================================
|
|
// MULTI-GPU POOL DISPATCH
|
|
// ============================================================================
|
|
// If this Engine was initialised with initializePool() / initializePoolFromEngine()
|
|
// the m_slots vector is non-empty. In that case, delegate to the pool
|
|
// dispatcher which acquires the first idle slot and runs inference there.
|
|
// This branch is NEVER taken for single-GPU use (buildLoadNetwork / loadNetwork).
|
|
if (!m_slots.empty()) {
|
|
return runInferenceFromPool(inputs, featureVectors);
|
|
}
|
|
|
|
// ============================================================================
|
|
// GPU INFERENCE THROTTLE
|
|
// ============================================================================
|
|
// Limit how many Engine instances can run CUDA inference simultaneously.
|
|
// Without this, 12 cameras each with their own Engine all submit GPU work
|
|
// at once → SM 100% → thermal throttle → cudaStreamSynchronize hangs.
|
|
// The semaphore lets excess threads wait on CPU (nearly zero cost) while
|
|
// a bounded number use the GPU efficiently.
|
|
GpuInferenceGuard gpuThrottle;
|
|
|
|
// ============================================================================
|
|
// SINGLE-ENGINE SERIALISATION
|
|
// ============================================================================
|
|
// The single Engine instance has shared mutable state (m_buffers, m_lastBatchSize,
|
|
// m_inferenceStream, TRT execution context). If two LabVIEW threads call
|
|
// runInference concurrently with different batch sizes, one will overwrite
|
|
// the input shapes and buffers while the other is mid-inference, causing a
|
|
// fatal "illegal memory access" that permanently corrupts the CUDA context.
|
|
//
|
|
// Pool-mode slots have their own busy-flag dispatch so they do NOT need this.
|
|
auto _mutexWaitStart = std::chrono::steady_clock::now();
|
|
std::lock_guard<std::mutex> inferenceLock(m_inferenceMutex);
|
|
auto _mutexAcquired = std::chrono::steady_clock::now();
|
|
double _mutexWaitMs = std::chrono::duration<double, std::milli>(_mutexAcquired - _mutexWaitStart).count();
|
|
if (_mutexWaitMs > 50.0) {
|
|
ANS_DBG("TRT_Engine", "MUTEX WAIT: %.1fms (queued behind another inference)", _mutexWaitMs);
|
|
}
|
|
|
|
// ============================================================================
|
|
// THREAD-SAFE GPU CONTEXT
|
|
// ============================================================================
|
|
// Ensure the calling thread's CUDA device matches this engine's GPU.
|
|
// This is essential for multi-GPU round-robin: LabVIEW reuses threads
|
|
// across tasks, so a thread that last ran inference on GPU 1 might now
|
|
// be running a task on GPU 0. Without this, cv::cuda::GpuMat allocations
|
|
// and kernel launches would target the wrong GPU, causing result corruption.
|
|
// Skip cudaSetDevice if already on the correct device — under WDDM
|
|
// with multiple GPUs each call costs 1-5ms of scheduler overhead.
|
|
{
|
|
int currentDev = -1;
|
|
cudaGetDevice(¤tDev);
|
|
if (currentDev != m_options.deviceIndex) {
|
|
cudaSetDevice(m_options.deviceIndex);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// DEBUG: First call diagnostics (per-instance, not process-wide)
|
|
// ============================================================================
|
|
|
|
if (m_verbose && m_firstInferenceCall) {
|
|
std::cout << "\n=== First runInference Call ===" << std::endl;
|
|
std::cout << "Number of input tensors: " << inputs.size() << std::endl;
|
|
for (size_t i = 0; i < inputs.size(); ++i) {
|
|
std::cout << "Input " << i << " batch size: " << inputs[i].size() << std::endl;
|
|
if (!inputs[i].empty()) {
|
|
const auto& img = inputs[i][0];
|
|
std::cout << " Image shape: " << img.cols << "x" << img.rows
|
|
<< "x" << img.channels() << " (type: " << img.type() << ")" << std::endl;
|
|
}
|
|
}
|
|
|
|
// Print optimization profile information
|
|
std::cout << "\n=== Engine Profile Information ===" << std::endl;
|
|
std::cout << "Number of optimization profiles: "
|
|
<< m_engine->getNbOptimizationProfiles() << std::endl;
|
|
|
|
if (m_engine->getNbOptimizationProfiles() > 0) {
|
|
for (int profile = 0; profile < m_engine->getNbOptimizationProfiles(); ++profile) {
|
|
std::cout << "\n--- Profile " << profile << " ---" << std::endl;
|
|
|
|
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
|
|
const char* tensorName = m_IOTensorNames[i].c_str();
|
|
|
|
// Check if this is an input tensor
|
|
auto ioMode = m_engine->getTensorIOMode(tensorName);
|
|
if (ioMode != nvinfer1::TensorIOMode::kINPUT) {
|
|
continue;
|
|
}
|
|
|
|
auto minDims = m_engine->getProfileShape(tensorName, profile,
|
|
nvinfer1::OptProfileSelector::kMIN);
|
|
auto optDims = m_engine->getProfileShape(tensorName, profile,
|
|
nvinfer1::OptProfileSelector::kOPT);
|
|
auto maxDims = m_engine->getProfileShape(tensorName, profile,
|
|
nvinfer1::OptProfileSelector::kMAX);
|
|
|
|
std::cout << "Tensor '" << tensorName << "' (INPUT):" << std::endl;
|
|
std::cout << " Min: [" << minDims.d[0];
|
|
for (int j = 1; j < minDims.nbDims; ++j) std::cout << "," << minDims.d[j];
|
|
std::cout << "]" << std::endl;
|
|
|
|
std::cout << " Opt: [" << optDims.d[0];
|
|
for (int j = 1; j < optDims.nbDims; ++j) std::cout << "," << optDims.d[j];
|
|
std::cout << "]" << std::endl;
|
|
|
|
std::cout << " Max: [" << maxDims.d[0];
|
|
for (int j = 1; j < maxDims.nbDims; ++j) std::cout << "," << maxDims.d[j];
|
|
std::cout << "]" << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!m_context->allInputDimensionsSpecified()) {
|
|
std::cout << "ERROR: Input dimensions not specified in context!" << std::endl;
|
|
return false;
|
|
}
|
|
|
|
std::cout << "\nContext state: All dimensions specified ✓" << std::endl;
|
|
m_firstInferenceCall = false;
|
|
}
|
|
|
|
// ============================================================================
|
|
// INPUT VALIDATION
|
|
// ============================================================================
|
|
|
|
if (inputs.empty() || inputs[0].empty()) {
|
|
std::cout << "Error: Empty input" << std::endl;
|
|
return false;
|
|
}
|
|
|
|
// ============================================================================
|
|
// SM=100% DETECTOR — tracks inference timing trends to catch the exact
|
|
// moment GPU becomes saturated. Logs every 50 inferences with rolling
|
|
// average, and immediately when degradation is detected.
|
|
// ============================================================================
|
|
// Global (process-wide) counters shared across all engine instances/threads
|
|
static std::atomic<int64_t> s_globalInfCount{0};
|
|
static std::atomic<int> s_globalActiveInf{0}; // currently in-flight inferences
|
|
static std::atomic<double> s_globalLastAvgMs{0.0}; // last known avg inference time
|
|
|
|
const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
|
|
s_globalActiveInf.fetch_add(1);
|
|
|
|
// Per-thread tracking
|
|
{
|
|
static thread_local int64_t s_infCount = 0;
|
|
static thread_local std::chrono::steady_clock::time_point s_lastLog;
|
|
static thread_local double s_rollingAvgMs = 0.0;
|
|
static thread_local double s_baselineMs = 0.0; // avg during first 100 inferences
|
|
static thread_local double s_maxMs = 0.0;
|
|
static thread_local bool s_degradationLogged = false;
|
|
s_infCount++;
|
|
|
|
if (s_infCount == 1) {
|
|
s_lastLog = std::chrono::steady_clock::now();
|
|
ANS_DBG("TRT_SM100", "FIRST inference — engine alive, globalInf=%lld", myInfNum);
|
|
}
|
|
|
|
// Log every 50 inferences (more frequent than 500 to catch transitions)
|
|
if (s_infCount % 50 == 0) {
|
|
auto now = std::chrono::steady_clock::now();
|
|
double elapsed = std::chrono::duration<double>(now - s_lastLog).count();
|
|
double fps = (elapsed > 0) ? (50.0 / elapsed) : 0;
|
|
s_lastLog = now;
|
|
|
|
size_t vramFree = 0, vramTotal = 0;
|
|
cudaMemGetInfo(&vramFree, &vramTotal);
|
|
size_t vramUsedMB = (vramTotal - vramFree) / (1024 * 1024);
|
|
size_t vramFreeMB = vramFree / (1024 * 1024);
|
|
|
|
ANS_DBG("TRT_SM100", "#%lld [global=%lld active=%d] %.1f inf/sec avgMs=%.1f maxMs=%.1f batch=%d graphs=%zu VRAM=%zuMB/%zuMB",
|
|
s_infCount, myInfNum, s_globalActiveInf.load(),
|
|
fps, s_rollingAvgMs, s_maxMs,
|
|
(int)inputs[0].size(), m_graphExecs.size(),
|
|
vramUsedMB, vramFreeMB);
|
|
|
|
// Capture baseline from first 100 inferences
|
|
if (s_infCount == 100) {
|
|
s_baselineMs = s_rollingAvgMs;
|
|
ANS_DBG("TRT_SM100", "BASELINE established: %.1fms/inference", s_baselineMs);
|
|
}
|
|
|
|
// Detect degradation: avg >3x baseline AND baseline is set
|
|
if (s_baselineMs > 0 && s_rollingAvgMs > s_baselineMs * 3.0 && !s_degradationLogged) {
|
|
s_degradationLogged = true;
|
|
ANS_DBG("TRT_SM100", "*** DEGRADATION DETECTED *** avg=%.1fms baseline=%.1fms (%.1fx) VRAM=%zuMB/%zuMB active=%d",
|
|
s_rollingAvgMs, s_baselineMs, s_rollingAvgMs / s_baselineMs,
|
|
vramUsedMB, vramFreeMB, s_globalActiveInf.load());
|
|
}
|
|
|
|
// Reset max for next window
|
|
s_maxMs = 0.0;
|
|
}
|
|
}
|
|
|
|
const auto numInputs = m_inputDims.size();
|
|
if (inputs.size() != numInputs) {
|
|
std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
|
|
<< ", Got: " << inputs.size() << std::endl;
|
|
return false;
|
|
}
|
|
|
|
const auto batchSize = static_cast<int32_t>(inputs[0].size());
|
|
|
|
if (batchSize > m_options.maxBatchSize) {
|
|
std::cout << "Error: Batch size " << batchSize << " exceeds maximum "
|
|
<< m_options.maxBatchSize << std::endl;
|
|
return false;
|
|
}
|
|
|
|
if (batchSize < 1) {
|
|
std::cout << "Error: Batch size must be at least 1" << std::endl;
|
|
return false;
|
|
}
|
|
|
|
// Validate batch size consistency across all inputs
|
|
for (size_t i = 1; i < inputs.size(); ++i) {
|
|
if (inputs[i].size() != static_cast<size_t>(batchSize)) {
|
|
std::cout << "Error: Inconsistent batch sizes across inputs. Input 0: "
|
|
<< batchSize << ", Input " << i << ": " << inputs[i].size() << std::endl;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// STREAM GUARD
|
|
// ============================================================================
|
|
// m_inferenceStream is now created eagerly in loadNetwork() so it is always
|
|
// valid here. Guard against the (unlikely) edge case where runInference is
|
|
// called before loadNetwork succeeds.
|
|
if (!m_streamInitialized || !m_inferenceStream) {
|
|
return false;
|
|
}
|
|
|
|
// ============================================================================
|
|
// SET INPUT SHAPES (batch size changed OR dynamic spatial dims need updating)
|
|
// ============================================================================
|
|
// Fast path: compute desired dims first, then compare against cached dims.
|
|
// This avoids all TRT API calls when the shape hasn't actually changed —
|
|
// critical for the recognizer which is called ~50-100x per image with
|
|
// dynamic width but often the same or similar widths.
|
|
// ============================================================================
|
|
|
|
{
|
|
// Lazily initialise the dims cache on first call
|
|
if (m_lastSetInputDims.empty()) {
|
|
m_lastSetInputDims.resize(numInputs);
|
|
for (size_t i = 0; i < numInputs; ++i) {
|
|
m_lastSetInputDims[i].nbDims = 0; // force mismatch on first call
|
|
}
|
|
}
|
|
|
|
// Build desired dims for every input tensor (cheap — no TRT API calls)
|
|
bool anyDimChanged = (m_lastBatchSize != batchSize);
|
|
std::vector<nvinfer1::Dims> desiredDims(numInputs);
|
|
for (size_t i = 0; i < numInputs; ++i) {
|
|
nvinfer1::Dims& nd = desiredDims[i];
|
|
nd.nbDims = 4;
|
|
nd.d[0] = batchSize;
|
|
nd.d[1] = m_inputDims[i].d[0]; // channels
|
|
if (m_hasDynamicSpatialDims && !inputs[i].empty()) {
|
|
const auto& firstImg = inputs[i][0];
|
|
nd.d[2] = (m_inputDims[i].d[1] == -1) ? firstImg.rows : m_inputDims[i].d[1];
|
|
nd.d[3] = (m_inputDims[i].d[2] == -1) ? firstImg.cols : m_inputDims[i].d[2];
|
|
} else {
|
|
nd.d[2] = m_inputDims[i].d[1];
|
|
nd.d[3] = m_inputDims[i].d[2];
|
|
}
|
|
// Compare with cached
|
|
if (!anyDimChanged) {
|
|
const auto& cached = m_lastSetInputDims[i];
|
|
if (cached.nbDims != nd.nbDims ||
|
|
cached.d[0] != nd.d[0] || cached.d[1] != nd.d[1] ||
|
|
cached.d[2] != nd.d[2] || cached.d[3] != nd.d[3]) {
|
|
anyDimChanged = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (anyDimChanged) {
|
|
ANS_DBG("TRT_Engine", "Shape change detected: batch %d -> %d (graphsCached=%zu)",
|
|
m_lastBatchSize, batchSize, m_graphExecs.size());
|
|
|
|
// === First-time diagnostics (verbose, once) ===
|
|
const bool firstTime = !m_batchShapeChangeLogged;
|
|
|
|
if (m_verbose && firstTime) {
|
|
std::cout << "\nInfo: Batch size change: " << m_lastBatchSize
|
|
<< " -> " << batchSize << std::endl;
|
|
}
|
|
|
|
// Set optimization profile (only when truly needed)
|
|
if (m_engine->getNbOptimizationProfiles() > 0) {
|
|
int currentProfile = m_context->getOptimizationProfile();
|
|
if (currentProfile != 0 || m_lastBatchSize < 0) {
|
|
if (m_verbose && firstTime) {
|
|
std::cout << " Setting optimization profile to 0..." << std::endl;
|
|
}
|
|
if (!m_context->setOptimizationProfileAsync(0, m_inferenceStream)) {
|
|
std::cout << "Error: Failed to set optimization profile 0" << std::endl;
|
|
return false;
|
|
}
|
|
cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
|
|
if (syncErr != cudaSuccess) {
|
|
std::cout << "Error: Failed to sync after profile change: "
|
|
<< cudaGetErrorString(syncErr) << std::endl;
|
|
return false;
|
|
}
|
|
if (m_verbose && firstTime) {
|
|
std::cout << " Optimization profile set successfully" << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update shapes for input tensors that actually changed
|
|
for (size_t i = 0; i < numInputs; ++i) {
|
|
const char* tensorName = m_IOTensorNames[i].c_str();
|
|
|
|
// Skip non-input tensors
|
|
auto ioMode = m_engine->getTensorIOMode(tensorName);
|
|
if (ioMode != nvinfer1::TensorIOMode::kINPUT) continue;
|
|
|
|
const nvinfer1::Dims& newDims = desiredDims[i];
|
|
const nvinfer1::Dims& cached = m_lastSetInputDims[i];
|
|
|
|
// Skip this tensor if its dims haven't changed
|
|
if (cached.nbDims == newDims.nbDims &&
|
|
cached.d[0] == newDims.d[0] && cached.d[1] == newDims.d[1] &&
|
|
cached.d[2] == newDims.d[2] && cached.d[3] == newDims.d[3]) {
|
|
continue;
|
|
}
|
|
|
|
// First-time verbose diagnostics
|
|
if (m_verbose && firstTime) {
|
|
std::cout << "\n Processing tensor " << i << ": '" << tensorName << "'" << std::endl;
|
|
// Validate batch size range (first time only)
|
|
if (m_engine->getNbOptimizationProfiles() > 0) {
|
|
int profileIndex = m_context->getOptimizationProfile();
|
|
nvinfer1::Dims minDims = m_engine->getProfileShape(
|
|
tensorName, profileIndex, nvinfer1::OptProfileSelector::kMIN);
|
|
nvinfer1::Dims maxDims = m_engine->getProfileShape(
|
|
tensorName, profileIndex, nvinfer1::OptProfileSelector::kMAX);
|
|
std::cout << " Profile batch range: [" << minDims.d[0]
|
|
<< " to " << maxDims.d[0] << "]" << std::endl;
|
|
if (batchSize < minDims.d[0] || batchSize > maxDims.d[0]) {
|
|
std::cout << "Error: Batch size " << batchSize
|
|
<< " outside profile range" << std::endl;
|
|
return false;
|
|
}
|
|
}
|
|
auto currentShape = m_context->getTensorShape(tensorName);
|
|
std::cout << " Current context shape: [";
|
|
for (int j = 0; j < currentShape.nbDims; ++j) {
|
|
if (j > 0) std::cout << ", ";
|
|
std::cout << currentShape.d[j];
|
|
}
|
|
std::cout << "]" << std::endl;
|
|
std::cout << " Setting new shape: [" << newDims.d[0] << ", "
|
|
<< newDims.d[1] << ", " << newDims.d[2] << ", "
|
|
<< newDims.d[3] << "]" << std::endl;
|
|
}
|
|
|
|
ANS_DBG("TRT_Engine", "setInputShape('%s') [%d,%d,%d,%d]",
|
|
tensorName, newDims.d[0], newDims.d[1], newDims.d[2], newDims.d[3]);
|
|
if (!m_context->setInputShape(tensorName, newDims)) {
|
|
ANS_DBG("TRT_Engine", "ERROR: setInputShape FAILED for '%s'", tensorName);
|
|
std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
|
|
return false;
|
|
}
|
|
|
|
// Verify shape (first time only — trust the API on hot path)
|
|
if (firstTime) {
|
|
auto verifyShape = m_context->getTensorShape(tensorName);
|
|
if (verifyShape.d[0] != batchSize) {
|
|
std::cout << "Error: Shape change didn't take effect. Expected batch "
|
|
<< batchSize << ", got " << verifyShape.d[0] << std::endl;
|
|
return false;
|
|
}
|
|
if (m_verbose) {
|
|
std::cout << " Shape updated successfully" << std::endl;
|
|
}
|
|
}
|
|
|
|
m_lastSetInputDims[i] = newDims;
|
|
}
|
|
|
|
// Verify all input dimensions specified (first time only)
|
|
if (firstTime) {
|
|
if (!m_context->allInputDimensionsSpecified()) {
|
|
std::cout << "Error: Not all input dimensions specified after shape change" << std::endl;
|
|
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
|
|
auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
|
|
std::cout << " " << m_IOTensorNames[i] << ": [";
|
|
for (int j = 0; j < shape.nbDims; ++j) {
|
|
if (j > 0) std::cout << ", ";
|
|
std::cout << shape.d[j];
|
|
}
|
|
std::cout << "]" << std::endl;
|
|
}
|
|
return false;
|
|
}
|
|
}
|
|
|
|
m_lastBatchSize = batchSize;
|
|
m_batchShapeChangeLogged = true;
|
|
|
|
// CRITICAL: Invalidate all cached CUDA graphs after shape change.
|
|
// Graphs were captured with the OLD context state (old tensor shapes).
|
|
// Launching them after setInputShape() produces undefined GPU behavior
|
|
// (invalid kernel sequences, SM lockup at 100%, inference hang).
|
|
if (!m_graphExecs.empty()) {
|
|
size_t destroyed = m_graphExecs.size();
|
|
for (auto& [bs, ge] : m_graphExecs) {
|
|
if (ge) cudaGraphExecDestroy(ge);
|
|
}
|
|
m_graphExecs.clear();
|
|
ANS_DBG("TRT_Engine", "INVALIDATED %zu cached CUDA graphs after shape change (batch=%d)",
|
|
destroyed, batchSize);
|
|
if (m_verbose || firstTime) {
|
|
std::cout << "Info: Invalidated " << destroyed
|
|
<< " cached CUDA graphs after shape change" << std::endl;
|
|
}
|
|
}
|
|
|
|
if (m_verbose && firstTime) {
|
|
std::cout << "\nInfo: Input shapes updated successfully for batch size "
|
|
<< batchSize << " ✓\n" << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// PREPROCESS AND COPY INPUTS TO GPU BUFFERS
|
|
// ============================================================================
|
|
|
|
// Pass 1: Validate all input dimensions before any GPU work.
|
|
// Dynamic dims (-1) are skipped in validation (they accept any size).
|
|
for (size_t i = 0; i < numInputs; ++i) {
|
|
const auto& batchInput = inputs[i];
|
|
const auto& dims = m_inputDims[i];
|
|
if (!batchInput.empty()) {
|
|
const auto& firstImg = batchInput[0];
|
|
bool mismatch = false;
|
|
if (dims.d[0] > 0 && firstImg.channels() != dims.d[0]) mismatch = true;
|
|
if (dims.d[1] > 0 && firstImg.rows != dims.d[1]) mismatch = true;
|
|
if (dims.d[2] > 0 && firstImg.cols != dims.d[2]) mismatch = true;
|
|
if (mismatch) {
|
|
std::cout << "Error: Input " << i << " dimension mismatch!" << std::endl;
|
|
std::cout << " Expected: " << dims.d[2] << "x" << dims.d[1]
|
|
<< "x" << dims.d[0] << " (WxHxC, -1=dynamic)" << std::endl;
|
|
std::cout << " Got: " << firstImg.cols << "x" << firstImg.rows
|
|
<< "x" << firstImg.channels() << " (WxHxC)" << std::endl;
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Pass 2: Preprocess + D2D copies — all on m_inferenceStream (no null stream).
|
|
//
|
|
// All OpenCV CUDA ops (convertTo, subtract, divide, split) in blobFromGpuMats
|
|
// now run on m_inferenceStream via the cv::cuda::Stream wrapper. This means:
|
|
// • No null-stream interaction — eliminates global sync barriers on WDDM
|
|
// • No event bridge needed — same-stream ordering guarantees correctness
|
|
// • CUDA graphs are safe — cv::cuda::split runs BEFORE graph capture
|
|
//
|
|
// GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
|
|
// cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
|
|
auto _prepStart = std::chrono::steady_clock::now();
|
|
cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
|
|
std::vector<cv::cuda::GpuMat> preprocessedBuffers;
|
|
preprocessedBuffers.reserve(numInputs);
|
|
|
|
for (size_t i = 0; i < numInputs; ++i) {
|
|
const auto& batchInput = inputs[i];
|
|
|
|
// Preprocess on m_inferenceStream (not the null stream).
|
|
preprocessedBuffers.push_back(
|
|
blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize, false, cvInferStream));
|
|
|
|
// D2D copy: same stream as preprocessing, so ordering is guaranteed.
|
|
const auto& blobMat = preprocessedBuffers.back();
|
|
const size_t copySize = static_cast<size_t>(blobMat.rows) * static_cast<size_t>(blobMat.cols) * blobMat.elemSize();
|
|
cudaError_t copyErr = cudaMemcpyAsync(
|
|
m_buffers[i],
|
|
preprocessedBuffers.back().ptr<void>(),
|
|
copySize,
|
|
cudaMemcpyDeviceToDevice,
|
|
m_inferenceStream);
|
|
|
|
if (copyErr != cudaSuccess) {
|
|
std::cout << "Error: Failed to copy input " << i
|
|
<< " to inference buffer: " << cudaGetErrorString(copyErr) << std::endl;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
{
|
|
double _prepMs = std::chrono::duration<double, std::milli>(
|
|
std::chrono::steady_clock::now() - _prepStart).count();
|
|
if (_prepMs > 100.0) {
|
|
ANS_DBG("TRT_SM100", "SLOW PREPROCESS: %.1fms batch=%d (blobFromGpuMats+D2D)", _prepMs, batchSize);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// PRE-ALLOCATE OUTPUT STRUCTURE
|
|
// ============================================================================
|
|
|
|
const size_t numOutputs = m_outputLengths.size();
|
|
|
|
featureVectors.clear();
|
|
featureVectors.resize(batchSize);
|
|
for (int batch = 0; batch < batchSize; ++batch) {
|
|
featureVectors[batch].resize(numOutputs);
|
|
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx)
|
|
featureVectors[batch][outputIdx].resize(m_outputLengths[outputIdx]);
|
|
}
|
|
|
|
// ============================================================================
|
|
// RUN INFERENCE + COPY OUTPUTS (CUDA Graph path or direct path)
|
|
// ============================================================================
|
|
|
|
// CUDA Graph path
|
|
// ---------------
|
|
// On the first call for a given batchSize we capture enqueueV3 + D2H copies
|
|
// into a reusable graph. Subsequent calls use cudaGraphLaunch, replacing
|
|
// many individual kernel-submission API calls with a single launch.
|
|
//
|
|
// Prerequisites satisfied here:
|
|
// • Preprocessing + D2D copies are queued on m_inferenceStream (same-stream
|
|
// ordering guarantees they complete before captured kernels execute)
|
|
// • m_pinnedOutputBuffers has stable addresses (allocated in loadNetwork)
|
|
// • m_buffers (GPU outputs) have stable addresses (allocated in loadNetwork)
|
|
//
|
|
// Falls back to the direct path if pinned buffers are unavailable or if
|
|
// graph capture/instantiation fails for any reason.
|
|
|
|
// CUDA graphs capture fixed kernel sequences; incompatible with dynamic spatial dims
|
|
// (input H/W change per inference call → different TRT kernel plans each time).
|
|
// Disabled for pool slots — concurrent graph captures on the same GPU corrupt the
|
|
// CUDA context ("operation not permitted when stream is capturing").
|
|
const bool canGraph = !m_disableGraphs && !m_pinnedOutputBuffers.empty() && !m_hasDynamicSpatialDims;
|
|
bool graphUsed = false;
|
|
|
|
if (canGraph) {
|
|
auto& graphExec = m_graphExecs[batchSize]; // inserts nullptr on first access
|
|
if (!graphExec) {
|
|
ANS_DBG("TRT_Engine", "CUDA graph CAPTURE starting for batch=%d (cached=%zu)",
|
|
batchSize, m_graphExecs.size());
|
|
// First call for this batchSize -- capture a new graph.
|
|
// Serialise captures across all Engine instances on this device to
|
|
// prevent TRT's shared workspace from creating cross-stream
|
|
// dependencies that violate CUDA graph capture rules.
|
|
std::lock_guard<std::mutex> captureLock(graphCaptureMutex());
|
|
|
|
// Clear any sticky CUDA error from a prior failed capture so that
|
|
// this attempt starts clean.
|
|
cudaGetLastError();
|
|
|
|
cudaGraph_t graph = nullptr;
|
|
bool captureOk = false;
|
|
|
|
if (cudaStreamBeginCapture(m_inferenceStream,
|
|
cudaStreamCaptureModeRelaxed) == cudaSuccess) {
|
|
// Record TRT kernels into the graph (not executed yet).
|
|
TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
|
|
|
|
// Record D2H copies to stable pinned addresses.
|
|
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
|
|
cudaMemcpyAsync(
|
|
m_pinnedOutputBuffers[outputIdx],
|
|
static_cast<char*>(m_buffers[numInputs + outputIdx]),
|
|
static_cast<size_t>(batchSize) * m_outputLengths[outputIdx] * sizeof(T),
|
|
cudaMemcpyDeviceToHost,
|
|
m_inferenceStream);
|
|
}
|
|
|
|
captureOk = (cudaStreamEndCapture(m_inferenceStream, &graph) == cudaSuccess
|
|
&& graph != nullptr);
|
|
}
|
|
|
|
if (captureOk) {
|
|
cudaGraphExec_t exec = nullptr;
|
|
if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
|
|
graphExec = exec;
|
|
cudaGraphDestroy(graph);
|
|
ANS_DBG("TRT_Engine", "CUDA graph CAPTURED OK for batch=%d exec=%p",
|
|
batchSize, (void*)graphExec);
|
|
}
|
|
|
|
if (!graphExec) {
|
|
ANS_DBG("TRT_Engine", "CUDA graph capture FAILED for batch=%d — falling back to direct path",
|
|
batchSize);
|
|
std::cout << "Warning: CUDA graph capture failed for batchSize="
|
|
<< batchSize << " -- falling back to direct inference path." << std::endl;
|
|
// Disable graph acceleration for this Engine instance.
|
|
for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
|
|
m_pinnedOutputBuffers.clear();
|
|
m_graphExecs.erase(batchSize);
|
|
}
|
|
}
|
|
|
|
if (graphExec) {
|
|
ANS_DBG("TRT_Engine", "CUDA graph LAUNCH batch=%d exec=%p", batchSize, (void*)graphExec);
|
|
// Launch the pre-captured graph (single API call replaces many).
|
|
auto _graphStart = std::chrono::steady_clock::now();
|
|
cudaGraphLaunch(graphExec, m_inferenceStream);
|
|
cudaStreamSynchronize_Safe(m_inferenceStream);
|
|
auto _graphEnd = std::chrono::steady_clock::now();
|
|
double _graphMs = std::chrono::duration<double, std::milli>(_graphEnd - _graphStart).count();
|
|
if (_graphMs > 500.0) {
|
|
ANS_DBG("TRT_SM100", "SLOW GRAPH: %.1fms batch=%d active=%d",
|
|
_graphMs, batchSize, s_globalActiveInf.load());
|
|
}
|
|
|
|
// CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
|
|
for (int batch = 0; batch < batchSize; ++batch) {
|
|
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
|
|
std::memcpy(
|
|
featureVectors[batch][outputIdx].data(),
|
|
m_pinnedOutputBuffers[outputIdx]
|
|
+ static_cast<size_t>(batch) * m_outputLengths[outputIdx],
|
|
m_outputLengths[outputIdx] * sizeof(T));
|
|
}
|
|
}
|
|
graphUsed = true;
|
|
}
|
|
}
|
|
|
|
// Direct path (no graph)
|
|
// ----------------------
|
|
// Used when pinned buffers are unavailable or graph capture failed.
|
|
if (!graphUsed) {
|
|
ANS_DBG("TRT_Engine", "Direct path (no graph) batch=%d", batchSize);
|
|
auto enqueueStart = std::chrono::steady_clock::now();
|
|
bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
|
|
auto enqueueEnd = std::chrono::steady_clock::now();
|
|
double enqueueMs = std::chrono::duration<double, std::milli>(enqueueEnd - enqueueStart).count();
|
|
if (enqueueMs > 500.0) {
|
|
ANS_DBG("TRT_Engine", "SLOW ENQUEUE: %.1fms batch=%d (enqueueV3 blocked!)", enqueueMs, batchSize);
|
|
}
|
|
if (!success) {
|
|
ANS_DBG("TRT_Engine", "ERROR: enqueueV3 FAILED batch=%d", batchSize);
|
|
return false;
|
|
}
|
|
|
|
for (int batch = 0; batch < batchSize; ++batch) {
|
|
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
|
|
const size_t outputBinding = numInputs + outputIdx;
|
|
const size_t offset =
|
|
static_cast<size_t>(batch) * m_outputLengths[outputIdx] * sizeof(T);
|
|
|
|
cudaError_t copyErr = cudaMemcpyAsync(
|
|
featureVectors[batch][outputIdx].data(),
|
|
static_cast<char*>(m_buffers[outputBinding]) + offset,
|
|
m_outputLengths[outputIdx] * sizeof(T),
|
|
cudaMemcpyDeviceToHost,
|
|
m_inferenceStream);
|
|
|
|
if (copyErr != cudaSuccess) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
auto syncStart = std::chrono::steady_clock::now();
|
|
cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
|
|
auto syncEnd = std::chrono::steady_clock::now();
|
|
double syncMs = std::chrono::duration<double, std::milli>(syncEnd - syncStart).count();
|
|
if (syncMs > 500.0) {
|
|
ANS_DBG("TRT_Engine", "SLOW INFERENCE SYNC: %.1fms batch=%d (direct path)", syncMs, batchSize);
|
|
}
|
|
if (syncErr != cudaSuccess) {
|
|
ANS_DBG("TRT_Engine", "ERROR: cudaStreamSync FAILED err=%d (%s)",
|
|
(int)syncErr, cudaGetErrorString(syncErr));
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Per-inference total timing breakdown (mutex wait + preprocess + GPU)
|
|
// ============================================================================
|
|
{
|
|
double totalMs = std::chrono::duration<double, std::milli>(
|
|
std::chrono::steady_clock::now() - _mutexWaitStart).count();
|
|
double gpuMs = totalMs - _mutexWaitMs; // Everything after mutex acquired
|
|
// Log every inference that takes >100ms total (including mutex wait)
|
|
if (totalMs > 100.0) {
|
|
ANS_DBG("TRT_Timing", "total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d",
|
|
totalMs, _mutexWaitMs, gpuMs, batchSize, s_globalActiveInf.load());
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// SM=100% DETECTOR — end-of-inference timing
|
|
// ============================================================================
|
|
{
|
|
static thread_local double s_ema = 0;
|
|
static thread_local std::chrono::steady_clock::time_point s_prevEnd;
|
|
static thread_local bool s_firstDone = false;
|
|
|
|
auto _now = std::chrono::steady_clock::now();
|
|
if (s_firstDone) {
|
|
double sinceLastMs = std::chrono::duration<double, std::milli>(_now - s_prevEnd).count();
|
|
// If time between consecutive inferences jumps dramatically,
|
|
// something blocked the thread (SM=100% or mutex contention)
|
|
if (s_ema > 0 && sinceLastMs > s_ema * 3.0 && sinceLastMs > 500.0) {
|
|
size_t vf = 0, vt = 0;
|
|
cudaMemGetInfo(&vf, &vt);
|
|
ANS_DBG("TRT_SM100", "GAP DETECTED: %.1fms between inferences (avg=%.1fms, %.1fx) active=%d VRAM=%zuMB free",
|
|
sinceLastMs, s_ema, sinceLastMs / s_ema,
|
|
s_globalActiveInf.load(), vf / (1024*1024));
|
|
}
|
|
s_ema = (s_ema == 0) ? sinceLastMs : (0.9 * s_ema + 0.1 * sinceLastMs);
|
|
}
|
|
s_prevEnd = _now;
|
|
s_firstDone = true;
|
|
|
|
s_globalActiveInf.fetch_sub(1);
|
|
}
|
|
|
|
return true;
|
|
}
|