Use software decoder by default

This commit is contained in:
2026-04-04 20:19:54 +11:00
parent 3a21026790
commit e134ebdf15
24 changed files with 693 additions and 215 deletions

View File

@@ -1,8 +1,10 @@
#pragma once
#include <cstring>
#include <chrono>
#include <filesystem>
#include <semaphore>
#include "TRTCompat.h"
#include "ANSLicense.h" // ANS_DBG macro for DebugView logging
// Per-device mutex for CUDA graph capture.
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
@@ -86,11 +88,9 @@ static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
cudaError_t err = cudaStreamQuery(stream);
if (err != cudaErrorNotReady) return err;
auto syncStart = std::chrono::steady_clock::now();
// Short Sleep(0) fast path (~10 iterations) catches sub-ms kernel completions.
// Then switch to Sleep(1) to give cleanup operations (cuArrayDestroy, cuMemFree)
// a window to acquire the exclusive nvcuda64 SRW lock.
// Previously used 1000 Sleep(0) iterations which hogged the SRW lock and
// caused ~20-second stalls when concurrent cleanup needed exclusive access.
for (int i = 0; i < 10; ++i) {
Sleep(0);
err = cudaStreamQuery(stream);
@@ -98,10 +98,21 @@ static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
}
// 1ms sleeps — adds negligible latency at 30 FPS but prevents SRW lock starvation.
int sleepCount = 0;
while (true) {
Sleep(1);
sleepCount++;
err = cudaStreamQuery(stream);
if (err != cudaErrorNotReady) return err;
if (err != cudaErrorNotReady) {
// Log if sync took too long (>500ms indicates GPU stall)
auto elapsed = std::chrono::duration<double, std::milli>(
std::chrono::steady_clock::now() - syncStart).count();
if (elapsed > 500.0) {
ANS_DBG("TRT_Engine", "SLOW SYNC: %.1fms (%d sleeps) stream=%p err=%d",
elapsed, sleepCount, (void*)stream, (int)err);
}
return err;
}
}
}
@@ -368,6 +379,71 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
return false;
}
// ============================================================================
// SM=100% DETECTOR — tracks inference timing trends to catch the exact
// moment GPU becomes saturated. Logs every 50 inferences with rolling
// average, and immediately when degradation is detected.
// ============================================================================
// Global (process-wide) counters shared across all engine instances/threads
static std::atomic<int64_t> s_globalInfCount{0};
static std::atomic<int> s_globalActiveInf{0}; // currently in-flight inferences
static std::atomic<double> s_globalLastAvgMs{0.0}; // last known avg inference time
const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
s_globalActiveInf.fetch_add(1);
// Per-thread tracking
{
static thread_local int64_t s_infCount = 0;
static thread_local std::chrono::steady_clock::time_point s_lastLog;
static thread_local double s_rollingAvgMs = 0.0;
static thread_local double s_baselineMs = 0.0; // avg during first 100 inferences
static thread_local double s_maxMs = 0.0;
static thread_local bool s_degradationLogged = false;
s_infCount++;
if (s_infCount == 1) {
s_lastLog = std::chrono::steady_clock::now();
ANS_DBG("TRT_SM100", "FIRST inference — engine alive, globalInf=%lld", myInfNum);
}
// Log every 50 inferences (more frequent than 500 to catch transitions)
if (s_infCount % 50 == 0) {
auto now = std::chrono::steady_clock::now();
double elapsed = std::chrono::duration<double>(now - s_lastLog).count();
double fps = (elapsed > 0) ? (50.0 / elapsed) : 0;
s_lastLog = now;
size_t vramFree = 0, vramTotal = 0;
cudaMemGetInfo(&vramFree, &vramTotal);
size_t vramUsedMB = (vramTotal - vramFree) / (1024 * 1024);
size_t vramFreeMB = vramFree / (1024 * 1024);
ANS_DBG("TRT_SM100", "#%lld [global=%lld active=%d] %.1f inf/sec avgMs=%.1f maxMs=%.1f batch=%d graphs=%zu VRAM=%zuMB/%zuMB",
s_infCount, myInfNum, s_globalActiveInf.load(),
fps, s_rollingAvgMs, s_maxMs,
(int)inputs[0].size(), m_graphExecs.size(),
vramUsedMB, vramFreeMB);
// Capture baseline from first 100 inferences
if (s_infCount == 100) {
s_baselineMs = s_rollingAvgMs;
ANS_DBG("TRT_SM100", "BASELINE established: %.1fms/inference", s_baselineMs);
}
// Detect degradation: avg >3x baseline AND baseline is set
if (s_baselineMs > 0 && s_rollingAvgMs > s_baselineMs * 3.0 && !s_degradationLogged) {
s_degradationLogged = true;
ANS_DBG("TRT_SM100", "*** DEGRADATION DETECTED *** avg=%.1fms baseline=%.1fms (%.1fx) VRAM=%zuMB/%zuMB active=%d",
s_rollingAvgMs, s_baselineMs, s_rollingAvgMs / s_baselineMs,
vramUsedMB, vramFreeMB, s_globalActiveInf.load());
}
// Reset max for next window
s_maxMs = 0.0;
}
}
const auto numInputs = m_inputDims.size();
if (inputs.size() != numInputs) {
std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
@@ -457,6 +533,9 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
}
if (anyDimChanged) {
ANS_DBG("TRT_Engine", "Shape change detected: batch %d -> %d (graphsCached=%zu)",
m_lastBatchSize, batchSize, m_graphExecs.size());
// === First-time diagnostics (verbose, once) ===
const bool firstTime = !m_batchShapeChangeLogged;
@@ -536,7 +615,10 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
<< newDims.d[3] << "]" << std::endl;
}
ANS_DBG("TRT_Engine", "setInputShape('%s') [%d,%d,%d,%d]",
tensorName, newDims.d[0], newDims.d[1], newDims.d[2], newDims.d[3]);
if (!m_context->setInputShape(tensorName, newDims)) {
ANS_DBG("TRT_Engine", "ERROR: setInputShape FAILED for '%s'", tensorName);
std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
return false;
}
@@ -576,6 +658,25 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
m_lastBatchSize = batchSize;
m_batchShapeChangeLogged = true;
// CRITICAL: Invalidate all cached CUDA graphs after shape change.
// Graphs were captured with the OLD context state (old tensor shapes).
// Launching them after setInputShape() produces undefined GPU behavior
// (invalid kernel sequences, SM lockup at 100%, inference hang).
if (!m_graphExecs.empty()) {
size_t destroyed = m_graphExecs.size();
for (auto& [bs, ge] : m_graphExecs) {
if (ge) cudaGraphExecDestroy(ge);
}
m_graphExecs.clear();
ANS_DBG("TRT_Engine", "INVALIDATED %zu cached CUDA graphs after shape change (batch=%d)",
destroyed, batchSize);
if (m_verbose || firstTime) {
std::cout << "Info: Invalidated " << destroyed
<< " cached CUDA graphs after shape change" << std::endl;
}
}
if (m_verbose && firstTime) {
std::cout << "\nInfo: Input shapes updated successfully for batch size "
<< batchSize << " ✓\n" << std::endl;
@@ -619,6 +720,7 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
//
// GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
// cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
auto _prepStart = std::chrono::steady_clock::now();
cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
std::vector<cv::cuda::GpuMat> preprocessedBuffers;
preprocessedBuffers.reserve(numInputs);
@@ -647,6 +749,14 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
}
}
{
double _prepMs = std::chrono::duration<double, std::milli>(
std::chrono::steady_clock::now() - _prepStart).count();
if (_prepMs > 100.0) {
ANS_DBG("TRT_SM100", "SLOW PREPROCESS: %.1fms batch=%d (blobFromGpuMats+D2D)", _prepMs, batchSize);
}
}
// ============================================================================
// PRE-ALLOCATE OUTPUT STRUCTURE
// ============================================================================
@@ -690,6 +800,8 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
if (canGraph) {
auto& graphExec = m_graphExecs[batchSize]; // inserts nullptr on first access
if (!graphExec) {
ANS_DBG("TRT_Engine", "CUDA graph CAPTURE starting for batch=%d (cached=%zu)",
batchSize, m_graphExecs.size());
// First call for this batchSize -- capture a new graph.
// Serialise captures across all Engine instances on this device to
// prevent TRT's shared workspace from creating cross-stream
@@ -727,9 +839,13 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
graphExec = exec;
cudaGraphDestroy(graph);
ANS_DBG("TRT_Engine", "CUDA graph CAPTURED OK for batch=%d exec=%p",
batchSize, (void*)graphExec);
}
if (!graphExec) {
ANS_DBG("TRT_Engine", "CUDA graph capture FAILED for batch=%d — falling back to direct path",
batchSize);
std::cout << "Warning: CUDA graph capture failed for batchSize="
<< batchSize << " -- falling back to direct inference path." << std::endl;
// Disable graph acceleration for this Engine instance.
@@ -740,9 +856,17 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
}
if (graphExec) {
ANS_DBG("TRT_Engine", "CUDA graph LAUNCH batch=%d exec=%p", batchSize, (void*)graphExec);
// Launch the pre-captured graph (single API call replaces many).
auto _graphStart = std::chrono::steady_clock::now();
cudaGraphLaunch(graphExec, m_inferenceStream);
cudaStreamSynchronize_Safe(m_inferenceStream);
auto _graphEnd = std::chrono::steady_clock::now();
double _graphMs = std::chrono::duration<double, std::milli>(_graphEnd - _graphStart).count();
if (_graphMs > 500.0) {
ANS_DBG("TRT_SM100", "SLOW GRAPH: %.1fms batch=%d active=%d",
_graphMs, batchSize, s_globalActiveInf.load());
}
// CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
for (int batch = 0; batch < batchSize; ++batch) {
@@ -762,8 +886,16 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
// ----------------------
// Used when pinned buffers are unavailable or graph capture failed.
if (!graphUsed) {
ANS_DBG("TRT_Engine", "Direct path (no graph) batch=%d", batchSize);
auto enqueueStart = std::chrono::steady_clock::now();
bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
auto enqueueEnd = std::chrono::steady_clock::now();
double enqueueMs = std::chrono::duration<double, std::milli>(enqueueEnd - enqueueStart).count();
if (enqueueMs > 500.0) {
ANS_DBG("TRT_Engine", "SLOW ENQUEUE: %.1fms batch=%d (enqueueV3 blocked!)", enqueueMs, batchSize);
}
if (!success) {
ANS_DBG("TRT_Engine", "ERROR: enqueueV3 FAILED batch=%d", batchSize);
std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
+ std::to_string(batchSize)
+ ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
@@ -805,8 +937,16 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
}
}
auto syncStart = std::chrono::steady_clock::now();
cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
auto syncEnd = std::chrono::steady_clock::now();
double syncMs = std::chrono::duration<double, std::milli>(syncEnd - syncStart).count();
if (syncMs > 500.0) {
ANS_DBG("TRT_Engine", "SLOW INFERENCE SYNC: %.1fms batch=%d (direct path)", syncMs, batchSize);
}
if (syncErr != cudaSuccess) {
ANS_DBG("TRT_Engine", "ERROR: cudaStreamSync FAILED err=%d (%s)",
(int)syncErr, cudaGetErrorString(syncErr));
std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
+ std::string(cudaGetErrorString(syncErr));
std::cout << errMsg << std::endl;
@@ -815,5 +955,33 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
}
}
// ============================================================================
// SM=100% DETECTOR — end-of-inference timing
// ============================================================================
{
static thread_local double s_ema = 0;
static thread_local std::chrono::steady_clock::time_point s_prevEnd;
static thread_local bool s_firstDone = false;
auto _now = std::chrono::steady_clock::now();
if (s_firstDone) {
double sinceLastMs = std::chrono::duration<double, std::milli>(_now - s_prevEnd).count();
// If time between consecutive inferences jumps dramatically,
// something blocked the thread (SM=100% or mutex contention)
if (s_ema > 0 && sinceLastMs > s_ema * 3.0 && sinceLastMs > 500.0) {
size_t vf = 0, vt = 0;
cudaMemGetInfo(&vf, &vt);
ANS_DBG("TRT_SM100", "GAP DETECTED: %.1fms between inferences (avg=%.1fms, %.1fx) active=%d VRAM=%zuMB free",
sinceLastMs, s_ema, sinceLastMs / s_ema,
s_globalActiveInf.load(), vf / (1024*1024));
}
s_ema = (s_ema == 0) ? sinceLastMs : (0.9 * s_ema + 0.1 * sinceLastMs);
}
s_prevEnd = _now;
s_firstDone = true;
s_globalActiveInf.fetch_sub(1);
}
return true;
}