Use software decoder by default

2026-04-04 20:19:54 +11:00
parent 3a21026790
commit e134ebdf15
24 changed files with 693 additions and 215 deletions
--- a/engines/TensorRTAPI/include/engine/EngineRunInference.inl
+++ b/engines/TensorRTAPI/include/engine/EngineRunInference.inl
@@ -1,8 +1,10 @@
 #pragma once
 #include <cstring>
+#include <chrono>
 #include <filesystem>
 #include <semaphore>
 #include "TRTCompat.h"
+#include "ANSLicense.h"   // ANS_DBG macro for DebugView logging

 // Per-device mutex for CUDA graph capture.
 // TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
@@ -86,11 +88,9 @@ static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
    cudaError_t err = cudaStreamQuery(stream);
    if (err != cudaErrorNotReady) return err;

+    auto syncStart = std::chrono::steady_clock::now();
+
    // Short Sleep(0) fast path (~10 iterations) catches sub-ms kernel completions.
-    // Then switch to Sleep(1) to give cleanup operations (cuArrayDestroy, cuMemFree)
-    // a window to acquire the exclusive nvcuda64 SRW lock.
-    // Previously used 1000 Sleep(0) iterations which hogged the SRW lock and
-    // caused ~20-second stalls when concurrent cleanup needed exclusive access.
    for (int i = 0; i < 10; ++i) {
        Sleep(0);
        err = cudaStreamQuery(stream);
@@ -98,10 +98,21 @@ static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
    }

    // 1ms sleeps — adds negligible latency at 30 FPS but prevents SRW lock starvation.
+    int sleepCount = 0;
    while (true) {
        Sleep(1);
+        sleepCount++;
        err = cudaStreamQuery(stream);
-        if (err != cudaErrorNotReady) return err;
+        if (err != cudaErrorNotReady) {
+            // Log if sync took too long (>500ms indicates GPU stall)
+            auto elapsed = std::chrono::duration<double, std::milli>(
+                std::chrono::steady_clock::now() - syncStart).count();
+            if (elapsed > 500.0) {
+                ANS_DBG("TRT_Engine", "SLOW SYNC: %.1fms (%d sleeps) stream=%p err=%d",
+                        elapsed, sleepCount, (void*)stream, (int)err);
+            }
+            return err;
+        }
    }
 }

@@ -368,6 +379,71 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
        return false;
    }

+    // ============================================================================
+    // SM=100% DETECTOR — tracks inference timing trends to catch the exact
+    // moment GPU becomes saturated. Logs every 50 inferences with rolling
+    // average, and immediately when degradation is detected.
+    // ============================================================================
+    // Global (process-wide) counters shared across all engine instances/threads
+    static std::atomic<int64_t> s_globalInfCount{0};
+    static std::atomic<int>     s_globalActiveInf{0};   // currently in-flight inferences
+    static std::atomic<double>  s_globalLastAvgMs{0.0};  // last known avg inference time
+
+    const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
+    s_globalActiveInf.fetch_add(1);
+
+    // Per-thread tracking
+    {
+        static thread_local int64_t s_infCount = 0;
+        static thread_local std::chrono::steady_clock::time_point s_lastLog;
+        static thread_local double  s_rollingAvgMs = 0.0;
+        static thread_local double  s_baselineMs = 0.0;   // avg during first 100 inferences
+        static thread_local double  s_maxMs = 0.0;
+        static thread_local bool    s_degradationLogged = false;
+        s_infCount++;
+
+        if (s_infCount == 1) {
+            s_lastLog = std::chrono::steady_clock::now();
+            ANS_DBG("TRT_SM100", "FIRST inference — engine alive, globalInf=%lld", myInfNum);
+        }
+
+        // Log every 50 inferences (more frequent than 500 to catch transitions)
+        if (s_infCount % 50 == 0) {
+            auto now = std::chrono::steady_clock::now();
+            double elapsed = std::chrono::duration<double>(now - s_lastLog).count();
+            double fps = (elapsed > 0) ? (50.0 / elapsed) : 0;
+            s_lastLog = now;
+
+            size_t vramFree = 0, vramTotal = 0;
+            cudaMemGetInfo(&vramFree, &vramTotal);
+            size_t vramUsedMB = (vramTotal - vramFree) / (1024 * 1024);
+            size_t vramFreeMB = vramFree / (1024 * 1024);
+
+            ANS_DBG("TRT_SM100", "#%lld [global=%lld active=%d] %.1f inf/sec avgMs=%.1f maxMs=%.1f batch=%d graphs=%zu VRAM=%zuMB/%zuMB",
+                    s_infCount, myInfNum, s_globalActiveInf.load(),
+                    fps, s_rollingAvgMs, s_maxMs,
+                    (int)inputs[0].size(), m_graphExecs.size(),
+                    vramUsedMB, vramFreeMB);
+
+            // Capture baseline from first 100 inferences
+            if (s_infCount == 100) {
+                s_baselineMs = s_rollingAvgMs;
+                ANS_DBG("TRT_SM100", "BASELINE established: %.1fms/inference", s_baselineMs);
+            }
+
+            // Detect degradation: avg >3x baseline AND baseline is set
+            if (s_baselineMs > 0 && s_rollingAvgMs > s_baselineMs * 3.0 && !s_degradationLogged) {
+                s_degradationLogged = true;
+                ANS_DBG("TRT_SM100", "*** DEGRADATION DETECTED *** avg=%.1fms baseline=%.1fms (%.1fx) VRAM=%zuMB/%zuMB active=%d",
+                        s_rollingAvgMs, s_baselineMs, s_rollingAvgMs / s_baselineMs,
+                        vramUsedMB, vramFreeMB, s_globalActiveInf.load());
+            }
+
+            // Reset max for next window
+            s_maxMs = 0.0;
+        }
+    }
+
    const auto numInputs = m_inputDims.size();
    if (inputs.size() != numInputs) {
        std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
@@ -457,6 +533,9 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
        }

        if (anyDimChanged) {
+            ANS_DBG("TRT_Engine", "Shape change detected: batch %d -> %d (graphsCached=%zu)",
+                    m_lastBatchSize, batchSize, m_graphExecs.size());
+
            // === First-time diagnostics (verbose, once) ===
            const bool firstTime = !m_batchShapeChangeLogged;

@@ -536,7 +615,10 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
                        << newDims.d[3] << "]" << std::endl;
                }

+                ANS_DBG("TRT_Engine", "setInputShape('%s') [%d,%d,%d,%d]",
+                        tensorName, newDims.d[0], newDims.d[1], newDims.d[2], newDims.d[3]);
                if (!m_context->setInputShape(tensorName, newDims)) {
+                    ANS_DBG("TRT_Engine", "ERROR: setInputShape FAILED for '%s'", tensorName);
                    std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
                    return false;
                }
@@ -576,6 +658,25 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i

            m_lastBatchSize = batchSize;
            m_batchShapeChangeLogged = true;
+
+            // CRITICAL: Invalidate all cached CUDA graphs after shape change.
+            // Graphs were captured with the OLD context state (old tensor shapes).
+            // Launching them after setInputShape() produces undefined GPU behavior
+            // (invalid kernel sequences, SM lockup at 100%, inference hang).
+            if (!m_graphExecs.empty()) {
+                size_t destroyed = m_graphExecs.size();
+                for (auto& [bs, ge] : m_graphExecs) {
+                    if (ge) cudaGraphExecDestroy(ge);
+                }
+                m_graphExecs.clear();
+                ANS_DBG("TRT_Engine", "INVALIDATED %zu cached CUDA graphs after shape change (batch=%d)",
+                        destroyed, batchSize);
+                if (m_verbose || firstTime) {
+                    std::cout << "Info: Invalidated " << destroyed
+                        << " cached CUDA graphs after shape change" << std::endl;
+                }
+            }
+
            if (m_verbose && firstTime) {
                std::cout << "\nInfo: Input shapes updated successfully for batch size "
                    << batchSize << " ✓\n" << std::endl;
@@ -619,6 +720,7 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
    //
    // GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
    // cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
+    auto _prepStart = std::chrono::steady_clock::now();
    cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
    std::vector<cv::cuda::GpuMat> preprocessedBuffers;
    preprocessedBuffers.reserve(numInputs);
@@ -647,6 +749,14 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
        }
    }

+    {
+        double _prepMs = std::chrono::duration<double, std::milli>(
+            std::chrono::steady_clock::now() - _prepStart).count();
+        if (_prepMs > 100.0) {
+            ANS_DBG("TRT_SM100", "SLOW PREPROCESS: %.1fms batch=%d (blobFromGpuMats+D2D)", _prepMs, batchSize);
+        }
+    }
+
    // ============================================================================
    // PRE-ALLOCATE OUTPUT STRUCTURE
    // ============================================================================
@@ -690,6 +800,8 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
    if (canGraph) {
        auto& graphExec = m_graphExecs[batchSize];   // inserts nullptr on first access
        if (!graphExec) {
+            ANS_DBG("TRT_Engine", "CUDA graph CAPTURE starting for batch=%d (cached=%zu)",
+                    batchSize, m_graphExecs.size());
            // First call for this batchSize -- capture a new graph.
            // Serialise captures across all Engine instances on this device to
            // prevent TRT's shared workspace from creating cross-stream
@@ -727,9 +839,13 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
                if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
                    graphExec = exec;
                cudaGraphDestroy(graph);
+                ANS_DBG("TRT_Engine", "CUDA graph CAPTURED OK for batch=%d exec=%p",
+                        batchSize, (void*)graphExec);
            }

            if (!graphExec) {
+                ANS_DBG("TRT_Engine", "CUDA graph capture FAILED for batch=%d — falling back to direct path",
+                        batchSize);
                std::cout << "Warning: CUDA graph capture failed for batchSize="
                    << batchSize << " -- falling back to direct inference path." << std::endl;
                // Disable graph acceleration for this Engine instance.
@@ -740,9 +856,17 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
        }

        if (graphExec) {
+            ANS_DBG("TRT_Engine", "CUDA graph LAUNCH batch=%d exec=%p", batchSize, (void*)graphExec);
            // Launch the pre-captured graph (single API call replaces many).
+            auto _graphStart = std::chrono::steady_clock::now();
            cudaGraphLaunch(graphExec, m_inferenceStream);
            cudaStreamSynchronize_Safe(m_inferenceStream);
+            auto _graphEnd = std::chrono::steady_clock::now();
+            double _graphMs = std::chrono::duration<double, std::milli>(_graphEnd - _graphStart).count();
+            if (_graphMs > 500.0) {
+                ANS_DBG("TRT_SM100", "SLOW GRAPH: %.1fms batch=%d active=%d",
+                        _graphMs, batchSize, s_globalActiveInf.load());
+            }

            // CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
            for (int batch = 0; batch < batchSize; ++batch) {
@@ -762,8 +886,16 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
    // ----------------------
    // Used when pinned buffers are unavailable or graph capture failed.
    if (!graphUsed) {
+        ANS_DBG("TRT_Engine", "Direct path (no graph) batch=%d", batchSize);
+        auto enqueueStart = std::chrono::steady_clock::now();
        bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
+        auto enqueueEnd = std::chrono::steady_clock::now();
+        double enqueueMs = std::chrono::duration<double, std::milli>(enqueueEnd - enqueueStart).count();
+        if (enqueueMs > 500.0) {
+            ANS_DBG("TRT_Engine", "SLOW ENQUEUE: %.1fms batch=%d (enqueueV3 blocked!)", enqueueMs, batchSize);
+        }
        if (!success) {
+            ANS_DBG("TRT_Engine", "ERROR: enqueueV3 FAILED batch=%d", batchSize);
            std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
                + std::to_string(batchSize)
                + ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
@@ -805,8 +937,16 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
            }
        }

+        auto syncStart = std::chrono::steady_clock::now();
        cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
+        auto syncEnd = std::chrono::steady_clock::now();
+        double syncMs = std::chrono::duration<double, std::milli>(syncEnd - syncStart).count();
+        if (syncMs > 500.0) {
+            ANS_DBG("TRT_Engine", "SLOW INFERENCE SYNC: %.1fms batch=%d (direct path)", syncMs, batchSize);
+        }
        if (syncErr != cudaSuccess) {
+            ANS_DBG("TRT_Engine", "ERROR: cudaStreamSync FAILED err=%d (%s)",
+                    (int)syncErr, cudaGetErrorString(syncErr));
            std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
                + std::string(cudaGetErrorString(syncErr));
            std::cout << errMsg << std::endl;
@@ -815,5 +955,33 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
        }
    }

+    // ============================================================================
+    // SM=100% DETECTOR — end-of-inference timing
+    // ============================================================================
+    {
+        static thread_local double s_ema = 0;
+        static thread_local std::chrono::steady_clock::time_point s_prevEnd;
+        static thread_local bool s_firstDone = false;
+
+        auto _now = std::chrono::steady_clock::now();
+        if (s_firstDone) {
+            double sinceLastMs = std::chrono::duration<double, std::milli>(_now - s_prevEnd).count();
+            // If time between consecutive inferences jumps dramatically,
+            // something blocked the thread (SM=100% or mutex contention)
+            if (s_ema > 0 && sinceLastMs > s_ema * 3.0 && sinceLastMs > 500.0) {
+                size_t vf = 0, vt = 0;
+                cudaMemGetInfo(&vf, &vt);
+                ANS_DBG("TRT_SM100", "GAP DETECTED: %.1fms between inferences (avg=%.1fms, %.1fx) active=%d VRAM=%zuMB free",
+                        sinceLastMs, s_ema, sinceLastMs / s_ema,
+                        s_globalActiveInf.load(), vf / (1024*1024));
+            }
+            s_ema = (s_ema == 0) ? sinceLastMs : (0.9 * s_ema + 0.1 * sinceLastMs);
+        }
+        s_prevEnd = _now;
+        s_firstDone = true;
+
+        s_globalActiveInf.fetch_sub(1);
+    }
+
    return true;
 }