Use CPU resize before upload to GPU to remove PCIe bottleneck

2026-04-04 22:29:08 +11:00
parent e134ebdf15
commit 98681f4da6
15 changed files with 572 additions and 493 deletions
--- a/engines/TensorRTAPI/include/engine/EngineRunInference.inl
+++ b/engines/TensorRTAPI/include/engine/EngineRunInference.inl
@@ -284,7 +284,13 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
    // fatal "illegal memory access" that permanently corrupts the CUDA context.
    //
    // Pool-mode slots have their own busy-flag dispatch so they do NOT need this.
+    auto _mutexWaitStart = std::chrono::steady_clock::now();
    std::lock_guard<std::mutex> inferenceLock(m_inferenceMutex);
+    auto _mutexAcquired = std::chrono::steady_clock::now();
+    double _mutexWaitMs = std::chrono::duration<double, std::milli>(_mutexAcquired - _mutexWaitStart).count();
+    if (_mutexWaitMs > 50.0) {
+        ANS_DBG("TRT_Engine", "MUTEX WAIT: %.1fms (queued behind another inference)", _mutexWaitMs);
+    }

    // ============================================================================
    // THREAD-SAFE GPU CONTEXT
@@ -955,6 +961,20 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
        }
    }

+    // ============================================================================
+    // Per-inference total timing breakdown (mutex wait + preprocess + GPU)
+    // ============================================================================
+    {
+        double totalMs = std::chrono::duration<double, std::milli>(
+            std::chrono::steady_clock::now() - _mutexWaitStart).count();
+        double gpuMs = totalMs - _mutexWaitMs;  // Everything after mutex acquired
+        // Log every inference that takes >100ms total (including mutex wait)
+        if (totalMs > 100.0) {
+            ANS_DBG("TRT_Timing", "total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d",
+                    totalMs, _mutexWaitMs, gpuMs, batchSize, s_globalActiveInf.load());
+        }
+    }
+
    // ============================================================================
    // SM=100% DETECTOR — end-of-inference timing
    // ============================================================================