Use software decoder by default

2026-04-04 20:19:54 +11:00
parent 3a21026790
commit e134ebdf15
24 changed files with 693 additions and 215 deletions
--- a/engines/TensorRTAPI/include/engine/EngineMultiGpu.inl
+++ b/engines/TensorRTAPI/include/engine/EngineMultiGpu.inl
@@ -607,6 +607,7 @@ bool Engine<T>::runInferenceFromPool(
    //   harmless — the second one finds a fresh slot immediately.
    InferenceSlot* slot = nullptr;
    bool kickedGrowth = false;
+    auto _poolAcquireStart = std::chrono::steady_clock::now();

    {
        std::unique_lock<std::mutex> lock(m_slotMutex);
@@ -630,6 +631,8 @@ bool Engine<T>::runInferenceFromPool(
            }

            if (!slot) {
+                ANS_DBG("TRT_Pool", "ALL SLOTS BUSY: %zu slots, active=%d — waiting for free slot",
+                        n, m_activeCount.load());
                // All slots busy.  In elastic mode, proactively grow the
                // pool in the background so the next request has a slot
                // on a different GPU.  We only kick once per wait cycle.
@@ -672,7 +675,17 @@ bool Engine<T>::runInferenceFromPool(
    }

    // -- 3. Still no slot => reject ---------------------------------------
+    {
+        double _acquireMs = std::chrono::duration<double, std::milli>(
+            std::chrono::steady_clock::now() - _poolAcquireStart).count();
+        if (_acquireMs > 100.0) {
+            ANS_DBG("TRT_Pool", "SLOW slot acquire: %.1fms slot=%p gpu=%d active=%d/%zu",
+                    _acquireMs, (void*)slot, slot ? slot->deviceIndex : -1,
+                    m_activeCount.load(), m_slots.size());
+        }
+    }
    if (!slot) {
+        ANS_DBG("TRT_Pool", "ERROR: No slot available — all %zu slots busy, timeout", m_slots.size());
        std::string errMsg = "[Engine] runInferenceFromPool FAIL: Capacity reached -- all "
            + std::to_string(m_activeCount.load()) + "/" + std::to_string(m_totalCapacity)
            + " slot(s) busy"
@@ -699,12 +712,23 @@ bool Engine<T>::runInferenceFromPool(
        if (currentDev != slot->deviceIndex) {
            cudaSetDevice(slot->deviceIndex);
        }
+        ANS_DBG("TRT_Pool", "Slot dispatch: gpu=%d active=%d/%zu",
+                slot->deviceIndex, m_activeCount.load(), m_slots.size());
+        auto _slotStart = std::chrono::steady_clock::now();
        result = slot->engine->runInference(inputs, featureVectors);
+        auto _slotEnd = std::chrono::steady_clock::now();
+        double _slotMs = std::chrono::duration<double, std::milli>(_slotEnd - _slotStart).count();
+        if (_slotMs > 500.0) {
+            ANS_DBG("TRT_Pool", "SLOW slot inference: %.1fms gpu=%d active=%d/%zu",
+                    _slotMs, slot->deviceIndex, m_activeCount.load(), m_slots.size());
+        }
    }
    catch (const std::exception& ex) {
+        ANS_DBG("TRT_Pool", "ERROR: runInference threw: %s", ex.what());
        std::cout << "Error [Pool]: runInference threw: " << ex.what() << std::endl;
    }
    catch (...) {
+        ANS_DBG("TRT_Pool", "ERROR: runInference threw unknown exception");
        std::cout << "Error [Pool]: runInference threw unknown exception" << std::endl;
    }