Use software decoder by default
This commit is contained in:
@@ -607,6 +607,7 @@ bool Engine<T>::runInferenceFromPool(
|
||||
// harmless — the second one finds a fresh slot immediately.
|
||||
InferenceSlot* slot = nullptr;
|
||||
bool kickedGrowth = false;
|
||||
auto _poolAcquireStart = std::chrono::steady_clock::now();
|
||||
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(m_slotMutex);
|
||||
@@ -630,6 +631,8 @@ bool Engine<T>::runInferenceFromPool(
|
||||
}
|
||||
|
||||
if (!slot) {
|
||||
ANS_DBG("TRT_Pool", "ALL SLOTS BUSY: %zu slots, active=%d — waiting for free slot",
|
||||
n, m_activeCount.load());
|
||||
// All slots busy. In elastic mode, proactively grow the
|
||||
// pool in the background so the next request has a slot
|
||||
// on a different GPU. We only kick once per wait cycle.
|
||||
@@ -672,7 +675,17 @@ bool Engine<T>::runInferenceFromPool(
|
||||
}
|
||||
|
||||
// -- 3. Still no slot => reject ---------------------------------------
|
||||
{
|
||||
double _acquireMs = std::chrono::duration<double, std::milli>(
|
||||
std::chrono::steady_clock::now() - _poolAcquireStart).count();
|
||||
if (_acquireMs > 100.0) {
|
||||
ANS_DBG("TRT_Pool", "SLOW slot acquire: %.1fms slot=%p gpu=%d active=%d/%zu",
|
||||
_acquireMs, (void*)slot, slot ? slot->deviceIndex : -1,
|
||||
m_activeCount.load(), m_slots.size());
|
||||
}
|
||||
}
|
||||
if (!slot) {
|
||||
ANS_DBG("TRT_Pool", "ERROR: No slot available — all %zu slots busy, timeout", m_slots.size());
|
||||
std::string errMsg = "[Engine] runInferenceFromPool FAIL: Capacity reached -- all "
|
||||
+ std::to_string(m_activeCount.load()) + "/" + std::to_string(m_totalCapacity)
|
||||
+ " slot(s) busy"
|
||||
@@ -699,12 +712,23 @@ bool Engine<T>::runInferenceFromPool(
|
||||
if (currentDev != slot->deviceIndex) {
|
||||
cudaSetDevice(slot->deviceIndex);
|
||||
}
|
||||
ANS_DBG("TRT_Pool", "Slot dispatch: gpu=%d active=%d/%zu",
|
||||
slot->deviceIndex, m_activeCount.load(), m_slots.size());
|
||||
auto _slotStart = std::chrono::steady_clock::now();
|
||||
result = slot->engine->runInference(inputs, featureVectors);
|
||||
auto _slotEnd = std::chrono::steady_clock::now();
|
||||
double _slotMs = std::chrono::duration<double, std::milli>(_slotEnd - _slotStart).count();
|
||||
if (_slotMs > 500.0) {
|
||||
ANS_DBG("TRT_Pool", "SLOW slot inference: %.1fms gpu=%d active=%d/%zu",
|
||||
_slotMs, slot->deviceIndex, m_activeCount.load(), m_slots.size());
|
||||
}
|
||||
}
|
||||
catch (const std::exception& ex) {
|
||||
ANS_DBG("TRT_Pool", "ERROR: runInference threw: %s", ex.what());
|
||||
std::cout << "Error [Pool]: runInference threw: " << ex.what() << std::endl;
|
||||
}
|
||||
catch (...) {
|
||||
ANS_DBG("TRT_Pool", "ERROR: runInference threw unknown exception");
|
||||
std::cout << "Error [Pool]: runInference threw unknown exception" << std::endl;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user