engines/TensorRTAPI/include/engine/EngineMultiGpu.inl

// ============================================================================
// EngineMultiGpu.inl
//
// Multi-GPU inference pool -- merged from MultiGpuEngineManager.h
//
// This file is #included at the bottom of engine.h and must not be compiled
// independently.  It provides implementations for all pool-management methods
// declared inside Engine<T>:
//
//   initializePool()           -- build from ONNX, create pool
//   initializePoolFromEngine() -- load pre-built TRT engine, create pool
//   enumerateDevices()         -- static CUDA device enumeration
//   loadSlots()                -- core pool allocation logic (private)
//   runInferenceFromPool()     -- thread-safe slot dispatch (private)
//   getTotalCapacity()         -- inline in engine.h
//   getActiveInferences()      -- inline in engine.h
//   getAvailableSlots()        -- inline in engine.h
//   isAtCapacity()             -- inline in engine.h
//   printCapacityReport()      -- human-readable pool status
// ============================================================================

// -- Static member definitions for global elastic slot cap --------------------
template <typename T>
std::atomic<int> Engine<T>::s_globalElasticCount{0};
template <typename T>
std::atomic<int> Engine<T>::s_globalElasticMax{32};  // safe default, overwritten on first pool init
template <typename T>
std::once_flag Engine<T>::s_globalCapInitFlag;
template <typename T>
std::atomic<int64_t> Engine<T>::s_lastPoolCreatedMs{0};

// ----------------------------------------------------------------------------
//  enumerateDevices  -- static, no model required
// ----------------------------------------------------------------------------
template <typename T>
/*static*/ std::vector<GpuDeviceInfo>
Engine<T>::enumerateDevices()
{
    int count = 0;
    cudaGetDeviceCount(&count);

    std::vector<GpuDeviceInfo> devices;
    devices.reserve(static_cast<size_t>(count));

    for (int i = 0; i < count; ++i) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);

        cudaSetDevice(i);
        size_t freeBytes = 0, totalBytes = 0;
        cudaMemGetInfo(&freeBytes, &totalBytes);

        GpuDeviceInfo info;
        info.index                 = i;
        info.name                  = prop.name;
        info.totalMemoryBytes      = prop.totalGlobalMem;
        info.freeMemoryAtInitBytes = freeBytes;
        info.computeMajor          = prop.major;
        info.computeMinor          = prop.minor;
        info.slotsAllocated        = 0;
        info.memoryPerSlotBytes    = 0;
        devices.push_back(std::move(info));
    }

    return devices;
}

// ----------------------------------------------------------------------------
//  Public pool-init wrappers
// ----------------------------------------------------------------------------
template <typename T>
bool Engine<T>::initializePool(
        const ANSCENTER::Options&   baseOptions,
        const std::string&          onnxModelPath,
        const std::array<float, 3>& subVals,
        const std::array<float, 3>& divVals,
        bool                        normalize,
        int                         maxSlotsPerGpu,
        double                      memSafetyFactor)
{
    // Apply baseOptions to *this* so that m_options is consistent whether
    // the user goes through initializePool() or the 6-param buildLoadNetwork().
    m_options = baseOptions;
    return buildLoadNetwork(onnxModelPath, subVals, divVals, normalize,
                            maxSlotsPerGpu, memSafetyFactor);
}

template <typename T>
bool Engine<T>::initializePoolFromEngine(
        const ANSCENTER::Options&   baseOptions,
        const std::string&          trtEnginePath,
        const std::array<float, 3>& subVals,
        const std::array<float, 3>& divVals,
        bool                        normalize,
        int                         maxSlotsPerGpu,
        double                      memSafetyFactor)
{
    m_options = baseOptions;
    return loadNetwork(trtEnginePath, subVals, divVals, normalize,
                       maxSlotsPerGpu, memSafetyFactor);
}

// ----------------------------------------------------------------------------
//  loadSlots  -- core pool allocation logic
//
//  Three modes based on maxSlotsPerGpu:
//
//     1  =>  ROUND-ROBIN (default)
//           1 slot per GPU, created at init.  Tasks queue when all slots
//           busy.  Best balance of VRAM usage and multi-GPU utilisation.
//           Example: 3 GPUs → 3 slots, round-robin dispatch.
//
//    -1  =>  ELASTIC MODE
//           Only the probe slot is pre-loaded.  Additional slots are created
//           on-demand by tryGrowPool() when concurrent requests arrive, and
//           released by releaseIdleSlots() when idle.  Higher throughput but
//           higher VRAM usage — only recommended for large GPUs (≥ 8 GB).
//
//    >1  =>  PRE-ALLOCATED MODE (explicit cap)
//           Slots are created upfront, capped at maxSlotsPerGpu per GPU.
//           Useful when the caller knows the required concurrency level.
// ----------------------------------------------------------------------------
template <typename T>
bool Engine<T>::loadSlots(
        const ANSCENTER::Options&   baseOptions,
        const std::string&          modelPath,
        const std::array<float, 3>& subVals,
        const std::array<float, 3>& divVals,
        bool                        normalize,
        bool                        fromOnnx,
        int                         maxSlotsPerGpu,
        double                      memSafetyFactor)
{
    // -- 1. Enumerate GPUs --------------------------------------------------
    m_deviceInfos = enumerateDevices();

    if (m_deviceInfos.empty()) {
        std::cout << "Error [Pool]: No CUDA-capable GPUs detected" << std::endl;
        return false;
    }

    const bool elastic = (maxSlotsPerGpu <= 0);
    m_elasticMode = elastic;

    // Set global elastic slot cap ONCE based on total GPU VRAM.
    // Budget: ~4 slots per GB.  This cap is shared across ALL pools
    // to prevent CUDA driver SRW lock convoy (30+ threads deadlocked).
    //   4 GB → 12,  6 GB → 24,  8 GB → 32,  12 GB → 48,  24 GB → 96
    if (elastic) {
        std::call_once(s_globalCapInitFlag, [this]() {
            int totalGB = 0;
            for (const auto& dev : m_deviceInfos)
                totalGB += static_cast<int>(dev.totalMemoryBytes / (1024ULL * 1024ULL * 1024ULL));
            int cap = std::max(8, totalGB * 4);  // minimum 8
            s_globalElasticMax.store(cap);
            std::cout << "Info [Pool]: Global elastic slot cap = "
                << cap << " (total " << totalGB << " GB VRAM x4)" << std::endl;
        });
    }

    std::cout << "\n====================================================" << std::endl;
    std::cout << "Engine Pool Initialization"
              << (elastic ? " [ELASTIC]" : " [PRE-ALLOCATED]") << std::endl;
    std::cout << "====================================================" << std::endl;
    std::cout << "Found " << m_deviceInfos.size() << " GPU(s):" << std::endl;

    for (const auto& d : m_deviceInfos) {
        std::cout << "  GPU[" << d.index << "] " << d.name
            << " | SM " << d.computeMajor << "." << d.computeMinor
            << " | Total " << d.totalMemoryBytes / 1048576 << " MiB"
            << " | Free "  << d.freeMemoryAtInitBytes / 1048576 << " MiB"
            << std::endl;
    }

    // Warn about heterogeneous GPUs -- TRT engine may not be compatible
    for (size_t i = 1; i < m_deviceInfos.size(); ++i) {
        if (m_deviceInfos[i].name != m_deviceInfos[0].name) {
            std::cout << "Warning [Pool]: GPU[" << i << "] '" << m_deviceInfos[i].name
                << "' differs from GPU[0] '" << m_deviceInfos[0].name
                << "'. TRT engine binary may be incompatible with dissimilar GPUs."
                << std::endl;
        }
    }

    // -- 2. Probe engine: measure per-slot VRAM footprint -------------------
    //
    //   Memory delta = freeBeforeLoad - freeAfterLoad
    //   Includes: TRT runtime buffers, CUDA context overhead, I/O buffers,
    //             stream memory, and workspace allocated by Engine<T>.
    //
    //   MULTI-GPU BALANCING: place the probe on the GPU with the most free
    //   VRAM.  This naturally distributes engines across GPUs as each pool
    //   init consumes VRAM from its chosen GPU, making the *other* GPU
    //   the best candidate for the next pool.

    int probeGpuIdx = 0;
    {
        size_t bestFree = 0;
        for (const auto& d : m_deviceInfos) {
            cudaSetDevice(d.index);
            size_t freeNow = 0, totalNow = 0;
            cudaMemGetInfo(&freeNow, &totalNow);
            std::cout << "  GPU[" << d.index << "] free VRAM: " << freeNow / 1048576 << " MiB" << std::endl;
            if (freeNow > bestFree) {
                bestFree    = freeNow;
                probeGpuIdx = d.index;
            }
        }
    }
    std::cout << "\nLoading probe engine on GPU[" << probeGpuIdx
        << "] (most free VRAM) to measure per-slot memory..." << std::endl;

    cudaSetDevice(probeGpuIdx);
    size_t freeBefore = 0, tmp = 0;
    cudaMemGetInfo(&freeBefore, &tmp);

    ANSCENTER::Options opts0 = baseOptions;
    opts0.deviceIndex         = probeGpuIdx;

    auto probeEngine = std::make_unique<Engine<T>>(opts0);
    const bool probeOk = fromOnnx
        ? probeEngine->buildLoadNetwork(modelPath, subVals, divVals, normalize)
        : probeEngine->loadNetwork     (modelPath, subVals, divVals, normalize);

    if (!probeOk) {
        return false;
    }

    size_t freeAfter = 0;
    cudaMemGetInfo(&freeAfter, &tmp);

    // Floor the delta at 64 MiB to guard against measurement noise
    constexpr size_t kMinSlotMemBytes = 64ULL * 1024 * 1024;
    const size_t rawDelta   = (freeBefore > freeAfter) ? (freeBefore - freeAfter) : 0ULL;
    const size_t memPerSlot = std::max(rawDelta, kMinSlotMemBytes);

    std::cout << "Info [Pool]: Memory per slot = " << memPerSlot / 1048576
        << " MiB  (measured delta = " << rawDelta / 1048576 << " MiB)" << std::endl;

    // Cache input/output tensor dims on *this* Engine so getInputDims() /
    // getOutputDims() work correctly when the pool is the active code path.
    m_inputDims  = probeEngine->getInputDims();
    m_outputDims = probeEngine->getOutputDims();

    // Sync GPU-capped batch sizes from the probe engine.  The build() function
    // may have reduced maxBatchSize based on GPU VRAM tier; propagate that to
    // the pool manager so callers see the actual runtime limits.
    m_options.maxBatchSize = probeEngine->getOptions().maxBatchSize;
    m_options.optBatchSize = probeEngine->getOptions().optBatchSize;

    // Store per-slot measurement for on-demand growth
    m_memPerSlot = memPerSlot;

    // Promote the probe engine into the first slot on the chosen GPU
    {
        InferenceSlot s;
        s.deviceIndex = probeGpuIdx;
        s.busy        = false;
        s.memUsed     = memPerSlot;
        s.engine      = std::move(probeEngine);
        m_slots.push_back(std::move(s));
    }
    m_deviceInfos[probeGpuIdx].slotsAllocated    = 1;
    m_deviceInfos[probeGpuIdx].memoryPerSlotBytes = memPerSlot;

    // -- 3. Store config for on-demand growth (elastic mode) -------------
    m_poolModelPath    = modelPath;
    m_poolSubVals      = subVals;
    m_poolDivVals      = divVals;
    m_poolNormalize    = normalize;
    m_poolFromOnnx     = fromOnnx;
    m_poolSafetyFactor = memSafetyFactor;

    if (elastic) {
        // -- ELASTIC: only the probe slot is pre-loaded -----------------
        std::cout << "Info [Pool]: Elastic mode -- starting with 1 probe slot."
            << "  Additional slots will be created on-demand as concurrent"
            << " requests arrive and released when idle." << std::endl;

        m_totalCapacity = 1;
        // Mark creation time — elastic growth is deferred for s_elasticGraceSec
        // to let other models create their probe engines first.
        {
            using namespace std::chrono;
            auto now = duration_cast<milliseconds>(
                steady_clock::now().time_since_epoch()).count();
            s_lastPoolCreatedMs.store(now);
        }
        printCapacityReport();
        startIdleTimer();     // Auto-cleanup idle slots periodically
        return true;
    }

    // -- 4. PRE-ALLOCATED: compute per-GPU capacity, then interleave -----
    //
    //   Phase A: determine how many slots each GPU can hold.
    //   Phase B: create slots in round-robin order across GPUs so that
    //            the linear m_nextSlotHint scan naturally distributes
    //            consecutive requests across GPUs:
    //              m_slots = [GPU0-s0, GPU1-s0, GPU2-s0, GPU0-s1, GPU1-s1, ...]
    //            This gives: Task1→GPU0, Task2→GPU1, Task3→GPU2, Task4→GPU0 ...

    const int numGpus = static_cast<int>(m_deviceInfos.size());

    // Phase A: compute slotsToAdd per GPU
    std::vector<int> slotsPerGpu(numGpus, 0);
    int maxSlotsAny = 0;

    for (int di = 0; di < numGpus; ++di) {
        cudaSetDevice(di);
        size_t freeNow = 0, totalNow = 0;
        cudaMemGetInfo(&freeNow, &totalNow);

        const size_t usableBytes = static_cast<size_t>(
            static_cast<double>(freeNow) * memSafetyFactor);

        int slotsToAdd = (memPerSlot > 0)
            ? static_cast<int>(usableBytes / memPerSlot) : 0;

        // Apply explicit per-GPU cap; the probe GPU already has the probe slot
        if (maxSlotsPerGpu > 0) {
            const int budget = (di == probeGpuIdx)
                ? (maxSlotsPerGpu - 1)
                : maxSlotsPerGpu;
            slotsToAdd = std::min(slotsToAdd, budget);
        }

        slotsPerGpu[di] = slotsToAdd;
        if (slotsToAdd > maxSlotsAny) maxSlotsAny = slotsToAdd;
        m_deviceInfos[di].memoryPerSlotBytes = memPerSlot;

        std::cout << "Info [Pool]: GPU[" << di << "] " << m_deviceInfos[di].name
            << " -- free " << freeNow / 1048576 << " MiB"
            << ", usable " << usableBytes / 1048576 << " MiB"
            << " => will add " << slotsToAdd << " slot(s)" << std::endl;
    }

    // Phase B: create slots interleaved across GPUs
    //   Round  0: GPU0-slot0, GPU1-slot0, GPU2-slot0
    //   Round  1: GPU0-slot1, GPU1-slot1, GPU2-slot1
    //   ...
    std::vector<int> slotsCreated(numGpus, 0);   // track actual success per GPU
    std::vector<bool> gpuFailed(numGpus, false);  // stop trying failed GPUs

    for (int round = 0; round < maxSlotsAny; ++round) {
        for (int di = 0; di < numGpus; ++di) {
            if (gpuFailed[di]) continue;
            if (slotsCreated[di] >= slotsPerGpu[di]) continue;

            cudaSetDevice(di);
            ANSCENTER::Options opts = baseOptions;
            opts.deviceIndex        = di;

            auto eng = std::make_unique<Engine<T>>(opts);
            eng->setVerbose(false);
            eng->setDisableGraphs(true); // concurrent graph captures corrupt CUDA context
            eng->m_skipEngineCache = m_skipEngineCache;  // propagate to pool slots
            const bool ok = fromOnnx
                ? eng->buildLoadNetwork(modelPath, subVals, divVals, normalize)
                : eng->loadNetwork     (modelPath, subVals, divVals, normalize);

            if (!ok) {
                std::cout << "Warning [Pool]: GPU[" << di << "] slot "
                    << (slotsCreated[di] + 1) << "/" << slotsPerGpu[di]
                    << " failed to load; halting allocation on this device." << std::endl;
                gpuFailed[di] = true;
                continue;
            }

            InferenceSlot slot;
            slot.deviceIndex = di;
            slot.busy        = false;
            slot.memUsed     = memPerSlot;
            slot.engine      = std::move(eng);
            m_slots.push_back(std::move(slot));
            m_deviceInfos[di].slotsAllocated++;
            slotsCreated[di]++;
        }
    }

    m_totalCapacity = static_cast<int>(m_slots.size());
    printCapacityReport();

    if (m_totalCapacity == 0) {
        std::cout << "Error [Pool]: Zero inference slots allocated -- "
                     "check available GPU memory." << std::endl;
        return false;
    }

    return true;
}

// ----------------------------------------------------------------------------
//  tryGrowPool  -- on-demand slot creation (elastic mode)
//
//  Called by runInferenceFromPool when every alive slot is busy.
//  Creates ONE new engine on the first GPU that has enough free VRAM.
//  GPUs are scanned in order (0, 1, ...), concentrating load on GPU 0 first.
//
//  Returns a pointer to the new slot (already marked busy) or nullptr if
//  no GPU has enough VRAM.
//
//  Thread-safety: m_growMutex serialises growth so only one thread creates
//  a slot at a time.  m_slotMutex is acquired briefly to push the new slot
//  into the deque.  The calling thread waits (engine deserialisation takes
//  ~0.5-3 s), but that is far better than rejecting the request entirely.
// ----------------------------------------------------------------------------
template <typename T>
typename Engine<T>::InferenceSlot*
Engine<T>::tryGrowPool(bool bypassGrace)
{
    std::lock_guard<std::mutex> growLock(m_growMutex);

    // Grace period: defer elastic growth for s_elasticGraceSec after the most
    // recent pool creation.  This reserves VRAM for probe engines that haven't
    // been created yet (e.g., 10 models loading sequentially — early pools
    // shouldn't grow elastic slots while later probes still need VRAM).
    // Bypassed for demand-driven growth (a new consumer explicitly joined the
    // pool, so we KNOW more slots are needed).
    if (!bypassGrace) {
        using namespace std::chrono;
        auto now = duration_cast<milliseconds>(
            steady_clock::now().time_since_epoch()).count();
        int64_t lastCreated = s_lastPoolCreatedMs.load();
        int64_t elapsedSec = (now - lastCreated) / 1000;
        if (lastCreated > 0 && elapsedSec < s_elasticGraceSec) {
            // Silently skip — don't spam logs during grace period
            return nullptr;
        }
    }

    // Global cap: prevent too many concurrent CUDA operations across ALL pools.
    // With shared engine pools, unlimited elastic growth causes CUDA driver
    // SRW lock convoy (30+ threads all blocked on nvcuda64 internal locks).
    const int currentGlobal = s_globalElasticCount.load();
    const int maxGlobal = s_globalElasticMax.load();
    if (currentGlobal >= maxGlobal) {
        std::cout << "Info [Pool]: tryGrowPool -- global cap reached ("
            << currentGlobal << "/" << maxGlobal
            << " total slots), not growing" << std::endl;
        return nullptr;
    }

    // Find the GPU with the most free VRAM that has enough for one more slot.
    // This naturally balances load across GPUs instead of always filling GPU 0.
    const size_t requiredBytes = (m_poolSafetyFactor > 0.0)
        ? static_cast<size_t>(static_cast<double>(m_memPerSlot) / m_poolSafetyFactor)
        : m_memPerSlot;

    std::cout << "Info [Pool]: tryGrowPool called -- need " << (requiredBytes >> 20)
        << " MiB per slot, scanning " << m_deviceInfos.size() << " GPU(s)..."
        << std::endl;

    // Sort device candidates by free VRAM descending (most free first)
    std::vector<std::pair<size_t, int>> gpuByFreeVram; // {freeBytes, deviceIndex}
    for (const auto& dev : m_deviceInfos) {
        cudaSetDevice(dev.index);
        size_t freeNow = 0, totalNow = 0;
        cudaMemGetInfo(&freeNow, &totalNow);
        std::cout << "Info [Pool]:   GPU[" << dev.index << "] free=" << (freeNow >> 20)
            << " MiB, required=" << (requiredBytes >> 20) << " MiB"
            << (freeNow >= requiredBytes ? " -> CANDIDATE" : " -> SKIP (not enough)")
            << std::endl;
        if (freeNow >= requiredBytes) {
            gpuByFreeVram.push_back({freeNow, dev.index});
        }
    }
    std::sort(gpuByFreeVram.begin(), gpuByFreeVram.end(),
              [](const auto& a, const auto& b) { return a.first > b.first; });

    if (gpuByFreeVram.empty()) {
        std::cout << "Warning [Pool]: tryGrowPool -- no GPU has enough free VRAM ("
            << (requiredBytes >> 20) << " MiB), cannot grow" << std::endl;
        return nullptr;
    }

    for (const auto& [freeVram, devIdx] : gpuByFreeVram) {
        auto& dev = m_deviceInfos[devIdx];

        std::cout << "Info [Pool]: Creating on-demand slot on GPU[" << dev.index
            << "] (free=" << (freeVram >> 20) << " MiB)..." << std::endl;

        // Create a new engine on the GPU with the most free VRAM
        cudaSetDevice(dev.index);
        ANSCENTER::Options opts = m_options;
        opts.deviceIndex = dev.index;

        auto eng = std::make_unique<Engine<T>>(opts);
        eng->setVerbose(false);
        eng->setDisableGraphs(true); // concurrent graph captures corrupt CUDA context
        eng->m_skipEngineCache = m_skipEngineCache;  // propagate to on-demand slots
        eng->m_skipOnnxRebuild = true;  // elastic growth must NOT delete/rebuild engine files
        eng->m_skipOnnxBuild  = bypassGrace;  // demand-driven growth: skip ONNX→TRT if no cached engine
        const bool ok = m_poolFromOnnx
            ? eng->buildLoadNetwork(m_poolModelPath, m_poolSubVals,
                                    m_poolDivVals, m_poolNormalize)
            : eng->loadNetwork(m_poolModelPath, m_poolSubVals,
                               m_poolDivVals, m_poolNormalize);

        if (!ok) {
            std::cout << "Warning [Pool]: On-demand slot creation FAILED on GPU["
                << dev.index << "]" << std::endl;
            continue;   // try next GPU
        }
        std::cout << "Info [Pool]: On-demand slot engine loaded OK on GPU["
            << dev.index << "]" << std::endl;

        // Check if we can reuse a dead slot entry (engine == nullptr)
        {
            std::lock_guard<std::mutex> slotLock(m_slotMutex);

            for (auto& s : m_slots) {
                if (!s.engine) {   // dead entry -- recycle it
                    s.deviceIndex  = dev.index;
                    s.busy         = true;
                    s.memUsed      = m_memPerSlot;
                    s.engine       = std::move(eng);
                    s.lastUsedTime = std::chrono::steady_clock::now();
                    dev.slotsAllocated++;
                    // Recount alive slots
                    int alive = 0;
                    for (const auto& x : m_slots) { if (x.engine) ++alive; }
                    m_totalCapacity = alive;
                    s_globalElasticCount++;
                    std::cout << "Info [Pool]: On-demand slot recycled on GPU["
                        << dev.index << "] -- pool now " << m_totalCapacity
                        << " slot(s) (global " << s_globalElasticCount.load()
                        << "/" << s_globalElasticMax.load() << ")" << std::endl;
                    return &s;
                }
            }

            // No dead entries to recycle -- push a new one.
            // std::deque::push_back does NOT invalidate references to existing
            // elements, so pointers held by other threads remain valid.
            InferenceSlot newSlot;
            newSlot.deviceIndex  = dev.index;
            newSlot.busy         = true;
            newSlot.memUsed      = m_memPerSlot;
            newSlot.engine       = std::move(eng);
            newSlot.lastUsedTime = std::chrono::steady_clock::now();
            m_slots.push_back(std::move(newSlot));
            dev.slotsAllocated++;
            m_totalCapacity = static_cast<int>(m_slots.size());  // all alive here
            s_globalElasticCount++;

            std::cout << "Info [Pool]: On-demand slot created on GPU["
                << dev.index << "] -- pool now " << m_totalCapacity
                << " slot(s) (global " << s_globalElasticCount.load()
                << "/" << s_globalElasticMax.load() << ")" << std::endl;

            return &m_slots.back();
        }
    }

    return nullptr;   // every GPU is full
}

// ----------------------------------------------------------------------------
//  growPool  -- public demand-driven growth (bypasses grace period)
// ----------------------------------------------------------------------------
template <typename T>
int Engine<T>::growPool(int count)
{
    int created = 0;
    for (int i = 0; i < count; ++i) {
        auto* slot = tryGrowPool(/*bypassGrace=*/true);
        if (!slot) break;
        // Release so inference threads can use it
        {
            std::lock_guard<std::mutex> lk(m_slotMutex);
            slot->busy = false;
            slot->lastUsedTime = std::chrono::steady_clock::now();
        }
        m_slotFreeCv.notify_one();
        ++created;
    }
    return created;
}

// ----------------------------------------------------------------------------
//  runInferenceFromPool  -- thread-safe slot dispatch
// ----------------------------------------------------------------------------
template <typename T>
bool Engine<T>::runInferenceFromPool(
        const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
        std::vector<std::vector<std::vector<T>>>&          featureVectors)
{
    // -- 1. Acquire an idle, alive slot (round-robin) --------------------
    //
    // Round-robin starting point avoids always favouring GPU 0.  Each call
    // advances m_nextSlotHint so consecutive requests spread across GPUs.
    // The mutex is held only for the O(N) scan + flag flip -- NOT during GPU
    // execution -- so threads using different slots proceed in parallel.
    //
    // PROACTIVE GROWTH (elastic mode):
    //   If all alive slots are busy when a request arrives, the pool is
    //   undersized for the current concurrency level.  We kick off pool
    //   growth (tryGrowPool) in a detached background thread while we
    //   wait for the current slot to free.  This ensures multi-GPU
    //   utilisation: the new slot lands on the GPU with the most free
    //   VRAM (typically GPU[1]) and is ready for the *next* request.
    //   Growth is serialised by m_growMutex so duplicate threads are
    //   harmless — the second one finds a fresh slot immediately.
    InferenceSlot* slot = nullptr;
    bool kickedGrowth = false;
    auto _poolAcquireStart = std::chrono::steady_clock::now();

    {
        std::unique_lock<std::mutex> lock(m_slotMutex);

        const auto deadline = std::chrono::steady_clock::now()
                            + std::chrono::milliseconds(2000);

        while (!slot) {
            const size_t n = m_slots.size();
            if (n > 0) {
                const size_t start = m_nextSlotHint.load() % n;
                for (size_t i = 0; i < n; ++i) {
                    auto& s = m_slots[(start + i) % n];
                    if (!s.busy && s.engine) {   // alive and idle
                        s.busy = true;
                        slot   = &s;
                        m_nextSlotHint = (start + i + 1) % n;
                        break;
                    }
                }
            }

            if (!slot) {
                ANS_DBG("TRT_Pool", "ALL SLOTS BUSY: %zu slots, active=%d — waiting for free slot",
                        n, m_activeCount.load());
                // All slots busy.  In elastic mode, proactively grow the
                // pool in the background so the next request has a slot
                // on a different GPU.  We only kick once per wait cycle.
                if (m_elasticMode && !kickedGrowth
                    && s_globalElasticCount.load() < s_globalElasticMax.load()) {
                    kickedGrowth = true;
                    std::cout << "Info [Pool]: All slots busy -- kicking background growth thread"
                        << std::endl;
                    // Fire-and-forget: tryGrowPool is serialised by
                    // m_growMutex, so concurrent kicks are safe.
                    std::thread([this]() {
                        std::cout << "Info [Pool]: Background growth thread started" << std::endl;
                        auto* newSlot = this->tryGrowPool();
                        if (newSlot) {
                            // Slot was created pre-marked busy; release it
                            // so the next requester can claim it.
                            {
                                std::lock_guard<std::mutex> lk(m_slotMutex);
                                newSlot->busy = false;
                                newSlot->lastUsedTime = std::chrono::steady_clock::now();
                            }
                            m_slotFreeCv.notify_all();
                            std::cout << "Info [Pool]: Background growth SUCCEEDED -- new slot on GPU["
                                << newSlot->deviceIndex << "], pool now "
                                << m_totalCapacity << " slot(s)" << std::endl;
                        } else {
                            std::cout << "Warning [Pool]: Background growth FAILED -- no slot created"
                                << std::endl;
                        }
                    }).detach();
                }

                // Wait for a running slot to finish and signal us
                if (m_slotFreeCv.wait_until(lock, deadline)
                        == std::cv_status::timeout) {
                    break;   // fall through to reject
                }
            }
        }
    }

    // -- 3. Still no slot => reject ---------------------------------------
    {
        double _acquireMs = std::chrono::duration<double, std::milli>(
            std::chrono::steady_clock::now() - _poolAcquireStart).count();
        if (_acquireMs > 100.0) {
            ANS_DBG("TRT_Pool", "SLOW slot acquire: %.1fms slot=%p gpu=%d active=%d/%zu",
                    _acquireMs, (void*)slot, slot ? slot->deviceIndex : -1,
                    m_activeCount.load(), m_slots.size());
        }
    }
    if (!slot) {
        ANS_DBG("TRT_Pool", "ERROR: No slot available — all %zu slots busy, timeout", m_slots.size());
        return false;
    }

    ++m_activeCount;

    // -- RAII guard: guarantee busy-flag and activeCount are restored ----------
    // If runInference() throws (cv::Exception, std::bad_alloc, ...) the slot
    // must be released and the counter decremented -- otherwise the slot is
    // permanently lost and capacity shrinks with every exception.
    bool result = false;
    try {
        // Match the calling thread's CUDA context to the slot's device.
        // Skip the call if the thread is already on the correct device
        // (cudaSetDevice under WDDM can cost 1-5ms per call).
        int currentDev = -1;
        cudaGetDevice(&currentDev);
        if (currentDev != slot->deviceIndex) {
            cudaSetDevice(slot->deviceIndex);
        }
        ANS_DBG("TRT_Pool", "Slot dispatch: gpu=%d active=%d/%zu",
                slot->deviceIndex, m_activeCount.load(), m_slots.size());
        auto _slotStart = std::chrono::steady_clock::now();
        result = slot->engine->runInference(inputs, featureVectors);
        auto _slotEnd = std::chrono::steady_clock::now();
        double _slotMs = std::chrono::duration<double, std::milli>(_slotEnd - _slotStart).count();
        if (_slotMs > 500.0) {
            ANS_DBG("TRT_Pool", "SLOW slot inference: %.1fms gpu=%d active=%d/%zu",
                    _slotMs, slot->deviceIndex, m_activeCount.load(), m_slots.size());
        }
    }
    catch (const std::exception& ex) {
        ANS_DBG("TRT_Pool", "ERROR: runInference threw: %s", ex.what());
        std::cout << "Error [Pool]: runInference threw: " << ex.what() << std::endl;
    }
    catch (...) {
        ANS_DBG("TRT_Pool", "ERROR: runInference threw unknown exception");
        std::cout << "Error [Pool]: runInference threw unknown exception" << std::endl;
    }

    {
        std::lock_guard<std::mutex> lock(m_slotMutex);
        slot->busy = false;
        slot->lastUsedTime = std::chrono::steady_clock::now();
    }
    --m_activeCount;
    m_slotFreeCv.notify_one();   // wake one thread waiting for a free slot

    return result;
}

// ----------------------------------------------------------------------------
//  releaseIdleSlots  -- VRAM reclamation for elastic pools
//
//  Destroys engine instances that have been idle for at least `idleSeconds`.
//  The first slot (probe, index 0) is never released so the model remains
//  instantly usable without re-measurement.
//
//  Dead slots are NOT erased from the deque (to avoid invalidating pointers);
//  their engine is reset to nullptr and they are recycled by tryGrowPool().
//
//  Call from a periodic background timer, e.g. every 10-30 seconds:
//    engine->releaseIdleSlots(30.0);
// ----------------------------------------------------------------------------
template <typename T>
int Engine<T>::releaseIdleSlots(double idleSeconds)
{
    std::lock_guard<std::mutex> growLock(m_growMutex);
    std::lock_guard<std::mutex> slotLock(m_slotMutex);

    const auto now = std::chrono::steady_clock::now();
    int released = 0;

    // Skip index 0 -- that's the probe slot, always kept alive
    for (size_t i = 1; i < m_slots.size(); ++i) {
        auto& s = m_slots[i];
        if (!s.busy && s.engine) {   // alive and idle
            const double idle = std::chrono::duration<double>(
                now - s.lastUsedTime).count();
            if (idle >= idleSeconds) {
                // Update device info
                for (auto& dev : m_deviceInfos) {
                    if (dev.index == s.deviceIndex) {
                        if (dev.slotsAllocated > 0) dev.slotsAllocated--;
                        break;
                    }
                }

                std::cout << "Info [Pool]: Releasing idle slot on GPU["
                    << s.deviceIndex << "] (idle "
                    << static_cast<int>(idle) << "s)" << std::endl;

                // Destroy engine -- frees GPU memory.
                // The InferenceSlot entry stays in the deque (dead) for reuse.
                s.engine.reset();
                s.memUsed = 0;
                released++;
                s_globalElasticCount--;
            }
        }
    }

    // Recount alive slots
    int alive = 0;
    for (const auto& s : m_slots) { if (s.engine) ++alive; }
    m_totalCapacity = alive;

    if (released > 0) {
        std::cout << "Info [Pool]: Released " << released << " idle slot(s)"
            << " -- pool now " << m_totalCapacity << " alive slot(s)"
            << std::endl;
    }

    return released;
}

// ----------------------------------------------------------------------------
//  printCapacityReport
// ----------------------------------------------------------------------------
template <typename T>
void Engine<T>::printCapacityReport() const
{
    // Count alive vs dead -- lock protects against concurrent tryGrowPool
    std::lock_guard<std::mutex> lock(m_slotMutex);
    int alive = 0, dead = 0;
    for (const auto& s : m_slots) {
        if (s.engine) ++alive; else ++dead;
    }

    std::cout << "\n=====================================================" << std::endl;
    std::cout << "  Engine Pool -- Capacity Report"
              << (m_elasticMode ? " [ELASTIC]" : " [PRE-ALLOCATED]") << std::endl;
    std::cout << "=====================================================" << std::endl;
    std::cout << "  Alive inference slots : " << alive << std::endl;
    if (dead > 0)
        std::cout << "  Dead (recyclable)     : " << dead << std::endl;
    std::cout << "  Active inferences     : " << m_activeCount.load() << std::endl;
    std::cout << "  Available slots       : "
              << (alive - m_activeCount.load())
              << (m_elasticMode ? " (+ on-demand)" : "")
              << std::endl;
    if (m_elasticMode) {
        std::cout << "  Global slot usage     : "
                  << s_globalElasticCount.load() << "/" << s_globalElasticMax.load()
                  << " (across all pools)" << std::endl;
    }
    std::cout << "  Memory per slot       : " << m_memPerSlot / 1048576 << " MiB" << std::endl;
    std::cout << "-----------------------------------------------------" << std::endl;
    for (const auto& d : m_deviceInfos) {
        std::cout << "  GPU[" << d.index << "] " << d.name
            << " | SM " << d.computeMajor << "." << d.computeMinor
            << " | Total " << d.totalMemoryBytes / 1048576 << " MiB"
            << " | Slots: " << d.slotsAllocated
            << " | Mem/slot: " << d.memoryPerSlotBytes / 1048576 << " MiB"
            << std::endl;
    }
    std::cout << "=====================================================" << std::endl;
}

// ----------------------------------------------------------------------------
//  startIdleTimer / stopIdleTimer -- automatic idle-slot cleanup
//
//  A background thread wakes every m_idleTimerIntervalSec seconds and calls
//  releaseIdleSlots(m_idleTimerThresholdSec).  The thread uses a
//  condition_variable with a timed wait so that stopIdleTimer() can wake it
//  immediately for a clean shutdown (no dangling sleeps).
//
//  Only active in elastic mode -- pre-allocated pools have fixed capacity.
// ----------------------------------------------------------------------------
template <typename T>
void Engine<T>::startIdleTimer()
{
    if (!m_elasticMode) return;          // no-op for pre-allocated pools
    if (m_idleTimerThread.joinable()) return;  // already running

    m_idleTimerStop = false;

    m_idleTimerThread = std::thread([this]() {
        std::cout << "Info [Pool]: Idle-slot cleanup timer started "
            << "(interval=" << m_idleTimerIntervalSec << "s, threshold="
            << m_idleTimerThresholdSec << "s)" << std::endl;

        while (!m_idleTimerStop.load()) {
            // Sleep for the interval, but wake early if stop is signalled
            {
                std::unique_lock<std::mutex> lk(m_idleTimerMutex);
                m_idleTimerCv.wait_for(lk,
                    std::chrono::duration<double>(m_idleTimerIntervalSec),
                    [this]() { return m_idleTimerStop.load(); });
            }

            if (m_idleTimerStop.load()) break;

            releaseIdleSlots(m_idleTimerThresholdSec);
        }

        std::cout << "Info [Pool]: Idle-slot cleanup timer stopped." << std::endl;
    });
}

template <typename T>
void Engine<T>::stopIdleTimer()
{
    if (!m_idleTimerThread.joinable()) return;  // not running

    m_idleTimerStop = true;
    m_idleTimerCv.notify_all();       // wake the sleeping thread immediately

    // During ExitProcess, worker threads are already killed by the OS.
    // Calling join() on a dead thread deadlocks or causes std::terminate.
    // Detach instead — the OS will reclaim everything momentarily.
    if (g_processExiting().load(std::memory_order_relaxed)) {
        m_idleTimerThread.detach();
    } else {
        m_idleTimerThread.join();     // normal path: wait for clean exit
    }
}