engines/TensorRTAPI/include/MultiGpuEngineManager.h

#pragma once

// ============================================================================
// DEPRECATED — MultiGpuEngineManager.h
//
// The functionality of this class has been merged directly into Engine<T>.
// Replace any usage of MultiGpuEngineManager with Engine as follows:
//
//   Before:
//     MultiGpuEngineManager<float> mgr;
//     mgr.initialize(opts, "model.onnx");
//     mgr.runInference(inputs, outputs);
//
//   After:
//     Engine<float> eng;
//     eng.initializePool(opts, "model.onnx");
//     eng.runInference(inputs, outputs);         // auto-dispatches to pool
//
// Method mapping:
//   MultiGpuEngineManager::initialize()           → Engine::initializePool()
//   MultiGpuEngineManager::initializeFromEngine() → Engine::initializePoolFromEngine()
//   MultiGpuEngineManager::runInference()         → Engine::runInference()  (unchanged signature)
//   MultiGpuEngineManager::enumerateDevices()     → Engine::enumerateDevices()
//   MultiGpuEngineManager::getTotalCapacity()     → Engine::getTotalCapacity()
//   MultiGpuEngineManager::getAvailableSlots()    → Engine::getAvailableSlots()
//   MultiGpuEngineManager::getActiveInferences()  → Engine::getActiveInferences()
//   MultiGpuEngineManager::isAtCapacity()         → Engine::isAtCapacity()
//   MultiGpuEngineManager::getDeviceInfo()        → Engine::getDeviceInfo()
//   MultiGpuEngineManager::printCapacityReport()  → Engine::printCapacityReport()
//   MultiGpuEngineManager::getInputDims()         → Engine::getInputDims()
//   MultiGpuEngineManager::getOutputDims()        → Engine::getOutputDims()
//
// GpuDeviceInfo struct is now declared in engine.h and needs no separate include.
//
// This file is kept temporarily for compatibility.  It will be removed in a
// future cleanup pass.  Do not add new code here.
// ============================================================================

/*
 * MultiGpuEngineManager.h  [DEPRECATED — see engine.h]
 *
 * A pool-based multi-GPU inference manager that wraps Engine<T>.
 *
 * Responsibilities
 * ----------------
 *   1. Enumerate all CUDA-capable GPUs and measure the GPU memory consumed
 *      by a single inference context (one Engine<T> instance).
 *   2. Allocate as many Engine<T> "slots" as memory allows, filling device 0
 *      first, then device 1, etc.
 *   3. Dispatch runInference() calls to the first idle slot (device 0 wins),
 *      so load naturally concentrates on the lowest-index GPU first.
 *   4. Enforce capacity: if every slot is busy, return false immediately
 *      rather than crashing or blocking indefinitely.
 *
 * Assumptions
 * -----------
 *   - All GPUs on the host are the same model (homogeneous). TRT engine files
 *     are therefore interchangeable across devices.
 *   - Engine<T> is safe to call concurrently from different threads as long as
 *     each instance is used by at most one thread at a time — this manager
 *     guarantees that through the per-slot busy flag.
 *
 * Usage example
 * -------------
 *   ANSCENTER::Options opts;
 *   opts.precision    = ANSCENTER::Precision::FP16;
 *   opts.maxBatchSize = 1;
 *
 *   MultiGpuEngineManager<float> mgr;
 *   if (!mgr.initialize(opts, "model.onnx")) { ... error ... }
 *
 *   mgr.printCapacityReport();
 *   // mgr.getTotalCapacity() tells you how many parallel tasks are possible
 *
 *   // From any thread:
 *   std::vector<std::vector<cv::cuda::GpuMat>> inputs = ...;
 *   std::vector<std::vector<std::vector<float>>> outputs;
 *   if (!mgr.runInference(inputs, outputs)) {
 *       // All slots busy — back off and retry, or queue the request.
 *   }
 */

#include <algorithm>
#include <atomic>
#include <memory>
#include <mutex>
#include <string>
#include <vector>

#include <cuda_runtime.h>
#include <spdlog/spdlog.h>

#include "engine.h"      // Engine<T>  (also pulls NvInfer, OpenCV-CUDA, etc.)
#include "ANSLicense.h"  // ANSCENTER::Options, ANSCENTER::Precision

// ============================================================================
//  GpuDeviceInfo — snapshot of one CUDA device captured at init time
// ============================================================================
struct GpuDeviceInfo {
    int         index                 = 0;
    std::string name;
    size_t      totalMemoryBytes      = 0;  ///< Physical VRAM
    size_t      freeMemoryAtInitBytes = 0;  ///< Free VRAM when manager started
    int         computeMajor          = 0;
    int         computeMinor          = 0;
    int         slotsAllocated        = 0;  ///< Engine<T> instances on this GPU
    size_t      memoryPerSlotBytes    = 0;  ///< Bytes each slot occupies in VRAM
};

// ============================================================================
//  MultiGpuEngineManager<T>
// ============================================================================
template <typename T>
class MultiGpuEngineManager {
public:

    // ── Construction / destruction ───────────────────────────────────────────
    MultiGpuEngineManager()  = default;
    ~MultiGpuEngineManager() = default;

    // Non-copyable (owning unique_ptrs inside slots)
    MultiGpuEngineManager(const MultiGpuEngineManager&)            = delete;
    MultiGpuEngineManager& operator=(const MultiGpuEngineManager&) = delete;

    // Movable
    MultiGpuEngineManager(MultiGpuEngineManager&&)                 = default;
    MultiGpuEngineManager& operator=(MultiGpuEngineManager&&)      = default;

    // ── Initialization ───────────────────────────────────────────────────────

    /**
     * Initialize from an ONNX model file.
     * Builds the TRT engine on the first run and caches it; subsequent calls
     * load the cached .engine file directly (fast path).
     *
     * @param baseOptions      Configuration template.  deviceIndex is ignored
     *                         and overridden per-slot; all other fields apply.
     * @param onnxModelPath    Path to the .onnx model file.
     * @param subVals          Per-channel subtraction for normalisation.
     * @param divVals          Per-channel divisor for normalisation.
     * @param normalize        Normalise pixel values to [0, 1] before inference.
     * @param maxSlotsPerGpu   Cap slots per GPU (-1 = memory-limited only).
     * @param memSafetyFactor  Fraction of free VRAM to consume (default 0.80).
     * @return true on success.
     */
    bool initialize(const ANSCENTER::Options&   baseOptions,
                    const std::string&          onnxModelPath,
                    const std::array<float, 3>& subVals         = {0.f, 0.f, 0.f},
                    const std::array<float, 3>& divVals         = {1.f, 1.f, 1.f},
                    bool                        normalize       = true,
                    int                         maxSlotsPerGpu  = -1,
                    double                      memSafetyFactor = 0.80);

    /**
     * Initialize from a pre-built TRT engine file — no ONNX build step.
     */
    bool initializeFromEngine(const ANSCENTER::Options&   baseOptions,
                              const std::string&          trtEnginePath,
                              const std::array<float, 3>& subVals         = {0.f, 0.f, 0.f},
                              const std::array<float, 3>& divVals         = {1.f, 1.f, 1.f},
                              bool                        normalize       = true,
                              int                         maxSlotsPerGpu  = -1,
                              double                      memSafetyFactor = 0.80);

    // ── Inference ────────────────────────────────────────────────────────────

    /**
     * Run inference on the best available slot.
     *
     * Slot-selection order: device 0 first (slots are stored in ascending
     * device-index order, so the lowest-index idle slot always wins).
     *
     * Returns FALSE immediately if every slot is busy.
     * Does NOT block — the caller handles retry / queuing.
     *
     * Thread-safe: may be called from multiple threads simultaneously.
     *
     * @param inputs         [input_tensor][batch][GpuMat]
     * @param featureVectors [batch][output_tensor][values]  — populated on return
     * @return true if inference completed successfully.
     */
    bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
                      std::vector<std::vector<std::vector<T>>>&          featureVectors);

    // ── Device / capacity queries ─────────────────────────────────────────────

    /**
     * Enumerate all CUDA-capable GPUs without loading any model.
     * Useful for pre-flight checks before calling initialize().
     */
    static std::vector<GpuDeviceInfo> enumerateDevices();

    /** Device snapshots captured at initialize() time. */
    const std::vector<GpuDeviceInfo>& getDeviceInfo()      const { return m_deviceInfos;    }

    /** Total Engine<T> slots across all GPUs. */
    int getTotalCapacity()    const { return m_totalCapacity;                       }

    /** Slots currently executing inference (approximate, lock-free read). */
    int getActiveInferences() const { return m_activeCount.load();                  }

    /** Slots not currently claimed by a running inference. */
    int getAvailableSlots()   const { return m_totalCapacity - m_activeCount.load(); }

    /** Whether the engine pool is fully saturated and new work would be rejected. */
    bool isAtCapacity()       const { return getAvailableSlots() <= 0;              }

    /** Input tensor dimensions (same for all slots, populated after init). */
    const std::vector<nvinfer1::Dims3>& getInputDims()  const { return m_inputDims;  }

    /** Output tensor dimensions (same for all slots, populated after init). */
    const std::vector<nvinfer1::Dims>&  getOutputDims() const { return m_outputDims; }

    /** Print a human-readable capacity and device report via spdlog. */
    void printCapacityReport() const;

private:

    // ── Internal slot descriptor ─────────────────────────────────────────────
    struct InferenceSlot {
        int    deviceIndex = 0;
        bool   busy        = false;
        size_t memUsed     = 0;     ///< Bytes this slot holds in VRAM
        std::unique_ptr<Engine<T>> engine;
    };

    // ── Data members ──────────────────────────────────────────────────────────
    std::vector<InferenceSlot>   m_slots;
    std::vector<GpuDeviceInfo>   m_deviceInfos;
    mutable std::mutex           m_slotMutex;
    std::atomic<int>             m_activeCount{0};
    int                          m_totalCapacity{0};

    // Tensor dims cached from the probe engine (identical for every slot)
    std::vector<nvinfer1::Dims3> m_inputDims;
    std::vector<nvinfer1::Dims>  m_outputDims;

    // ── Private helpers ───────────────────────────────────────────────────────
    bool loadSlots(const ANSCENTER::Options&   baseOptions,
                   const std::string&          modelPath,
                   const std::array<float, 3>& subVals,
                   const std::array<float, 3>& divVals,
                   bool                        normalize,
                   bool                        fromOnnx,
                   int                         maxSlotsPerGpu,
                   double                      memSafetyFactor);
};

// ============================================================================
//  Template implementation
//  (must be in the header because Engine<T> is itself a template)
// ============================================================================

// ────────────────────────────────────────────────────────────────────────────
//  enumerateDevices  — static, no model loading required
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
/*static*/ std::vector<GpuDeviceInfo>
MultiGpuEngineManager<T>::enumerateDevices()
{
    int count = 0;
    cudaGetDeviceCount(&count);

    std::vector<GpuDeviceInfo> devices;
    devices.reserve(count);

    for (int i = 0; i < count; ++i) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);

        cudaSetDevice(i);
        size_t freeBytes = 0, totalBytes = 0;
        cudaMemGetInfo(&freeBytes, &totalBytes);

        GpuDeviceInfo info;
        info.index                 = i;
        info.name                  = prop.name;
        info.totalMemoryBytes      = prop.totalGlobalMem;
        info.freeMemoryAtInitBytes = freeBytes;
        info.computeMajor          = prop.major;
        info.computeMinor          = prop.minor;
        info.slotsAllocated        = 0;
        info.memoryPerSlotBytes    = 0;
        devices.push_back(std::move(info));
    }

    return devices;
}

// ────────────────────────────────────────────────────────────────────────────
//  Public init wrappers
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
bool MultiGpuEngineManager<T>::initialize(
        const ANSCENTER::Options&   baseOptions,
        const std::string&          onnxModelPath,
        const std::array<float, 3>& subVals,
        const std::array<float, 3>& divVals,
        bool                        normalize,
        int                         maxSlotsPerGpu,
        double                      memSafetyFactor)
{
    return loadSlots(baseOptions, onnxModelPath,
                     subVals, divVals, normalize,
                     /*fromOnnx=*/true,
                     maxSlotsPerGpu, memSafetyFactor);
}

template <typename T>
bool MultiGpuEngineManager<T>::initializeFromEngine(
        const ANSCENTER::Options&   baseOptions,
        const std::string&          trtEnginePath,
        const std::array<float, 3>& subVals,
        const std::array<float, 3>& divVals,
        bool                        normalize,
        int                         maxSlotsPerGpu,
        double                      memSafetyFactor)
{
    return loadSlots(baseOptions, trtEnginePath,
                     subVals, divVals, normalize,
                     /*fromOnnx=*/false,
                     maxSlotsPerGpu, memSafetyFactor);
}

// ────────────────────────────────────────────────────────────────────────────
//  loadSlots  — core initialization logic
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
bool MultiGpuEngineManager<T>::loadSlots(
        const ANSCENTER::Options&   baseOptions,
        const std::string&          modelPath,
        const std::array<float, 3>& subVals,
        const std::array<float, 3>& divVals,
        bool                        normalize,
        bool                        fromOnnx,
        int                         maxSlotsPerGpu,
        double                      memSafetyFactor)
{
    // ──────────────────────────────────────────────────────────────────
    // 1.  Enumerate GPUs
    // ──────────────────────────────────────────────────────────────────
    m_deviceInfos = enumerateDevices();

    if (m_deviceInfos.empty()) {
        spdlog::error("MultiGpuEngineManager: No CUDA-capable GPUs detected");
        return false;
    }

    spdlog::info("MultiGpuEngineManager: {} GPU(s) found:", m_deviceInfos.size());
    for (const auto& d : m_deviceInfos) {
        spdlog::info("  GPU[{}] {} | SM {}.{} | Total {:.0f} MiB | Free {:.0f} MiB",
                     d.index, d.name,
                     d.computeMajor, d.computeMinor,
                     d.totalMemoryBytes      / 1048576.0,
                     d.freeMemoryAtInitBytes / 1048576.0);
    }

    // Warn if the GPUs are heterogeneous — the TRT engine may be incompatible
    for (size_t i = 1; i < m_deviceInfos.size(); ++i) {
        if (m_deviceInfos[i].name != m_deviceInfos[0].name) {
            spdlog::warn("MultiGpuEngineManager: GPU[{}] '{}' differs from GPU[0] '{}'. "
                         "TRT engine binaries may be incompatible with dissimilar GPUs.",
                         i, m_deviceInfos[i].name, m_deviceInfos[0].name);
        }
    }

    // ──────────────────────────────────────────────────────────────────
    // 2.  Load ONE probe engine on GPU 0 to measure per-slot VRAM usage.
    //
    //     Memory delta = freeBeforeLoad − freeAfterLoad
    //     This includes: TRT engine buffers, CUDA context overhead, and
    //     any stream / workspace memory Engine<T> allocates.
    // ──────────────────────────────────────────────────────────────────
    spdlog::info("MultiGpuEngineManager: Loading probe engine on GPU[0] "
                 "to measure per-slot memory footprint...");

    cudaSetDevice(0);
    size_t freeBefore = 0, tmp = 0;
    cudaMemGetInfo(&freeBefore, &tmp);

    ANSCENTER::Options opts0 = baseOptions;
    opts0.deviceIndex        = 0;

    auto probeEngine = std::make_unique<Engine<T>>(opts0);
    const bool probeOk = fromOnnx
        ? probeEngine->buildLoadNetwork(modelPath, subVals, divVals, normalize)
        : probeEngine->loadNetwork     (modelPath, subVals, divVals, normalize);

    if (!probeOk) {
        spdlog::error("MultiGpuEngineManager: Probe engine failed to load on GPU[0]");
        return false;
    }

    size_t freeAfter = 0;
    cudaMemGetInfo(&freeAfter, &tmp);

    // Guard against measurement noise: floor at 64 MiB
    constexpr size_t kMinSlotMemBytes = 64ULL * 1024 * 1024;
    const size_t rawDelta  = (freeBefore > freeAfter) ? (freeBefore - freeAfter) : 0ULL;
    const size_t memPerSlot = std::max(rawDelta, kMinSlotMemBytes);

    spdlog::info("MultiGpuEngineManager: Memory per inference slot: {:.1f} MiB  "
                 "(measured delta = {:.1f} MiB)",
                 memPerSlot  / 1048576.0,
                 rawDelta    / 1048576.0);

    // Cache tensor dims — same for every slot since they all use the same model
    m_inputDims  = probeEngine->getInputDims();
    m_outputDims = probeEngine->getOutputDims();

    // Promote the probe engine into slot 0 on device 0
    {
        InferenceSlot s;
        s.deviceIndex = 0;
        s.busy        = false;
        s.memUsed     = memPerSlot;
        s.engine      = std::move(probeEngine);
        m_slots.push_back(std::move(s));
    }
    m_deviceInfos[0].slotsAllocated     = 1;
    m_deviceInfos[0].memoryPerSlotBytes  = memPerSlot;

    // ──────────────────────────────────────────────────────────────────
    // 3.  Auto-cap: VRAM-fraction budget per model
    //
    //     When maxSlotsPerGpu is -1 (the default), each model pool
    //     auto-limits itself to kMaxVramFractionPerModel of total GPU
    //     VRAM.  This prevents the first model loaded from consuming all
    //     memory and starving subsequent models in multi-model deployments.
    //     We use *total* VRAM (not free) as the budget base so the cap is
    //     consistent regardless of load order.
    //
    //     maxSlotsPerGpu == -1  →  auto-cap from VRAM fraction (default)
    //     maxSlotsPerGpu >   0  →  explicit cap (user override, unchanged)
    // ──────────────────────────────────────────────────────────────────
    constexpr double kMaxVramFractionPerModel = 0.25;  // 25% of total VRAM

    int effectiveMaxSlotsPerGpu = maxSlotsPerGpu;

    if (maxSlotsPerGpu <= 0 && memPerSlot > 0) {
        const size_t totalVram   = m_deviceInfos[0].totalMemoryBytes;
        const size_t vramBudget  = static_cast<size_t>(
            static_cast<double>(totalVram) * kMaxVramFractionPerModel);
        const int autoCap = std::max(1, static_cast<int>(vramBudget / memPerSlot));
        effectiveMaxSlotsPerGpu = autoCap;

        spdlog::info("MultiGpuEngineManager: VRAM auto-cap = {} slot(s)/GPU "
                     "(model {} MiB/slot, budget {} MiB = {}% of {} MiB total)",
                     autoCap, memPerSlot / 1048576,
                     vramBudget / 1048576,
                     static_cast<int>(kMaxVramFractionPerModel * 100),
                     totalVram / 1048576);
    }

    // ──────────────────────────────────────────────────────────────────
    // 4.  Fill remaining capacity on every GPU.
    //
    //     For GPU 0:
    //       freeNow already reflects probe usage → slotsToAdd is the count
    //       of *additional* slots that fit, beyond the probe.
    //
    //     For GPU 1+:
    //       freeNow is the original available memory on that device, so
    //       slotsToAdd is the *total* slots for that device.
    // ──────────────────────────────────────────────────────────────────
    for (int di = 0; di < static_cast<int>(m_deviceInfos.size()); ++di) {

        cudaSetDevice(di);
        size_t freeNow = 0, totalNow = 0;
        cudaMemGetInfo(&freeNow, &totalNow);

        const size_t usableBytes = static_cast<size_t>(
                static_cast<double>(freeNow) * memSafetyFactor);

        // How many new Engine<T> instances fit in the usable memory?
        int slotsToAdd = (memPerSlot > 0)
                         ? static_cast<int>(usableBytes / memPerSlot)
                         : 0;

        // Apply VRAM-fraction auto-cap or explicit per-GPU cap.
        // GPU 0 already has the probe slot, so subtract 1 from its budget.
        if (effectiveMaxSlotsPerGpu > 0) {
            const int budget = (di == 0)
                ? (effectiveMaxSlotsPerGpu - 1)
                : effectiveMaxSlotsPerGpu;
            slotsToAdd = std::min(slotsToAdd, budget);
        }

        m_deviceInfos[di].memoryPerSlotBytes = memPerSlot;

        spdlog::info("MultiGpuEngineManager: GPU[{}] {} — "
                     "free {:.0f} MiB, usable {:.0f} MiB → adding {} slot(s)",
                     di, m_deviceInfos[di].name,
                     freeNow      / 1048576.0,
                     usableBytes  / 1048576.0,
                     slotsToAdd);

        for (int s = 0; s < slotsToAdd; ++s) {
            ANSCENTER::Options opts = baseOptions;
            opts.deviceIndex        = di;

            auto eng = std::make_unique<Engine<T>>(opts);
            const bool ok = fromOnnx
                ? eng->buildLoadNetwork(modelPath, subVals, divVals, normalize)
                : eng->loadNetwork     (modelPath, subVals, divVals, normalize);

            if (!ok) {
                spdlog::warn("MultiGpuEngineManager: GPU[{}] — slot {}/{} load failed; "
                             "halting allocation on this device.",
                             di, s + 1, slotsToAdd);
                break;
            }

            InferenceSlot slot;
            slot.deviceIndex = di;
            slot.busy        = false;
            slot.memUsed     = memPerSlot;
            slot.engine      = std::move(eng);
            m_slots.push_back(std::move(slot));
            m_deviceInfos[di].slotsAllocated++;
        }
    }

    m_totalCapacity = static_cast<int>(m_slots.size());
    printCapacityReport();

    if (m_totalCapacity == 0) {
        spdlog::error("MultiGpuEngineManager: Zero inference slots allocated — "
                      "check available GPU memory.");
        return false;
    }

    return true;
}

// ────────────────────────────────────────────────────────────────────────────
//  runInference
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
bool MultiGpuEngineManager<T>::runInference(
        const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
        std::vector<std::vector<std::vector<T>>>&          featureVectors)
{
    // ── Acquire the first idle slot ───────────────────────────────────────
    //
    // Slots are stored in ascending device-index order (all device-0 slots
    // come first), so the scan naturally prefers device 0.  The mutex is
    // held only for the O(N) scan + flag flip — NOT during the GPU kernel —
    // so threads using different slots proceed in parallel.
    InferenceSlot* slot = nullptr;
    {
        std::lock_guard<std::mutex> lock(m_slotMutex);
        for (auto& s : m_slots) {
            if (!s.busy) {
                s.busy = true;
                slot   = &s;
                break;
            }
        }
    }

    if (!slot) {
        // All slots are in use.  Enforce the capacity limit by refusing the
        // request rather than crashing or waiting indefinitely.
        spdlog::warn("MultiGpuEngineManager: Capacity reached — "
                     "all {}/{} inference slot(s) busy. "
                     "Request rejected; release a running inference first.",
                     m_activeCount.load(), m_totalCapacity);
        return false;
    }

    ++m_activeCount;

    // Set the calling thread's CUDA device context to match the slot's device.
    // Engine<T>::loadNetwork() already did this internally when the engine was
    // created, and the CUDA streams inside are bound to that device; calling
    // cudaSetDevice here ensures the calling thread's context matches so that
    // stream operations and memory queries behave correctly in multi-threaded
    // scenarios where threads may have previously touched a different device.
    cudaSetDevice(slot->deviceIndex);

    const bool result = slot->engine->runInference(inputs, featureVectors);

    // ── Release the slot ──────────────────────────────────────────────────
    {
        std::lock_guard<std::mutex> lock(m_slotMutex);
        slot->busy = false;
    }
    --m_activeCount;

    return result;
}

// ────────────────────────────────────────────────────────────────────────────
//  printCapacityReport
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
void MultiGpuEngineManager<T>::printCapacityReport() const
{
    spdlog::info("============================================================");
    spdlog::info("  MultiGpuEngineManager — Capacity Report");
    spdlog::info("============================================================");
    spdlog::info("  Total inference slots : {}", m_totalCapacity);
    spdlog::info("  Active inferences     : {}", m_activeCount.load());
    spdlog::info("  Available slots       : {}",
                 m_totalCapacity - m_activeCount.load());
    spdlog::info("------------------------------------------------------------");
    for (const auto& d : m_deviceInfos) {
        spdlog::info("  GPU[{:d}] {:s} | SM {:d}.{:d} | "
                     "Total {:6.0f} MiB | Slots: {:2d} | Mem/slot: {:6.1f} MiB",
                     d.index, d.name,
                     d.computeMajor, d.computeMinor,
                     d.totalMemoryBytes   / 1048576.0,
                     d.slotsAllocated,
                     d.memoryPerSlotBytes / 1048576.0);
    }
    spdlog::info("============================================================");
}