#pragma once // ============================================================================ // DEPRECATED — MultiGpuEngineManager.h // // The functionality of this class has been merged directly into Engine. // Replace any usage of MultiGpuEngineManager with Engine as follows: // // Before: // MultiGpuEngineManager mgr; // mgr.initialize(opts, "model.onnx"); // mgr.runInference(inputs, outputs); // // After: // Engine eng; // eng.initializePool(opts, "model.onnx"); // eng.runInference(inputs, outputs); // auto-dispatches to pool // // Method mapping: // MultiGpuEngineManager::initialize() → Engine::initializePool() // MultiGpuEngineManager::initializeFromEngine() → Engine::initializePoolFromEngine() // MultiGpuEngineManager::runInference() → Engine::runInference() (unchanged signature) // MultiGpuEngineManager::enumerateDevices() → Engine::enumerateDevices() // MultiGpuEngineManager::getTotalCapacity() → Engine::getTotalCapacity() // MultiGpuEngineManager::getAvailableSlots() → Engine::getAvailableSlots() // MultiGpuEngineManager::getActiveInferences() → Engine::getActiveInferences() // MultiGpuEngineManager::isAtCapacity() → Engine::isAtCapacity() // MultiGpuEngineManager::getDeviceInfo() → Engine::getDeviceInfo() // MultiGpuEngineManager::printCapacityReport() → Engine::printCapacityReport() // MultiGpuEngineManager::getInputDims() → Engine::getInputDims() // MultiGpuEngineManager::getOutputDims() → Engine::getOutputDims() // // GpuDeviceInfo struct is now declared in engine.h and needs no separate include. // // This file is kept temporarily for compatibility. It will be removed in a // future cleanup pass. Do not add new code here. // ============================================================================ /* * MultiGpuEngineManager.h [DEPRECATED — see engine.h] * * A pool-based multi-GPU inference manager that wraps Engine. * * Responsibilities * ---------------- * 1. Enumerate all CUDA-capable GPUs and measure the GPU memory consumed * by a single inference context (one Engine instance). * 2. Allocate as many Engine "slots" as memory allows, filling device 0 * first, then device 1, etc. * 3. Dispatch runInference() calls to the first idle slot (device 0 wins), * so load naturally concentrates on the lowest-index GPU first. * 4. Enforce capacity: if every slot is busy, return false immediately * rather than crashing or blocking indefinitely. * * Assumptions * ----------- * - All GPUs on the host are the same model (homogeneous). TRT engine files * are therefore interchangeable across devices. * - Engine is safe to call concurrently from different threads as long as * each instance is used by at most one thread at a time — this manager * guarantees that through the per-slot busy flag. * * Usage example * ------------- * ANSCENTER::Options opts; * opts.precision = ANSCENTER::Precision::FP16; * opts.maxBatchSize = 1; * * MultiGpuEngineManager mgr; * if (!mgr.initialize(opts, "model.onnx")) { ... error ... } * * mgr.printCapacityReport(); * // mgr.getTotalCapacity() tells you how many parallel tasks are possible * * // From any thread: * std::vector> inputs = ...; * std::vector>> outputs; * if (!mgr.runInference(inputs, outputs)) { * // All slots busy — back off and retry, or queue the request. * } */ #include #include #include #include #include #include #include #include #include "engine.h" // Engine (also pulls NvInfer, OpenCV-CUDA, etc.) #include "ANSLicense.h" // ANSCENTER::Options, ANSCENTER::Precision // ============================================================================ // GpuDeviceInfo — snapshot of one CUDA device captured at init time // ============================================================================ struct GpuDeviceInfo { int index = 0; std::string name; size_t totalMemoryBytes = 0; ///< Physical VRAM size_t freeMemoryAtInitBytes = 0; ///< Free VRAM when manager started int computeMajor = 0; int computeMinor = 0; int slotsAllocated = 0; ///< Engine instances on this GPU size_t memoryPerSlotBytes = 0; ///< Bytes each slot occupies in VRAM }; // ============================================================================ // MultiGpuEngineManager // ============================================================================ template class MultiGpuEngineManager { public: // ── Construction / destruction ─────────────────────────────────────────── MultiGpuEngineManager() = default; ~MultiGpuEngineManager() = default; // Non-copyable (owning unique_ptrs inside slots) MultiGpuEngineManager(const MultiGpuEngineManager&) = delete; MultiGpuEngineManager& operator=(const MultiGpuEngineManager&) = delete; // Movable MultiGpuEngineManager(MultiGpuEngineManager&&) = default; MultiGpuEngineManager& operator=(MultiGpuEngineManager&&) = default; // ── Initialization ─────────────────────────────────────────────────────── /** * Initialize from an ONNX model file. * Builds the TRT engine on the first run and caches it; subsequent calls * load the cached .engine file directly (fast path). * * @param baseOptions Configuration template. deviceIndex is ignored * and overridden per-slot; all other fields apply. * @param onnxModelPath Path to the .onnx model file. * @param subVals Per-channel subtraction for normalisation. * @param divVals Per-channel divisor for normalisation. * @param normalize Normalise pixel values to [0, 1] before inference. * @param maxSlotsPerGpu Cap slots per GPU (-1 = memory-limited only). * @param memSafetyFactor Fraction of free VRAM to consume (default 0.80). * @return true on success. */ bool initialize(const ANSCENTER::Options& baseOptions, const std::string& onnxModelPath, const std::array& subVals = {0.f, 0.f, 0.f}, const std::array& divVals = {1.f, 1.f, 1.f}, bool normalize = true, int maxSlotsPerGpu = -1, double memSafetyFactor = 0.80); /** * Initialize from a pre-built TRT engine file — no ONNX build step. */ bool initializeFromEngine(const ANSCENTER::Options& baseOptions, const std::string& trtEnginePath, const std::array& subVals = {0.f, 0.f, 0.f}, const std::array& divVals = {1.f, 1.f, 1.f}, bool normalize = true, int maxSlotsPerGpu = -1, double memSafetyFactor = 0.80); // ── Inference ──────────────────────────────────────────────────────────── /** * Run inference on the best available slot. * * Slot-selection order: device 0 first (slots are stored in ascending * device-index order, so the lowest-index idle slot always wins). * * Returns FALSE immediately if every slot is busy. * Does NOT block — the caller handles retry / queuing. * * Thread-safe: may be called from multiple threads simultaneously. * * @param inputs [input_tensor][batch][GpuMat] * @param featureVectors [batch][output_tensor][values] — populated on return * @return true if inference completed successfully. */ bool runInference(const std::vector>& inputs, std::vector>>& featureVectors); // ── Device / capacity queries ───────────────────────────────────────────── /** * Enumerate all CUDA-capable GPUs without loading any model. * Useful for pre-flight checks before calling initialize(). */ static std::vector enumerateDevices(); /** Device snapshots captured at initialize() time. */ const std::vector& getDeviceInfo() const { return m_deviceInfos; } /** Total Engine slots across all GPUs. */ int getTotalCapacity() const { return m_totalCapacity; } /** Slots currently executing inference (approximate, lock-free read). */ int getActiveInferences() const { return m_activeCount.load(); } /** Slots not currently claimed by a running inference. */ int getAvailableSlots() const { return m_totalCapacity - m_activeCount.load(); } /** Whether the engine pool is fully saturated and new work would be rejected. */ bool isAtCapacity() const { return getAvailableSlots() <= 0; } /** Input tensor dimensions (same for all slots, populated after init). */ const std::vector& getInputDims() const { return m_inputDims; } /** Output tensor dimensions (same for all slots, populated after init). */ const std::vector& getOutputDims() const { return m_outputDims; } /** Print a human-readable capacity and device report via spdlog. */ void printCapacityReport() const; private: // ── Internal slot descriptor ───────────────────────────────────────────── struct InferenceSlot { int deviceIndex = 0; bool busy = false; size_t memUsed = 0; ///< Bytes this slot holds in VRAM std::unique_ptr> engine; }; // ── Data members ────────────────────────────────────────────────────────── std::vector m_slots; std::vector m_deviceInfos; mutable std::mutex m_slotMutex; std::atomic m_activeCount{0}; int m_totalCapacity{0}; // Tensor dims cached from the probe engine (identical for every slot) std::vector m_inputDims; std::vector m_outputDims; // ── Private helpers ─────────────────────────────────────────────────────── bool loadSlots(const ANSCENTER::Options& baseOptions, const std::string& modelPath, const std::array& subVals, const std::array& divVals, bool normalize, bool fromOnnx, int maxSlotsPerGpu, double memSafetyFactor); }; // ============================================================================ // Template implementation // (must be in the header because Engine is itself a template) // ============================================================================ // ──────────────────────────────────────────────────────────────────────────── // enumerateDevices — static, no model loading required // ──────────────────────────────────────────────────────────────────────────── template /*static*/ std::vector MultiGpuEngineManager::enumerateDevices() { int count = 0; cudaGetDeviceCount(&count); std::vector devices; devices.reserve(count); for (int i = 0; i < count; ++i) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, i); cudaSetDevice(i); size_t freeBytes = 0, totalBytes = 0; cudaMemGetInfo(&freeBytes, &totalBytes); GpuDeviceInfo info; info.index = i; info.name = prop.name; info.totalMemoryBytes = prop.totalGlobalMem; info.freeMemoryAtInitBytes = freeBytes; info.computeMajor = prop.major; info.computeMinor = prop.minor; info.slotsAllocated = 0; info.memoryPerSlotBytes = 0; devices.push_back(std::move(info)); } return devices; } // ──────────────────────────────────────────────────────────────────────────── // Public init wrappers // ──────────────────────────────────────────────────────────────────────────── template bool MultiGpuEngineManager::initialize( const ANSCENTER::Options& baseOptions, const std::string& onnxModelPath, const std::array& subVals, const std::array& divVals, bool normalize, int maxSlotsPerGpu, double memSafetyFactor) { return loadSlots(baseOptions, onnxModelPath, subVals, divVals, normalize, /*fromOnnx=*/true, maxSlotsPerGpu, memSafetyFactor); } template bool MultiGpuEngineManager::initializeFromEngine( const ANSCENTER::Options& baseOptions, const std::string& trtEnginePath, const std::array& subVals, const std::array& divVals, bool normalize, int maxSlotsPerGpu, double memSafetyFactor) { return loadSlots(baseOptions, trtEnginePath, subVals, divVals, normalize, /*fromOnnx=*/false, maxSlotsPerGpu, memSafetyFactor); } // ──────────────────────────────────────────────────────────────────────────── // loadSlots — core initialization logic // ──────────────────────────────────────────────────────────────────────────── template bool MultiGpuEngineManager::loadSlots( const ANSCENTER::Options& baseOptions, const std::string& modelPath, const std::array& subVals, const std::array& divVals, bool normalize, bool fromOnnx, int maxSlotsPerGpu, double memSafetyFactor) { // ────────────────────────────────────────────────────────────────── // 1. Enumerate GPUs // ────────────────────────────────────────────────────────────────── m_deviceInfos = enumerateDevices(); if (m_deviceInfos.empty()) { spdlog::error("MultiGpuEngineManager: No CUDA-capable GPUs detected"); return false; } spdlog::info("MultiGpuEngineManager: {} GPU(s) found:", m_deviceInfos.size()); for (const auto& d : m_deviceInfos) { spdlog::info(" GPU[{}] {} | SM {}.{} | Total {:.0f} MiB | Free {:.0f} MiB", d.index, d.name, d.computeMajor, d.computeMinor, d.totalMemoryBytes / 1048576.0, d.freeMemoryAtInitBytes / 1048576.0); } // Warn if the GPUs are heterogeneous — the TRT engine may be incompatible for (size_t i = 1; i < m_deviceInfos.size(); ++i) { if (m_deviceInfos[i].name != m_deviceInfos[0].name) { spdlog::warn("MultiGpuEngineManager: GPU[{}] '{}' differs from GPU[0] '{}'. " "TRT engine binaries may be incompatible with dissimilar GPUs.", i, m_deviceInfos[i].name, m_deviceInfos[0].name); } } // ────────────────────────────────────────────────────────────────── // 2. Load ONE probe engine on GPU 0 to measure per-slot VRAM usage. // // Memory delta = freeBeforeLoad − freeAfterLoad // This includes: TRT engine buffers, CUDA context overhead, and // any stream / workspace memory Engine allocates. // ────────────────────────────────────────────────────────────────── spdlog::info("MultiGpuEngineManager: Loading probe engine on GPU[0] " "to measure per-slot memory footprint..."); cudaSetDevice(0); size_t freeBefore = 0, tmp = 0; cudaMemGetInfo(&freeBefore, &tmp); ANSCENTER::Options opts0 = baseOptions; opts0.deviceIndex = 0; auto probeEngine = std::make_unique>(opts0); const bool probeOk = fromOnnx ? probeEngine->buildLoadNetwork(modelPath, subVals, divVals, normalize) : probeEngine->loadNetwork (modelPath, subVals, divVals, normalize); if (!probeOk) { spdlog::error("MultiGpuEngineManager: Probe engine failed to load on GPU[0]"); return false; } size_t freeAfter = 0; cudaMemGetInfo(&freeAfter, &tmp); // Guard against measurement noise: floor at 64 MiB constexpr size_t kMinSlotMemBytes = 64ULL * 1024 * 1024; const size_t rawDelta = (freeBefore > freeAfter) ? (freeBefore - freeAfter) : 0ULL; const size_t memPerSlot = std::max(rawDelta, kMinSlotMemBytes); spdlog::info("MultiGpuEngineManager: Memory per inference slot: {:.1f} MiB " "(measured delta = {:.1f} MiB)", memPerSlot / 1048576.0, rawDelta / 1048576.0); // Cache tensor dims — same for every slot since they all use the same model m_inputDims = probeEngine->getInputDims(); m_outputDims = probeEngine->getOutputDims(); // Promote the probe engine into slot 0 on device 0 { InferenceSlot s; s.deviceIndex = 0; s.busy = false; s.memUsed = memPerSlot; s.engine = std::move(probeEngine); m_slots.push_back(std::move(s)); } m_deviceInfos[0].slotsAllocated = 1; m_deviceInfos[0].memoryPerSlotBytes = memPerSlot; // ────────────────────────────────────────────────────────────────── // 3. Auto-cap: VRAM-fraction budget per model // // When maxSlotsPerGpu is -1 (the default), each model pool // auto-limits itself to kMaxVramFractionPerModel of total GPU // VRAM. This prevents the first model loaded from consuming all // memory and starving subsequent models in multi-model deployments. // We use *total* VRAM (not free) as the budget base so the cap is // consistent regardless of load order. // // maxSlotsPerGpu == -1 → auto-cap from VRAM fraction (default) // maxSlotsPerGpu > 0 → explicit cap (user override, unchanged) // ────────────────────────────────────────────────────────────────── constexpr double kMaxVramFractionPerModel = 0.25; // 25% of total VRAM int effectiveMaxSlotsPerGpu = maxSlotsPerGpu; if (maxSlotsPerGpu <= 0 && memPerSlot > 0) { const size_t totalVram = m_deviceInfos[0].totalMemoryBytes; const size_t vramBudget = static_cast( static_cast(totalVram) * kMaxVramFractionPerModel); const int autoCap = std::max(1, static_cast(vramBudget / memPerSlot)); effectiveMaxSlotsPerGpu = autoCap; spdlog::info("MultiGpuEngineManager: VRAM auto-cap = {} slot(s)/GPU " "(model {} MiB/slot, budget {} MiB = {}% of {} MiB total)", autoCap, memPerSlot / 1048576, vramBudget / 1048576, static_cast(kMaxVramFractionPerModel * 100), totalVram / 1048576); } // ────────────────────────────────────────────────────────────────── // 4. Fill remaining capacity on every GPU. // // For GPU 0: // freeNow already reflects probe usage → slotsToAdd is the count // of *additional* slots that fit, beyond the probe. // // For GPU 1+: // freeNow is the original available memory on that device, so // slotsToAdd is the *total* slots for that device. // ────────────────────────────────────────────────────────────────── for (int di = 0; di < static_cast(m_deviceInfos.size()); ++di) { cudaSetDevice(di); size_t freeNow = 0, totalNow = 0; cudaMemGetInfo(&freeNow, &totalNow); const size_t usableBytes = static_cast( static_cast(freeNow) * memSafetyFactor); // How many new Engine instances fit in the usable memory? int slotsToAdd = (memPerSlot > 0) ? static_cast(usableBytes / memPerSlot) : 0; // Apply VRAM-fraction auto-cap or explicit per-GPU cap. // GPU 0 already has the probe slot, so subtract 1 from its budget. if (effectiveMaxSlotsPerGpu > 0) { const int budget = (di == 0) ? (effectiveMaxSlotsPerGpu - 1) : effectiveMaxSlotsPerGpu; slotsToAdd = std::min(slotsToAdd, budget); } m_deviceInfos[di].memoryPerSlotBytes = memPerSlot; spdlog::info("MultiGpuEngineManager: GPU[{}] {} — " "free {:.0f} MiB, usable {:.0f} MiB → adding {} slot(s)", di, m_deviceInfos[di].name, freeNow / 1048576.0, usableBytes / 1048576.0, slotsToAdd); for (int s = 0; s < slotsToAdd; ++s) { ANSCENTER::Options opts = baseOptions; opts.deviceIndex = di; auto eng = std::make_unique>(opts); const bool ok = fromOnnx ? eng->buildLoadNetwork(modelPath, subVals, divVals, normalize) : eng->loadNetwork (modelPath, subVals, divVals, normalize); if (!ok) { spdlog::warn("MultiGpuEngineManager: GPU[{}] — slot {}/{} load failed; " "halting allocation on this device.", di, s + 1, slotsToAdd); break; } InferenceSlot slot; slot.deviceIndex = di; slot.busy = false; slot.memUsed = memPerSlot; slot.engine = std::move(eng); m_slots.push_back(std::move(slot)); m_deviceInfos[di].slotsAllocated++; } } m_totalCapacity = static_cast(m_slots.size()); printCapacityReport(); if (m_totalCapacity == 0) { spdlog::error("MultiGpuEngineManager: Zero inference slots allocated — " "check available GPU memory."); return false; } return true; } // ──────────────────────────────────────────────────────────────────────────── // runInference // ──────────────────────────────────────────────────────────────────────────── template bool MultiGpuEngineManager::runInference( const std::vector>& inputs, std::vector>>& featureVectors) { // ── Acquire the first idle slot ─────────────────────────────────────── // // Slots are stored in ascending device-index order (all device-0 slots // come first), so the scan naturally prefers device 0. The mutex is // held only for the O(N) scan + flag flip — NOT during the GPU kernel — // so threads using different slots proceed in parallel. InferenceSlot* slot = nullptr; { std::lock_guard lock(m_slotMutex); for (auto& s : m_slots) { if (!s.busy) { s.busy = true; slot = &s; break; } } } if (!slot) { // All slots are in use. Enforce the capacity limit by refusing the // request rather than crashing or waiting indefinitely. spdlog::warn("MultiGpuEngineManager: Capacity reached — " "all {}/{} inference slot(s) busy. " "Request rejected; release a running inference first.", m_activeCount.load(), m_totalCapacity); return false; } ++m_activeCount; // Set the calling thread's CUDA device context to match the slot's device. // Engine::loadNetwork() already did this internally when the engine was // created, and the CUDA streams inside are bound to that device; calling // cudaSetDevice here ensures the calling thread's context matches so that // stream operations and memory queries behave correctly in multi-threaded // scenarios where threads may have previously touched a different device. cudaSetDevice(slot->deviceIndex); const bool result = slot->engine->runInference(inputs, featureVectors); // ── Release the slot ────────────────────────────────────────────────── { std::lock_guard lock(m_slotMutex); slot->busy = false; } --m_activeCount; return result; } // ──────────────────────────────────────────────────────────────────────────── // printCapacityReport // ──────────────────────────────────────────────────────────────────────────── template void MultiGpuEngineManager::printCapacityReport() const { spdlog::info("============================================================"); spdlog::info(" MultiGpuEngineManager — Capacity Report"); spdlog::info("============================================================"); spdlog::info(" Total inference slots : {}", m_totalCapacity); spdlog::info(" Active inferences : {}", m_activeCount.load()); spdlog::info(" Available slots : {}", m_totalCapacity - m_activeCount.load()); spdlog::info("------------------------------------------------------------"); for (const auto& d : m_deviceInfos) { spdlog::info(" GPU[{:d}] {:s} | SM {:d}.{:d} | " "Total {:6.0f} MiB | Slots: {:2d} | Mem/slot: {:6.1f} MiB", d.index, d.name, d.computeMajor, d.computeMinor, d.totalMemoryBytes / 1048576.0, d.slotsAllocated, d.memoryPerSlotBytes / 1048576.0); } spdlog::info("============================================================"); }