Files
ANSCORE/engines/TensorRTAPI/include/MultiGpuEngineManager.h

619 lines
30 KiB
C
Raw Permalink Normal View History

2026-03-28 16:54:11 +11:00
#pragma once
// ============================================================================
// DEPRECATED — MultiGpuEngineManager.h
//
// The functionality of this class has been merged directly into Engine<T>.
// Replace any usage of MultiGpuEngineManager with Engine as follows:
//
// Before:
// MultiGpuEngineManager<float> mgr;
// mgr.initialize(opts, "model.onnx");
// mgr.runInference(inputs, outputs);
//
// After:
// Engine<float> eng;
// eng.initializePool(opts, "model.onnx");
// eng.runInference(inputs, outputs); // auto-dispatches to pool
//
// Method mapping:
// MultiGpuEngineManager::initialize() → Engine::initializePool()
// MultiGpuEngineManager::initializeFromEngine() → Engine::initializePoolFromEngine()
// MultiGpuEngineManager::runInference() → Engine::runInference() (unchanged signature)
// MultiGpuEngineManager::enumerateDevices() → Engine::enumerateDevices()
// MultiGpuEngineManager::getTotalCapacity() → Engine::getTotalCapacity()
// MultiGpuEngineManager::getAvailableSlots() → Engine::getAvailableSlots()
// MultiGpuEngineManager::getActiveInferences() → Engine::getActiveInferences()
// MultiGpuEngineManager::isAtCapacity() → Engine::isAtCapacity()
// MultiGpuEngineManager::getDeviceInfo() → Engine::getDeviceInfo()
// MultiGpuEngineManager::printCapacityReport() → Engine::printCapacityReport()
// MultiGpuEngineManager::getInputDims() → Engine::getInputDims()
// MultiGpuEngineManager::getOutputDims() → Engine::getOutputDims()
//
// GpuDeviceInfo struct is now declared in engine.h and needs no separate include.
//
// This file is kept temporarily for compatibility. It will be removed in a
// future cleanup pass. Do not add new code here.
// ============================================================================
/*
* MultiGpuEngineManager.h [DEPRECATED see engine.h]
*
* A pool-based multi-GPU inference manager that wraps Engine<T>.
*
* Responsibilities
* ----------------
* 1. Enumerate all CUDA-capable GPUs and measure the GPU memory consumed
* by a single inference context (one Engine<T> instance).
* 2. Allocate as many Engine<T> "slots" as memory allows, filling device 0
* first, then device 1, etc.
* 3. Dispatch runInference() calls to the first idle slot (device 0 wins),
* so load naturally concentrates on the lowest-index GPU first.
* 4. Enforce capacity: if every slot is busy, return false immediately
* rather than crashing or blocking indefinitely.
*
* Assumptions
* -----------
* - All GPUs on the host are the same model (homogeneous). TRT engine files
* are therefore interchangeable across devices.
* - Engine<T> is safe to call concurrently from different threads as long as
* each instance is used by at most one thread at a time this manager
* guarantees that through the per-slot busy flag.
*
* Usage example
* -------------
* ANSCENTER::Options opts;
* opts.precision = ANSCENTER::Precision::FP16;
* opts.maxBatchSize = 1;
*
* MultiGpuEngineManager<float> mgr;
* if (!mgr.initialize(opts, "model.onnx")) { ... error ... }
*
* mgr.printCapacityReport();
* // mgr.getTotalCapacity() tells you how many parallel tasks are possible
*
* // From any thread:
* std::vector<std::vector<cv::cuda::GpuMat>> inputs = ...;
* std::vector<std::vector<std::vector<float>>> outputs;
* if (!mgr.runInference(inputs, outputs)) {
* // All slots busy — back off and retry, or queue the request.
* }
*/
#include <algorithm>
#include <atomic>
#include <memory>
#include <mutex>
#include <string>
#include <vector>
#include <cuda_runtime.h>
#include <spdlog/spdlog.h>
#include "engine.h" // Engine<T> (also pulls NvInfer, OpenCV-CUDA, etc.)
#include "ANSLicense.h" // ANSCENTER::Options, ANSCENTER::Precision
// ============================================================================
// GpuDeviceInfo — snapshot of one CUDA device captured at init time
// ============================================================================
struct GpuDeviceInfo {
int index = 0;
std::string name;
size_t totalMemoryBytes = 0; ///< Physical VRAM
size_t freeMemoryAtInitBytes = 0; ///< Free VRAM when manager started
int computeMajor = 0;
int computeMinor = 0;
int slotsAllocated = 0; ///< Engine<T> instances on this GPU
size_t memoryPerSlotBytes = 0; ///< Bytes each slot occupies in VRAM
};
// ============================================================================
// MultiGpuEngineManager<T>
// ============================================================================
template <typename T>
class MultiGpuEngineManager {
public:
// ── Construction / destruction ───────────────────────────────────────────
MultiGpuEngineManager() = default;
~MultiGpuEngineManager() = default;
// Non-copyable (owning unique_ptrs inside slots)
MultiGpuEngineManager(const MultiGpuEngineManager&) = delete;
MultiGpuEngineManager& operator=(const MultiGpuEngineManager&) = delete;
// Movable
MultiGpuEngineManager(MultiGpuEngineManager&&) = default;
MultiGpuEngineManager& operator=(MultiGpuEngineManager&&) = default;
// ── Initialization ───────────────────────────────────────────────────────
/**
* Initialize from an ONNX model file.
* Builds the TRT engine on the first run and caches it; subsequent calls
* load the cached .engine file directly (fast path).
*
* @param baseOptions Configuration template. deviceIndex is ignored
* and overridden per-slot; all other fields apply.
* @param onnxModelPath Path to the .onnx model file.
* @param subVals Per-channel subtraction for normalisation.
* @param divVals Per-channel divisor for normalisation.
* @param normalize Normalise pixel values to [0, 1] before inference.
* @param maxSlotsPerGpu Cap slots per GPU (-1 = memory-limited only).
* @param memSafetyFactor Fraction of free VRAM to consume (default 0.80).
* @return true on success.
*/
bool initialize(const ANSCENTER::Options& baseOptions,
const std::string& onnxModelPath,
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},
const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
bool normalize = true,
int maxSlotsPerGpu = -1,
double memSafetyFactor = 0.80);
/**
* Initialize from a pre-built TRT engine file no ONNX build step.
*/
bool initializeFromEngine(const ANSCENTER::Options& baseOptions,
const std::string& trtEnginePath,
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},
const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
bool normalize = true,
int maxSlotsPerGpu = -1,
double memSafetyFactor = 0.80);
// ── Inference ────────────────────────────────────────────────────────────
/**
* Run inference on the best available slot.
*
* Slot-selection order: device 0 first (slots are stored in ascending
* device-index order, so the lowest-index idle slot always wins).
*
* Returns FALSE immediately if every slot is busy.
* Does NOT block the caller handles retry / queuing.
*
* Thread-safe: may be called from multiple threads simultaneously.
*
* @param inputs [input_tensor][batch][GpuMat]
* @param featureVectors [batch][output_tensor][values] populated on return
* @return true if inference completed successfully.
*/
bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
std::vector<std::vector<std::vector<T>>>& featureVectors);
// ── Device / capacity queries ─────────────────────────────────────────────
/**
* Enumerate all CUDA-capable GPUs without loading any model.
* Useful for pre-flight checks before calling initialize().
*/
static std::vector<GpuDeviceInfo> enumerateDevices();
/** Device snapshots captured at initialize() time. */
const std::vector<GpuDeviceInfo>& getDeviceInfo() const { return m_deviceInfos; }
/** Total Engine<T> slots across all GPUs. */
int getTotalCapacity() const { return m_totalCapacity; }
/** Slots currently executing inference (approximate, lock-free read). */
int getActiveInferences() const { return m_activeCount.load(); }
/** Slots not currently claimed by a running inference. */
int getAvailableSlots() const { return m_totalCapacity - m_activeCount.load(); }
/** Whether the engine pool is fully saturated and new work would be rejected. */
bool isAtCapacity() const { return getAvailableSlots() <= 0; }
/** Input tensor dimensions (same for all slots, populated after init). */
const std::vector<nvinfer1::Dims3>& getInputDims() const { return m_inputDims; }
/** Output tensor dimensions (same for all slots, populated after init). */
const std::vector<nvinfer1::Dims>& getOutputDims() const { return m_outputDims; }
/** Print a human-readable capacity and device report via spdlog. */
void printCapacityReport() const;
private:
// ── Internal slot descriptor ─────────────────────────────────────────────
struct InferenceSlot {
int deviceIndex = 0;
bool busy = false;
size_t memUsed = 0; ///< Bytes this slot holds in VRAM
std::unique_ptr<Engine<T>> engine;
};
// ── Data members ──────────────────────────────────────────────────────────
std::vector<InferenceSlot> m_slots;
std::vector<GpuDeviceInfo> m_deviceInfos;
mutable std::mutex m_slotMutex;
std::atomic<int> m_activeCount{0};
int m_totalCapacity{0};
// Tensor dims cached from the probe engine (identical for every slot)
std::vector<nvinfer1::Dims3> m_inputDims;
std::vector<nvinfer1::Dims> m_outputDims;
// ── Private helpers ───────────────────────────────────────────────────────
bool loadSlots(const ANSCENTER::Options& baseOptions,
const std::string& modelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
bool fromOnnx,
int maxSlotsPerGpu,
double memSafetyFactor);
};
// ============================================================================
// Template implementation
// (must be in the header because Engine<T> is itself a template)
// ============================================================================
// ────────────────────────────────────────────────────────────────────────────
// enumerateDevices — static, no model loading required
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
/*static*/ std::vector<GpuDeviceInfo>
MultiGpuEngineManager<T>::enumerateDevices()
{
int count = 0;
cudaGetDeviceCount(&count);
std::vector<GpuDeviceInfo> devices;
devices.reserve(count);
for (int i = 0; i < count; ++i) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
cudaSetDevice(i);
size_t freeBytes = 0, totalBytes = 0;
cudaMemGetInfo(&freeBytes, &totalBytes);
GpuDeviceInfo info;
info.index = i;
info.name = prop.name;
info.totalMemoryBytes = prop.totalGlobalMem;
info.freeMemoryAtInitBytes = freeBytes;
info.computeMajor = prop.major;
info.computeMinor = prop.minor;
info.slotsAllocated = 0;
info.memoryPerSlotBytes = 0;
devices.push_back(std::move(info));
}
return devices;
}
// ────────────────────────────────────────────────────────────────────────────
// Public init wrappers
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
bool MultiGpuEngineManager<T>::initialize(
const ANSCENTER::Options& baseOptions,
const std::string& onnxModelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
int maxSlotsPerGpu,
double memSafetyFactor)
{
return loadSlots(baseOptions, onnxModelPath,
subVals, divVals, normalize,
/*fromOnnx=*/true,
maxSlotsPerGpu, memSafetyFactor);
}
template <typename T>
bool MultiGpuEngineManager<T>::initializeFromEngine(
const ANSCENTER::Options& baseOptions,
const std::string& trtEnginePath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
int maxSlotsPerGpu,
double memSafetyFactor)
{
return loadSlots(baseOptions, trtEnginePath,
subVals, divVals, normalize,
/*fromOnnx=*/false,
maxSlotsPerGpu, memSafetyFactor);
}
// ────────────────────────────────────────────────────────────────────────────
// loadSlots — core initialization logic
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
bool MultiGpuEngineManager<T>::loadSlots(
const ANSCENTER::Options& baseOptions,
const std::string& modelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
bool fromOnnx,
int maxSlotsPerGpu,
double memSafetyFactor)
{
// ──────────────────────────────────────────────────────────────────
// 1. Enumerate GPUs
// ──────────────────────────────────────────────────────────────────
m_deviceInfos = enumerateDevices();
if (m_deviceInfos.empty()) {
spdlog::error("MultiGpuEngineManager: No CUDA-capable GPUs detected");
return false;
}
spdlog::info("MultiGpuEngineManager: {} GPU(s) found:", m_deviceInfos.size());
for (const auto& d : m_deviceInfos) {
spdlog::info(" GPU[{}] {} | SM {}.{} | Total {:.0f} MiB | Free {:.0f} MiB",
d.index, d.name,
d.computeMajor, d.computeMinor,
d.totalMemoryBytes / 1048576.0,
d.freeMemoryAtInitBytes / 1048576.0);
}
// Warn if the GPUs are heterogeneous — the TRT engine may be incompatible
for (size_t i = 1; i < m_deviceInfos.size(); ++i) {
if (m_deviceInfos[i].name != m_deviceInfos[0].name) {
spdlog::warn("MultiGpuEngineManager: GPU[{}] '{}' differs from GPU[0] '{}'. "
"TRT engine binaries may be incompatible with dissimilar GPUs.",
i, m_deviceInfos[i].name, m_deviceInfos[0].name);
}
}
// ──────────────────────────────────────────────────────────────────
// 2. Load ONE probe engine on GPU 0 to measure per-slot VRAM usage.
//
// Memory delta = freeBeforeLoad freeAfterLoad
// This includes: TRT engine buffers, CUDA context overhead, and
// any stream / workspace memory Engine<T> allocates.
// ──────────────────────────────────────────────────────────────────
spdlog::info("MultiGpuEngineManager: Loading probe engine on GPU[0] "
"to measure per-slot memory footprint...");
cudaSetDevice(0);
size_t freeBefore = 0, tmp = 0;
cudaMemGetInfo(&freeBefore, &tmp);
ANSCENTER::Options opts0 = baseOptions;
opts0.deviceIndex = 0;
auto probeEngine = std::make_unique<Engine<T>>(opts0);
const bool probeOk = fromOnnx
? probeEngine->buildLoadNetwork(modelPath, subVals, divVals, normalize)
: probeEngine->loadNetwork (modelPath, subVals, divVals, normalize);
if (!probeOk) {
spdlog::error("MultiGpuEngineManager: Probe engine failed to load on GPU[0]");
return false;
}
size_t freeAfter = 0;
cudaMemGetInfo(&freeAfter, &tmp);
// Guard against measurement noise: floor at 64 MiB
constexpr size_t kMinSlotMemBytes = 64ULL * 1024 * 1024;
const size_t rawDelta = (freeBefore > freeAfter) ? (freeBefore - freeAfter) : 0ULL;
const size_t memPerSlot = std::max(rawDelta, kMinSlotMemBytes);
spdlog::info("MultiGpuEngineManager: Memory per inference slot: {:.1f} MiB "
"(measured delta = {:.1f} MiB)",
memPerSlot / 1048576.0,
rawDelta / 1048576.0);
// Cache tensor dims — same for every slot since they all use the same model
m_inputDims = probeEngine->getInputDims();
m_outputDims = probeEngine->getOutputDims();
// Promote the probe engine into slot 0 on device 0
{
InferenceSlot s;
s.deviceIndex = 0;
s.busy = false;
s.memUsed = memPerSlot;
s.engine = std::move(probeEngine);
m_slots.push_back(std::move(s));
}
m_deviceInfos[0].slotsAllocated = 1;
m_deviceInfos[0].memoryPerSlotBytes = memPerSlot;
// ──────────────────────────────────────────────────────────────────
// 3. Auto-cap: VRAM-fraction budget per model
//
// When maxSlotsPerGpu is -1 (the default), each model pool
// auto-limits itself to kMaxVramFractionPerModel of total GPU
// VRAM. This prevents the first model loaded from consuming all
// memory and starving subsequent models in multi-model deployments.
// We use *total* VRAM (not free) as the budget base so the cap is
// consistent regardless of load order.
//
// maxSlotsPerGpu == -1 → auto-cap from VRAM fraction (default)
// maxSlotsPerGpu > 0 → explicit cap (user override, unchanged)
// ──────────────────────────────────────────────────────────────────
constexpr double kMaxVramFractionPerModel = 0.25; // 25% of total VRAM
int effectiveMaxSlotsPerGpu = maxSlotsPerGpu;
if (maxSlotsPerGpu <= 0 && memPerSlot > 0) {
const size_t totalVram = m_deviceInfos[0].totalMemoryBytes;
const size_t vramBudget = static_cast<size_t>(
static_cast<double>(totalVram) * kMaxVramFractionPerModel);
const int autoCap = std::max(1, static_cast<int>(vramBudget / memPerSlot));
effectiveMaxSlotsPerGpu = autoCap;
spdlog::info("MultiGpuEngineManager: VRAM auto-cap = {} slot(s)/GPU "
"(model {} MiB/slot, budget {} MiB = {}% of {} MiB total)",
autoCap, memPerSlot / 1048576,
vramBudget / 1048576,
static_cast<int>(kMaxVramFractionPerModel * 100),
totalVram / 1048576);
}
// ──────────────────────────────────────────────────────────────────
// 4. Fill remaining capacity on every GPU.
//
// For GPU 0:
// freeNow already reflects probe usage → slotsToAdd is the count
// of *additional* slots that fit, beyond the probe.
//
// For GPU 1+:
// freeNow is the original available memory on that device, so
// slotsToAdd is the *total* slots for that device.
// ──────────────────────────────────────────────────────────────────
for (int di = 0; di < static_cast<int>(m_deviceInfos.size()); ++di) {
cudaSetDevice(di);
size_t freeNow = 0, totalNow = 0;
cudaMemGetInfo(&freeNow, &totalNow);
const size_t usableBytes = static_cast<size_t>(
static_cast<double>(freeNow) * memSafetyFactor);
// How many new Engine<T> instances fit in the usable memory?
int slotsToAdd = (memPerSlot > 0)
? static_cast<int>(usableBytes / memPerSlot)
: 0;
// Apply VRAM-fraction auto-cap or explicit per-GPU cap.
// GPU 0 already has the probe slot, so subtract 1 from its budget.
if (effectiveMaxSlotsPerGpu > 0) {
const int budget = (di == 0)
? (effectiveMaxSlotsPerGpu - 1)
: effectiveMaxSlotsPerGpu;
slotsToAdd = std::min(slotsToAdd, budget);
}
m_deviceInfos[di].memoryPerSlotBytes = memPerSlot;
spdlog::info("MultiGpuEngineManager: GPU[{}] {} — "
"free {:.0f} MiB, usable {:.0f} MiB → adding {} slot(s)",
di, m_deviceInfos[di].name,
freeNow / 1048576.0,
usableBytes / 1048576.0,
slotsToAdd);
for (int s = 0; s < slotsToAdd; ++s) {
ANSCENTER::Options opts = baseOptions;
opts.deviceIndex = di;
auto eng = std::make_unique<Engine<T>>(opts);
const bool ok = fromOnnx
? eng->buildLoadNetwork(modelPath, subVals, divVals, normalize)
: eng->loadNetwork (modelPath, subVals, divVals, normalize);
if (!ok) {
spdlog::warn("MultiGpuEngineManager: GPU[{}] — slot {}/{} load failed; "
"halting allocation on this device.",
di, s + 1, slotsToAdd);
break;
}
InferenceSlot slot;
slot.deviceIndex = di;
slot.busy = false;
slot.memUsed = memPerSlot;
slot.engine = std::move(eng);
m_slots.push_back(std::move(slot));
m_deviceInfos[di].slotsAllocated++;
}
}
m_totalCapacity = static_cast<int>(m_slots.size());
printCapacityReport();
if (m_totalCapacity == 0) {
spdlog::error("MultiGpuEngineManager: Zero inference slots allocated — "
"check available GPU memory.");
return false;
}
return true;
}
// ────────────────────────────────────────────────────────────────────────────
// runInference
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
bool MultiGpuEngineManager<T>::runInference(
const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
std::vector<std::vector<std::vector<T>>>& featureVectors)
{
// ── Acquire the first idle slot ───────────────────────────────────────
//
// Slots are stored in ascending device-index order (all device-0 slots
// come first), so the scan naturally prefers device 0. The mutex is
// held only for the O(N) scan + flag flip — NOT during the GPU kernel —
// so threads using different slots proceed in parallel.
InferenceSlot* slot = nullptr;
{
std::lock_guard<std::mutex> lock(m_slotMutex);
for (auto& s : m_slots) {
if (!s.busy) {
s.busy = true;
slot = &s;
break;
}
}
}
if (!slot) {
// All slots are in use. Enforce the capacity limit by refusing the
// request rather than crashing or waiting indefinitely.
spdlog::warn("MultiGpuEngineManager: Capacity reached — "
"all {}/{} inference slot(s) busy. "
"Request rejected; release a running inference first.",
m_activeCount.load(), m_totalCapacity);
return false;
}
++m_activeCount;
// Set the calling thread's CUDA device context to match the slot's device.
// Engine<T>::loadNetwork() already did this internally when the engine was
// created, and the CUDA streams inside are bound to that device; calling
// cudaSetDevice here ensures the calling thread's context matches so that
// stream operations and memory queries behave correctly in multi-threaded
// scenarios where threads may have previously touched a different device.
cudaSetDevice(slot->deviceIndex);
const bool result = slot->engine->runInference(inputs, featureVectors);
// ── Release the slot ──────────────────────────────────────────────────
{
std::lock_guard<std::mutex> lock(m_slotMutex);
slot->busy = false;
}
--m_activeCount;
return result;
}
// ────────────────────────────────────────────────────────────────────────────
// printCapacityReport
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
void MultiGpuEngineManager<T>::printCapacityReport() const
{
spdlog::info("============================================================");
spdlog::info(" MultiGpuEngineManager — Capacity Report");
spdlog::info("============================================================");
spdlog::info(" Total inference slots : {}", m_totalCapacity);
spdlog::info(" Active inferences : {}", m_activeCount.load());
spdlog::info(" Available slots : {}",
m_totalCapacity - m_activeCount.load());
spdlog::info("------------------------------------------------------------");
for (const auto& d : m_deviceInfos) {
spdlog::info(" GPU[{:d}] {:s} | SM {:d}.{:d} | "
"Total {:6.0f} MiB | Slots: {:2d} | Mem/slot: {:6.1f} MiB",
d.index, d.name,
d.computeMajor, d.computeMinor,
d.totalMemoryBytes / 1048576.0,
d.slotsAllocated,
d.memoryPerSlotBytes / 1048576.0);
}
spdlog::info("============================================================");
}