Files
ANSCORE/engines/TensorRTAPI/include/MultiGpuEngineManager.h

619 lines
30 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#pragma once
// ============================================================================
// DEPRECATED — MultiGpuEngineManager.h
//
// The functionality of this class has been merged directly into Engine<T>.
// Replace any usage of MultiGpuEngineManager with Engine as follows:
//
// Before:
// MultiGpuEngineManager<float> mgr;
// mgr.initialize(opts, "model.onnx");
// mgr.runInference(inputs, outputs);
//
// After:
// Engine<float> eng;
// eng.initializePool(opts, "model.onnx");
// eng.runInference(inputs, outputs); // auto-dispatches to pool
//
// Method mapping:
// MultiGpuEngineManager::initialize() → Engine::initializePool()
// MultiGpuEngineManager::initializeFromEngine() → Engine::initializePoolFromEngine()
// MultiGpuEngineManager::runInference() → Engine::runInference() (unchanged signature)
// MultiGpuEngineManager::enumerateDevices() → Engine::enumerateDevices()
// MultiGpuEngineManager::getTotalCapacity() → Engine::getTotalCapacity()
// MultiGpuEngineManager::getAvailableSlots() → Engine::getAvailableSlots()
// MultiGpuEngineManager::getActiveInferences() → Engine::getActiveInferences()
// MultiGpuEngineManager::isAtCapacity() → Engine::isAtCapacity()
// MultiGpuEngineManager::getDeviceInfo() → Engine::getDeviceInfo()
// MultiGpuEngineManager::printCapacityReport() → Engine::printCapacityReport()
// MultiGpuEngineManager::getInputDims() → Engine::getInputDims()
// MultiGpuEngineManager::getOutputDims() → Engine::getOutputDims()
//
// GpuDeviceInfo struct is now declared in engine.h and needs no separate include.
//
// This file is kept temporarily for compatibility. It will be removed in a
// future cleanup pass. Do not add new code here.
// ============================================================================
/*
* MultiGpuEngineManager.h [DEPRECATED — see engine.h]
*
* A pool-based multi-GPU inference manager that wraps Engine<T>.
*
* Responsibilities
* ----------------
* 1. Enumerate all CUDA-capable GPUs and measure the GPU memory consumed
* by a single inference context (one Engine<T> instance).
* 2. Allocate as many Engine<T> "slots" as memory allows, filling device 0
* first, then device 1, etc.
* 3. Dispatch runInference() calls to the first idle slot (device 0 wins),
* so load naturally concentrates on the lowest-index GPU first.
* 4. Enforce capacity: if every slot is busy, return false immediately
* rather than crashing or blocking indefinitely.
*
* Assumptions
* -----------
* - All GPUs on the host are the same model (homogeneous). TRT engine files
* are therefore interchangeable across devices.
* - Engine<T> is safe to call concurrently from different threads as long as
* each instance is used by at most one thread at a time — this manager
* guarantees that through the per-slot busy flag.
*
* Usage example
* -------------
* ANSCENTER::Options opts;
* opts.precision = ANSCENTER::Precision::FP16;
* opts.maxBatchSize = 1;
*
* MultiGpuEngineManager<float> mgr;
* if (!mgr.initialize(opts, "model.onnx")) { ... error ... }
*
* mgr.printCapacityReport();
* // mgr.getTotalCapacity() tells you how many parallel tasks are possible
*
* // From any thread:
* std::vector<std::vector<cv::cuda::GpuMat>> inputs = ...;
* std::vector<std::vector<std::vector<float>>> outputs;
* if (!mgr.runInference(inputs, outputs)) {
* // All slots busy — back off and retry, or queue the request.
* }
*/
#include <algorithm>
#include <atomic>
#include <memory>
#include <mutex>
#include <string>
#include <vector>
#include <cuda_runtime.h>
#include <spdlog/spdlog.h>
#include "engine.h" // Engine<T> (also pulls NvInfer, OpenCV-CUDA, etc.)
#include "ANSLicense.h" // ANSCENTER::Options, ANSCENTER::Precision
// ============================================================================
// GpuDeviceInfo — snapshot of one CUDA device captured at init time
// ============================================================================
struct GpuDeviceInfo {
int index = 0;
std::string name;
size_t totalMemoryBytes = 0; ///< Physical VRAM
size_t freeMemoryAtInitBytes = 0; ///< Free VRAM when manager started
int computeMajor = 0;
int computeMinor = 0;
int slotsAllocated = 0; ///< Engine<T> instances on this GPU
size_t memoryPerSlotBytes = 0; ///< Bytes each slot occupies in VRAM
};
// ============================================================================
// MultiGpuEngineManager<T>
// ============================================================================
template <typename T>
class MultiGpuEngineManager {
public:
// ── Construction / destruction ───────────────────────────────────────────
MultiGpuEngineManager() = default;
~MultiGpuEngineManager() = default;
// Non-copyable (owning unique_ptrs inside slots)
MultiGpuEngineManager(const MultiGpuEngineManager&) = delete;
MultiGpuEngineManager& operator=(const MultiGpuEngineManager&) = delete;
// Movable
MultiGpuEngineManager(MultiGpuEngineManager&&) = default;
MultiGpuEngineManager& operator=(MultiGpuEngineManager&&) = default;
// ── Initialization ───────────────────────────────────────────────────────
/**
* Initialize from an ONNX model file.
* Builds the TRT engine on the first run and caches it; subsequent calls
* load the cached .engine file directly (fast path).
*
* @param baseOptions Configuration template. deviceIndex is ignored
* and overridden per-slot; all other fields apply.
* @param onnxModelPath Path to the .onnx model file.
* @param subVals Per-channel subtraction for normalisation.
* @param divVals Per-channel divisor for normalisation.
* @param normalize Normalise pixel values to [0, 1] before inference.
* @param maxSlotsPerGpu Cap slots per GPU (-1 = memory-limited only).
* @param memSafetyFactor Fraction of free VRAM to consume (default 0.80).
* @return true on success.
*/
bool initialize(const ANSCENTER::Options& baseOptions,
const std::string& onnxModelPath,
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},
const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
bool normalize = true,
int maxSlotsPerGpu = -1,
double memSafetyFactor = 0.80);
/**
* Initialize from a pre-built TRT engine file — no ONNX build step.
*/
bool initializeFromEngine(const ANSCENTER::Options& baseOptions,
const std::string& trtEnginePath,
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},
const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
bool normalize = true,
int maxSlotsPerGpu = -1,
double memSafetyFactor = 0.80);
// ── Inference ────────────────────────────────────────────────────────────
/**
* Run inference on the best available slot.
*
* Slot-selection order: device 0 first (slots are stored in ascending
* device-index order, so the lowest-index idle slot always wins).
*
* Returns FALSE immediately if every slot is busy.
* Does NOT block — the caller handles retry / queuing.
*
* Thread-safe: may be called from multiple threads simultaneously.
*
* @param inputs [input_tensor][batch][GpuMat]
* @param featureVectors [batch][output_tensor][values] — populated on return
* @return true if inference completed successfully.
*/
bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
std::vector<std::vector<std::vector<T>>>& featureVectors);
// ── Device / capacity queries ─────────────────────────────────────────────
/**
* Enumerate all CUDA-capable GPUs without loading any model.
* Useful for pre-flight checks before calling initialize().
*/
static std::vector<GpuDeviceInfo> enumerateDevices();
/** Device snapshots captured at initialize() time. */
const std::vector<GpuDeviceInfo>& getDeviceInfo() const { return m_deviceInfos; }
/** Total Engine<T> slots across all GPUs. */
int getTotalCapacity() const { return m_totalCapacity; }
/** Slots currently executing inference (approximate, lock-free read). */
int getActiveInferences() const { return m_activeCount.load(); }
/** Slots not currently claimed by a running inference. */
int getAvailableSlots() const { return m_totalCapacity - m_activeCount.load(); }
/** Whether the engine pool is fully saturated and new work would be rejected. */
bool isAtCapacity() const { return getAvailableSlots() <= 0; }
/** Input tensor dimensions (same for all slots, populated after init). */
const std::vector<nvinfer1::Dims3>& getInputDims() const { return m_inputDims; }
/** Output tensor dimensions (same for all slots, populated after init). */
const std::vector<nvinfer1::Dims>& getOutputDims() const { return m_outputDims; }
/** Print a human-readable capacity and device report via spdlog. */
void printCapacityReport() const;
private:
// ── Internal slot descriptor ─────────────────────────────────────────────
struct InferenceSlot {
int deviceIndex = 0;
bool busy = false;
size_t memUsed = 0; ///< Bytes this slot holds in VRAM
std::unique_ptr<Engine<T>> engine;
};
// ── Data members ──────────────────────────────────────────────────────────
std::vector<InferenceSlot> m_slots;
std::vector<GpuDeviceInfo> m_deviceInfos;
mutable std::mutex m_slotMutex;
std::atomic<int> m_activeCount{0};
int m_totalCapacity{0};
// Tensor dims cached from the probe engine (identical for every slot)
std::vector<nvinfer1::Dims3> m_inputDims;
std::vector<nvinfer1::Dims> m_outputDims;
// ── Private helpers ───────────────────────────────────────────────────────
bool loadSlots(const ANSCENTER::Options& baseOptions,
const std::string& modelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
bool fromOnnx,
int maxSlotsPerGpu,
double memSafetyFactor);
};
// ============================================================================
// Template implementation
// (must be in the header because Engine<T> is itself a template)
// ============================================================================
// ────────────────────────────────────────────────────────────────────────────
// enumerateDevices — static, no model loading required
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
/*static*/ std::vector<GpuDeviceInfo>
MultiGpuEngineManager<T>::enumerateDevices()
{
int count = 0;
cudaGetDeviceCount(&count);
std::vector<GpuDeviceInfo> devices;
devices.reserve(count);
for (int i = 0; i < count; ++i) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
cudaSetDevice(i);
size_t freeBytes = 0, totalBytes = 0;
cudaMemGetInfo(&freeBytes, &totalBytes);
GpuDeviceInfo info;
info.index = i;
info.name = prop.name;
info.totalMemoryBytes = prop.totalGlobalMem;
info.freeMemoryAtInitBytes = freeBytes;
info.computeMajor = prop.major;
info.computeMinor = prop.minor;
info.slotsAllocated = 0;
info.memoryPerSlotBytes = 0;
devices.push_back(std::move(info));
}
return devices;
}
// ────────────────────────────────────────────────────────────────────────────
// Public init wrappers
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
bool MultiGpuEngineManager<T>::initialize(
const ANSCENTER::Options& baseOptions,
const std::string& onnxModelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
int maxSlotsPerGpu,
double memSafetyFactor)
{
return loadSlots(baseOptions, onnxModelPath,
subVals, divVals, normalize,
/*fromOnnx=*/true,
maxSlotsPerGpu, memSafetyFactor);
}
template <typename T>
bool MultiGpuEngineManager<T>::initializeFromEngine(
const ANSCENTER::Options& baseOptions,
const std::string& trtEnginePath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
int maxSlotsPerGpu,
double memSafetyFactor)
{
return loadSlots(baseOptions, trtEnginePath,
subVals, divVals, normalize,
/*fromOnnx=*/false,
maxSlotsPerGpu, memSafetyFactor);
}
// ────────────────────────────────────────────────────────────────────────────
// loadSlots — core initialization logic
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
bool MultiGpuEngineManager<T>::loadSlots(
const ANSCENTER::Options& baseOptions,
const std::string& modelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
bool fromOnnx,
int maxSlotsPerGpu,
double memSafetyFactor)
{
// ──────────────────────────────────────────────────────────────────
// 1. Enumerate GPUs
// ──────────────────────────────────────────────────────────────────
m_deviceInfos = enumerateDevices();
if (m_deviceInfos.empty()) {
spdlog::error("MultiGpuEngineManager: No CUDA-capable GPUs detected");
return false;
}
spdlog::info("MultiGpuEngineManager: {} GPU(s) found:", m_deviceInfos.size());
for (const auto& d : m_deviceInfos) {
spdlog::info(" GPU[{}] {} | SM {}.{} | Total {:.0f} MiB | Free {:.0f} MiB",
d.index, d.name,
d.computeMajor, d.computeMinor,
d.totalMemoryBytes / 1048576.0,
d.freeMemoryAtInitBytes / 1048576.0);
}
// Warn if the GPUs are heterogeneous — the TRT engine may be incompatible
for (size_t i = 1; i < m_deviceInfos.size(); ++i) {
if (m_deviceInfos[i].name != m_deviceInfos[0].name) {
spdlog::warn("MultiGpuEngineManager: GPU[{}] '{}' differs from GPU[0] '{}'. "
"TRT engine binaries may be incompatible with dissimilar GPUs.",
i, m_deviceInfos[i].name, m_deviceInfos[0].name);
}
}
// ──────────────────────────────────────────────────────────────────
// 2. Load ONE probe engine on GPU 0 to measure per-slot VRAM usage.
//
// Memory delta = freeBeforeLoad freeAfterLoad
// This includes: TRT engine buffers, CUDA context overhead, and
// any stream / workspace memory Engine<T> allocates.
// ──────────────────────────────────────────────────────────────────
spdlog::info("MultiGpuEngineManager: Loading probe engine on GPU[0] "
"to measure per-slot memory footprint...");
cudaSetDevice(0);
size_t freeBefore = 0, tmp = 0;
cudaMemGetInfo(&freeBefore, &tmp);
ANSCENTER::Options opts0 = baseOptions;
opts0.deviceIndex = 0;
auto probeEngine = std::make_unique<Engine<T>>(opts0);
const bool probeOk = fromOnnx
? probeEngine->buildLoadNetwork(modelPath, subVals, divVals, normalize)
: probeEngine->loadNetwork (modelPath, subVals, divVals, normalize);
if (!probeOk) {
spdlog::error("MultiGpuEngineManager: Probe engine failed to load on GPU[0]");
return false;
}
size_t freeAfter = 0;
cudaMemGetInfo(&freeAfter, &tmp);
// Guard against measurement noise: floor at 64 MiB
constexpr size_t kMinSlotMemBytes = 64ULL * 1024 * 1024;
const size_t rawDelta = (freeBefore > freeAfter) ? (freeBefore - freeAfter) : 0ULL;
const size_t memPerSlot = std::max(rawDelta, kMinSlotMemBytes);
spdlog::info("MultiGpuEngineManager: Memory per inference slot: {:.1f} MiB "
"(measured delta = {:.1f} MiB)",
memPerSlot / 1048576.0,
rawDelta / 1048576.0);
// Cache tensor dims — same for every slot since they all use the same model
m_inputDims = probeEngine->getInputDims();
m_outputDims = probeEngine->getOutputDims();
// Promote the probe engine into slot 0 on device 0
{
InferenceSlot s;
s.deviceIndex = 0;
s.busy = false;
s.memUsed = memPerSlot;
s.engine = std::move(probeEngine);
m_slots.push_back(std::move(s));
}
m_deviceInfos[0].slotsAllocated = 1;
m_deviceInfos[0].memoryPerSlotBytes = memPerSlot;
// ──────────────────────────────────────────────────────────────────
// 3. Auto-cap: VRAM-fraction budget per model
//
// When maxSlotsPerGpu is -1 (the default), each model pool
// auto-limits itself to kMaxVramFractionPerModel of total GPU
// VRAM. This prevents the first model loaded from consuming all
// memory and starving subsequent models in multi-model deployments.
// We use *total* VRAM (not free) as the budget base so the cap is
// consistent regardless of load order.
//
// maxSlotsPerGpu == -1 → auto-cap from VRAM fraction (default)
// maxSlotsPerGpu > 0 → explicit cap (user override, unchanged)
// ──────────────────────────────────────────────────────────────────
constexpr double kMaxVramFractionPerModel = 0.25; // 25% of total VRAM
int effectiveMaxSlotsPerGpu = maxSlotsPerGpu;
if (maxSlotsPerGpu <= 0 && memPerSlot > 0) {
const size_t totalVram = m_deviceInfos[0].totalMemoryBytes;
const size_t vramBudget = static_cast<size_t>(
static_cast<double>(totalVram) * kMaxVramFractionPerModel);
const int autoCap = std::max(1, static_cast<int>(vramBudget / memPerSlot));
effectiveMaxSlotsPerGpu = autoCap;
spdlog::info("MultiGpuEngineManager: VRAM auto-cap = {} slot(s)/GPU "
"(model {} MiB/slot, budget {} MiB = {}% of {} MiB total)",
autoCap, memPerSlot / 1048576,
vramBudget / 1048576,
static_cast<int>(kMaxVramFractionPerModel * 100),
totalVram / 1048576);
}
// ──────────────────────────────────────────────────────────────────
// 4. Fill remaining capacity on every GPU.
//
// For GPU 0:
// freeNow already reflects probe usage → slotsToAdd is the count
// of *additional* slots that fit, beyond the probe.
//
// For GPU 1+:
// freeNow is the original available memory on that device, so
// slotsToAdd is the *total* slots for that device.
// ──────────────────────────────────────────────────────────────────
for (int di = 0; di < static_cast<int>(m_deviceInfos.size()); ++di) {
cudaSetDevice(di);
size_t freeNow = 0, totalNow = 0;
cudaMemGetInfo(&freeNow, &totalNow);
const size_t usableBytes = static_cast<size_t>(
static_cast<double>(freeNow) * memSafetyFactor);
// How many new Engine<T> instances fit in the usable memory?
int slotsToAdd = (memPerSlot > 0)
? static_cast<int>(usableBytes / memPerSlot)
: 0;
// Apply VRAM-fraction auto-cap or explicit per-GPU cap.
// GPU 0 already has the probe slot, so subtract 1 from its budget.
if (effectiveMaxSlotsPerGpu > 0) {
const int budget = (di == 0)
? (effectiveMaxSlotsPerGpu - 1)
: effectiveMaxSlotsPerGpu;
slotsToAdd = std::min(slotsToAdd, budget);
}
m_deviceInfos[di].memoryPerSlotBytes = memPerSlot;
spdlog::info("MultiGpuEngineManager: GPU[{}] {} — "
"free {:.0f} MiB, usable {:.0f} MiB → adding {} slot(s)",
di, m_deviceInfos[di].name,
freeNow / 1048576.0,
usableBytes / 1048576.0,
slotsToAdd);
for (int s = 0; s < slotsToAdd; ++s) {
ANSCENTER::Options opts = baseOptions;
opts.deviceIndex = di;
auto eng = std::make_unique<Engine<T>>(opts);
const bool ok = fromOnnx
? eng->buildLoadNetwork(modelPath, subVals, divVals, normalize)
: eng->loadNetwork (modelPath, subVals, divVals, normalize);
if (!ok) {
spdlog::warn("MultiGpuEngineManager: GPU[{}] — slot {}/{} load failed; "
"halting allocation on this device.",
di, s + 1, slotsToAdd);
break;
}
InferenceSlot slot;
slot.deviceIndex = di;
slot.busy = false;
slot.memUsed = memPerSlot;
slot.engine = std::move(eng);
m_slots.push_back(std::move(slot));
m_deviceInfos[di].slotsAllocated++;
}
}
m_totalCapacity = static_cast<int>(m_slots.size());
printCapacityReport();
if (m_totalCapacity == 0) {
spdlog::error("MultiGpuEngineManager: Zero inference slots allocated — "
"check available GPU memory.");
return false;
}
return true;
}
// ────────────────────────────────────────────────────────────────────────────
// runInference
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
bool MultiGpuEngineManager<T>::runInference(
const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
std::vector<std::vector<std::vector<T>>>& featureVectors)
{
// ── Acquire the first idle slot ───────────────────────────────────────
//
// Slots are stored in ascending device-index order (all device-0 slots
// come first), so the scan naturally prefers device 0. The mutex is
// held only for the O(N) scan + flag flip — NOT during the GPU kernel —
// so threads using different slots proceed in parallel.
InferenceSlot* slot = nullptr;
{
std::lock_guard<std::mutex> lock(m_slotMutex);
for (auto& s : m_slots) {
if (!s.busy) {
s.busy = true;
slot = &s;
break;
}
}
}
if (!slot) {
// All slots are in use. Enforce the capacity limit by refusing the
// request rather than crashing or waiting indefinitely.
spdlog::warn("MultiGpuEngineManager: Capacity reached — "
"all {}/{} inference slot(s) busy. "
"Request rejected; release a running inference first.",
m_activeCount.load(), m_totalCapacity);
return false;
}
++m_activeCount;
// Set the calling thread's CUDA device context to match the slot's device.
// Engine<T>::loadNetwork() already did this internally when the engine was
// created, and the CUDA streams inside are bound to that device; calling
// cudaSetDevice here ensures the calling thread's context matches so that
// stream operations and memory queries behave correctly in multi-threaded
// scenarios where threads may have previously touched a different device.
cudaSetDevice(slot->deviceIndex);
const bool result = slot->engine->runInference(inputs, featureVectors);
// ── Release the slot ──────────────────────────────────────────────────
{
std::lock_guard<std::mutex> lock(m_slotMutex);
slot->busy = false;
}
--m_activeCount;
return result;
}
// ────────────────────────────────────────────────────────────────────────────
// printCapacityReport
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
void MultiGpuEngineManager<T>::printCapacityReport() const
{
spdlog::info("============================================================");
spdlog::info(" MultiGpuEngineManager — Capacity Report");
spdlog::info("============================================================");
spdlog::info(" Total inference slots : {}", m_totalCapacity);
spdlog::info(" Active inferences : {}", m_activeCount.load());
spdlog::info(" Available slots : {}",
m_totalCapacity - m_activeCount.load());
spdlog::info("------------------------------------------------------------");
for (const auto& d : m_deviceInfos) {
spdlog::info(" GPU[{:d}] {:s} | SM {:d}.{:d} | "
"Total {:6.0f} MiB | Slots: {:2d} | Mem/slot: {:6.1f} MiB",
d.index, d.name,
d.computeMajor, d.computeMinor,
d.totalMemoryBytes / 1048576.0,
d.slotsAllocated,
d.memoryPerSlotBytes / 1048576.0);
}
spdlog::info("============================================================");
}