619 lines
30 KiB
C++
619 lines
30 KiB
C++
#pragma once
|
||
|
||
// ============================================================================
|
||
// DEPRECATED — MultiGpuEngineManager.h
|
||
//
|
||
// The functionality of this class has been merged directly into Engine<T>.
|
||
// Replace any usage of MultiGpuEngineManager with Engine as follows:
|
||
//
|
||
// Before:
|
||
// MultiGpuEngineManager<float> mgr;
|
||
// mgr.initialize(opts, "model.onnx");
|
||
// mgr.runInference(inputs, outputs);
|
||
//
|
||
// After:
|
||
// Engine<float> eng;
|
||
// eng.initializePool(opts, "model.onnx");
|
||
// eng.runInference(inputs, outputs); // auto-dispatches to pool
|
||
//
|
||
// Method mapping:
|
||
// MultiGpuEngineManager::initialize() → Engine::initializePool()
|
||
// MultiGpuEngineManager::initializeFromEngine() → Engine::initializePoolFromEngine()
|
||
// MultiGpuEngineManager::runInference() → Engine::runInference() (unchanged signature)
|
||
// MultiGpuEngineManager::enumerateDevices() → Engine::enumerateDevices()
|
||
// MultiGpuEngineManager::getTotalCapacity() → Engine::getTotalCapacity()
|
||
// MultiGpuEngineManager::getAvailableSlots() → Engine::getAvailableSlots()
|
||
// MultiGpuEngineManager::getActiveInferences() → Engine::getActiveInferences()
|
||
// MultiGpuEngineManager::isAtCapacity() → Engine::isAtCapacity()
|
||
// MultiGpuEngineManager::getDeviceInfo() → Engine::getDeviceInfo()
|
||
// MultiGpuEngineManager::printCapacityReport() → Engine::printCapacityReport()
|
||
// MultiGpuEngineManager::getInputDims() → Engine::getInputDims()
|
||
// MultiGpuEngineManager::getOutputDims() → Engine::getOutputDims()
|
||
//
|
||
// GpuDeviceInfo struct is now declared in engine.h and needs no separate include.
|
||
//
|
||
// This file is kept temporarily for compatibility. It will be removed in a
|
||
// future cleanup pass. Do not add new code here.
|
||
// ============================================================================
|
||
|
||
/*
|
||
* MultiGpuEngineManager.h [DEPRECATED — see engine.h]
|
||
*
|
||
* A pool-based multi-GPU inference manager that wraps Engine<T>.
|
||
*
|
||
* Responsibilities
|
||
* ----------------
|
||
* 1. Enumerate all CUDA-capable GPUs and measure the GPU memory consumed
|
||
* by a single inference context (one Engine<T> instance).
|
||
* 2. Allocate as many Engine<T> "slots" as memory allows, filling device 0
|
||
* first, then device 1, etc.
|
||
* 3. Dispatch runInference() calls to the first idle slot (device 0 wins),
|
||
* so load naturally concentrates on the lowest-index GPU first.
|
||
* 4. Enforce capacity: if every slot is busy, return false immediately
|
||
* rather than crashing or blocking indefinitely.
|
||
*
|
||
* Assumptions
|
||
* -----------
|
||
* - All GPUs on the host are the same model (homogeneous). TRT engine files
|
||
* are therefore interchangeable across devices.
|
||
* - Engine<T> is safe to call concurrently from different threads as long as
|
||
* each instance is used by at most one thread at a time — this manager
|
||
* guarantees that through the per-slot busy flag.
|
||
*
|
||
* Usage example
|
||
* -------------
|
||
* ANSCENTER::Options opts;
|
||
* opts.precision = ANSCENTER::Precision::FP16;
|
||
* opts.maxBatchSize = 1;
|
||
*
|
||
* MultiGpuEngineManager<float> mgr;
|
||
* if (!mgr.initialize(opts, "model.onnx")) { ... error ... }
|
||
*
|
||
* mgr.printCapacityReport();
|
||
* // mgr.getTotalCapacity() tells you how many parallel tasks are possible
|
||
*
|
||
* // From any thread:
|
||
* std::vector<std::vector<cv::cuda::GpuMat>> inputs = ...;
|
||
* std::vector<std::vector<std::vector<float>>> outputs;
|
||
* if (!mgr.runInference(inputs, outputs)) {
|
||
* // All slots busy — back off and retry, or queue the request.
|
||
* }
|
||
*/
|
||
|
||
#include <algorithm>
|
||
#include <atomic>
|
||
#include <memory>
|
||
#include <mutex>
|
||
#include <string>
|
||
#include <vector>
|
||
|
||
#include <cuda_runtime.h>
|
||
#include <spdlog/spdlog.h>
|
||
|
||
#include "engine.h" // Engine<T> (also pulls NvInfer, OpenCV-CUDA, etc.)
|
||
#include "ANSLicense.h" // ANSCENTER::Options, ANSCENTER::Precision
|
||
|
||
// ============================================================================
|
||
// GpuDeviceInfo — snapshot of one CUDA device captured at init time
|
||
// ============================================================================
|
||
struct GpuDeviceInfo {
|
||
int index = 0;
|
||
std::string name;
|
||
size_t totalMemoryBytes = 0; ///< Physical VRAM
|
||
size_t freeMemoryAtInitBytes = 0; ///< Free VRAM when manager started
|
||
int computeMajor = 0;
|
||
int computeMinor = 0;
|
||
int slotsAllocated = 0; ///< Engine<T> instances on this GPU
|
||
size_t memoryPerSlotBytes = 0; ///< Bytes each slot occupies in VRAM
|
||
};
|
||
|
||
// ============================================================================
|
||
// MultiGpuEngineManager<T>
|
||
// ============================================================================
|
||
template <typename T>
|
||
class MultiGpuEngineManager {
|
||
public:
|
||
|
||
// ── Construction / destruction ───────────────────────────────────────────
|
||
MultiGpuEngineManager() = default;
|
||
~MultiGpuEngineManager() = default;
|
||
|
||
// Non-copyable (owning unique_ptrs inside slots)
|
||
MultiGpuEngineManager(const MultiGpuEngineManager&) = delete;
|
||
MultiGpuEngineManager& operator=(const MultiGpuEngineManager&) = delete;
|
||
|
||
// Movable
|
||
MultiGpuEngineManager(MultiGpuEngineManager&&) = default;
|
||
MultiGpuEngineManager& operator=(MultiGpuEngineManager&&) = default;
|
||
|
||
// ── Initialization ───────────────────────────────────────────────────────
|
||
|
||
/**
|
||
* Initialize from an ONNX model file.
|
||
* Builds the TRT engine on the first run and caches it; subsequent calls
|
||
* load the cached .engine file directly (fast path).
|
||
*
|
||
* @param baseOptions Configuration template. deviceIndex is ignored
|
||
* and overridden per-slot; all other fields apply.
|
||
* @param onnxModelPath Path to the .onnx model file.
|
||
* @param subVals Per-channel subtraction for normalisation.
|
||
* @param divVals Per-channel divisor for normalisation.
|
||
* @param normalize Normalise pixel values to [0, 1] before inference.
|
||
* @param maxSlotsPerGpu Cap slots per GPU (-1 = memory-limited only).
|
||
* @param memSafetyFactor Fraction of free VRAM to consume (default 0.80).
|
||
* @return true on success.
|
||
*/
|
||
bool initialize(const ANSCENTER::Options& baseOptions,
|
||
const std::string& onnxModelPath,
|
||
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},
|
||
const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
|
||
bool normalize = true,
|
||
int maxSlotsPerGpu = -1,
|
||
double memSafetyFactor = 0.80);
|
||
|
||
/**
|
||
* Initialize from a pre-built TRT engine file — no ONNX build step.
|
||
*/
|
||
bool initializeFromEngine(const ANSCENTER::Options& baseOptions,
|
||
const std::string& trtEnginePath,
|
||
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},
|
||
const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
|
||
bool normalize = true,
|
||
int maxSlotsPerGpu = -1,
|
||
double memSafetyFactor = 0.80);
|
||
|
||
// ── Inference ────────────────────────────────────────────────────────────
|
||
|
||
/**
|
||
* Run inference on the best available slot.
|
||
*
|
||
* Slot-selection order: device 0 first (slots are stored in ascending
|
||
* device-index order, so the lowest-index idle slot always wins).
|
||
*
|
||
* Returns FALSE immediately if every slot is busy.
|
||
* Does NOT block — the caller handles retry / queuing.
|
||
*
|
||
* Thread-safe: may be called from multiple threads simultaneously.
|
||
*
|
||
* @param inputs [input_tensor][batch][GpuMat]
|
||
* @param featureVectors [batch][output_tensor][values] — populated on return
|
||
* @return true if inference completed successfully.
|
||
*/
|
||
bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
|
||
std::vector<std::vector<std::vector<T>>>& featureVectors);
|
||
|
||
// ── Device / capacity queries ─────────────────────────────────────────────
|
||
|
||
/**
|
||
* Enumerate all CUDA-capable GPUs without loading any model.
|
||
* Useful for pre-flight checks before calling initialize().
|
||
*/
|
||
static std::vector<GpuDeviceInfo> enumerateDevices();
|
||
|
||
/** Device snapshots captured at initialize() time. */
|
||
const std::vector<GpuDeviceInfo>& getDeviceInfo() const { return m_deviceInfos; }
|
||
|
||
/** Total Engine<T> slots across all GPUs. */
|
||
int getTotalCapacity() const { return m_totalCapacity; }
|
||
|
||
/** Slots currently executing inference (approximate, lock-free read). */
|
||
int getActiveInferences() const { return m_activeCount.load(); }
|
||
|
||
/** Slots not currently claimed by a running inference. */
|
||
int getAvailableSlots() const { return m_totalCapacity - m_activeCount.load(); }
|
||
|
||
/** Whether the engine pool is fully saturated and new work would be rejected. */
|
||
bool isAtCapacity() const { return getAvailableSlots() <= 0; }
|
||
|
||
/** Input tensor dimensions (same for all slots, populated after init). */
|
||
const std::vector<nvinfer1::Dims3>& getInputDims() const { return m_inputDims; }
|
||
|
||
/** Output tensor dimensions (same for all slots, populated after init). */
|
||
const std::vector<nvinfer1::Dims>& getOutputDims() const { return m_outputDims; }
|
||
|
||
/** Print a human-readable capacity and device report via spdlog. */
|
||
void printCapacityReport() const;
|
||
|
||
private:
|
||
|
||
// ── Internal slot descriptor ─────────────────────────────────────────────
|
||
struct InferenceSlot {
|
||
int deviceIndex = 0;
|
||
bool busy = false;
|
||
size_t memUsed = 0; ///< Bytes this slot holds in VRAM
|
||
std::unique_ptr<Engine<T>> engine;
|
||
};
|
||
|
||
// ── Data members ──────────────────────────────────────────────────────────
|
||
std::vector<InferenceSlot> m_slots;
|
||
std::vector<GpuDeviceInfo> m_deviceInfos;
|
||
mutable std::mutex m_slotMutex;
|
||
std::atomic<int> m_activeCount{0};
|
||
int m_totalCapacity{0};
|
||
|
||
// Tensor dims cached from the probe engine (identical for every slot)
|
||
std::vector<nvinfer1::Dims3> m_inputDims;
|
||
std::vector<nvinfer1::Dims> m_outputDims;
|
||
|
||
// ── Private helpers ───────────────────────────────────────────────────────
|
||
bool loadSlots(const ANSCENTER::Options& baseOptions,
|
||
const std::string& modelPath,
|
||
const std::array<float, 3>& subVals,
|
||
const std::array<float, 3>& divVals,
|
||
bool normalize,
|
||
bool fromOnnx,
|
||
int maxSlotsPerGpu,
|
||
double memSafetyFactor);
|
||
};
|
||
|
||
// ============================================================================
|
||
// Template implementation
|
||
// (must be in the header because Engine<T> is itself a template)
|
||
// ============================================================================
|
||
|
||
// ────────────────────────────────────────────────────────────────────────────
|
||
// enumerateDevices — static, no model loading required
|
||
// ────────────────────────────────────────────────────────────────────────────
|
||
template <typename T>
|
||
/*static*/ std::vector<GpuDeviceInfo>
|
||
MultiGpuEngineManager<T>::enumerateDevices()
|
||
{
|
||
int count = 0;
|
||
cudaGetDeviceCount(&count);
|
||
|
||
std::vector<GpuDeviceInfo> devices;
|
||
devices.reserve(count);
|
||
|
||
for (int i = 0; i < count; ++i) {
|
||
cudaDeviceProp prop;
|
||
cudaGetDeviceProperties(&prop, i);
|
||
|
||
cudaSetDevice(i);
|
||
size_t freeBytes = 0, totalBytes = 0;
|
||
cudaMemGetInfo(&freeBytes, &totalBytes);
|
||
|
||
GpuDeviceInfo info;
|
||
info.index = i;
|
||
info.name = prop.name;
|
||
info.totalMemoryBytes = prop.totalGlobalMem;
|
||
info.freeMemoryAtInitBytes = freeBytes;
|
||
info.computeMajor = prop.major;
|
||
info.computeMinor = prop.minor;
|
||
info.slotsAllocated = 0;
|
||
info.memoryPerSlotBytes = 0;
|
||
devices.push_back(std::move(info));
|
||
}
|
||
|
||
return devices;
|
||
}
|
||
|
||
// ────────────────────────────────────────────────────────────────────────────
|
||
// Public init wrappers
|
||
// ────────────────────────────────────────────────────────────────────────────
|
||
template <typename T>
|
||
bool MultiGpuEngineManager<T>::initialize(
|
||
const ANSCENTER::Options& baseOptions,
|
||
const std::string& onnxModelPath,
|
||
const std::array<float, 3>& subVals,
|
||
const std::array<float, 3>& divVals,
|
||
bool normalize,
|
||
int maxSlotsPerGpu,
|
||
double memSafetyFactor)
|
||
{
|
||
return loadSlots(baseOptions, onnxModelPath,
|
||
subVals, divVals, normalize,
|
||
/*fromOnnx=*/true,
|
||
maxSlotsPerGpu, memSafetyFactor);
|
||
}
|
||
|
||
template <typename T>
|
||
bool MultiGpuEngineManager<T>::initializeFromEngine(
|
||
const ANSCENTER::Options& baseOptions,
|
||
const std::string& trtEnginePath,
|
||
const std::array<float, 3>& subVals,
|
||
const std::array<float, 3>& divVals,
|
||
bool normalize,
|
||
int maxSlotsPerGpu,
|
||
double memSafetyFactor)
|
||
{
|
||
return loadSlots(baseOptions, trtEnginePath,
|
||
subVals, divVals, normalize,
|
||
/*fromOnnx=*/false,
|
||
maxSlotsPerGpu, memSafetyFactor);
|
||
}
|
||
|
||
// ────────────────────────────────────────────────────────────────────────────
|
||
// loadSlots — core initialization logic
|
||
// ────────────────────────────────────────────────────────────────────────────
|
||
template <typename T>
|
||
bool MultiGpuEngineManager<T>::loadSlots(
|
||
const ANSCENTER::Options& baseOptions,
|
||
const std::string& modelPath,
|
||
const std::array<float, 3>& subVals,
|
||
const std::array<float, 3>& divVals,
|
||
bool normalize,
|
||
bool fromOnnx,
|
||
int maxSlotsPerGpu,
|
||
double memSafetyFactor)
|
||
{
|
||
// ──────────────────────────────────────────────────────────────────
|
||
// 1. Enumerate GPUs
|
||
// ──────────────────────────────────────────────────────────────────
|
||
m_deviceInfos = enumerateDevices();
|
||
|
||
if (m_deviceInfos.empty()) {
|
||
spdlog::error("MultiGpuEngineManager: No CUDA-capable GPUs detected");
|
||
return false;
|
||
}
|
||
|
||
spdlog::info("MultiGpuEngineManager: {} GPU(s) found:", m_deviceInfos.size());
|
||
for (const auto& d : m_deviceInfos) {
|
||
spdlog::info(" GPU[{}] {} | SM {}.{} | Total {:.0f} MiB | Free {:.0f} MiB",
|
||
d.index, d.name,
|
||
d.computeMajor, d.computeMinor,
|
||
d.totalMemoryBytes / 1048576.0,
|
||
d.freeMemoryAtInitBytes / 1048576.0);
|
||
}
|
||
|
||
// Warn if the GPUs are heterogeneous — the TRT engine may be incompatible
|
||
for (size_t i = 1; i < m_deviceInfos.size(); ++i) {
|
||
if (m_deviceInfos[i].name != m_deviceInfos[0].name) {
|
||
spdlog::warn("MultiGpuEngineManager: GPU[{}] '{}' differs from GPU[0] '{}'. "
|
||
"TRT engine binaries may be incompatible with dissimilar GPUs.",
|
||
i, m_deviceInfos[i].name, m_deviceInfos[0].name);
|
||
}
|
||
}
|
||
|
||
// ──────────────────────────────────────────────────────────────────
|
||
// 2. Load ONE probe engine on GPU 0 to measure per-slot VRAM usage.
|
||
//
|
||
// Memory delta = freeBeforeLoad − freeAfterLoad
|
||
// This includes: TRT engine buffers, CUDA context overhead, and
|
||
// any stream / workspace memory Engine<T> allocates.
|
||
// ──────────────────────────────────────────────────────────────────
|
||
spdlog::info("MultiGpuEngineManager: Loading probe engine on GPU[0] "
|
||
"to measure per-slot memory footprint...");
|
||
|
||
cudaSetDevice(0);
|
||
size_t freeBefore = 0, tmp = 0;
|
||
cudaMemGetInfo(&freeBefore, &tmp);
|
||
|
||
ANSCENTER::Options opts0 = baseOptions;
|
||
opts0.deviceIndex = 0;
|
||
|
||
auto probeEngine = std::make_unique<Engine<T>>(opts0);
|
||
const bool probeOk = fromOnnx
|
||
? probeEngine->buildLoadNetwork(modelPath, subVals, divVals, normalize)
|
||
: probeEngine->loadNetwork (modelPath, subVals, divVals, normalize);
|
||
|
||
if (!probeOk) {
|
||
spdlog::error("MultiGpuEngineManager: Probe engine failed to load on GPU[0]");
|
||
return false;
|
||
}
|
||
|
||
size_t freeAfter = 0;
|
||
cudaMemGetInfo(&freeAfter, &tmp);
|
||
|
||
// Guard against measurement noise: floor at 64 MiB
|
||
constexpr size_t kMinSlotMemBytes = 64ULL * 1024 * 1024;
|
||
const size_t rawDelta = (freeBefore > freeAfter) ? (freeBefore - freeAfter) : 0ULL;
|
||
const size_t memPerSlot = std::max(rawDelta, kMinSlotMemBytes);
|
||
|
||
spdlog::info("MultiGpuEngineManager: Memory per inference slot: {:.1f} MiB "
|
||
"(measured delta = {:.1f} MiB)",
|
||
memPerSlot / 1048576.0,
|
||
rawDelta / 1048576.0);
|
||
|
||
// Cache tensor dims — same for every slot since they all use the same model
|
||
m_inputDims = probeEngine->getInputDims();
|
||
m_outputDims = probeEngine->getOutputDims();
|
||
|
||
// Promote the probe engine into slot 0 on device 0
|
||
{
|
||
InferenceSlot s;
|
||
s.deviceIndex = 0;
|
||
s.busy = false;
|
||
s.memUsed = memPerSlot;
|
||
s.engine = std::move(probeEngine);
|
||
m_slots.push_back(std::move(s));
|
||
}
|
||
m_deviceInfos[0].slotsAllocated = 1;
|
||
m_deviceInfos[0].memoryPerSlotBytes = memPerSlot;
|
||
|
||
// ──────────────────────────────────────────────────────────────────
|
||
// 3. Auto-cap: VRAM-fraction budget per model
|
||
//
|
||
// When maxSlotsPerGpu is -1 (the default), each model pool
|
||
// auto-limits itself to kMaxVramFractionPerModel of total GPU
|
||
// VRAM. This prevents the first model loaded from consuming all
|
||
// memory and starving subsequent models in multi-model deployments.
|
||
// We use *total* VRAM (not free) as the budget base so the cap is
|
||
// consistent regardless of load order.
|
||
//
|
||
// maxSlotsPerGpu == -1 → auto-cap from VRAM fraction (default)
|
||
// maxSlotsPerGpu > 0 → explicit cap (user override, unchanged)
|
||
// ──────────────────────────────────────────────────────────────────
|
||
constexpr double kMaxVramFractionPerModel = 0.25; // 25% of total VRAM
|
||
|
||
int effectiveMaxSlotsPerGpu = maxSlotsPerGpu;
|
||
|
||
if (maxSlotsPerGpu <= 0 && memPerSlot > 0) {
|
||
const size_t totalVram = m_deviceInfos[0].totalMemoryBytes;
|
||
const size_t vramBudget = static_cast<size_t>(
|
||
static_cast<double>(totalVram) * kMaxVramFractionPerModel);
|
||
const int autoCap = std::max(1, static_cast<int>(vramBudget / memPerSlot));
|
||
effectiveMaxSlotsPerGpu = autoCap;
|
||
|
||
spdlog::info("MultiGpuEngineManager: VRAM auto-cap = {} slot(s)/GPU "
|
||
"(model {} MiB/slot, budget {} MiB = {}% of {} MiB total)",
|
||
autoCap, memPerSlot / 1048576,
|
||
vramBudget / 1048576,
|
||
static_cast<int>(kMaxVramFractionPerModel * 100),
|
||
totalVram / 1048576);
|
||
}
|
||
|
||
// ──────────────────────────────────────────────────────────────────
|
||
// 4. Fill remaining capacity on every GPU.
|
||
//
|
||
// For GPU 0:
|
||
// freeNow already reflects probe usage → slotsToAdd is the count
|
||
// of *additional* slots that fit, beyond the probe.
|
||
//
|
||
// For GPU 1+:
|
||
// freeNow is the original available memory on that device, so
|
||
// slotsToAdd is the *total* slots for that device.
|
||
// ──────────────────────────────────────────────────────────────────
|
||
for (int di = 0; di < static_cast<int>(m_deviceInfos.size()); ++di) {
|
||
|
||
cudaSetDevice(di);
|
||
size_t freeNow = 0, totalNow = 0;
|
||
cudaMemGetInfo(&freeNow, &totalNow);
|
||
|
||
const size_t usableBytes = static_cast<size_t>(
|
||
static_cast<double>(freeNow) * memSafetyFactor);
|
||
|
||
// How many new Engine<T> instances fit in the usable memory?
|
||
int slotsToAdd = (memPerSlot > 0)
|
||
? static_cast<int>(usableBytes / memPerSlot)
|
||
: 0;
|
||
|
||
// Apply VRAM-fraction auto-cap or explicit per-GPU cap.
|
||
// GPU 0 already has the probe slot, so subtract 1 from its budget.
|
||
if (effectiveMaxSlotsPerGpu > 0) {
|
||
const int budget = (di == 0)
|
||
? (effectiveMaxSlotsPerGpu - 1)
|
||
: effectiveMaxSlotsPerGpu;
|
||
slotsToAdd = std::min(slotsToAdd, budget);
|
||
}
|
||
|
||
m_deviceInfos[di].memoryPerSlotBytes = memPerSlot;
|
||
|
||
spdlog::info("MultiGpuEngineManager: GPU[{}] {} — "
|
||
"free {:.0f} MiB, usable {:.0f} MiB → adding {} slot(s)",
|
||
di, m_deviceInfos[di].name,
|
||
freeNow / 1048576.0,
|
||
usableBytes / 1048576.0,
|
||
slotsToAdd);
|
||
|
||
for (int s = 0; s < slotsToAdd; ++s) {
|
||
ANSCENTER::Options opts = baseOptions;
|
||
opts.deviceIndex = di;
|
||
|
||
auto eng = std::make_unique<Engine<T>>(opts);
|
||
const bool ok = fromOnnx
|
||
? eng->buildLoadNetwork(modelPath, subVals, divVals, normalize)
|
||
: eng->loadNetwork (modelPath, subVals, divVals, normalize);
|
||
|
||
if (!ok) {
|
||
spdlog::warn("MultiGpuEngineManager: GPU[{}] — slot {}/{} load failed; "
|
||
"halting allocation on this device.",
|
||
di, s + 1, slotsToAdd);
|
||
break;
|
||
}
|
||
|
||
InferenceSlot slot;
|
||
slot.deviceIndex = di;
|
||
slot.busy = false;
|
||
slot.memUsed = memPerSlot;
|
||
slot.engine = std::move(eng);
|
||
m_slots.push_back(std::move(slot));
|
||
m_deviceInfos[di].slotsAllocated++;
|
||
}
|
||
}
|
||
|
||
m_totalCapacity = static_cast<int>(m_slots.size());
|
||
printCapacityReport();
|
||
|
||
if (m_totalCapacity == 0) {
|
||
spdlog::error("MultiGpuEngineManager: Zero inference slots allocated — "
|
||
"check available GPU memory.");
|
||
return false;
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
// ────────────────────────────────────────────────────────────────────────────
|
||
// runInference
|
||
// ────────────────────────────────────────────────────────────────────────────
|
||
template <typename T>
|
||
bool MultiGpuEngineManager<T>::runInference(
|
||
const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
|
||
std::vector<std::vector<std::vector<T>>>& featureVectors)
|
||
{
|
||
// ── Acquire the first idle slot ───────────────────────────────────────
|
||
//
|
||
// Slots are stored in ascending device-index order (all device-0 slots
|
||
// come first), so the scan naturally prefers device 0. The mutex is
|
||
// held only for the O(N) scan + flag flip — NOT during the GPU kernel —
|
||
// so threads using different slots proceed in parallel.
|
||
InferenceSlot* slot = nullptr;
|
||
{
|
||
std::lock_guard<std::mutex> lock(m_slotMutex);
|
||
for (auto& s : m_slots) {
|
||
if (!s.busy) {
|
||
s.busy = true;
|
||
slot = &s;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
if (!slot) {
|
||
// All slots are in use. Enforce the capacity limit by refusing the
|
||
// request rather than crashing or waiting indefinitely.
|
||
spdlog::warn("MultiGpuEngineManager: Capacity reached — "
|
||
"all {}/{} inference slot(s) busy. "
|
||
"Request rejected; release a running inference first.",
|
||
m_activeCount.load(), m_totalCapacity);
|
||
return false;
|
||
}
|
||
|
||
++m_activeCount;
|
||
|
||
// Set the calling thread's CUDA device context to match the slot's device.
|
||
// Engine<T>::loadNetwork() already did this internally when the engine was
|
||
// created, and the CUDA streams inside are bound to that device; calling
|
||
// cudaSetDevice here ensures the calling thread's context matches so that
|
||
// stream operations and memory queries behave correctly in multi-threaded
|
||
// scenarios where threads may have previously touched a different device.
|
||
cudaSetDevice(slot->deviceIndex);
|
||
|
||
const bool result = slot->engine->runInference(inputs, featureVectors);
|
||
|
||
// ── Release the slot ──────────────────────────────────────────────────
|
||
{
|
||
std::lock_guard<std::mutex> lock(m_slotMutex);
|
||
slot->busy = false;
|
||
}
|
||
--m_activeCount;
|
||
|
||
return result;
|
||
}
|
||
|
||
// ────────────────────────────────────────────────────────────────────────────
|
||
// printCapacityReport
|
||
// ────────────────────────────────────────────────────────────────────────────
|
||
template <typename T>
|
||
void MultiGpuEngineManager<T>::printCapacityReport() const
|
||
{
|
||
spdlog::info("============================================================");
|
||
spdlog::info(" MultiGpuEngineManager — Capacity Report");
|
||
spdlog::info("============================================================");
|
||
spdlog::info(" Total inference slots : {}", m_totalCapacity);
|
||
spdlog::info(" Active inferences : {}", m_activeCount.load());
|
||
spdlog::info(" Available slots : {}",
|
||
m_totalCapacity - m_activeCount.load());
|
||
spdlog::info("------------------------------------------------------------");
|
||
for (const auto& d : m_deviceInfos) {
|
||
spdlog::info(" GPU[{:d}] {:s} | SM {:d}.{:d} | "
|
||
"Total {:6.0f} MiB | Slots: {:2d} | Mem/slot: {:6.1f} MiB",
|
||
d.index, d.name,
|
||
d.computeMajor, d.computeMinor,
|
||
d.totalMemoryBytes / 1048576.0,
|
||
d.slotsAllocated,
|
||
d.memoryPerSlotBytes / 1048576.0);
|
||
}
|
||
spdlog::info("============================================================");
|
||
}
|