Initial setup for CLion

This commit is contained in:
2026-03-28 16:54:11 +11:00
parent 239cc02591
commit 7b4134133c
1136 changed files with 811916 additions and 0 deletions

View File

@@ -0,0 +1,34 @@
#pragma once
#include "NvInfer.h"
#include "engine_api.h" // ENGINE_API (__declspec dllexport / dllimport)
// Class used for int8 calibration
class ENGINE_API Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
public:
Int8EntropyCalibrator2(int32_t batchSize, int32_t inputW, int32_t inputH, const std::string &calibDataDirPath,
const std::string &calibTableName, const std::string &inputBlobName,
const std::array<float, 3> &subVals = {0.f, 0.f, 0.f}, const std::array<float, 3> &divVals = {1.f, 1.f, 1.f},
bool normalize = true, bool readCache = true);
virtual ~Int8EntropyCalibrator2();
// Abstract base class methods which must be implemented
int32_t getBatchSize() const noexcept override;
bool getBatch(void *bindings[], char const *names[], int32_t nbBindings) noexcept override;
void const *readCalibrationCache(std::size_t &length) noexcept override;
void writeCalibrationCache(void const *ptr, std::size_t length) noexcept override;
private:
const int32_t m_batchSize;
const int32_t m_inputW;
const int32_t m_inputH;
int32_t m_imgIdx;
std::vector<std::string> m_imgPaths;
size_t m_inputCount;
const std::string m_calibTableName;
const std::string m_inputBlobName;
const std::array<float, 3> m_subVals;
const std::array<float, 3> m_divVals;
const bool m_normalize;
const bool m_readCache;
void *m_deviceInput;
std::vector<char> m_calibCache;
};

View File

@@ -0,0 +1,618 @@
#pragma once
// ============================================================================
// DEPRECATED — MultiGpuEngineManager.h
//
// The functionality of this class has been merged directly into Engine<T>.
// Replace any usage of MultiGpuEngineManager with Engine as follows:
//
// Before:
// MultiGpuEngineManager<float> mgr;
// mgr.initialize(opts, "model.onnx");
// mgr.runInference(inputs, outputs);
//
// After:
// Engine<float> eng;
// eng.initializePool(opts, "model.onnx");
// eng.runInference(inputs, outputs); // auto-dispatches to pool
//
// Method mapping:
// MultiGpuEngineManager::initialize() → Engine::initializePool()
// MultiGpuEngineManager::initializeFromEngine() → Engine::initializePoolFromEngine()
// MultiGpuEngineManager::runInference() → Engine::runInference() (unchanged signature)
// MultiGpuEngineManager::enumerateDevices() → Engine::enumerateDevices()
// MultiGpuEngineManager::getTotalCapacity() → Engine::getTotalCapacity()
// MultiGpuEngineManager::getAvailableSlots() → Engine::getAvailableSlots()
// MultiGpuEngineManager::getActiveInferences() → Engine::getActiveInferences()
// MultiGpuEngineManager::isAtCapacity() → Engine::isAtCapacity()
// MultiGpuEngineManager::getDeviceInfo() → Engine::getDeviceInfo()
// MultiGpuEngineManager::printCapacityReport() → Engine::printCapacityReport()
// MultiGpuEngineManager::getInputDims() → Engine::getInputDims()
// MultiGpuEngineManager::getOutputDims() → Engine::getOutputDims()
//
// GpuDeviceInfo struct is now declared in engine.h and needs no separate include.
//
// This file is kept temporarily for compatibility. It will be removed in a
// future cleanup pass. Do not add new code here.
// ============================================================================
/*
* MultiGpuEngineManager.h [DEPRECATED — see engine.h]
*
* A pool-based multi-GPU inference manager that wraps Engine<T>.
*
* Responsibilities
* ----------------
* 1. Enumerate all CUDA-capable GPUs and measure the GPU memory consumed
* by a single inference context (one Engine<T> instance).
* 2. Allocate as many Engine<T> "slots" as memory allows, filling device 0
* first, then device 1, etc.
* 3. Dispatch runInference() calls to the first idle slot (device 0 wins),
* so load naturally concentrates on the lowest-index GPU first.
* 4. Enforce capacity: if every slot is busy, return false immediately
* rather than crashing or blocking indefinitely.
*
* Assumptions
* -----------
* - All GPUs on the host are the same model (homogeneous). TRT engine files
* are therefore interchangeable across devices.
* - Engine<T> is safe to call concurrently from different threads as long as
* each instance is used by at most one thread at a time — this manager
* guarantees that through the per-slot busy flag.
*
* Usage example
* -------------
* ANSCENTER::Options opts;
* opts.precision = ANSCENTER::Precision::FP16;
* opts.maxBatchSize = 1;
*
* MultiGpuEngineManager<float> mgr;
* if (!mgr.initialize(opts, "model.onnx")) { ... error ... }
*
* mgr.printCapacityReport();
* // mgr.getTotalCapacity() tells you how many parallel tasks are possible
*
* // From any thread:
* std::vector<std::vector<cv::cuda::GpuMat>> inputs = ...;
* std::vector<std::vector<std::vector<float>>> outputs;
* if (!mgr.runInference(inputs, outputs)) {
* // All slots busy — back off and retry, or queue the request.
* }
*/
#include <algorithm>
#include <atomic>
#include <memory>
#include <mutex>
#include <string>
#include <vector>
#include <cuda_runtime.h>
#include <spdlog/spdlog.h>
#include "engine.h" // Engine<T> (also pulls NvInfer, OpenCV-CUDA, etc.)
#include "ANSLicense.h" // ANSCENTER::Options, ANSCENTER::Precision
// ============================================================================
// GpuDeviceInfo — snapshot of one CUDA device captured at init time
// ============================================================================
struct GpuDeviceInfo {
int index = 0;
std::string name;
size_t totalMemoryBytes = 0; ///< Physical VRAM
size_t freeMemoryAtInitBytes = 0; ///< Free VRAM when manager started
int computeMajor = 0;
int computeMinor = 0;
int slotsAllocated = 0; ///< Engine<T> instances on this GPU
size_t memoryPerSlotBytes = 0; ///< Bytes each slot occupies in VRAM
};
// ============================================================================
// MultiGpuEngineManager<T>
// ============================================================================
template <typename T>
class MultiGpuEngineManager {
public:
// ── Construction / destruction ───────────────────────────────────────────
MultiGpuEngineManager() = default;
~MultiGpuEngineManager() = default;
// Non-copyable (owning unique_ptrs inside slots)
MultiGpuEngineManager(const MultiGpuEngineManager&) = delete;
MultiGpuEngineManager& operator=(const MultiGpuEngineManager&) = delete;
// Movable
MultiGpuEngineManager(MultiGpuEngineManager&&) = default;
MultiGpuEngineManager& operator=(MultiGpuEngineManager&&) = default;
// ── Initialization ───────────────────────────────────────────────────────
/**
* Initialize from an ONNX model file.
* Builds the TRT engine on the first run and caches it; subsequent calls
* load the cached .engine file directly (fast path).
*
* @param baseOptions Configuration template. deviceIndex is ignored
* and overridden per-slot; all other fields apply.
* @param onnxModelPath Path to the .onnx model file.
* @param subVals Per-channel subtraction for normalisation.
* @param divVals Per-channel divisor for normalisation.
* @param normalize Normalise pixel values to [0, 1] before inference.
* @param maxSlotsPerGpu Cap slots per GPU (-1 = memory-limited only).
* @param memSafetyFactor Fraction of free VRAM to consume (default 0.80).
* @return true on success.
*/
bool initialize(const ANSCENTER::Options& baseOptions,
const std::string& onnxModelPath,
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},
const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
bool normalize = true,
int maxSlotsPerGpu = -1,
double memSafetyFactor = 0.80);
/**
* Initialize from a pre-built TRT engine file — no ONNX build step.
*/
bool initializeFromEngine(const ANSCENTER::Options& baseOptions,
const std::string& trtEnginePath,
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},
const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
bool normalize = true,
int maxSlotsPerGpu = -1,
double memSafetyFactor = 0.80);
// ── Inference ────────────────────────────────────────────────────────────
/**
* Run inference on the best available slot.
*
* Slot-selection order: device 0 first (slots are stored in ascending
* device-index order, so the lowest-index idle slot always wins).
*
* Returns FALSE immediately if every slot is busy.
* Does NOT block — the caller handles retry / queuing.
*
* Thread-safe: may be called from multiple threads simultaneously.
*
* @param inputs [input_tensor][batch][GpuMat]
* @param featureVectors [batch][output_tensor][values] — populated on return
* @return true if inference completed successfully.
*/
bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
std::vector<std::vector<std::vector<T>>>& featureVectors);
// ── Device / capacity queries ─────────────────────────────────────────────
/**
* Enumerate all CUDA-capable GPUs without loading any model.
* Useful for pre-flight checks before calling initialize().
*/
static std::vector<GpuDeviceInfo> enumerateDevices();
/** Device snapshots captured at initialize() time. */
const std::vector<GpuDeviceInfo>& getDeviceInfo() const { return m_deviceInfos; }
/** Total Engine<T> slots across all GPUs. */
int getTotalCapacity() const { return m_totalCapacity; }
/** Slots currently executing inference (approximate, lock-free read). */
int getActiveInferences() const { return m_activeCount.load(); }
/** Slots not currently claimed by a running inference. */
int getAvailableSlots() const { return m_totalCapacity - m_activeCount.load(); }
/** Whether the engine pool is fully saturated and new work would be rejected. */
bool isAtCapacity() const { return getAvailableSlots() <= 0; }
/** Input tensor dimensions (same for all slots, populated after init). */
const std::vector<nvinfer1::Dims3>& getInputDims() const { return m_inputDims; }
/** Output tensor dimensions (same for all slots, populated after init). */
const std::vector<nvinfer1::Dims>& getOutputDims() const { return m_outputDims; }
/** Print a human-readable capacity and device report via spdlog. */
void printCapacityReport() const;
private:
// ── Internal slot descriptor ─────────────────────────────────────────────
struct InferenceSlot {
int deviceIndex = 0;
bool busy = false;
size_t memUsed = 0; ///< Bytes this slot holds in VRAM
std::unique_ptr<Engine<T>> engine;
};
// ── Data members ──────────────────────────────────────────────────────────
std::vector<InferenceSlot> m_slots;
std::vector<GpuDeviceInfo> m_deviceInfos;
mutable std::mutex m_slotMutex;
std::atomic<int> m_activeCount{0};
int m_totalCapacity{0};
// Tensor dims cached from the probe engine (identical for every slot)
std::vector<nvinfer1::Dims3> m_inputDims;
std::vector<nvinfer1::Dims> m_outputDims;
// ── Private helpers ───────────────────────────────────────────────────────
bool loadSlots(const ANSCENTER::Options& baseOptions,
const std::string& modelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
bool fromOnnx,
int maxSlotsPerGpu,
double memSafetyFactor);
};
// ============================================================================
// Template implementation
// (must be in the header because Engine<T> is itself a template)
// ============================================================================
// ────────────────────────────────────────────────────────────────────────────
// enumerateDevices — static, no model loading required
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
/*static*/ std::vector<GpuDeviceInfo>
MultiGpuEngineManager<T>::enumerateDevices()
{
int count = 0;
cudaGetDeviceCount(&count);
std::vector<GpuDeviceInfo> devices;
devices.reserve(count);
for (int i = 0; i < count; ++i) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
cudaSetDevice(i);
size_t freeBytes = 0, totalBytes = 0;
cudaMemGetInfo(&freeBytes, &totalBytes);
GpuDeviceInfo info;
info.index = i;
info.name = prop.name;
info.totalMemoryBytes = prop.totalGlobalMem;
info.freeMemoryAtInitBytes = freeBytes;
info.computeMajor = prop.major;
info.computeMinor = prop.minor;
info.slotsAllocated = 0;
info.memoryPerSlotBytes = 0;
devices.push_back(std::move(info));
}
return devices;
}
// ────────────────────────────────────────────────────────────────────────────
// Public init wrappers
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
bool MultiGpuEngineManager<T>::initialize(
const ANSCENTER::Options& baseOptions,
const std::string& onnxModelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
int maxSlotsPerGpu,
double memSafetyFactor)
{
return loadSlots(baseOptions, onnxModelPath,
subVals, divVals, normalize,
/*fromOnnx=*/true,
maxSlotsPerGpu, memSafetyFactor);
}
template <typename T>
bool MultiGpuEngineManager<T>::initializeFromEngine(
const ANSCENTER::Options& baseOptions,
const std::string& trtEnginePath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
int maxSlotsPerGpu,
double memSafetyFactor)
{
return loadSlots(baseOptions, trtEnginePath,
subVals, divVals, normalize,
/*fromOnnx=*/false,
maxSlotsPerGpu, memSafetyFactor);
}
// ────────────────────────────────────────────────────────────────────────────
// loadSlots — core initialization logic
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
bool MultiGpuEngineManager<T>::loadSlots(
const ANSCENTER::Options& baseOptions,
const std::string& modelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
bool fromOnnx,
int maxSlotsPerGpu,
double memSafetyFactor)
{
// ──────────────────────────────────────────────────────────────────
// 1. Enumerate GPUs
// ──────────────────────────────────────────────────────────────────
m_deviceInfos = enumerateDevices();
if (m_deviceInfos.empty()) {
spdlog::error("MultiGpuEngineManager: No CUDA-capable GPUs detected");
return false;
}
spdlog::info("MultiGpuEngineManager: {} GPU(s) found:", m_deviceInfos.size());
for (const auto& d : m_deviceInfos) {
spdlog::info(" GPU[{}] {} | SM {}.{} | Total {:.0f} MiB | Free {:.0f} MiB",
d.index, d.name,
d.computeMajor, d.computeMinor,
d.totalMemoryBytes / 1048576.0,
d.freeMemoryAtInitBytes / 1048576.0);
}
// Warn if the GPUs are heterogeneous — the TRT engine may be incompatible
for (size_t i = 1; i < m_deviceInfos.size(); ++i) {
if (m_deviceInfos[i].name != m_deviceInfos[0].name) {
spdlog::warn("MultiGpuEngineManager: GPU[{}] '{}' differs from GPU[0] '{}'. "
"TRT engine binaries may be incompatible with dissimilar GPUs.",
i, m_deviceInfos[i].name, m_deviceInfos[0].name);
}
}
// ──────────────────────────────────────────────────────────────────
// 2. Load ONE probe engine on GPU 0 to measure per-slot VRAM usage.
//
// Memory delta = freeBeforeLoad freeAfterLoad
// This includes: TRT engine buffers, CUDA context overhead, and
// any stream / workspace memory Engine<T> allocates.
// ──────────────────────────────────────────────────────────────────
spdlog::info("MultiGpuEngineManager: Loading probe engine on GPU[0] "
"to measure per-slot memory footprint...");
cudaSetDevice(0);
size_t freeBefore = 0, tmp = 0;
cudaMemGetInfo(&freeBefore, &tmp);
ANSCENTER::Options opts0 = baseOptions;
opts0.deviceIndex = 0;
auto probeEngine = std::make_unique<Engine<T>>(opts0);
const bool probeOk = fromOnnx
? probeEngine->buildLoadNetwork(modelPath, subVals, divVals, normalize)
: probeEngine->loadNetwork (modelPath, subVals, divVals, normalize);
if (!probeOk) {
spdlog::error("MultiGpuEngineManager: Probe engine failed to load on GPU[0]");
return false;
}
size_t freeAfter = 0;
cudaMemGetInfo(&freeAfter, &tmp);
// Guard against measurement noise: floor at 64 MiB
constexpr size_t kMinSlotMemBytes = 64ULL * 1024 * 1024;
const size_t rawDelta = (freeBefore > freeAfter) ? (freeBefore - freeAfter) : 0ULL;
const size_t memPerSlot = std::max(rawDelta, kMinSlotMemBytes);
spdlog::info("MultiGpuEngineManager: Memory per inference slot: {:.1f} MiB "
"(measured delta = {:.1f} MiB)",
memPerSlot / 1048576.0,
rawDelta / 1048576.0);
// Cache tensor dims — same for every slot since they all use the same model
m_inputDims = probeEngine->getInputDims();
m_outputDims = probeEngine->getOutputDims();
// Promote the probe engine into slot 0 on device 0
{
InferenceSlot s;
s.deviceIndex = 0;
s.busy = false;
s.memUsed = memPerSlot;
s.engine = std::move(probeEngine);
m_slots.push_back(std::move(s));
}
m_deviceInfos[0].slotsAllocated = 1;
m_deviceInfos[0].memoryPerSlotBytes = memPerSlot;
// ──────────────────────────────────────────────────────────────────
// 3. Auto-cap: VRAM-fraction budget per model
//
// When maxSlotsPerGpu is -1 (the default), each model pool
// auto-limits itself to kMaxVramFractionPerModel of total GPU
// VRAM. This prevents the first model loaded from consuming all
// memory and starving subsequent models in multi-model deployments.
// We use *total* VRAM (not free) as the budget base so the cap is
// consistent regardless of load order.
//
// maxSlotsPerGpu == -1 → auto-cap from VRAM fraction (default)
// maxSlotsPerGpu > 0 → explicit cap (user override, unchanged)
// ──────────────────────────────────────────────────────────────────
constexpr double kMaxVramFractionPerModel = 0.25; // 25% of total VRAM
int effectiveMaxSlotsPerGpu = maxSlotsPerGpu;
if (maxSlotsPerGpu <= 0 && memPerSlot > 0) {
const size_t totalVram = m_deviceInfos[0].totalMemoryBytes;
const size_t vramBudget = static_cast<size_t>(
static_cast<double>(totalVram) * kMaxVramFractionPerModel);
const int autoCap = std::max(1, static_cast<int>(vramBudget / memPerSlot));
effectiveMaxSlotsPerGpu = autoCap;
spdlog::info("MultiGpuEngineManager: VRAM auto-cap = {} slot(s)/GPU "
"(model {} MiB/slot, budget {} MiB = {}% of {} MiB total)",
autoCap, memPerSlot / 1048576,
vramBudget / 1048576,
static_cast<int>(kMaxVramFractionPerModel * 100),
totalVram / 1048576);
}
// ──────────────────────────────────────────────────────────────────
// 4. Fill remaining capacity on every GPU.
//
// For GPU 0:
// freeNow already reflects probe usage → slotsToAdd is the count
// of *additional* slots that fit, beyond the probe.
//
// For GPU 1+:
// freeNow is the original available memory on that device, so
// slotsToAdd is the *total* slots for that device.
// ──────────────────────────────────────────────────────────────────
for (int di = 0; di < static_cast<int>(m_deviceInfos.size()); ++di) {
cudaSetDevice(di);
size_t freeNow = 0, totalNow = 0;
cudaMemGetInfo(&freeNow, &totalNow);
const size_t usableBytes = static_cast<size_t>(
static_cast<double>(freeNow) * memSafetyFactor);
// How many new Engine<T> instances fit in the usable memory?
int slotsToAdd = (memPerSlot > 0)
? static_cast<int>(usableBytes / memPerSlot)
: 0;
// Apply VRAM-fraction auto-cap or explicit per-GPU cap.
// GPU 0 already has the probe slot, so subtract 1 from its budget.
if (effectiveMaxSlotsPerGpu > 0) {
const int budget = (di == 0)
? (effectiveMaxSlotsPerGpu - 1)
: effectiveMaxSlotsPerGpu;
slotsToAdd = std::min(slotsToAdd, budget);
}
m_deviceInfos[di].memoryPerSlotBytes = memPerSlot;
spdlog::info("MultiGpuEngineManager: GPU[{}] {} — "
"free {:.0f} MiB, usable {:.0f} MiB → adding {} slot(s)",
di, m_deviceInfos[di].name,
freeNow / 1048576.0,
usableBytes / 1048576.0,
slotsToAdd);
for (int s = 0; s < slotsToAdd; ++s) {
ANSCENTER::Options opts = baseOptions;
opts.deviceIndex = di;
auto eng = std::make_unique<Engine<T>>(opts);
const bool ok = fromOnnx
? eng->buildLoadNetwork(modelPath, subVals, divVals, normalize)
: eng->loadNetwork (modelPath, subVals, divVals, normalize);
if (!ok) {
spdlog::warn("MultiGpuEngineManager: GPU[{}] — slot {}/{} load failed; "
"halting allocation on this device.",
di, s + 1, slotsToAdd);
break;
}
InferenceSlot slot;
slot.deviceIndex = di;
slot.busy = false;
slot.memUsed = memPerSlot;
slot.engine = std::move(eng);
m_slots.push_back(std::move(slot));
m_deviceInfos[di].slotsAllocated++;
}
}
m_totalCapacity = static_cast<int>(m_slots.size());
printCapacityReport();
if (m_totalCapacity == 0) {
spdlog::error("MultiGpuEngineManager: Zero inference slots allocated — "
"check available GPU memory.");
return false;
}
return true;
}
// ────────────────────────────────────────────────────────────────────────────
// runInference
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
bool MultiGpuEngineManager<T>::runInference(
const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
std::vector<std::vector<std::vector<T>>>& featureVectors)
{
// ── Acquire the first idle slot ───────────────────────────────────────
//
// Slots are stored in ascending device-index order (all device-0 slots
// come first), so the scan naturally prefers device 0. The mutex is
// held only for the O(N) scan + flag flip — NOT during the GPU kernel —
// so threads using different slots proceed in parallel.
InferenceSlot* slot = nullptr;
{
std::lock_guard<std::mutex> lock(m_slotMutex);
for (auto& s : m_slots) {
if (!s.busy) {
s.busy = true;
slot = &s;
break;
}
}
}
if (!slot) {
// All slots are in use. Enforce the capacity limit by refusing the
// request rather than crashing or waiting indefinitely.
spdlog::warn("MultiGpuEngineManager: Capacity reached — "
"all {}/{} inference slot(s) busy. "
"Request rejected; release a running inference first.",
m_activeCount.load(), m_totalCapacity);
return false;
}
++m_activeCount;
// Set the calling thread's CUDA device context to match the slot's device.
// Engine<T>::loadNetwork() already did this internally when the engine was
// created, and the CUDA streams inside are bound to that device; calling
// cudaSetDevice here ensures the calling thread's context matches so that
// stream operations and memory queries behave correctly in multi-threaded
// scenarios where threads may have previously touched a different device.
cudaSetDevice(slot->deviceIndex);
const bool result = slot->engine->runInference(inputs, featureVectors);
// ── Release the slot ──────────────────────────────────────────────────
{
std::lock_guard<std::mutex> lock(m_slotMutex);
slot->busy = false;
}
--m_activeCount;
return result;
}
// ────────────────────────────────────────────────────────────────────────────
// printCapacityReport
// ────────────────────────────────────────────────────────────────────────────
template <typename T>
void MultiGpuEngineManager<T>::printCapacityReport() const
{
spdlog::info("============================================================");
spdlog::info(" MultiGpuEngineManager — Capacity Report");
spdlog::info("============================================================");
spdlog::info(" Total inference slots : {}", m_totalCapacity);
spdlog::info(" Active inferences : {}", m_activeCount.load());
spdlog::info(" Available slots : {}",
m_totalCapacity - m_activeCount.load());
spdlog::info("------------------------------------------------------------");
for (const auto& d : m_deviceInfos) {
spdlog::info(" GPU[{:d}] {:s} | SM {:d}.{:d} | "
"Total {:6.0f} MiB | Slots: {:2d} | Mem/slot: {:6.1f} MiB",
d.index, d.name,
d.computeMajor, d.computeMinor,
d.totalMemoryBytes / 1048576.0,
d.slotsAllocated,
d.memoryPerSlotBytes / 1048576.0);
}
spdlog::info("============================================================");
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,887 @@
// ============================================================================
// EngineMultiGpu.inl
//
// Multi-GPU inference pool -- merged from MultiGpuEngineManager.h
//
// This file is #included at the bottom of engine.h and must not be compiled
// independently. It provides implementations for all pool-management methods
// declared inside Engine<T>:
//
// initializePool() -- build from ONNX, create pool
// initializePoolFromEngine() -- load pre-built TRT engine, create pool
// enumerateDevices() -- static CUDA device enumeration
// loadSlots() -- core pool allocation logic (private)
// runInferenceFromPool() -- thread-safe slot dispatch (private)
// getTotalCapacity() -- inline in engine.h
// getActiveInferences() -- inline in engine.h
// getAvailableSlots() -- inline in engine.h
// isAtCapacity() -- inline in engine.h
// printCapacityReport() -- human-readable pool status
// ============================================================================
// -- Static member definitions for global elastic slot cap --------------------
template <typename T>
std::atomic<int> Engine<T>::s_globalElasticCount{0};
template <typename T>
std::atomic<int> Engine<T>::s_globalElasticMax{32}; // safe default, overwritten on first pool init
template <typename T>
std::once_flag Engine<T>::s_globalCapInitFlag;
template <typename T>
std::atomic<int64_t> Engine<T>::s_lastPoolCreatedMs{0};
// ----------------------------------------------------------------------------
// enumerateDevices -- static, no model required
// ----------------------------------------------------------------------------
template <typename T>
/*static*/ std::vector<GpuDeviceInfo>
Engine<T>::enumerateDevices()
{
int count = 0;
cudaGetDeviceCount(&count);
std::vector<GpuDeviceInfo> devices;
devices.reserve(static_cast<size_t>(count));
for (int i = 0; i < count; ++i) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
cudaSetDevice(i);
size_t freeBytes = 0, totalBytes = 0;
cudaMemGetInfo(&freeBytes, &totalBytes);
GpuDeviceInfo info;
info.index = i;
info.name = prop.name;
info.totalMemoryBytes = prop.totalGlobalMem;
info.freeMemoryAtInitBytes = freeBytes;
info.computeMajor = prop.major;
info.computeMinor = prop.minor;
info.slotsAllocated = 0;
info.memoryPerSlotBytes = 0;
devices.push_back(std::move(info));
}
return devices;
}
// ----------------------------------------------------------------------------
// Public pool-init wrappers
// ----------------------------------------------------------------------------
template <typename T>
bool Engine<T>::initializePool(
const ANSCENTER::Options& baseOptions,
const std::string& onnxModelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
int maxSlotsPerGpu,
double memSafetyFactor)
{
// Apply baseOptions to *this* so that m_options is consistent whether
// the user goes through initializePool() or the 6-param buildLoadNetwork().
m_options = baseOptions;
return buildLoadNetwork(onnxModelPath, subVals, divVals, normalize,
maxSlotsPerGpu, memSafetyFactor);
}
template <typename T>
bool Engine<T>::initializePoolFromEngine(
const ANSCENTER::Options& baseOptions,
const std::string& trtEnginePath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
int maxSlotsPerGpu,
double memSafetyFactor)
{
m_options = baseOptions;
return loadNetwork(trtEnginePath, subVals, divVals, normalize,
maxSlotsPerGpu, memSafetyFactor);
}
// ----------------------------------------------------------------------------
// loadSlots -- core pool allocation logic
//
// Three modes based on maxSlotsPerGpu:
//
// 1 => ROUND-ROBIN (default)
// 1 slot per GPU, created at init. Tasks queue when all slots
// busy. Best balance of VRAM usage and multi-GPU utilisation.
// Example: 3 GPUs → 3 slots, round-robin dispatch.
//
// -1 => ELASTIC MODE
// Only the probe slot is pre-loaded. Additional slots are created
// on-demand by tryGrowPool() when concurrent requests arrive, and
// released by releaseIdleSlots() when idle. Higher throughput but
// higher VRAM usage — only recommended for large GPUs (≥ 8 GB).
//
// >1 => PRE-ALLOCATED MODE (explicit cap)
// Slots are created upfront, capped at maxSlotsPerGpu per GPU.
// Useful when the caller knows the required concurrency level.
// ----------------------------------------------------------------------------
template <typename T>
bool Engine<T>::loadSlots(
const ANSCENTER::Options& baseOptions,
const std::string& modelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
bool fromOnnx,
int maxSlotsPerGpu,
double memSafetyFactor)
{
// -- 1. Enumerate GPUs --------------------------------------------------
m_deviceInfos = enumerateDevices();
if (m_deviceInfos.empty()) {
std::cout << "Error [Pool]: No CUDA-capable GPUs detected" << std::endl;
return false;
}
const bool elastic = (maxSlotsPerGpu <= 0);
m_elasticMode = elastic;
// Set global elastic slot cap ONCE based on total GPU VRAM.
// Budget: ~4 slots per GB. This cap is shared across ALL pools
// to prevent CUDA driver SRW lock convoy (30+ threads deadlocked).
// 4 GB → 12, 6 GB → 24, 8 GB → 32, 12 GB → 48, 24 GB → 96
if (elastic) {
std::call_once(s_globalCapInitFlag, [this]() {
int totalGB = 0;
for (const auto& dev : m_deviceInfos)
totalGB += static_cast<int>(dev.totalMemoryBytes / (1024ULL * 1024ULL * 1024ULL));
int cap = std::max(8, totalGB * 4); // minimum 8
s_globalElasticMax.store(cap);
std::cout << "Info [Pool]: Global elastic slot cap = "
<< cap << " (total " << totalGB << " GB VRAM x4)" << std::endl;
});
}
std::cout << "\n====================================================" << std::endl;
std::cout << "Engine Pool Initialization"
<< (elastic ? " [ELASTIC]" : " [PRE-ALLOCATED]") << std::endl;
std::cout << "====================================================" << std::endl;
std::cout << "Found " << m_deviceInfos.size() << " GPU(s):" << std::endl;
for (const auto& d : m_deviceInfos) {
std::cout << " GPU[" << d.index << "] " << d.name
<< " | SM " << d.computeMajor << "." << d.computeMinor
<< " | Total " << d.totalMemoryBytes / 1048576 << " MiB"
<< " | Free " << d.freeMemoryAtInitBytes / 1048576 << " MiB"
<< std::endl;
}
// Warn about heterogeneous GPUs -- TRT engine may not be compatible
for (size_t i = 1; i < m_deviceInfos.size(); ++i) {
if (m_deviceInfos[i].name != m_deviceInfos[0].name) {
std::cout << "Warning [Pool]: GPU[" << i << "] '" << m_deviceInfos[i].name
<< "' differs from GPU[0] '" << m_deviceInfos[0].name
<< "'. TRT engine binary may be incompatible with dissimilar GPUs."
<< std::endl;
}
}
// -- 2. Probe engine: measure per-slot VRAM footprint -------------------
//
// Memory delta = freeBeforeLoad - freeAfterLoad
// Includes: TRT runtime buffers, CUDA context overhead, I/O buffers,
// stream memory, and workspace allocated by Engine<T>.
//
// MULTI-GPU BALANCING: place the probe on the GPU with the most free
// VRAM. This naturally distributes engines across GPUs as each pool
// init consumes VRAM from its chosen GPU, making the *other* GPU
// the best candidate for the next pool.
int probeGpuIdx = 0;
{
size_t bestFree = 0;
for (const auto& d : m_deviceInfos) {
cudaSetDevice(d.index);
size_t freeNow = 0, totalNow = 0;
cudaMemGetInfo(&freeNow, &totalNow);
std::cout << " GPU[" << d.index << "] free VRAM: " << freeNow / 1048576 << " MiB" << std::endl;
if (freeNow > bestFree) {
bestFree = freeNow;
probeGpuIdx = d.index;
}
}
}
std::cout << "\nLoading probe engine on GPU[" << probeGpuIdx
<< "] (most free VRAM) to measure per-slot memory..." << std::endl;
cudaSetDevice(probeGpuIdx);
size_t freeBefore = 0, tmp = 0;
cudaMemGetInfo(&freeBefore, &tmp);
ANSCENTER::Options opts0 = baseOptions;
opts0.deviceIndex = probeGpuIdx;
auto probeEngine = std::make_unique<Engine<T>>(opts0);
const bool probeOk = fromOnnx
? probeEngine->buildLoadNetwork(modelPath, subVals, divVals, normalize)
: probeEngine->loadNetwork (modelPath, subVals, divVals, normalize);
if (!probeOk) {
logEngineEvent("[Engine] loadSlots FAIL: Probe engine failed on GPU["
+ std::to_string(probeGpuIdx) + "] for " + modelPath
+ " (freeVRAM before=" + std::to_string(freeBefore / 1048576) + " MiB)", true);
return false;
}
size_t freeAfter = 0;
cudaMemGetInfo(&freeAfter, &tmp);
// Floor the delta at 64 MiB to guard against measurement noise
constexpr size_t kMinSlotMemBytes = 64ULL * 1024 * 1024;
const size_t rawDelta = (freeBefore > freeAfter) ? (freeBefore - freeAfter) : 0ULL;
const size_t memPerSlot = std::max(rawDelta, kMinSlotMemBytes);
std::cout << "Info [Pool]: Memory per slot = " << memPerSlot / 1048576
<< " MiB (measured delta = " << rawDelta / 1048576 << " MiB)" << std::endl;
// Cache input/output tensor dims on *this* Engine so getInputDims() /
// getOutputDims() work correctly when the pool is the active code path.
m_inputDims = probeEngine->getInputDims();
m_outputDims = probeEngine->getOutputDims();
// Sync GPU-capped batch sizes from the probe engine. The build() function
// may have reduced maxBatchSize based on GPU VRAM tier; propagate that to
// the pool manager so callers see the actual runtime limits.
m_options.maxBatchSize = probeEngine->getOptions().maxBatchSize;
m_options.optBatchSize = probeEngine->getOptions().optBatchSize;
// Store per-slot measurement for on-demand growth
m_memPerSlot = memPerSlot;
// Promote the probe engine into the first slot on the chosen GPU
{
InferenceSlot s;
s.deviceIndex = probeGpuIdx;
s.busy = false;
s.memUsed = memPerSlot;
s.engine = std::move(probeEngine);
m_slots.push_back(std::move(s));
}
m_deviceInfos[probeGpuIdx].slotsAllocated = 1;
m_deviceInfos[probeGpuIdx].memoryPerSlotBytes = memPerSlot;
// -- 3. Store config for on-demand growth (elastic mode) -------------
m_poolModelPath = modelPath;
m_poolSubVals = subVals;
m_poolDivVals = divVals;
m_poolNormalize = normalize;
m_poolFromOnnx = fromOnnx;
m_poolSafetyFactor = memSafetyFactor;
if (elastic) {
// -- ELASTIC: only the probe slot is pre-loaded -----------------
std::cout << "Info [Pool]: Elastic mode -- starting with 1 probe slot."
<< " Additional slots will be created on-demand as concurrent"
<< " requests arrive and released when idle." << std::endl;
m_totalCapacity = 1;
// Mark creation time — elastic growth is deferred for s_elasticGraceSec
// to let other models create their probe engines first.
{
using namespace std::chrono;
auto now = duration_cast<milliseconds>(
steady_clock::now().time_since_epoch()).count();
s_lastPoolCreatedMs.store(now);
}
printCapacityReport();
startIdleTimer(); // Auto-cleanup idle slots periodically
return true;
}
// -- 4. PRE-ALLOCATED: compute per-GPU capacity, then interleave -----
//
// Phase A: determine how many slots each GPU can hold.
// Phase B: create slots in round-robin order across GPUs so that
// the linear m_nextSlotHint scan naturally distributes
// consecutive requests across GPUs:
// m_slots = [GPU0-s0, GPU1-s0, GPU2-s0, GPU0-s1, GPU1-s1, ...]
// This gives: Task1→GPU0, Task2→GPU1, Task3→GPU2, Task4→GPU0 ...
const int numGpus = static_cast<int>(m_deviceInfos.size());
// Phase A: compute slotsToAdd per GPU
std::vector<int> slotsPerGpu(numGpus, 0);
int maxSlotsAny = 0;
for (int di = 0; di < numGpus; ++di) {
cudaSetDevice(di);
size_t freeNow = 0, totalNow = 0;
cudaMemGetInfo(&freeNow, &totalNow);
const size_t usableBytes = static_cast<size_t>(
static_cast<double>(freeNow) * memSafetyFactor);
int slotsToAdd = (memPerSlot > 0)
? static_cast<int>(usableBytes / memPerSlot) : 0;
// Apply explicit per-GPU cap; the probe GPU already has the probe slot
if (maxSlotsPerGpu > 0) {
const int budget = (di == probeGpuIdx)
? (maxSlotsPerGpu - 1)
: maxSlotsPerGpu;
slotsToAdd = std::min(slotsToAdd, budget);
}
slotsPerGpu[di] = slotsToAdd;
if (slotsToAdd > maxSlotsAny) maxSlotsAny = slotsToAdd;
m_deviceInfos[di].memoryPerSlotBytes = memPerSlot;
std::cout << "Info [Pool]: GPU[" << di << "] " << m_deviceInfos[di].name
<< " -- free " << freeNow / 1048576 << " MiB"
<< ", usable " << usableBytes / 1048576 << " MiB"
<< " => will add " << slotsToAdd << " slot(s)" << std::endl;
}
// Phase B: create slots interleaved across GPUs
// Round 0: GPU0-slot0, GPU1-slot0, GPU2-slot0
// Round 1: GPU0-slot1, GPU1-slot1, GPU2-slot1
// ...
std::vector<int> slotsCreated(numGpus, 0); // track actual success per GPU
std::vector<bool> gpuFailed(numGpus, false); // stop trying failed GPUs
for (int round = 0; round < maxSlotsAny; ++round) {
for (int di = 0; di < numGpus; ++di) {
if (gpuFailed[di]) continue;
if (slotsCreated[di] >= slotsPerGpu[di]) continue;
cudaSetDevice(di);
ANSCENTER::Options opts = baseOptions;
opts.deviceIndex = di;
auto eng = std::make_unique<Engine<T>>(opts);
eng->setVerbose(false);
eng->setDisableGraphs(true); // concurrent graph captures corrupt CUDA context
eng->m_skipEngineCache = m_skipEngineCache; // propagate to pool slots
const bool ok = fromOnnx
? eng->buildLoadNetwork(modelPath, subVals, divVals, normalize)
: eng->loadNetwork (modelPath, subVals, divVals, normalize);
if (!ok) {
std::cout << "Warning [Pool]: GPU[" << di << "] slot "
<< (slotsCreated[di] + 1) << "/" << slotsPerGpu[di]
<< " failed to load; halting allocation on this device." << std::endl;
gpuFailed[di] = true;
continue;
}
InferenceSlot slot;
slot.deviceIndex = di;
slot.busy = false;
slot.memUsed = memPerSlot;
slot.engine = std::move(eng);
m_slots.push_back(std::move(slot));
m_deviceInfos[di].slotsAllocated++;
slotsCreated[di]++;
}
}
m_totalCapacity = static_cast<int>(m_slots.size());
printCapacityReport();
if (m_totalCapacity == 0) {
std::cout << "Error [Pool]: Zero inference slots allocated -- "
"check available GPU memory." << std::endl;
return false;
}
return true;
}
// ----------------------------------------------------------------------------
// tryGrowPool -- on-demand slot creation (elastic mode)
//
// Called by runInferenceFromPool when every alive slot is busy.
// Creates ONE new engine on the first GPU that has enough free VRAM.
// GPUs are scanned in order (0, 1, ...), concentrating load on GPU 0 first.
//
// Returns a pointer to the new slot (already marked busy) or nullptr if
// no GPU has enough VRAM.
//
// Thread-safety: m_growMutex serialises growth so only one thread creates
// a slot at a time. m_slotMutex is acquired briefly to push the new slot
// into the deque. The calling thread waits (engine deserialisation takes
// ~0.5-3 s), but that is far better than rejecting the request entirely.
// ----------------------------------------------------------------------------
template <typename T>
typename Engine<T>::InferenceSlot*
Engine<T>::tryGrowPool(bool bypassGrace)
{
std::lock_guard<std::mutex> growLock(m_growMutex);
// Grace period: defer elastic growth for s_elasticGraceSec after the most
// recent pool creation. This reserves VRAM for probe engines that haven't
// been created yet (e.g., 10 models loading sequentially — early pools
// shouldn't grow elastic slots while later probes still need VRAM).
// Bypassed for demand-driven growth (a new consumer explicitly joined the
// pool, so we KNOW more slots are needed).
if (!bypassGrace) {
using namespace std::chrono;
auto now = duration_cast<milliseconds>(
steady_clock::now().time_since_epoch()).count();
int64_t lastCreated = s_lastPoolCreatedMs.load();
int64_t elapsedSec = (now - lastCreated) / 1000;
if (lastCreated > 0 && elapsedSec < s_elasticGraceSec) {
// Silently skip — don't spam logs during grace period
return nullptr;
}
}
// Global cap: prevent too many concurrent CUDA operations across ALL pools.
// With shared engine pools, unlimited elastic growth causes CUDA driver
// SRW lock convoy (30+ threads all blocked on nvcuda64 internal locks).
const int currentGlobal = s_globalElasticCount.load();
const int maxGlobal = s_globalElasticMax.load();
if (currentGlobal >= maxGlobal) {
std::cout << "Info [Pool]: tryGrowPool -- global cap reached ("
<< currentGlobal << "/" << maxGlobal
<< " total slots), not growing" << std::endl;
return nullptr;
}
// Find the GPU with the most free VRAM that has enough for one more slot.
// This naturally balances load across GPUs instead of always filling GPU 0.
const size_t requiredBytes = (m_poolSafetyFactor > 0.0)
? static_cast<size_t>(static_cast<double>(m_memPerSlot) / m_poolSafetyFactor)
: m_memPerSlot;
std::cout << "Info [Pool]: tryGrowPool called -- need " << (requiredBytes >> 20)
<< " MiB per slot, scanning " << m_deviceInfos.size() << " GPU(s)..."
<< std::endl;
// Sort device candidates by free VRAM descending (most free first)
std::vector<std::pair<size_t, int>> gpuByFreeVram; // {freeBytes, deviceIndex}
for (const auto& dev : m_deviceInfos) {
cudaSetDevice(dev.index);
size_t freeNow = 0, totalNow = 0;
cudaMemGetInfo(&freeNow, &totalNow);
std::cout << "Info [Pool]: GPU[" << dev.index << "] free=" << (freeNow >> 20)
<< " MiB, required=" << (requiredBytes >> 20) << " MiB"
<< (freeNow >= requiredBytes ? " -> CANDIDATE" : " -> SKIP (not enough)")
<< std::endl;
if (freeNow >= requiredBytes) {
gpuByFreeVram.push_back({freeNow, dev.index});
}
}
std::sort(gpuByFreeVram.begin(), gpuByFreeVram.end(),
[](const auto& a, const auto& b) { return a.first > b.first; });
if (gpuByFreeVram.empty()) {
std::cout << "Warning [Pool]: tryGrowPool -- no GPU has enough free VRAM ("
<< (requiredBytes >> 20) << " MiB), cannot grow" << std::endl;
return nullptr;
}
for (const auto& [freeVram, devIdx] : gpuByFreeVram) {
auto& dev = m_deviceInfos[devIdx];
std::cout << "Info [Pool]: Creating on-demand slot on GPU[" << dev.index
<< "] (free=" << (freeVram >> 20) << " MiB)..." << std::endl;
// Create a new engine on the GPU with the most free VRAM
cudaSetDevice(dev.index);
ANSCENTER::Options opts = m_options;
opts.deviceIndex = dev.index;
auto eng = std::make_unique<Engine<T>>(opts);
eng->setVerbose(false);
eng->setDisableGraphs(true); // concurrent graph captures corrupt CUDA context
eng->m_skipEngineCache = m_skipEngineCache; // propagate to on-demand slots
eng->m_skipOnnxRebuild = true; // elastic growth must NOT delete/rebuild engine files
eng->m_skipOnnxBuild = bypassGrace; // demand-driven growth: skip ONNX→TRT if no cached engine
const bool ok = m_poolFromOnnx
? eng->buildLoadNetwork(m_poolModelPath, m_poolSubVals,
m_poolDivVals, m_poolNormalize)
: eng->loadNetwork(m_poolModelPath, m_poolSubVals,
m_poolDivVals, m_poolNormalize);
if (!ok) {
std::cout << "Warning [Pool]: On-demand slot creation FAILED on GPU["
<< dev.index << "]" << std::endl;
continue; // try next GPU
}
std::cout << "Info [Pool]: On-demand slot engine loaded OK on GPU["
<< dev.index << "]" << std::endl;
// Check if we can reuse a dead slot entry (engine == nullptr)
{
std::lock_guard<std::mutex> slotLock(m_slotMutex);
for (auto& s : m_slots) {
if (!s.engine) { // dead entry -- recycle it
s.deviceIndex = dev.index;
s.busy = true;
s.memUsed = m_memPerSlot;
s.engine = std::move(eng);
s.lastUsedTime = std::chrono::steady_clock::now();
dev.slotsAllocated++;
// Recount alive slots
int alive = 0;
for (const auto& x : m_slots) { if (x.engine) ++alive; }
m_totalCapacity = alive;
s_globalElasticCount++;
std::cout << "Info [Pool]: On-demand slot recycled on GPU["
<< dev.index << "] -- pool now " << m_totalCapacity
<< " slot(s) (global " << s_globalElasticCount.load()
<< "/" << s_globalElasticMax.load() << ")" << std::endl;
return &s;
}
}
// No dead entries to recycle -- push a new one.
// std::deque::push_back does NOT invalidate references to existing
// elements, so pointers held by other threads remain valid.
InferenceSlot newSlot;
newSlot.deviceIndex = dev.index;
newSlot.busy = true;
newSlot.memUsed = m_memPerSlot;
newSlot.engine = std::move(eng);
newSlot.lastUsedTime = std::chrono::steady_clock::now();
m_slots.push_back(std::move(newSlot));
dev.slotsAllocated++;
m_totalCapacity = static_cast<int>(m_slots.size()); // all alive here
s_globalElasticCount++;
std::cout << "Info [Pool]: On-demand slot created on GPU["
<< dev.index << "] -- pool now " << m_totalCapacity
<< " slot(s) (global " << s_globalElasticCount.load()
<< "/" << s_globalElasticMax.load() << ")" << std::endl;
return &m_slots.back();
}
}
return nullptr; // every GPU is full
}
// ----------------------------------------------------------------------------
// growPool -- public demand-driven growth (bypasses grace period)
// ----------------------------------------------------------------------------
template <typename T>
int Engine<T>::growPool(int count)
{
int created = 0;
for (int i = 0; i < count; ++i) {
auto* slot = tryGrowPool(/*bypassGrace=*/true);
if (!slot) break;
// Release so inference threads can use it
{
std::lock_guard<std::mutex> lk(m_slotMutex);
slot->busy = false;
slot->lastUsedTime = std::chrono::steady_clock::now();
}
m_slotFreeCv.notify_one();
++created;
}
return created;
}
// ----------------------------------------------------------------------------
// runInferenceFromPool -- thread-safe slot dispatch
// ----------------------------------------------------------------------------
template <typename T>
bool Engine<T>::runInferenceFromPool(
const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
std::vector<std::vector<std::vector<T>>>& featureVectors)
{
// -- 1. Acquire an idle, alive slot (round-robin) --------------------
//
// Round-robin starting point avoids always favouring GPU 0. Each call
// advances m_nextSlotHint so consecutive requests spread across GPUs.
// The mutex is held only for the O(N) scan + flag flip -- NOT during GPU
// execution -- so threads using different slots proceed in parallel.
//
// PROACTIVE GROWTH (elastic mode):
// If all alive slots are busy when a request arrives, the pool is
// undersized for the current concurrency level. We kick off pool
// growth (tryGrowPool) in a detached background thread while we
// wait for the current slot to free. This ensures multi-GPU
// utilisation: the new slot lands on the GPU with the most free
// VRAM (typically GPU[1]) and is ready for the *next* request.
// Growth is serialised by m_growMutex so duplicate threads are
// harmless — the second one finds a fresh slot immediately.
InferenceSlot* slot = nullptr;
bool kickedGrowth = false;
{
std::unique_lock<std::mutex> lock(m_slotMutex);
const auto deadline = std::chrono::steady_clock::now()
+ std::chrono::milliseconds(2000);
while (!slot) {
const size_t n = m_slots.size();
if (n > 0) {
const size_t start = m_nextSlotHint.load() % n;
for (size_t i = 0; i < n; ++i) {
auto& s = m_slots[(start + i) % n];
if (!s.busy && s.engine) { // alive and idle
s.busy = true;
slot = &s;
m_nextSlotHint = (start + i + 1) % n;
break;
}
}
}
if (!slot) {
// All slots busy. In elastic mode, proactively grow the
// pool in the background so the next request has a slot
// on a different GPU. We only kick once per wait cycle.
if (m_elasticMode && !kickedGrowth
&& s_globalElasticCount.load() < s_globalElasticMax.load()) {
kickedGrowth = true;
std::cout << "Info [Pool]: All slots busy -- kicking background growth thread"
<< std::endl;
// Fire-and-forget: tryGrowPool is serialised by
// m_growMutex, so concurrent kicks are safe.
std::thread([this]() {
std::cout << "Info [Pool]: Background growth thread started" << std::endl;
auto* newSlot = this->tryGrowPool();
if (newSlot) {
// Slot was created pre-marked busy; release it
// so the next requester can claim it.
{
std::lock_guard<std::mutex> lk(m_slotMutex);
newSlot->busy = false;
newSlot->lastUsedTime = std::chrono::steady_clock::now();
}
m_slotFreeCv.notify_all();
std::cout << "Info [Pool]: Background growth SUCCEEDED -- new slot on GPU["
<< newSlot->deviceIndex << "], pool now "
<< m_totalCapacity << " slot(s)" << std::endl;
} else {
std::cout << "Warning [Pool]: Background growth FAILED -- no slot created"
<< std::endl;
}
}).detach();
}
// Wait for a running slot to finish and signal us
if (m_slotFreeCv.wait_until(lock, deadline)
== std::cv_status::timeout) {
break; // fall through to reject
}
}
}
}
// -- 3. Still no slot => reject ---------------------------------------
if (!slot) {
std::string errMsg = "[Engine] runInferenceFromPool FAIL: Capacity reached -- all "
+ std::to_string(m_activeCount.load()) + "/" + std::to_string(m_totalCapacity)
+ " slot(s) busy"
+ (m_elasticMode ? " and all GPUs full" : "")
+ ". Request rejected (2s timeout).";
std::cout << errMsg << std::endl;
logEngineEvent(errMsg, true);
return false;
}
++m_activeCount;
// -- RAII guard: guarantee busy-flag and activeCount are restored ----------
// If runInference() throws (cv::Exception, std::bad_alloc, ...) the slot
// must be released and the counter decremented -- otherwise the slot is
// permanently lost and capacity shrinks with every exception.
bool result = false;
try {
// Match the calling thread's CUDA context to the slot's device.
// Skip the call if the thread is already on the correct device
// (cudaSetDevice under WDDM can cost 1-5ms per call).
int currentDev = -1;
cudaGetDevice(&currentDev);
if (currentDev != slot->deviceIndex) {
cudaSetDevice(slot->deviceIndex);
}
result = slot->engine->runInference(inputs, featureVectors);
}
catch (const std::exception& ex) {
std::cout << "Error [Pool]: runInference threw: " << ex.what() << std::endl;
}
catch (...) {
std::cout << "Error [Pool]: runInference threw unknown exception" << std::endl;
}
{
std::lock_guard<std::mutex> lock(m_slotMutex);
slot->busy = false;
slot->lastUsedTime = std::chrono::steady_clock::now();
}
--m_activeCount;
m_slotFreeCv.notify_one(); // wake one thread waiting for a free slot
return result;
}
// ----------------------------------------------------------------------------
// releaseIdleSlots -- VRAM reclamation for elastic pools
//
// Destroys engine instances that have been idle for at least `idleSeconds`.
// The first slot (probe, index 0) is never released so the model remains
// instantly usable without re-measurement.
//
// Dead slots are NOT erased from the deque (to avoid invalidating pointers);
// their engine is reset to nullptr and they are recycled by tryGrowPool().
//
// Call from a periodic background timer, e.g. every 10-30 seconds:
// engine->releaseIdleSlots(30.0);
// ----------------------------------------------------------------------------
template <typename T>
int Engine<T>::releaseIdleSlots(double idleSeconds)
{
std::lock_guard<std::mutex> growLock(m_growMutex);
std::lock_guard<std::mutex> slotLock(m_slotMutex);
const auto now = std::chrono::steady_clock::now();
int released = 0;
// Skip index 0 -- that's the probe slot, always kept alive
for (size_t i = 1; i < m_slots.size(); ++i) {
auto& s = m_slots[i];
if (!s.busy && s.engine) { // alive and idle
const double idle = std::chrono::duration<double>(
now - s.lastUsedTime).count();
if (idle >= idleSeconds) {
// Update device info
for (auto& dev : m_deviceInfos) {
if (dev.index == s.deviceIndex) {
if (dev.slotsAllocated > 0) dev.slotsAllocated--;
break;
}
}
std::cout << "Info [Pool]: Releasing idle slot on GPU["
<< s.deviceIndex << "] (idle "
<< static_cast<int>(idle) << "s)" << std::endl;
// Destroy engine -- frees GPU memory.
// The InferenceSlot entry stays in the deque (dead) for reuse.
s.engine.reset();
s.memUsed = 0;
released++;
s_globalElasticCount--;
}
}
}
// Recount alive slots
int alive = 0;
for (const auto& s : m_slots) { if (s.engine) ++alive; }
m_totalCapacity = alive;
if (released > 0) {
std::cout << "Info [Pool]: Released " << released << " idle slot(s)"
<< " -- pool now " << m_totalCapacity << " alive slot(s)"
<< std::endl;
}
return released;
}
// ----------------------------------------------------------------------------
// printCapacityReport
// ----------------------------------------------------------------------------
template <typename T>
void Engine<T>::printCapacityReport() const
{
// Count alive vs dead -- lock protects against concurrent tryGrowPool
std::lock_guard<std::mutex> lock(m_slotMutex);
int alive = 0, dead = 0;
for (const auto& s : m_slots) {
if (s.engine) ++alive; else ++dead;
}
std::cout << "\n=====================================================" << std::endl;
std::cout << " Engine Pool -- Capacity Report"
<< (m_elasticMode ? " [ELASTIC]" : " [PRE-ALLOCATED]") << std::endl;
std::cout << "=====================================================" << std::endl;
std::cout << " Alive inference slots : " << alive << std::endl;
if (dead > 0)
std::cout << " Dead (recyclable) : " << dead << std::endl;
std::cout << " Active inferences : " << m_activeCount.load() << std::endl;
std::cout << " Available slots : "
<< (alive - m_activeCount.load())
<< (m_elasticMode ? " (+ on-demand)" : "")
<< std::endl;
if (m_elasticMode) {
std::cout << " Global slot usage : "
<< s_globalElasticCount.load() << "/" << s_globalElasticMax.load()
<< " (across all pools)" << std::endl;
}
std::cout << " Memory per slot : " << m_memPerSlot / 1048576 << " MiB" << std::endl;
std::cout << "-----------------------------------------------------" << std::endl;
for (const auto& d : m_deviceInfos) {
std::cout << " GPU[" << d.index << "] " << d.name
<< " | SM " << d.computeMajor << "." << d.computeMinor
<< " | Total " << d.totalMemoryBytes / 1048576 << " MiB"
<< " | Slots: " << d.slotsAllocated
<< " | Mem/slot: " << d.memoryPerSlotBytes / 1048576 << " MiB"
<< std::endl;
}
std::cout << "=====================================================" << std::endl;
}
// ----------------------------------------------------------------------------
// startIdleTimer / stopIdleTimer -- automatic idle-slot cleanup
//
// A background thread wakes every m_idleTimerIntervalSec seconds and calls
// releaseIdleSlots(m_idleTimerThresholdSec). The thread uses a
// condition_variable with a timed wait so that stopIdleTimer() can wake it
// immediately for a clean shutdown (no dangling sleeps).
//
// Only active in elastic mode -- pre-allocated pools have fixed capacity.
// ----------------------------------------------------------------------------
template <typename T>
void Engine<T>::startIdleTimer()
{
if (!m_elasticMode) return; // no-op for pre-allocated pools
if (m_idleTimerThread.joinable()) return; // already running
m_idleTimerStop = false;
m_idleTimerThread = std::thread([this]() {
std::cout << "Info [Pool]: Idle-slot cleanup timer started "
<< "(interval=" << m_idleTimerIntervalSec << "s, threshold="
<< m_idleTimerThresholdSec << "s)" << std::endl;
while (!m_idleTimerStop.load()) {
// Sleep for the interval, but wake early if stop is signalled
{
std::unique_lock<std::mutex> lk(m_idleTimerMutex);
m_idleTimerCv.wait_for(lk,
std::chrono::duration<double>(m_idleTimerIntervalSec),
[this]() { return m_idleTimerStop.load(); });
}
if (m_idleTimerStop.load()) break;
releaseIdleSlots(m_idleTimerThresholdSec);
}
std::cout << "Info [Pool]: Idle-slot cleanup timer stopped." << std::endl;
});
}
template <typename T>
void Engine<T>::stopIdleTimer()
{
if (!m_idleTimerThread.joinable()) return; // not running
m_idleTimerStop = true;
m_idleTimerCv.notify_all(); // wake the sleeping thread immediately
// During ExitProcess, worker threads are already killed by the OS.
// Calling join() on a dead thread deadlocks or causes std::terminate.
// Detach instead — the OS will reclaim everything momentarily.
if (g_processExiting().load(std::memory_order_relaxed)) {
m_idleTimerThread.detach();
} else {
m_idleTimerThread.join(); // normal path: wait for clean exit
}
}

View File

@@ -0,0 +1,431 @@
#pragma once
// EnginePoolManager.h — Process-wide cache for shared Engine<T> pool instances.
//
// When multiple AI tasks load the same model (same ONNX path + GPU + config),
// this manager ensures they share a SINGLE Engine<T> pool instead of each task
// creating its own pool with independent execution contexts and VRAM buffers.
//
// Without sharing: N tasks × ~500 MB = N × 500 MB VRAM (OOM at ~5-8 tasks on 8GB GPU)
// With sharing: 1 pool × ~500 MB = 500 MB total (unlimited tasks, slower via queuing)
//
// Lazy eviction: when refcount drops to 0, the pool is kept alive for
// kEvictGraceSec seconds. If a new task acquires it within that window,
// it gets an instant HIT without rebuilding. This handles the LabView
// edit/duplicate/create cycle (destroy → recreate) gracefully.
//
// Thread-safety: All public methods are mutex-protected.
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <array>
#include <iostream>
#include <functional>
#include <chrono>
#include <thread>
#include <atomic>
#include <cuda_runtime.h>
#include "TRTEngineCache.h" // constructor touches TRTEngineCache::instance() for destruction ordering
#ifdef _WIN32
#include <windows.h>
#endif
// Forward declare Engine<T> to avoid circular includes.
// The header that includes this must also include engine.h.
template <typename T> class Engine;
namespace ANSCENTER { struct Options; }
template <typename T>
class EnginePoolManager {
public:
static EnginePoolManager& instance() {
static EnginePoolManager s_instance;
return s_instance;
}
// ========================================================================
// Cache key — uniquely identifies a compatible Engine pool.
// ========================================================================
struct PoolKey {
std::string modelPath;
int precision = 0; // cast from Precision enum
int maxBatch = 1;
bool operator==(const PoolKey& o) const {
return modelPath == o.modelPath &&
precision == o.precision &&
maxBatch == o.maxBatch;
}
};
struct PoolKeyHash {
size_t operator()(const PoolKey& k) const {
size_t h = std::hash<std::string>{}(k.modelPath);
h ^= std::hash<int>{}(k.precision) << 16;
h ^= std::hash<int>{}(k.maxBatch) << 24;
return h;
}
};
// ========================================================================
// acquire() — get or create a shared Engine pool.
//
// On first call for a given key: creates a new Engine<T>, calls
// buildLoadNetwork with the provided parameters, and caches it.
//
// On subsequent calls (or within lazy-eviction grace period):
// returns the existing shared_ptr and increments refcount.
// No VRAM allocated, near-instant.
//
// Returns nullptr if engine creation/loading fails.
// ========================================================================
std::shared_ptr<Engine<T>> acquire(
const PoolKey& key,
const ANSCENTER::Options& options,
const std::string& modelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
int maxSlotsPerGpu)
{
// Optimizer / temporary engines: maxSlotsPerGpu==0 means the caller
// only needs a lightweight, non-shared engine (e.g., OptimizeModelStr).
// Bypass the pool cache entirely:
// - Don't hold m_mutex (which blocks ALL other pool creation)
// - Don't cache the result (temporary engine is destroyed on release)
// - Use the simple 4-param buildLoadNetwork (no pool, no probe, no VRAM measurement)
// Note: maxSlotsPerGpu==1 is now the normal "1 slot per GPU" multi-GPU
// round-robin mode, so it goes through the pool path below.
if (maxSlotsPerGpu == 0) {
logEvent("[EnginePoolManager] BYPASS (maxSlots=0): " + key.modelPath
+ " — creating non-shared engine");
auto engine = std::make_shared<Engine<T>>(options);
bool ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
return ok ? engine : nullptr;
}
std::unique_lock<std::mutex> lock(m_mutex);
auto it = m_pools.find(key);
if (it != m_pools.end()) {
it->second.refcount++;
it->second.evictTime = TimePoint{}; // cancel pending eviction
int refs = it->second.refcount;
auto engine = it->second.engine;
logEvent("[EnginePoolManager] HIT: " + key.modelPath
+ " refs=" + std::to_string(refs));
// Demand-driven growth: only in elastic mode (maxSlotsPerGpu <= 0
// or > 1). With maxSlotsPerGpu==1 (round-robin default), the pool
// already has the right number of slots (1 per GPU) — tasks queue
// when all slots are busy, which is the intended behavior.
if (maxSlotsPerGpu != 1 && refs > 1 && engine) {
int alive = engine->getTotalCapacity();
if (alive < refs) {
// Check total GPU VRAM — skip growth on small GPUs
size_t totalVram = 0;
{
size_t freeTmp = 0;
cudaSetDevice(options.deviceIndex);
cudaMemGetInfo(&freeTmp, &totalVram);
}
constexpr size_t kMinVramForGrowth = 6ULL * 1024 * 1024 * 1024; // 6 GB
if (totalVram >= kMinVramForGrowth) {
lock.unlock(); // release PoolManager lock before growing
std::thread([engine, alive, refs, modelPath = key.modelPath]() {
int created = engine->growPool(1);
if (created > 0) {
logEngineEvent("[EnginePoolManager] DEMAND GROWTH: " + modelPath
+ " grew from " + std::to_string(alive)
+ " to " + std::to_string(engine->getTotalCapacity())
+ " slots (refs=" + std::to_string(refs) + ")");
}
}).detach();
} else {
logEvent("[EnginePoolManager] SKIP GROWTH: " + key.modelPath
+ " (GPU VRAM " + std::to_string(totalVram >> 20)
+ " MiB < 6 GB threshold, refs=" + std::to_string(refs) + ")");
}
}
}
return engine;
}
// Cache miss — create new Engine pool
logEvent("[EnginePoolManager] MISS: Creating pool for " + key.modelPath + "...");
// Log VRAM before attempting to create probe
{
size_t freeMem = 0, totalMem = 0;
cudaSetDevice(options.deviceIndex);
cudaMemGetInfo(&freeMem, &totalMem);
logEvent("[EnginePoolManager] GPU[" + std::to_string(options.deviceIndex)
+ "] VRAM: " + std::to_string(freeMem >> 20) + " MiB free / "
+ std::to_string(totalMem >> 20) + " MiB total (before probe)");
}
auto engine = std::make_shared<Engine<T>>(options);
bool ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize, maxSlotsPerGpu);
if (!ok) {
// Step 1: Force-evict all pools with refcount=0 to reclaim VRAM
int evicted = forceEvictPending();
if (evicted > 0) {
size_t freeMem2 = 0, totalMem2 = 0;
cudaSetDevice(options.deviceIndex);
cudaMemGetInfo(&freeMem2, &totalMem2);
logEvent("[EnginePoolManager] RETRY EVICT: Force-evicted " + std::to_string(evicted)
+ " pending pool(s), now " + std::to_string(freeMem2 >> 20)
+ " MiB free. Retrying " + key.modelPath + "...");
engine = std::make_shared<Engine<T>>(options);
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize, maxSlotsPerGpu);
}
// Step 2: If still failing, retry with lightweight mode (no elastic pool).
// The elastic probe does heavy warmup (batch 1-8, 10+ iterations) which
// consumes ~300-500 MB vs ~50-100 MB for a simple loadNetwork.
// Lightweight mode: tasks queue for a single shared slot — slower but works.
if (!ok) {
size_t freeMem3 = 0, totalMem3 = 0;
cudaSetDevice(options.deviceIndex);
cudaMemGetInfo(&freeMem3, &totalMem3);
logEvent("[EnginePoolManager] RETRY LIGHTWEIGHT: Elastic probe failed, "
+ std::to_string(freeMem3 >> 20) + " MiB free. "
"Retrying with single-slot mode for " + key.modelPath + "...");
engine = std::make_shared<Engine<T>>(options);
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
}
// Step 3: If still failing, wait briefly and retry.
// Transient failures can occur when:
// - TRT engine file is being written by another build (partial file)
// - CUDA driver has temporary resource contention during multi-pool startup
// - GPU memory fragmentation resolves after previous allocations settle
// Evidence: FireSmoke/detector.onnx failed at 3740 MiB free, then
// succeeded 4 seconds later at 3154 MiB free (less VRAM!).
if (!ok) {
size_t freeMem4 = 0, totalMem4 = 0;
cudaSetDevice(options.deviceIndex);
cudaMemGetInfo(&freeMem4, &totalMem4);
logEvent("[EnginePoolManager] RETRY DELAYED: All attempts failed with "
+ std::to_string(freeMem4 >> 20) + " MiB free. "
"Waiting 3s before final retry for " + key.modelPath + "...");
// Release mutex during sleep so other tasks can proceed
// (they may complete pool creation that resolves our issue)
lock.unlock();
std::this_thread::sleep_for(std::chrono::seconds(3));
lock.lock();
// Check if another thread created this pool while we slept
auto it2 = m_pools.find(key);
if (it2 != m_pools.end()) {
it2->second.refcount++;
it2->second.evictTime = TimePoint{};
logEvent("[EnginePoolManager] HIT (after delay): " + key.modelPath
+ " refs=" + std::to_string(it2->second.refcount));
return it2->second.engine;
}
// Final retry — try lightweight again after delay
cudaSetDevice(options.deviceIndex);
cudaMemGetInfo(&freeMem4, &totalMem4);
logEvent("[EnginePoolManager] RETRY FINAL: " + std::to_string(freeMem4 >> 20)
+ " MiB free. Last attempt for " + key.modelPath + "...");
engine = std::make_shared<Engine<T>>(options);
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
}
if (!ok) {
size_t freeMem = 0, totalMem = 0;
cudaMemGetInfo(&freeMem, &totalMem);
logEvent("[EnginePoolManager] FAILED: Could not load engine for "
+ key.modelPath + " | GPU[" + std::to_string(options.deviceIndex)
+ "] VRAM: " + std::to_string(freeMem >> 20) + " MiB free / "
+ std::to_string(totalMem >> 20) + " MiB total"
+ " (after 4 attempts: elastic, evict, lightweight, delayed)", true);
return nullptr;
}
}
PoolEntry entry;
entry.engine = engine;
entry.refcount = 1;
m_pools.emplace(key, std::move(entry));
// Start the lazy-eviction sweeper if not already running
startSweeperIfNeeded();
logEvent("[EnginePoolManager] CREATED: " + key.modelPath + " refs=1");
return engine;
}
// ========================================================================
// release() — decrement refcount for a shared pool.
//
// When refcount reaches 0, the pool is NOT immediately evicted.
// Instead, it is marked for lazy eviction after kEvictGraceSec.
// This handles the LabView edit cycle (destroy → recreate within
// seconds) without rebuilding the engine from scratch.
// ========================================================================
void release(const PoolKey& key) {
std::lock_guard<std::mutex> lock(m_mutex);
auto it = m_pools.find(key);
if (it == m_pools.end()) return;
if (it->second.refcount <= 0) return;
it->second.refcount--;
logEvent("[EnginePoolManager] RELEASE: " + key.modelPath
+ " refs=" + std::to_string(it->second.refcount));
if (it->second.refcount <= 0) {
// Mark for lazy eviction — don't destroy yet
it->second.evictTime = Clock::now() + std::chrono::seconds(kEvictGraceSec);
logEvent("[EnginePoolManager] PENDING EVICT: " + key.modelPath
+ " (will evict in " + std::to_string(kEvictGraceSec) + "s if not re-acquired)");
}
}
/// Clear all cached pools (call during DLL_PROCESS_DETACH).
void clearAll() {
{
std::lock_guard<std::mutex> lock(m_mutex);
logEvent("[EnginePoolManager] CLEAR ALL (" + std::to_string(m_pools.size()) + " pools)");
m_pools.clear();
}
stopSweeper();
}
/// Number of cached pools (for diagnostics).
size_t size() const {
std::lock_guard<std::mutex> lock(m_mutex);
return m_pools.size();
}
private:
EnginePoolManager() {
// CRITICAL: Touch TRTEngineCache singleton to ensure it is constructed
// BEFORE EnginePoolManager. C++ destroys function-local statics in
// reverse construction order, so this guarantees TRTEngineCache outlives
// EnginePoolManager. Without this, during ExitProcess the cache may be
// destroyed first, and ~Engine calling TRTEngineCache::release() crashes
// on a destroyed unordered_map (static destruction order fiasco).
(void)TRTEngineCache::instance();
}
~EnginePoolManager() {
if (g_processExiting().load(std::memory_order_relaxed)) {
// ExitProcess path: worker threads are dead, CUDA/TRT state is
// unreliable. Don't destroy Engine objects (their destructors
// call cudaFree, thread::join, etc. which deadlock or crash).
// The OS reclaims all memory, VRAM, and handles at process exit.
m_sweeperRunning.store(false);
return;
}
// Normal FreeLibrary path: threads are alive, safe to clean up.
// Explicitly clear pools before implicit member destruction.
// This destroys Engine<T> objects (which call TRTEngineCache::release())
// while we still hold m_mutex and can log diagnostics.
try {
std::lock_guard<std::mutex> lock(m_mutex);
m_pools.clear();
} catch (...) {}
stopSweeper();
}
EnginePoolManager(const EnginePoolManager&) = delete;
EnginePoolManager& operator=(const EnginePoolManager&) = delete;
// Grace period before evicting a pool with refcount=0.
// Covers LabView edit/duplicate/create cycles (destroy → recreate).
static constexpr int kEvictGraceSec = 120; // 2 minutes
// Sweeper interval — how often to check for expired pools.
static constexpr int kSweeperIntervalSec = 30;
using Clock = std::chrono::steady_clock;
using TimePoint = std::chrono::time_point<Clock>;
// Log to stdout/stderr only — no Windows Event Viewer.
// Event Viewer logging is handled by logEngineEvent() in engine.h for
// critical engine-level errors. EnginePoolManager messages are
// informational (HIT/MISS/EVICT) and don't need Event Viewer entries.
static void logEvent(const std::string& msg, bool isError = false) {
if (isError)
std::cerr << msg << std::endl;
else
std::cout << msg << std::endl;
}
struct PoolEntry {
std::shared_ptr<Engine<T>> engine;
int refcount = 0;
TimePoint evictTime {}; // when to evict (zero = not pending)
};
// ========================================================================
// Sweeper thread — periodically checks for pools whose eviction
// grace period has expired and removes them.
// ========================================================================
void startSweeperIfNeeded() {
// Called under m_mutex
if (m_sweeperRunning.load()) return;
m_sweeperRunning.store(true);
m_sweeperThread = std::thread([this]() {
while (m_sweeperRunning.load()) {
std::this_thread::sleep_for(std::chrono::seconds(kSweeperIntervalSec));
if (!m_sweeperRunning.load()) break;
sweepExpired();
}
});
m_sweeperThread.detach();
}
void stopSweeper() {
m_sweeperRunning.store(false);
}
// Force-evict ALL pools with refcount=0 (regardless of grace period).
// Called when a new pool creation fails due to low VRAM.
// Returns number of pools evicted.
// MUST be called under m_mutex.
int forceEvictPending() {
int evicted = 0;
for (auto it = m_pools.begin(); it != m_pools.end(); ) {
if (it->second.refcount <= 0) {
logEvent("[EnginePoolManager] FORCE EVICT (VRAM recovery): " + it->first.modelPath);
it = m_pools.erase(it);
evicted++;
} else {
++it;
}
}
return evicted;
}
void sweepExpired() {
std::lock_guard<std::mutex> lock(m_mutex);
auto now = Clock::now();
for (auto it = m_pools.begin(); it != m_pools.end(); ) {
auto& entry = it->second;
// Only evict if refcount is 0 AND evictTime has passed
if (entry.refcount <= 0
&& entry.evictTime != TimePoint{}
&& now >= entry.evictTime)
{
logEvent("[EnginePoolManager] EVICT (expired): " + it->first.modelPath);
it = m_pools.erase(it);
} else {
++it;
}
}
}
std::unordered_map<PoolKey, PoolEntry, PoolKeyHash> m_pools;
mutable std::mutex m_mutex;
std::atomic<bool> m_sweeperRunning{false};
std::thread m_sweeperThread;
};

View File

@@ -0,0 +1,719 @@
#pragma once
#include <cstring>
#include <filesystem>
#include "TRTCompat.h"
// Per-device mutex for CUDA graph capture.
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
// at the CUDA context level. When two Engine instances on the same GPU
// capture graphs concurrently, these cross-stream dependencies violate
// graph capture rules ("operation not permitted when stream is capturing").
// This mutex serialises graph captures across all Engine<T> instances on
// the same device — subsequent cudaGraphLaunch calls are still concurrent.
static std::mutex& graphCaptureMutex() {
static std::mutex m;
return m;
}
template <typename T>
void Engine<T>::warmUp(int iterations) {
if (m_verbose) {
std::cout << "\n========================================" << std::endl;
std::cout << "Engine Warmup" << std::endl;
std::cout << "========================================" << std::endl;
}
// Determine batch sizes to warm up
std::vector<int> batchSizes;
if (m_options.maxBatchSize > 1) {
if (m_verbose) {
std::cout << "Dynamic batch engine detected (max batch: " << m_options.maxBatchSize << ")" << std::endl;
std::cout << "Warming up common batch sizes to pre-compile kernels..." << std::endl;
}
// Warm up ALL batch sizes from 1 to maxBatchSize.
// Each unseen batch size incurs a 100-300ms kernel compilation penalty
// on first use. Warming all sizes eliminates that latency at inference
// time and ensures every CUDA graph is pre-captured.
for (int batch = 1; batch <= m_options.maxBatchSize; ++batch) {
batchSizes.push_back(batch);
}
}
else {
if (m_verbose) std::cout << "Fixed batch engine detected (batch size: " << m_options.maxBatchSize << ")" << std::endl;
batchSizes.push_back(m_options.maxBatchSize);
}
if (m_verbose) {
std::cout << "Batch sizes to warm up: ";
for (size_t i = 0; i < batchSizes.size(); ++i) {
std::cout << batchSizes[i];
if (i < batchSizes.size() - 1) std::cout << ", ";
}
std::cout << std::endl;
}
// Warm up each batch size.
// The first call triggers kernel compilation; the second captures the CUDA
// graph. Additional iterations only measure steady-state latency for the
// optBatchSize (printed as a diagnostic).
for (int batchSize : batchSizes) {
const int iters = (batchSize == m_options.optBatchSize) ? iterations : 2;
if (m_verbose) std::cout << "\nWarming up batch=" << batchSize << " (x" << iters << " iterations)..." << std::endl;
// Create dummy inputs for this batch size
std::vector<std::vector<cv::cuda::GpuMat>> dummyInputs;
for (size_t i = 0; i < m_inputDims.size(); ++i) {
const auto& dims = m_inputDims[i];
std::vector<cv::cuda::GpuMat> batch;
// FIXED: Create proper dummy images on GPU
// For dynamic spatial dims, use opt dimensions for warmup
int warmH = (dims.d[1] > 0) ? dims.d[1] : m_options.optInputHeight;
int warmW = (dims.d[2] > 0) ? dims.d[2] : m_options.optInputWidth;
for (int b = 0; b < batchSize; ++b) {
// Create on CPU first
cv::Mat cpuImg(warmH, warmW, CV_32FC(dims.d[0]), cv::Scalar(0.5f, 0.5f, 0.5f));
// Upload to GPU
cv::cuda::GpuMat gpuImg;
gpuImg.upload(cpuImg);
batch.push_back(gpuImg);
}
dummyInputs.push_back(batch);
}
std::vector<std::vector<std::vector<T>>> dummyOutputs;
// Time the first iteration (kernel compilation happens here)
auto start = std::chrono::high_resolution_clock::now();
bool firstSuccess = runInference(dummyInputs, dummyOutputs);
auto end = std::chrono::high_resolution_clock::now();
auto firstTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
if (!firstSuccess) {
if (m_verbose) std::cout << " ✗ ERROR: First iteration failed for batch=" << batchSize << std::endl;
continue;
}
if (m_verbose) {
std::cout << " First iteration: " << firstTime << " ms";
if (firstTime > 100) {
std::cout << " (kernel compilation detected)";
}
std::cout << std::endl;
}
// Run remaining iterations to measure stable performance
if (iters > 1) {
auto iterStart = std::chrono::high_resolution_clock::now();
for (int i = 1; i < iters; ++i) {
bool success = runInference(dummyInputs, dummyOutputs);
if (!success) {
if (m_verbose) std::cout << " ✗ ERROR: Iteration " << i << " failed" << std::endl;
break;
}
}
auto iterEnd = std::chrono::high_resolution_clock::now();
auto totalTime = std::chrono::duration_cast<std::chrono::milliseconds>(iterEnd - iterStart).count();
float avgTime = totalTime / static_cast<float>(iters - 1);
if (m_verbose) {
std::cout << " Subsequent iterations (avg): " << std::fixed << std::setprecision(1)
<< avgTime << " ms" << std::endl;
if (firstTime > 100 && avgTime < firstTime * 0.5f) {
float speedup = firstTime / avgTime;
std::cout << " ✓ Speedup after warmup: " << std::fixed << std::setprecision(1)
<< speedup << "x faster" << std::endl;
}
}
}
if (m_verbose) std::cout << " ✓ Batch=" << batchSize << " warmed up successfully" << std::endl;
}
if (m_verbose) {
std::cout << "\n========================================" << std::endl;
std::cout << "Warmup Complete!" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "Kernels pre-compiled for all batch sizes." << std::endl;
std::cout << "========================================\n" << std::endl;
}
}
template <typename T>
bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,std::vector<std::vector<std::vector<T>>>& featureVectors) {
// ============================================================================
// MULTI-GPU POOL DISPATCH
// ============================================================================
// If this Engine was initialised with initializePool() / initializePoolFromEngine()
// the m_slots vector is non-empty. In that case, delegate to the pool
// dispatcher which acquires the first idle slot and runs inference there.
// This branch is NEVER taken for single-GPU use (buildLoadNetwork / loadNetwork).
if (!m_slots.empty()) {
return runInferenceFromPool(inputs, featureVectors);
}
// ============================================================================
// SINGLE-ENGINE SERIALISATION
// ============================================================================
// The single Engine instance has shared mutable state (m_buffers, m_lastBatchSize,
// m_inferenceStream, TRT execution context). If two LabVIEW threads call
// runInference concurrently with different batch sizes, one will overwrite
// the input shapes and buffers while the other is mid-inference, causing a
// fatal "illegal memory access" that permanently corrupts the CUDA context.
//
// Pool-mode slots have their own busy-flag dispatch so they do NOT need this.
std::lock_guard<std::mutex> inferenceLock(m_inferenceMutex);
// ============================================================================
// THREAD-SAFE GPU CONTEXT
// ============================================================================
// Ensure the calling thread's CUDA device matches this engine's GPU.
// This is essential for multi-GPU round-robin: LabVIEW reuses threads
// across tasks, so a thread that last ran inference on GPU 1 might now
// be running a task on GPU 0. Without this, cv::cuda::GpuMat allocations
// and kernel launches would target the wrong GPU, causing result corruption.
// Skip cudaSetDevice if already on the correct device — under WDDM
// with multiple GPUs each call costs 1-5ms of scheduler overhead.
{
int currentDev = -1;
cudaGetDevice(&currentDev);
if (currentDev != m_options.deviceIndex) {
cudaSetDevice(m_options.deviceIndex);
}
}
// ============================================================================
// DEBUG: First call diagnostics (per-instance, not process-wide)
// ============================================================================
if (m_verbose && m_firstInferenceCall) {
std::cout << "\n=== First runInference Call ===" << std::endl;
std::cout << "Number of input tensors: " << inputs.size() << std::endl;
for (size_t i = 0; i < inputs.size(); ++i) {
std::cout << "Input " << i << " batch size: " << inputs[i].size() << std::endl;
if (!inputs[i].empty()) {
const auto& img = inputs[i][0];
std::cout << " Image shape: " << img.cols << "x" << img.rows
<< "x" << img.channels() << " (type: " << img.type() << ")" << std::endl;
}
}
// Print optimization profile information
std::cout << "\n=== Engine Profile Information ===" << std::endl;
std::cout << "Number of optimization profiles: "
<< m_engine->getNbOptimizationProfiles() << std::endl;
if (m_engine->getNbOptimizationProfiles() > 0) {
for (int profile = 0; profile < m_engine->getNbOptimizationProfiles(); ++profile) {
std::cout << "\n--- Profile " << profile << " ---" << std::endl;
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
const char* tensorName = m_IOTensorNames[i].c_str();
// Check if this is an input tensor
auto ioMode = m_engine->getTensorIOMode(tensorName);
if (ioMode != nvinfer1::TensorIOMode::kINPUT) {
continue;
}
auto minDims = m_engine->getProfileShape(tensorName, profile,
nvinfer1::OptProfileSelector::kMIN);
auto optDims = m_engine->getProfileShape(tensorName, profile,
nvinfer1::OptProfileSelector::kOPT);
auto maxDims = m_engine->getProfileShape(tensorName, profile,
nvinfer1::OptProfileSelector::kMAX);
std::cout << "Tensor '" << tensorName << "' (INPUT):" << std::endl;
std::cout << " Min: [" << minDims.d[0];
for (int j = 1; j < minDims.nbDims; ++j) std::cout << "," << minDims.d[j];
std::cout << "]" << std::endl;
std::cout << " Opt: [" << optDims.d[0];
for (int j = 1; j < optDims.nbDims; ++j) std::cout << "," << optDims.d[j];
std::cout << "]" << std::endl;
std::cout << " Max: [" << maxDims.d[0];
for (int j = 1; j < maxDims.nbDims; ++j) std::cout << "," << maxDims.d[j];
std::cout << "]" << std::endl;
}
}
}
if (!m_context->allInputDimensionsSpecified()) {
std::cout << "ERROR: Input dimensions not specified in context!" << std::endl;
return false;
}
std::cout << "\nContext state: All dimensions specified ✓" << std::endl;
m_firstInferenceCall = false;
}
// ============================================================================
// INPUT VALIDATION
// ============================================================================
if (inputs.empty() || inputs[0].empty()) {
std::cout << "Error: Empty input" << std::endl;
return false;
}
const auto numInputs = m_inputDims.size();
if (inputs.size() != numInputs) {
std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
<< ", Got: " << inputs.size() << std::endl;
return false;
}
const auto batchSize = static_cast<int32_t>(inputs[0].size());
if (batchSize > m_options.maxBatchSize) {
std::cout << "Error: Batch size " << batchSize << " exceeds maximum "
<< m_options.maxBatchSize << std::endl;
return false;
}
if (batchSize < 1) {
std::cout << "Error: Batch size must be at least 1" << std::endl;
return false;
}
// Validate batch size consistency across all inputs
for (size_t i = 1; i < inputs.size(); ++i) {
if (inputs[i].size() != static_cast<size_t>(batchSize)) {
std::cout << "Error: Inconsistent batch sizes across inputs. Input 0: "
<< batchSize << ", Input " << i << ": " << inputs[i].size() << std::endl;
return false;
}
}
// ============================================================================
// STREAM GUARD
// ============================================================================
// m_inferenceStream is now created eagerly in loadNetwork() so it is always
// valid here. Guard against the (unlikely) edge case where runInference is
// called before loadNetwork succeeds.
if (!m_streamInitialized || !m_inferenceStream) {
std::string errMsg = "Error: Inference stream not initialised. "
"Call loadNetwork() / buildLoadNetwork() before runInference().";
std::cout << errMsg << std::endl;
logEngineEvent("[Engine] runInference: " + errMsg, true);
return false;
}
// ============================================================================
// SET INPUT SHAPES (batch size changed OR dynamic spatial dims need updating)
// ============================================================================
// Fast path: compute desired dims first, then compare against cached dims.
// This avoids all TRT API calls when the shape hasn't actually changed —
// critical for the recognizer which is called ~50-100x per image with
// dynamic width but often the same or similar widths.
// ============================================================================
{
// Lazily initialise the dims cache on first call
if (m_lastSetInputDims.empty()) {
m_lastSetInputDims.resize(numInputs);
for (size_t i = 0; i < numInputs; ++i) {
m_lastSetInputDims[i].nbDims = 0; // force mismatch on first call
}
}
// Build desired dims for every input tensor (cheap — no TRT API calls)
bool anyDimChanged = (m_lastBatchSize != batchSize);
std::vector<nvinfer1::Dims> desiredDims(numInputs);
for (size_t i = 0; i < numInputs; ++i) {
nvinfer1::Dims& nd = desiredDims[i];
nd.nbDims = 4;
nd.d[0] = batchSize;
nd.d[1] = m_inputDims[i].d[0]; // channels
if (m_hasDynamicSpatialDims && !inputs[i].empty()) {
const auto& firstImg = inputs[i][0];
nd.d[2] = (m_inputDims[i].d[1] == -1) ? firstImg.rows : m_inputDims[i].d[1];
nd.d[3] = (m_inputDims[i].d[2] == -1) ? firstImg.cols : m_inputDims[i].d[2];
} else {
nd.d[2] = m_inputDims[i].d[1];
nd.d[3] = m_inputDims[i].d[2];
}
// Compare with cached
if (!anyDimChanged) {
const auto& cached = m_lastSetInputDims[i];
if (cached.nbDims != nd.nbDims ||
cached.d[0] != nd.d[0] || cached.d[1] != nd.d[1] ||
cached.d[2] != nd.d[2] || cached.d[3] != nd.d[3]) {
anyDimChanged = true;
}
}
}
if (anyDimChanged) {
// === First-time diagnostics (verbose, once) ===
const bool firstTime = !m_batchShapeChangeLogged;
if (m_verbose && firstTime) {
std::cout << "\nInfo: Batch size change: " << m_lastBatchSize
<< " -> " << batchSize << std::endl;
}
// Set optimization profile (only when truly needed)
if (m_engine->getNbOptimizationProfiles() > 0) {
int currentProfile = m_context->getOptimizationProfile();
if (currentProfile != 0 || m_lastBatchSize < 0) {
if (m_verbose && firstTime) {
std::cout << " Setting optimization profile to 0..." << std::endl;
}
if (!m_context->setOptimizationProfileAsync(0, m_inferenceStream)) {
std::cout << "Error: Failed to set optimization profile 0" << std::endl;
return false;
}
cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
if (syncErr != cudaSuccess) {
std::cout << "Error: Failed to sync after profile change: "
<< cudaGetErrorString(syncErr) << std::endl;
return false;
}
if (m_verbose && firstTime) {
std::cout << " Optimization profile set successfully" << std::endl;
}
}
}
// Update shapes for input tensors that actually changed
for (size_t i = 0; i < numInputs; ++i) {
const char* tensorName = m_IOTensorNames[i].c_str();
// Skip non-input tensors
auto ioMode = m_engine->getTensorIOMode(tensorName);
if (ioMode != nvinfer1::TensorIOMode::kINPUT) continue;
const nvinfer1::Dims& newDims = desiredDims[i];
const nvinfer1::Dims& cached = m_lastSetInputDims[i];
// Skip this tensor if its dims haven't changed
if (cached.nbDims == newDims.nbDims &&
cached.d[0] == newDims.d[0] && cached.d[1] == newDims.d[1] &&
cached.d[2] == newDims.d[2] && cached.d[3] == newDims.d[3]) {
continue;
}
// First-time verbose diagnostics
if (m_verbose && firstTime) {
std::cout << "\n Processing tensor " << i << ": '" << tensorName << "'" << std::endl;
// Validate batch size range (first time only)
if (m_engine->getNbOptimizationProfiles() > 0) {
int profileIndex = m_context->getOptimizationProfile();
nvinfer1::Dims minDims = m_engine->getProfileShape(
tensorName, profileIndex, nvinfer1::OptProfileSelector::kMIN);
nvinfer1::Dims maxDims = m_engine->getProfileShape(
tensorName, profileIndex, nvinfer1::OptProfileSelector::kMAX);
std::cout << " Profile batch range: [" << minDims.d[0]
<< " to " << maxDims.d[0] << "]" << std::endl;
if (batchSize < minDims.d[0] || batchSize > maxDims.d[0]) {
std::cout << "Error: Batch size " << batchSize
<< " outside profile range" << std::endl;
return false;
}
}
auto currentShape = m_context->getTensorShape(tensorName);
std::cout << " Current context shape: [";
for (int j = 0; j < currentShape.nbDims; ++j) {
if (j > 0) std::cout << ", ";
std::cout << currentShape.d[j];
}
std::cout << "]" << std::endl;
std::cout << " Setting new shape: [" << newDims.d[0] << ", "
<< newDims.d[1] << ", " << newDims.d[2] << ", "
<< newDims.d[3] << "]" << std::endl;
}
if (!m_context->setInputShape(tensorName, newDims)) {
std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
return false;
}
// Verify shape (first time only — trust the API on hot path)
if (firstTime) {
auto verifyShape = m_context->getTensorShape(tensorName);
if (verifyShape.d[0] != batchSize) {
std::cout << "Error: Shape change didn't take effect. Expected batch "
<< batchSize << ", got " << verifyShape.d[0] << std::endl;
return false;
}
if (m_verbose) {
std::cout << " Shape updated successfully" << std::endl;
}
}
m_lastSetInputDims[i] = newDims;
}
// Verify all input dimensions specified (first time only)
if (firstTime) {
if (!m_context->allInputDimensionsSpecified()) {
std::cout << "Error: Not all input dimensions specified after shape change" << std::endl;
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
std::cout << " " << m_IOTensorNames[i] << ": [";
for (int j = 0; j < shape.nbDims; ++j) {
if (j > 0) std::cout << ", ";
std::cout << shape.d[j];
}
std::cout << "]" << std::endl;
}
return false;
}
}
m_lastBatchSize = batchSize;
m_batchShapeChangeLogged = true;
if (m_verbose && firstTime) {
std::cout << "\nInfo: Input shapes updated successfully for batch size "
<< batchSize << " ✓\n" << std::endl;
}
}
}
// ============================================================================
// PREPROCESS AND COPY INPUTS TO GPU BUFFERS
// ============================================================================
// Pass 1: Validate all input dimensions before any GPU work.
// Dynamic dims (-1) are skipped in validation (they accept any size).
for (size_t i = 0; i < numInputs; ++i) {
const auto& batchInput = inputs[i];
const auto& dims = m_inputDims[i];
if (!batchInput.empty()) {
const auto& firstImg = batchInput[0];
bool mismatch = false;
if (dims.d[0] > 0 && firstImg.channels() != dims.d[0]) mismatch = true;
if (dims.d[1] > 0 && firstImg.rows != dims.d[1]) mismatch = true;
if (dims.d[2] > 0 && firstImg.cols != dims.d[2]) mismatch = true;
if (mismatch) {
std::cout << "Error: Input " << i << " dimension mismatch!" << std::endl;
std::cout << " Expected: " << dims.d[2] << "x" << dims.d[1]
<< "x" << dims.d[0] << " (WxHxC, -1=dynamic)" << std::endl;
std::cout << " Got: " << firstImg.cols << "x" << firstImg.rows
<< "x" << firstImg.channels() << " (WxHxC)" << std::endl;
return false;
}
}
}
// Pass 2: Preprocess + D2D copies — all on m_inferenceStream (no null stream).
//
// All OpenCV CUDA ops (convertTo, subtract, divide, split) in blobFromGpuMats
// now run on m_inferenceStream via the cv::cuda::Stream wrapper. This means:
// • No null-stream interaction — eliminates global sync barriers on WDDM
// • No event bridge needed — same-stream ordering guarantees correctness
// • CUDA graphs are safe — cv::cuda::split runs BEFORE graph capture
//
// GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
// cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
std::vector<cv::cuda::GpuMat> preprocessedBuffers;
preprocessedBuffers.reserve(numInputs);
for (size_t i = 0; i < numInputs; ++i) {
const auto& batchInput = inputs[i];
// Preprocess on m_inferenceStream (not the null stream).
preprocessedBuffers.push_back(
blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize, false, cvInferStream));
// D2D copy: same stream as preprocessing, so ordering is guaranteed.
const auto& blobMat = preprocessedBuffers.back();
const size_t copySize = static_cast<size_t>(blobMat.rows) * static_cast<size_t>(blobMat.cols) * blobMat.elemSize();
cudaError_t copyErr = cudaMemcpyAsync(
m_buffers[i],
preprocessedBuffers.back().ptr<void>(),
copySize,
cudaMemcpyDeviceToDevice,
m_inferenceStream);
if (copyErr != cudaSuccess) {
std::cout << "Error: Failed to copy input " << i
<< " to inference buffer: " << cudaGetErrorString(copyErr) << std::endl;
return false;
}
}
// ============================================================================
// PRE-ALLOCATE OUTPUT STRUCTURE
// ============================================================================
const size_t numOutputs = m_outputLengths.size();
featureVectors.clear();
featureVectors.resize(batchSize);
for (int batch = 0; batch < batchSize; ++batch) {
featureVectors[batch].resize(numOutputs);
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx)
featureVectors[batch][outputIdx].resize(m_outputLengths[outputIdx]);
}
// ============================================================================
// RUN INFERENCE + COPY OUTPUTS (CUDA Graph path or direct path)
// ============================================================================
// CUDA Graph path
// ---------------
// On the first call for a given batchSize we capture enqueueV3 + D2H copies
// into a reusable graph. Subsequent calls use cudaGraphLaunch, replacing
// many individual kernel-submission API calls with a single launch.
//
// Prerequisites satisfied here:
// • Preprocessing + D2D copies are queued on m_inferenceStream (same-stream
// ordering guarantees they complete before captured kernels execute)
// • m_pinnedOutputBuffers has stable addresses (allocated in loadNetwork)
// • m_buffers (GPU outputs) have stable addresses (allocated in loadNetwork)
//
// Falls back to the direct path if pinned buffers are unavailable or if
// graph capture/instantiation fails for any reason.
// CUDA graphs capture fixed kernel sequences; incompatible with dynamic spatial dims
// (input H/W change per inference call → different TRT kernel plans each time).
// Disabled for pool slots — concurrent graph captures on the same GPU corrupt the
// CUDA context ("operation not permitted when stream is capturing").
const bool canGraph = !m_disableGraphs && !m_pinnedOutputBuffers.empty() && !m_hasDynamicSpatialDims;
bool graphUsed = false;
if (canGraph) {
auto& graphExec = m_graphExecs[batchSize]; // inserts nullptr on first access
if (!graphExec) {
// First call for this batchSize -- capture a new graph.
// Serialise captures across all Engine instances on this device to
// prevent TRT's shared workspace from creating cross-stream
// dependencies that violate CUDA graph capture rules.
std::lock_guard<std::mutex> captureLock(graphCaptureMutex());
// Clear any sticky CUDA error from a prior failed capture so that
// this attempt starts clean.
cudaGetLastError();
cudaGraph_t graph = nullptr;
bool captureOk = false;
if (cudaStreamBeginCapture(m_inferenceStream,
cudaStreamCaptureModeRelaxed) == cudaSuccess) {
// Record TRT kernels into the graph (not executed yet).
TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
// Record D2H copies to stable pinned addresses.
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
cudaMemcpyAsync(
m_pinnedOutputBuffers[outputIdx],
static_cast<char*>(m_buffers[numInputs + outputIdx]),
static_cast<size_t>(batchSize) * m_outputLengths[outputIdx] * sizeof(T),
cudaMemcpyDeviceToHost,
m_inferenceStream);
}
captureOk = (cudaStreamEndCapture(m_inferenceStream, &graph) == cudaSuccess
&& graph != nullptr);
}
if (captureOk) {
cudaGraphExec_t exec = nullptr;
if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
graphExec = exec;
cudaGraphDestroy(graph);
}
if (!graphExec) {
std::cout << "Warning: CUDA graph capture failed for batchSize="
<< batchSize << " -- falling back to direct inference path." << std::endl;
// Disable graph acceleration for this Engine instance.
for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
m_pinnedOutputBuffers.clear();
m_graphExecs.erase(batchSize);
}
}
if (graphExec) {
// Launch the pre-captured graph (single API call replaces many).
cudaGraphLaunch(graphExec, m_inferenceStream);
cudaStreamSynchronize(m_inferenceStream);
// CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
for (int batch = 0; batch < batchSize; ++batch) {
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
std::memcpy(
featureVectors[batch][outputIdx].data(),
m_pinnedOutputBuffers[outputIdx]
+ static_cast<size_t>(batch) * m_outputLengths[outputIdx],
m_outputLengths[outputIdx] * sizeof(T));
}
}
graphUsed = true;
}
}
// Direct path (no graph)
// ----------------------
// Used when pinned buffers are unavailable or graph capture failed.
if (!graphUsed) {
bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
if (!success) {
std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
+ std::to_string(batchSize)
+ ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
debugInfo += ", tensor'" + m_IOTensorNames[i] + "'=[";
for (int j = 0; j < shape.nbDims; ++j) {
if (j > 0) debugInfo += ",";
debugInfo += std::to_string(shape.d[j]);
}
debugInfo += "]";
}
std::cout << debugInfo << std::endl;
logEngineEvent(debugInfo, true);
return false;
}
for (int batch = 0; batch < batchSize; ++batch) {
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
const size_t outputBinding = numInputs + outputIdx;
const size_t offset =
static_cast<size_t>(batch) * m_outputLengths[outputIdx] * sizeof(T);
cudaError_t copyErr = cudaMemcpyAsync(
featureVectors[batch][outputIdx].data(),
static_cast<char*>(m_buffers[outputBinding]) + offset,
m_outputLengths[outputIdx] * sizeof(T),
cudaMemcpyDeviceToHost,
m_inferenceStream);
if (copyErr != cudaSuccess) {
std::string errMsg = "[Engine] runInference FAIL: cudaMemcpyAsync output "
+ std::to_string(outputIdx) + " batch " + std::to_string(batch)
+ ": " + cudaGetErrorString(copyErr);
std::cout << errMsg << std::endl;
logEngineEvent(errMsg, true);
return false;
}
}
}
cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
if (syncErr != cudaSuccess) {
std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
+ std::string(cudaGetErrorString(syncErr));
std::cout << errMsg << std::endl;
logEngineEvent(errMsg, true);
return false;
}
}
return true;
}

View File

@@ -0,0 +1,250 @@
#pragma once
#include <filesystem>
#include <NvInfer.h> // NV_TENSORRT_MAJOR/MINOR/PATCH
#include <NvInferVersion.h> // also defines TRT version macros
#include <cudnn_version.h> // CUDNN_MAJOR/MINOR/PATCHLEVEL
#include <cuda_runtime.h> // cudaRuntimeGetVersion
template <typename T>
void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output) {
if (input.size() == 1) {
output = std::move(input[0]);
}
else {
auto msg = "The feature vector has incorrect dimensions!";
std::cout<<msg;
}
}
template <typename T>
void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output) {
if (input.size() != 1 || input[0].size() != 1) {
auto msg = "The feature vector has incorrect dimensions!";
std::cout<<msg;
}
output = std::move(input[0][0]);
}
template <typename T>
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
size_t height, size_t width,
const cv::Scalar& bgcolor) {
// Ensure input is valid
if (input.empty()) {
return cv::cuda::GpuMat();
}
// Create a CUDA stream
cv::cuda::Stream stream;
// Calculate aspect ratio and unpadded dimensions
float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
size_t unpad_w = static_cast<size_t>(r * input.cols);
size_t unpad_h = static_cast<size_t>(r * input.rows);
// Resize the input image
cv::cuda::GpuMat re;
re.create(unpad_h, unpad_w, input.type());
cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);
// Create the output image and fill with the background color
cv::cuda::GpuMat out;
out.create(height, width, input.type());
out.setTo(bgcolor, stream);
// Copy the resized content into the top-left corner of the output image
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
stream.waitForCompletion();
return out;
}
template <typename T> void Engine<T>::getDeviceNames(std::vector<std::string> &deviceNames) {
int numGPUs;
cudaGetDeviceCount(&numGPUs);
for (int device = 0; device < numGPUs; device++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, device);
deviceNames.push_back(std::string(prop.name));
}
}
template <typename T> int Engine<T>::getBindingIndexByName(const std::string& name) {
for (int i = 0, e = m_engine->getNbIOTensors(); i < e; i++)
{
if (name == m_engine->getIOTensorName(i))
{
return i;
}
}
return -1;
}
//template <typename T> std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options &options, const std::string &onnxModelPath) {
// const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
// std::string engineName = onnxModelPath.substr(filenamePos, onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
//
// // Add the GPU device name to the file to ensure that the model is only used
// // on devices with the exact same GPU
// std::vector<std::string> deviceNames;
// getDeviceNames(deviceNames);
//
// if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
// auto msg = "Error, provided device index is out of range!";
// std::cout<<msg;
// return "";
// }
//
// auto deviceName = deviceNames[options.deviceIndex];
// // Remove spaces from the device name
// deviceName.erase(std::remove_if(deviceName.begin(), deviceName.end(), ::isspace), deviceName.end());
// engineName += "." + deviceName;
// // Serialize the specified options into the filename
// if (options.precision == ANSCENTER::Precision::FP16) {
// engineName += ".fp16";
// } else if (options.precision == ANSCENTER::Precision::FP32) {
// engineName += ".fp32";
// } else {
// engineName += ".int8";
// }
// if (options.maxBatchSize > 1) {
// engineName += "." + std::to_string(options.maxBatchSize);
// }
// return engineName;
//}
template <typename T>
std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options& options,
const std::string& onnxModelPath)
{
// -- Base name from ONNX file ---------------------------------------------
const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
std::string engineName = onnxModelPath.substr(
filenamePos,
onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
// -- GPU device name ------------------------------------------------------
// Ensures the engine is only loaded on the exact GPU it was built for.
std::vector<std::string> deviceNames;
getDeviceNames(deviceNames);
if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
std::cout << "Error, provided device index is out of range!";
return "";
}
auto deviceName = deviceNames[options.deviceIndex];
deviceName.erase(
std::remove_if(deviceName.begin(), deviceName.end(), ::isspace),
deviceName.end());
engineName += "." + deviceName;
// -- Precision ------------------------------------------------------------
if (options.precision == ANSCENTER::Precision::FP16) {
engineName += ".fp16";
}
else if (options.precision == ANSCENTER::Precision::FP32) {
engineName += ".fp32";
}
else {
engineName += ".int8";
}
// -- Batch size -----------------------------------------------------------
if (options.maxBatchSize > 1) {
engineName += ".b" + std::to_string(options.maxBatchSize);
}
// -- Max spatial dims: intentionally NOT included in the filename ----------
// buildWithRetry() may reduce max dims (e.g. 2560→1920) when GPU memory
// is insufficient. If the filename included .s{H}x{W}, the next launch
// would look for .s2560x2560, miss the cached .s1920x1920, and waste
// minutes re-attempting the doomed 2560 build before falling back.
// Without the suffix, the cache is found immediately on the next launch.
// The actual profile max is queried at runtime via getProfileMaxHeight/Width.
// -- TensorRT version -----------------------------------------------------
// Engine format changes between TensorRT minor versions -- must rebuild.
// NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH are defined in
// <NvInferVersion.h> which is included via NvInfer.h.
engineName += ".trt"
+ std::to_string(NV_TENSORRT_MAJOR) + "."
+ std::to_string(NV_TENSORRT_MINOR) + "."
+ std::to_string(NV_TENSORRT_PATCH);
// -- CUDA runtime version -------------------------------------------------
// Engines built with different CUDA versions may use different PTX/cubin
// formats and must be rebuilt.
int cudaVersion = 0;
cudaRuntimeGetVersion(&cudaVersion);
const int cudaMajor = cudaVersion / 1000;
const int cudaMinor = (cudaVersion % 1000) / 10;
engineName += ".cuda"
+ std::to_string(cudaMajor) + "."
+ std::to_string(cudaMinor);
// -- cuDNN version --------------------------------------------------------
// cuDNN version affects layer implementations inside the engine.
// CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL are defined in <cudnn_version.h>.
engineName += ".cudnn"
+ std::to_string(CUDNN_MAJOR) + "."
+ std::to_string(CUDNN_MINOR);
return engineName;
}
template <typename T>
cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
const std::array<float, 3> &divVals, bool normalize, bool swapRB,
cv::cuda::Stream &stream) {
cv::cuda::GpuMat result;
if (batchInput.empty()) return result;
if (batchInput[0].channels() != 3) return result;
const int H = batchInput[0].rows;
const int W = batchInput[0].cols;
const int batch = static_cast<int>(batchInput.size());
const size_t planeSize = static_cast<size_t>(H) * W; // pixels per channel
// Output blob: planar NCHW layout stored as a single-channel GpuMat.
// Total elements = batch * 3 * H * W.
cv::cuda::GpuMat blob(1, batch * 3 * static_cast<int>(planeSize), CV_32FC1);
for (int img = 0; img < batch; ++img) {
// 1. Convert to float and normalise while still in HWC (interleaved) format.
// Channel-wise subtract / divide operate correctly on interleaved data.
cv::cuda::GpuMat floatImg;
if (normalize) {
batchInput[img].convertTo(floatImg, CV_32FC3, 1.f / 255.f, stream);
} else {
batchInput[img].convertTo(floatImg, CV_32FC3, 1.0, stream);
}
cv::cuda::subtract(floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), floatImg, cv::noArray(), -1, stream);
cv::cuda::divide(floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), floatImg, 1, -1, stream);
// 2. Split normalised HWC image into CHW planes directly into the blob.
size_t offset = static_cast<size_t>(img) * 3 * planeSize;
if (swapRB) {
// BGR input -> RGB planes: B goes to plane 2, G to plane 1, R to plane 0
std::vector<cv::cuda::GpuMat> channels{
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize), // B -> plane 2
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize), // G -> plane 1
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)}; // R -> plane 0
cv::cuda::split(floatImg, channels, stream);
} else {
// BGR input -> BGR planes: keep channel order
std::vector<cv::cuda::GpuMat> channels{
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
cv::cuda::split(floatImg, channels, stream);
}
}
return blob;
}
template <typename T> void Engine<T>::clearGpuBuffers() {
if (!m_buffers.empty()) {
// Free ALL I/O GPU buffers (both inputs and outputs).
// Previously only outputs were freed, leaking input allocations from loadNetwork().
for (void* ptr : m_buffers) {
if (ptr) {
Util::checkCudaErrorCode(cudaFree(ptr));
}
}
m_buffers.clear();
}
}

View File

@@ -0,0 +1,9 @@
#pragma once
// ============================================================================
// Forwarding header — NvDynLoader moved to ANSLibsLoader
//
// This file is retained for backward compatibility. All consuming projects
// should update their include paths to reference ANSLibsLoader/include/
// directly. Once all projects are updated, this file can be removed.
// ============================================================================
#include "../../../ANSLibsLoader/include/NvDynLoader.h"

View File

@@ -0,0 +1,50 @@
#pragma once
// ============================================================================
// TRTCompat.h -- TensorRT version compatibility macros
//
// Centralises all TRT-version-dependent API differences so that the rest of
// the codebase can be compiled against TRT 8.x or TRT 10.x without scattering
// #if blocks everywhere.
//
// Build 1: CUDA 11.8 + cuDNN 8 + TensorRT 8.6 + OpenCV 4.10 (SM 35-86)
// Build 2: CUDA 13.1 + cuDNN 9 + TensorRT 10 + OpenCV 4.13 (SM 75-121)
// ============================================================================
#include <NvInferVersion.h>
// ---------------------------------------------------------------------------
// Network creation
// ---------------------------------------------------------------------------
// TRT 10+: kEXPLICIT_BATCH was removed (it is the only mode).
// TRT 8.x: The flag must be passed explicitly.
#if NV_TENSORRT_MAJOR >= 10
#define TRT_CREATE_NETWORK(builder) \
(builder)->createNetworkV2(0)
#else
#define TRT_CREATE_NETWORK(builder) \
(builder)->createNetworkV2( \
1U << static_cast<uint32_t>( \
nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH))
#endif
// ---------------------------------------------------------------------------
// Inference execution
// ---------------------------------------------------------------------------
// TRT 10+: enqueueV3(stream) — uses tensor addresses pre-bound via
// setTensorAddress().
// TRT 8.x: enqueueV2(bindings, stream, nullptr) — uses a void** array
// indexed by binding position.
#if NV_TENSORRT_MAJOR >= 10
#define TRT_ENQUEUE(context, stream, buffers) \
(context)->enqueueV3(stream)
#else
#define TRT_ENQUEUE(context, stream, buffers) \
(context)->enqueueV2( \
reinterpret_cast<void**>((buffers).data()), (stream), nullptr)
#endif
// ---------------------------------------------------------------------------
// Feature-detection helpers
// ---------------------------------------------------------------------------
#define TRT_HAS_ENQUEUE_V3 (NV_TENSORRT_MAJOR >= 10)
#define TRT_HAS_EXPLICIT_BATCH_FLAG (NV_TENSORRT_MAJOR < 10)

View File

@@ -0,0 +1,177 @@
#pragma once
// TRTEngineCache.h — Process-wide cache for shared TensorRT ICudaEngine instances.
//
// When multiple AI tasks load the same model (same .engine file + GPU), this cache
// ensures only ONE copy of the model weights lives in VRAM. Each task creates its
// own IExecutionContext from the shared ICudaEngine (TRT-supported pattern).
//
// Usage in loadNetwork():
// auto& cache = TRTEngineCache::instance();
// auto hit = cache.tryGet(enginePath, gpuIdx);
// if (hit.engine) {
// m_engine = hit.engine; m_runtime = hit.runtime; // cache hit
// } else {
// // ... deserialize as usual ...
// m_engine = cache.putIfAbsent(enginePath, gpuIdx, runtime, engine);
// }
//
// In ~Engine():
// cache.release(enginePath, gpuIdx);
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <iostream>
#include <NvInfer.h>
/// Process-wide flag: set to true during DLL_PROCESS_DETACH when ExitProcess
/// is in progress (lpReserved != NULL). Worker threads are already dead in
/// this state, so thread::join() would deadlock and CUDA/TRT calls are unsafe.
/// Checked by Engine::~Engine to skip cleanup that requires live threads or GPUs.
inline std::atomic<bool>& g_processExiting() {
static std::atomic<bool> s_flag{false};
return s_flag;
}
class TRTEngineCache {
public:
struct CacheHit {
std::shared_ptr<nvinfer1::ICudaEngine> engine;
std::shared_ptr<nvinfer1::IRuntime> runtime;
};
static TRTEngineCache& instance() {
static TRTEngineCache s_instance;
return s_instance;
}
/// Global bypass — when true, tryGet() always returns miss, putIfAbsent()
/// is a no-op, and buildLoadNetwork/loadNetwork force single-GPU path.
/// Used by OptimizeModelStr to prevent inner engines (created by
/// custom DLLs via ANSLIB.dll) from creating pools/caching.
/// Stored as a member of the singleton to guarantee a single instance
/// across all translation units (avoids MSVC inline static duplication).
static std::atomic<bool>& globalBypass() {
return instance().m_globalBypass;
}
std::atomic<bool> m_globalBypass{false};
/// Try to get a cached engine. Returns {nullptr, nullptr} on miss.
/// On hit, increments refcount.
CacheHit tryGet(const std::string& engineFilePath, int gpuIndex) {
if (globalBypass().load(std::memory_order_relaxed)) return {nullptr, nullptr};
std::lock_guard<std::mutex> lock(m_mutex);
auto it = m_cache.find({engineFilePath, gpuIndex});
if (it != m_cache.end()) {
it->second.refcount++;
std::cout << "[TRTEngineCache] HIT: " << engineFilePath
<< " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl;
return {it->second.engine, it->second.runtime};
}
return {nullptr, nullptr};
}
/// Store a newly deserialized engine. If another thread already stored the
/// same key (race), returns the existing one and the caller's copy is discarded.
/// Increments refcount for the returned engine.
std::shared_ptr<nvinfer1::ICudaEngine> putIfAbsent(
const std::string& engineFilePath, int gpuIndex,
std::shared_ptr<nvinfer1::IRuntime> runtime,
std::shared_ptr<nvinfer1::ICudaEngine> engine) {
if (globalBypass().load(std::memory_order_relaxed)) return engine; // don't cache
std::lock_guard<std::mutex> lock(m_mutex);
CacheKey key{engineFilePath, gpuIndex};
auto it = m_cache.find(key);
if (it != m_cache.end()) {
// Another thread beat us — use theirs, discard ours
it->second.refcount++;
std::cout << "[TRTEngineCache] RACE: using existing for " << engineFilePath
<< " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl;
return it->second.engine;
}
// First to store — insert
CachedEntry entry;
entry.engine = std::move(engine);
entry.runtime = std::move(runtime);
entry.refcount = 1;
auto inserted = m_cache.emplace(std::move(key), std::move(entry));
std::cout << "[TRTEngineCache] STORED: " << engineFilePath
<< " GPU[" << gpuIndex << "] refs=1" << std::endl;
return inserted.first->second.engine;
}
/// Decrement refcount. When refcount reaches 0, the engine is evicted immediately
/// to release VRAM and file handles (allows ModelOptimizer to rebuild .engine files
/// while LabVIEW is running).
void release(const std::string& engineFilePath, int gpuIndex) {
std::lock_guard<std::mutex> lock(m_mutex);
auto it = m_cache.find({engineFilePath, gpuIndex});
if (it != m_cache.end() && it->second.refcount > 0) {
it->second.refcount--;
std::cout << "[TRTEngineCache] RELEASE: " << engineFilePath
<< " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl;
if (it->second.refcount <= 0) {
std::cout << "[TRTEngineCache] EVICT (refcount=0): " << engineFilePath
<< " GPU[" << gpuIndex << "]" << std::endl;
m_cache.erase(it);
}
}
}
/// Remove all entries with refcount == 0 (call at shutdown or when VRAM tight).
void evictUnused() {
std::lock_guard<std::mutex> lock(m_mutex);
for (auto it = m_cache.begin(); it != m_cache.end(); ) {
if (it->second.refcount <= 0) {
std::cout << "[TRTEngineCache] EVICT: " << it->first.path
<< " GPU[" << it->first.gpuIndex << "]" << std::endl;
it = m_cache.erase(it);
} else {
++it;
}
}
}
/// Clear all cached engines immediately (call during DLL_PROCESS_DETACH
/// BEFORE destroying engine handles, to avoid calling into unloaded TRT DLLs).
void clearAll() {
std::lock_guard<std::mutex> lock(m_mutex);
std::cout << "[TRTEngineCache] CLEAR ALL (" << m_cache.size() << " entries)" << std::endl;
m_cache.clear(); // shared_ptrs released — engines destroyed while TRT is still loaded
}
/// Number of cached engines (for diagnostics).
size_t size() const {
std::lock_guard<std::mutex> lock(m_mutex);
return m_cache.size();
}
private:
TRTEngineCache() = default;
TRTEngineCache(const TRTEngineCache&) = delete;
TRTEngineCache& operator=(const TRTEngineCache&) = delete;
struct CacheKey {
std::string path;
int gpuIndex = 0;
bool operator==(const CacheKey& o) const {
return path == o.path && gpuIndex == o.gpuIndex;
}
};
struct CacheKeyHash {
size_t operator()(const CacheKey& k) const {
return std::hash<std::string>{}(k.path) ^
(std::hash<int>{}(k.gpuIndex) << 16);
}
};
struct CachedEntry {
std::shared_ptr<nvinfer1::ICudaEngine> engine;
std::shared_ptr<nvinfer1::IRuntime> runtime;
int refcount = 0;
};
std::unordered_map<CacheKey, CachedEntry, CacheKeyHash> m_cache;
mutable std::mutex m_mutex;
};

View File

@@ -0,0 +1,31 @@
#pragma once
// ============================================================================
// ENGINE_API — DLL export/import control macro
//
// HOW TO USE
// ----------
// Library project (CodeAid):
// Add ENGINE_EXPORTS to Project Properties →
// C/C++ → Preprocessor → Preprocessor Definitions.
// This causes all ENGINE_API-decorated symbols to be __declspec(dllexport).
//
// Consuming projects:
// Do NOT define ENGINE_EXPORTS. ENGINE_API becomes __declspec(dllimport),
// which tells the linker to resolve decorated symbols from the DLL's .lib
// import library. No engine.cpp needs to be added to the consuming project.
//
// WHAT IS DECORATED
// -----------------
// Logger — class ENGINE_API Logger
// GpuDeviceInfo — struct ENGINE_API GpuDeviceInfo
// Int8EntropyCalibrator2 — class ENGINE_API Int8EntropyCalibrator2
// Engine<float> — explicit instantiation at bottom of engine.h
// (and any other Engine<T> types listed there)
// ============================================================================
#ifdef ENGINE_EXPORTS
# define ENGINE_API __declspec(dllexport)
#else
# define ENGINE_API __declspec(dllimport)
#endif

View File

@@ -0,0 +1,20 @@
#pragma once
#include <vector>
#include <array>
#include <opencv2/core/cuda.hpp>
#include "NvInfer.h" // Include for nvinfer1::Dims and nvinfer1::Dims3
template <typename T>
class IEngine {
public:
virtual ~IEngine() = default;
virtual bool buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) = 0;
virtual bool loadNetwork(std::string trtModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) = 0;
virtual bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &inputs,
std::vector<std::vector<std::vector<T>>> &featureVectors) = 0;
virtual const std::vector<nvinfer1::Dims3> &getInputDims() const = 0;
virtual const std::vector<nvinfer1::Dims> &getOutputDims() const = 0;
};

View File

@@ -0,0 +1,75 @@
#pragma once
#include <iostream>
#include <string>
#include <spdlog/spdlog.h>
enum class LogLevel {
Trace,
Debug,
Info,
Warn,
Error,
Critical,
Off,
Unknown
};
// Get the log level string from the environment variable
inline std::string getLogLevelFromEnvironment() {
const char* envValue = std::getenv("LOG_LEVEL");
if (envValue) {
return std::string(envValue);
} else {
spdlog::warn("LOG_LEVEL environment variable not set. Using default log level (info).");
return "info";
}
}
// Convert log level string to LogLevel enum
inline LogLevel parseLogLevel(const std::string& logLevelStr) {
if (logLevelStr == "trace") {
return LogLevel::Trace;
} else if (logLevelStr == "debug") {
return LogLevel::Debug;
} else if (logLevelStr == "info") {
return LogLevel::Info;
} else if (logLevelStr == "warn" || logLevelStr == "warning") {
return LogLevel::Warn;
} else if (logLevelStr == "err" || logLevelStr == "error") {
return LogLevel::Error;
} else if (logLevelStr == "critical") {
return LogLevel::Critical;
} else if (logLevelStr == "off") {
return LogLevel::Off;
} else {
spdlog::warn("Unknown log level string: {}. Defaulting to 'info' level.", logLevelStr);
return LogLevel::Unknown;
}
}
// Convert LogLevel enum to spdlog::level::level_enum
inline spdlog::level::level_enum toSpdlogLevel(const std::string& logLevelStr) {
LogLevel logLevel = parseLogLevel(logLevelStr);
switch (logLevel) {
case LogLevel::Trace:
return spdlog::level::trace;
case LogLevel::Debug:
return spdlog::level::debug;
case LogLevel::Info:
return spdlog::level::info;
case LogLevel::Warn:
return spdlog::level::warn;
case LogLevel::Error:
return spdlog::level::err;
case LogLevel::Critical:
return spdlog::level::critical;
case LogLevel::Off:
return spdlog::level::off;
default:
spdlog::warn("Unknown log level. Using default log level (info).");
return spdlog::level::info;
}
}

View File

@@ -0,0 +1,11 @@
#pragma once
#include <spdlog/spdlog.h>
#define CHECK(condition) \
do { \
if (!(condition)) { \
spdlog::error("Assertion failed: ({}), function {}, file {}, line {}.", #condition, __FUNCTION__, __FILE__, __LINE__); \
abort(); \
} \
} while (false);

View File

@@ -0,0 +1,20 @@
#pragma once
#include <chrono>
// Utility Timer
template <typename Clock = std::chrono::high_resolution_clock> class Stopwatch {
typename Clock::time_point start_point;
public:
Stopwatch() : start_point(Clock::now()) {}
// Returns elapsed time
template <typename Rep = typename Clock::duration::rep, typename Units = typename Clock::duration> Rep elapsedTime() const {
std::atomic_thread_fence(std::memory_order_relaxed);
auto counted_time = std::chrono::duration_cast<Units>(Clock::now() - start_point).count();
std::atomic_thread_fence(std::memory_order_relaxed);
return static_cast<Rep>(counted_time);
}
};
using preciseStopwatch = Stopwatch<>;

View File

@@ -0,0 +1,20 @@
#pragma once
#include <fstream>
#include <string>
#include <vector>
#include <cuda_runtime.h>
#include <spdlog/spdlog.h>
namespace Util {
// Checks if a file exists at the given file path
bool doesFileExist(const std::string &filepath);
// Checks and logs CUDA error codes
void checkCudaErrorCode(cudaError_t code);
// Retrieves a list of file names in the specified directory
std::vector<std::string> getFilesInDirectory(const std::string &dirPath);
}
#include "Util.inl"

View File

@@ -0,0 +1,30 @@
#pragma once
#include <filesystem>
namespace Util {
inline bool doesFileExist(const std::string &filepath) {
std::ifstream f(filepath.c_str());
return f.good();
}
inline void checkCudaErrorCode(cudaError_t code) {
if (code != cudaSuccess) {
std::string errMsg = "CUDA operation failed with code: " + std::to_string(code) + " (" + cudaGetErrorName(code) +
"), with message: " + cudaGetErrorString(code);
spdlog::error(errMsg);
// throw std::runtime_error(errMsg);
}
}
inline std::vector<std::string> getFilesInDirectory(const std::string &dirPath) {
std::vector<std::string> fileNames;
for (const auto &entry : std::filesystem::directory_iterator(dirPath)) {
if (entry.is_regular_file()) {
fileNames.push_back(entry.path().string());
}
}
return fileNames;
}
}