Initial setup for CLion
This commit is contained in:
2636
engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl
Normal file
2636
engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl
Normal file
File diff suppressed because it is too large
Load Diff
887
engines/TensorRTAPI/include/engine/EngineMultiGpu.inl
Normal file
887
engines/TensorRTAPI/include/engine/EngineMultiGpu.inl
Normal file
@@ -0,0 +1,887 @@
|
||||
// ============================================================================
|
||||
// EngineMultiGpu.inl
|
||||
//
|
||||
// Multi-GPU inference pool -- merged from MultiGpuEngineManager.h
|
||||
//
|
||||
// This file is #included at the bottom of engine.h and must not be compiled
|
||||
// independently. It provides implementations for all pool-management methods
|
||||
// declared inside Engine<T>:
|
||||
//
|
||||
// initializePool() -- build from ONNX, create pool
|
||||
// initializePoolFromEngine() -- load pre-built TRT engine, create pool
|
||||
// enumerateDevices() -- static CUDA device enumeration
|
||||
// loadSlots() -- core pool allocation logic (private)
|
||||
// runInferenceFromPool() -- thread-safe slot dispatch (private)
|
||||
// getTotalCapacity() -- inline in engine.h
|
||||
// getActiveInferences() -- inline in engine.h
|
||||
// getAvailableSlots() -- inline in engine.h
|
||||
// isAtCapacity() -- inline in engine.h
|
||||
// printCapacityReport() -- human-readable pool status
|
||||
// ============================================================================
|
||||
|
||||
// -- Static member definitions for global elastic slot cap --------------------
|
||||
template <typename T>
|
||||
std::atomic<int> Engine<T>::s_globalElasticCount{0};
|
||||
template <typename T>
|
||||
std::atomic<int> Engine<T>::s_globalElasticMax{32}; // safe default, overwritten on first pool init
|
||||
template <typename T>
|
||||
std::once_flag Engine<T>::s_globalCapInitFlag;
|
||||
template <typename T>
|
||||
std::atomic<int64_t> Engine<T>::s_lastPoolCreatedMs{0};
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// enumerateDevices -- static, no model required
|
||||
// ----------------------------------------------------------------------------
|
||||
template <typename T>
|
||||
/*static*/ std::vector<GpuDeviceInfo>
|
||||
Engine<T>::enumerateDevices()
|
||||
{
|
||||
int count = 0;
|
||||
cudaGetDeviceCount(&count);
|
||||
|
||||
std::vector<GpuDeviceInfo> devices;
|
||||
devices.reserve(static_cast<size_t>(count));
|
||||
|
||||
for (int i = 0; i < count; ++i) {
|
||||
cudaDeviceProp prop;
|
||||
cudaGetDeviceProperties(&prop, i);
|
||||
|
||||
cudaSetDevice(i);
|
||||
size_t freeBytes = 0, totalBytes = 0;
|
||||
cudaMemGetInfo(&freeBytes, &totalBytes);
|
||||
|
||||
GpuDeviceInfo info;
|
||||
info.index = i;
|
||||
info.name = prop.name;
|
||||
info.totalMemoryBytes = prop.totalGlobalMem;
|
||||
info.freeMemoryAtInitBytes = freeBytes;
|
||||
info.computeMajor = prop.major;
|
||||
info.computeMinor = prop.minor;
|
||||
info.slotsAllocated = 0;
|
||||
info.memoryPerSlotBytes = 0;
|
||||
devices.push_back(std::move(info));
|
||||
}
|
||||
|
||||
return devices;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Public pool-init wrappers
|
||||
// ----------------------------------------------------------------------------
|
||||
template <typename T>
|
||||
bool Engine<T>::initializePool(
|
||||
const ANSCENTER::Options& baseOptions,
|
||||
const std::string& onnxModelPath,
|
||||
const std::array<float, 3>& subVals,
|
||||
const std::array<float, 3>& divVals,
|
||||
bool normalize,
|
||||
int maxSlotsPerGpu,
|
||||
double memSafetyFactor)
|
||||
{
|
||||
// Apply baseOptions to *this* so that m_options is consistent whether
|
||||
// the user goes through initializePool() or the 6-param buildLoadNetwork().
|
||||
m_options = baseOptions;
|
||||
return buildLoadNetwork(onnxModelPath, subVals, divVals, normalize,
|
||||
maxSlotsPerGpu, memSafetyFactor);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool Engine<T>::initializePoolFromEngine(
|
||||
const ANSCENTER::Options& baseOptions,
|
||||
const std::string& trtEnginePath,
|
||||
const std::array<float, 3>& subVals,
|
||||
const std::array<float, 3>& divVals,
|
||||
bool normalize,
|
||||
int maxSlotsPerGpu,
|
||||
double memSafetyFactor)
|
||||
{
|
||||
m_options = baseOptions;
|
||||
return loadNetwork(trtEnginePath, subVals, divVals, normalize,
|
||||
maxSlotsPerGpu, memSafetyFactor);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// loadSlots -- core pool allocation logic
|
||||
//
|
||||
// Three modes based on maxSlotsPerGpu:
|
||||
//
|
||||
// 1 => ROUND-ROBIN (default)
|
||||
// 1 slot per GPU, created at init. Tasks queue when all slots
|
||||
// busy. Best balance of VRAM usage and multi-GPU utilisation.
|
||||
// Example: 3 GPUs → 3 slots, round-robin dispatch.
|
||||
//
|
||||
// -1 => ELASTIC MODE
|
||||
// Only the probe slot is pre-loaded. Additional slots are created
|
||||
// on-demand by tryGrowPool() when concurrent requests arrive, and
|
||||
// released by releaseIdleSlots() when idle. Higher throughput but
|
||||
// higher VRAM usage — only recommended for large GPUs (≥ 8 GB).
|
||||
//
|
||||
// >1 => PRE-ALLOCATED MODE (explicit cap)
|
||||
// Slots are created upfront, capped at maxSlotsPerGpu per GPU.
|
||||
// Useful when the caller knows the required concurrency level.
|
||||
// ----------------------------------------------------------------------------
|
||||
template <typename T>
|
||||
bool Engine<T>::loadSlots(
|
||||
const ANSCENTER::Options& baseOptions,
|
||||
const std::string& modelPath,
|
||||
const std::array<float, 3>& subVals,
|
||||
const std::array<float, 3>& divVals,
|
||||
bool normalize,
|
||||
bool fromOnnx,
|
||||
int maxSlotsPerGpu,
|
||||
double memSafetyFactor)
|
||||
{
|
||||
// -- 1. Enumerate GPUs --------------------------------------------------
|
||||
m_deviceInfos = enumerateDevices();
|
||||
|
||||
if (m_deviceInfos.empty()) {
|
||||
std::cout << "Error [Pool]: No CUDA-capable GPUs detected" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
const bool elastic = (maxSlotsPerGpu <= 0);
|
||||
m_elasticMode = elastic;
|
||||
|
||||
// Set global elastic slot cap ONCE based on total GPU VRAM.
|
||||
// Budget: ~4 slots per GB. This cap is shared across ALL pools
|
||||
// to prevent CUDA driver SRW lock convoy (30+ threads deadlocked).
|
||||
// 4 GB → 12, 6 GB → 24, 8 GB → 32, 12 GB → 48, 24 GB → 96
|
||||
if (elastic) {
|
||||
std::call_once(s_globalCapInitFlag, [this]() {
|
||||
int totalGB = 0;
|
||||
for (const auto& dev : m_deviceInfos)
|
||||
totalGB += static_cast<int>(dev.totalMemoryBytes / (1024ULL * 1024ULL * 1024ULL));
|
||||
int cap = std::max(8, totalGB * 4); // minimum 8
|
||||
s_globalElasticMax.store(cap);
|
||||
std::cout << "Info [Pool]: Global elastic slot cap = "
|
||||
<< cap << " (total " << totalGB << " GB VRAM x4)" << std::endl;
|
||||
});
|
||||
}
|
||||
|
||||
std::cout << "\n====================================================" << std::endl;
|
||||
std::cout << "Engine Pool Initialization"
|
||||
<< (elastic ? " [ELASTIC]" : " [PRE-ALLOCATED]") << std::endl;
|
||||
std::cout << "====================================================" << std::endl;
|
||||
std::cout << "Found " << m_deviceInfos.size() << " GPU(s):" << std::endl;
|
||||
|
||||
for (const auto& d : m_deviceInfos) {
|
||||
std::cout << " GPU[" << d.index << "] " << d.name
|
||||
<< " | SM " << d.computeMajor << "." << d.computeMinor
|
||||
<< " | Total " << d.totalMemoryBytes / 1048576 << " MiB"
|
||||
<< " | Free " << d.freeMemoryAtInitBytes / 1048576 << " MiB"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
// Warn about heterogeneous GPUs -- TRT engine may not be compatible
|
||||
for (size_t i = 1; i < m_deviceInfos.size(); ++i) {
|
||||
if (m_deviceInfos[i].name != m_deviceInfos[0].name) {
|
||||
std::cout << "Warning [Pool]: GPU[" << i << "] '" << m_deviceInfos[i].name
|
||||
<< "' differs from GPU[0] '" << m_deviceInfos[0].name
|
||||
<< "'. TRT engine binary may be incompatible with dissimilar GPUs."
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// -- 2. Probe engine: measure per-slot VRAM footprint -------------------
|
||||
//
|
||||
// Memory delta = freeBeforeLoad - freeAfterLoad
|
||||
// Includes: TRT runtime buffers, CUDA context overhead, I/O buffers,
|
||||
// stream memory, and workspace allocated by Engine<T>.
|
||||
//
|
||||
// MULTI-GPU BALANCING: place the probe on the GPU with the most free
|
||||
// VRAM. This naturally distributes engines across GPUs as each pool
|
||||
// init consumes VRAM from its chosen GPU, making the *other* GPU
|
||||
// the best candidate for the next pool.
|
||||
|
||||
int probeGpuIdx = 0;
|
||||
{
|
||||
size_t bestFree = 0;
|
||||
for (const auto& d : m_deviceInfos) {
|
||||
cudaSetDevice(d.index);
|
||||
size_t freeNow = 0, totalNow = 0;
|
||||
cudaMemGetInfo(&freeNow, &totalNow);
|
||||
std::cout << " GPU[" << d.index << "] free VRAM: " << freeNow / 1048576 << " MiB" << std::endl;
|
||||
if (freeNow > bestFree) {
|
||||
bestFree = freeNow;
|
||||
probeGpuIdx = d.index;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << "\nLoading probe engine on GPU[" << probeGpuIdx
|
||||
<< "] (most free VRAM) to measure per-slot memory..." << std::endl;
|
||||
|
||||
cudaSetDevice(probeGpuIdx);
|
||||
size_t freeBefore = 0, tmp = 0;
|
||||
cudaMemGetInfo(&freeBefore, &tmp);
|
||||
|
||||
ANSCENTER::Options opts0 = baseOptions;
|
||||
opts0.deviceIndex = probeGpuIdx;
|
||||
|
||||
auto probeEngine = std::make_unique<Engine<T>>(opts0);
|
||||
const bool probeOk = fromOnnx
|
||||
? probeEngine->buildLoadNetwork(modelPath, subVals, divVals, normalize)
|
||||
: probeEngine->loadNetwork (modelPath, subVals, divVals, normalize);
|
||||
|
||||
if (!probeOk) {
|
||||
logEngineEvent("[Engine] loadSlots FAIL: Probe engine failed on GPU["
|
||||
+ std::to_string(probeGpuIdx) + "] for " + modelPath
|
||||
+ " (freeVRAM before=" + std::to_string(freeBefore / 1048576) + " MiB)", true);
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t freeAfter = 0;
|
||||
cudaMemGetInfo(&freeAfter, &tmp);
|
||||
|
||||
// Floor the delta at 64 MiB to guard against measurement noise
|
||||
constexpr size_t kMinSlotMemBytes = 64ULL * 1024 * 1024;
|
||||
const size_t rawDelta = (freeBefore > freeAfter) ? (freeBefore - freeAfter) : 0ULL;
|
||||
const size_t memPerSlot = std::max(rawDelta, kMinSlotMemBytes);
|
||||
|
||||
std::cout << "Info [Pool]: Memory per slot = " << memPerSlot / 1048576
|
||||
<< " MiB (measured delta = " << rawDelta / 1048576 << " MiB)" << std::endl;
|
||||
|
||||
// Cache input/output tensor dims on *this* Engine so getInputDims() /
|
||||
// getOutputDims() work correctly when the pool is the active code path.
|
||||
m_inputDims = probeEngine->getInputDims();
|
||||
m_outputDims = probeEngine->getOutputDims();
|
||||
|
||||
// Sync GPU-capped batch sizes from the probe engine. The build() function
|
||||
// may have reduced maxBatchSize based on GPU VRAM tier; propagate that to
|
||||
// the pool manager so callers see the actual runtime limits.
|
||||
m_options.maxBatchSize = probeEngine->getOptions().maxBatchSize;
|
||||
m_options.optBatchSize = probeEngine->getOptions().optBatchSize;
|
||||
|
||||
// Store per-slot measurement for on-demand growth
|
||||
m_memPerSlot = memPerSlot;
|
||||
|
||||
// Promote the probe engine into the first slot on the chosen GPU
|
||||
{
|
||||
InferenceSlot s;
|
||||
s.deviceIndex = probeGpuIdx;
|
||||
s.busy = false;
|
||||
s.memUsed = memPerSlot;
|
||||
s.engine = std::move(probeEngine);
|
||||
m_slots.push_back(std::move(s));
|
||||
}
|
||||
m_deviceInfos[probeGpuIdx].slotsAllocated = 1;
|
||||
m_deviceInfos[probeGpuIdx].memoryPerSlotBytes = memPerSlot;
|
||||
|
||||
// -- 3. Store config for on-demand growth (elastic mode) -------------
|
||||
m_poolModelPath = modelPath;
|
||||
m_poolSubVals = subVals;
|
||||
m_poolDivVals = divVals;
|
||||
m_poolNormalize = normalize;
|
||||
m_poolFromOnnx = fromOnnx;
|
||||
m_poolSafetyFactor = memSafetyFactor;
|
||||
|
||||
if (elastic) {
|
||||
// -- ELASTIC: only the probe slot is pre-loaded -----------------
|
||||
std::cout << "Info [Pool]: Elastic mode -- starting with 1 probe slot."
|
||||
<< " Additional slots will be created on-demand as concurrent"
|
||||
<< " requests arrive and released when idle." << std::endl;
|
||||
|
||||
m_totalCapacity = 1;
|
||||
// Mark creation time — elastic growth is deferred for s_elasticGraceSec
|
||||
// to let other models create their probe engines first.
|
||||
{
|
||||
using namespace std::chrono;
|
||||
auto now = duration_cast<milliseconds>(
|
||||
steady_clock::now().time_since_epoch()).count();
|
||||
s_lastPoolCreatedMs.store(now);
|
||||
}
|
||||
printCapacityReport();
|
||||
startIdleTimer(); // Auto-cleanup idle slots periodically
|
||||
return true;
|
||||
}
|
||||
|
||||
// -- 4. PRE-ALLOCATED: compute per-GPU capacity, then interleave -----
|
||||
//
|
||||
// Phase A: determine how many slots each GPU can hold.
|
||||
// Phase B: create slots in round-robin order across GPUs so that
|
||||
// the linear m_nextSlotHint scan naturally distributes
|
||||
// consecutive requests across GPUs:
|
||||
// m_slots = [GPU0-s0, GPU1-s0, GPU2-s0, GPU0-s1, GPU1-s1, ...]
|
||||
// This gives: Task1→GPU0, Task2→GPU1, Task3→GPU2, Task4→GPU0 ...
|
||||
|
||||
const int numGpus = static_cast<int>(m_deviceInfos.size());
|
||||
|
||||
// Phase A: compute slotsToAdd per GPU
|
||||
std::vector<int> slotsPerGpu(numGpus, 0);
|
||||
int maxSlotsAny = 0;
|
||||
|
||||
for (int di = 0; di < numGpus; ++di) {
|
||||
cudaSetDevice(di);
|
||||
size_t freeNow = 0, totalNow = 0;
|
||||
cudaMemGetInfo(&freeNow, &totalNow);
|
||||
|
||||
const size_t usableBytes = static_cast<size_t>(
|
||||
static_cast<double>(freeNow) * memSafetyFactor);
|
||||
|
||||
int slotsToAdd = (memPerSlot > 0)
|
||||
? static_cast<int>(usableBytes / memPerSlot) : 0;
|
||||
|
||||
// Apply explicit per-GPU cap; the probe GPU already has the probe slot
|
||||
if (maxSlotsPerGpu > 0) {
|
||||
const int budget = (di == probeGpuIdx)
|
||||
? (maxSlotsPerGpu - 1)
|
||||
: maxSlotsPerGpu;
|
||||
slotsToAdd = std::min(slotsToAdd, budget);
|
||||
}
|
||||
|
||||
slotsPerGpu[di] = slotsToAdd;
|
||||
if (slotsToAdd > maxSlotsAny) maxSlotsAny = slotsToAdd;
|
||||
m_deviceInfos[di].memoryPerSlotBytes = memPerSlot;
|
||||
|
||||
std::cout << "Info [Pool]: GPU[" << di << "] " << m_deviceInfos[di].name
|
||||
<< " -- free " << freeNow / 1048576 << " MiB"
|
||||
<< ", usable " << usableBytes / 1048576 << " MiB"
|
||||
<< " => will add " << slotsToAdd << " slot(s)" << std::endl;
|
||||
}
|
||||
|
||||
// Phase B: create slots interleaved across GPUs
|
||||
// Round 0: GPU0-slot0, GPU1-slot0, GPU2-slot0
|
||||
// Round 1: GPU0-slot1, GPU1-slot1, GPU2-slot1
|
||||
// ...
|
||||
std::vector<int> slotsCreated(numGpus, 0); // track actual success per GPU
|
||||
std::vector<bool> gpuFailed(numGpus, false); // stop trying failed GPUs
|
||||
|
||||
for (int round = 0; round < maxSlotsAny; ++round) {
|
||||
for (int di = 0; di < numGpus; ++di) {
|
||||
if (gpuFailed[di]) continue;
|
||||
if (slotsCreated[di] >= slotsPerGpu[di]) continue;
|
||||
|
||||
cudaSetDevice(di);
|
||||
ANSCENTER::Options opts = baseOptions;
|
||||
opts.deviceIndex = di;
|
||||
|
||||
auto eng = std::make_unique<Engine<T>>(opts);
|
||||
eng->setVerbose(false);
|
||||
eng->setDisableGraphs(true); // concurrent graph captures corrupt CUDA context
|
||||
eng->m_skipEngineCache = m_skipEngineCache; // propagate to pool slots
|
||||
const bool ok = fromOnnx
|
||||
? eng->buildLoadNetwork(modelPath, subVals, divVals, normalize)
|
||||
: eng->loadNetwork (modelPath, subVals, divVals, normalize);
|
||||
|
||||
if (!ok) {
|
||||
std::cout << "Warning [Pool]: GPU[" << di << "] slot "
|
||||
<< (slotsCreated[di] + 1) << "/" << slotsPerGpu[di]
|
||||
<< " failed to load; halting allocation on this device." << std::endl;
|
||||
gpuFailed[di] = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
InferenceSlot slot;
|
||||
slot.deviceIndex = di;
|
||||
slot.busy = false;
|
||||
slot.memUsed = memPerSlot;
|
||||
slot.engine = std::move(eng);
|
||||
m_slots.push_back(std::move(slot));
|
||||
m_deviceInfos[di].slotsAllocated++;
|
||||
slotsCreated[di]++;
|
||||
}
|
||||
}
|
||||
|
||||
m_totalCapacity = static_cast<int>(m_slots.size());
|
||||
printCapacityReport();
|
||||
|
||||
if (m_totalCapacity == 0) {
|
||||
std::cout << "Error [Pool]: Zero inference slots allocated -- "
|
||||
"check available GPU memory." << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// tryGrowPool -- on-demand slot creation (elastic mode)
|
||||
//
|
||||
// Called by runInferenceFromPool when every alive slot is busy.
|
||||
// Creates ONE new engine on the first GPU that has enough free VRAM.
|
||||
// GPUs are scanned in order (0, 1, ...), concentrating load on GPU 0 first.
|
||||
//
|
||||
// Returns a pointer to the new slot (already marked busy) or nullptr if
|
||||
// no GPU has enough VRAM.
|
||||
//
|
||||
// Thread-safety: m_growMutex serialises growth so only one thread creates
|
||||
// a slot at a time. m_slotMutex is acquired briefly to push the new slot
|
||||
// into the deque. The calling thread waits (engine deserialisation takes
|
||||
// ~0.5-3 s), but that is far better than rejecting the request entirely.
|
||||
// ----------------------------------------------------------------------------
|
||||
template <typename T>
|
||||
typename Engine<T>::InferenceSlot*
|
||||
Engine<T>::tryGrowPool(bool bypassGrace)
|
||||
{
|
||||
std::lock_guard<std::mutex> growLock(m_growMutex);
|
||||
|
||||
// Grace period: defer elastic growth for s_elasticGraceSec after the most
|
||||
// recent pool creation. This reserves VRAM for probe engines that haven't
|
||||
// been created yet (e.g., 10 models loading sequentially — early pools
|
||||
// shouldn't grow elastic slots while later probes still need VRAM).
|
||||
// Bypassed for demand-driven growth (a new consumer explicitly joined the
|
||||
// pool, so we KNOW more slots are needed).
|
||||
if (!bypassGrace) {
|
||||
using namespace std::chrono;
|
||||
auto now = duration_cast<milliseconds>(
|
||||
steady_clock::now().time_since_epoch()).count();
|
||||
int64_t lastCreated = s_lastPoolCreatedMs.load();
|
||||
int64_t elapsedSec = (now - lastCreated) / 1000;
|
||||
if (lastCreated > 0 && elapsedSec < s_elasticGraceSec) {
|
||||
// Silently skip — don't spam logs during grace period
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Global cap: prevent too many concurrent CUDA operations across ALL pools.
|
||||
// With shared engine pools, unlimited elastic growth causes CUDA driver
|
||||
// SRW lock convoy (30+ threads all blocked on nvcuda64 internal locks).
|
||||
const int currentGlobal = s_globalElasticCount.load();
|
||||
const int maxGlobal = s_globalElasticMax.load();
|
||||
if (currentGlobal >= maxGlobal) {
|
||||
std::cout << "Info [Pool]: tryGrowPool -- global cap reached ("
|
||||
<< currentGlobal << "/" << maxGlobal
|
||||
<< " total slots), not growing" << std::endl;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Find the GPU with the most free VRAM that has enough for one more slot.
|
||||
// This naturally balances load across GPUs instead of always filling GPU 0.
|
||||
const size_t requiredBytes = (m_poolSafetyFactor > 0.0)
|
||||
? static_cast<size_t>(static_cast<double>(m_memPerSlot) / m_poolSafetyFactor)
|
||||
: m_memPerSlot;
|
||||
|
||||
std::cout << "Info [Pool]: tryGrowPool called -- need " << (requiredBytes >> 20)
|
||||
<< " MiB per slot, scanning " << m_deviceInfos.size() << " GPU(s)..."
|
||||
<< std::endl;
|
||||
|
||||
// Sort device candidates by free VRAM descending (most free first)
|
||||
std::vector<std::pair<size_t, int>> gpuByFreeVram; // {freeBytes, deviceIndex}
|
||||
for (const auto& dev : m_deviceInfos) {
|
||||
cudaSetDevice(dev.index);
|
||||
size_t freeNow = 0, totalNow = 0;
|
||||
cudaMemGetInfo(&freeNow, &totalNow);
|
||||
std::cout << "Info [Pool]: GPU[" << dev.index << "] free=" << (freeNow >> 20)
|
||||
<< " MiB, required=" << (requiredBytes >> 20) << " MiB"
|
||||
<< (freeNow >= requiredBytes ? " -> CANDIDATE" : " -> SKIP (not enough)")
|
||||
<< std::endl;
|
||||
if (freeNow >= requiredBytes) {
|
||||
gpuByFreeVram.push_back({freeNow, dev.index});
|
||||
}
|
||||
}
|
||||
std::sort(gpuByFreeVram.begin(), gpuByFreeVram.end(),
|
||||
[](const auto& a, const auto& b) { return a.first > b.first; });
|
||||
|
||||
if (gpuByFreeVram.empty()) {
|
||||
std::cout << "Warning [Pool]: tryGrowPool -- no GPU has enough free VRAM ("
|
||||
<< (requiredBytes >> 20) << " MiB), cannot grow" << std::endl;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
for (const auto& [freeVram, devIdx] : gpuByFreeVram) {
|
||||
auto& dev = m_deviceInfos[devIdx];
|
||||
|
||||
std::cout << "Info [Pool]: Creating on-demand slot on GPU[" << dev.index
|
||||
<< "] (free=" << (freeVram >> 20) << " MiB)..." << std::endl;
|
||||
|
||||
// Create a new engine on the GPU with the most free VRAM
|
||||
cudaSetDevice(dev.index);
|
||||
ANSCENTER::Options opts = m_options;
|
||||
opts.deviceIndex = dev.index;
|
||||
|
||||
auto eng = std::make_unique<Engine<T>>(opts);
|
||||
eng->setVerbose(false);
|
||||
eng->setDisableGraphs(true); // concurrent graph captures corrupt CUDA context
|
||||
eng->m_skipEngineCache = m_skipEngineCache; // propagate to on-demand slots
|
||||
eng->m_skipOnnxRebuild = true; // elastic growth must NOT delete/rebuild engine files
|
||||
eng->m_skipOnnxBuild = bypassGrace; // demand-driven growth: skip ONNX→TRT if no cached engine
|
||||
const bool ok = m_poolFromOnnx
|
||||
? eng->buildLoadNetwork(m_poolModelPath, m_poolSubVals,
|
||||
m_poolDivVals, m_poolNormalize)
|
||||
: eng->loadNetwork(m_poolModelPath, m_poolSubVals,
|
||||
m_poolDivVals, m_poolNormalize);
|
||||
|
||||
if (!ok) {
|
||||
std::cout << "Warning [Pool]: On-demand slot creation FAILED on GPU["
|
||||
<< dev.index << "]" << std::endl;
|
||||
continue; // try next GPU
|
||||
}
|
||||
std::cout << "Info [Pool]: On-demand slot engine loaded OK on GPU["
|
||||
<< dev.index << "]" << std::endl;
|
||||
|
||||
// Check if we can reuse a dead slot entry (engine == nullptr)
|
||||
{
|
||||
std::lock_guard<std::mutex> slotLock(m_slotMutex);
|
||||
|
||||
for (auto& s : m_slots) {
|
||||
if (!s.engine) { // dead entry -- recycle it
|
||||
s.deviceIndex = dev.index;
|
||||
s.busy = true;
|
||||
s.memUsed = m_memPerSlot;
|
||||
s.engine = std::move(eng);
|
||||
s.lastUsedTime = std::chrono::steady_clock::now();
|
||||
dev.slotsAllocated++;
|
||||
// Recount alive slots
|
||||
int alive = 0;
|
||||
for (const auto& x : m_slots) { if (x.engine) ++alive; }
|
||||
m_totalCapacity = alive;
|
||||
s_globalElasticCount++;
|
||||
std::cout << "Info [Pool]: On-demand slot recycled on GPU["
|
||||
<< dev.index << "] -- pool now " << m_totalCapacity
|
||||
<< " slot(s) (global " << s_globalElasticCount.load()
|
||||
<< "/" << s_globalElasticMax.load() << ")" << std::endl;
|
||||
return &s;
|
||||
}
|
||||
}
|
||||
|
||||
// No dead entries to recycle -- push a new one.
|
||||
// std::deque::push_back does NOT invalidate references to existing
|
||||
// elements, so pointers held by other threads remain valid.
|
||||
InferenceSlot newSlot;
|
||||
newSlot.deviceIndex = dev.index;
|
||||
newSlot.busy = true;
|
||||
newSlot.memUsed = m_memPerSlot;
|
||||
newSlot.engine = std::move(eng);
|
||||
newSlot.lastUsedTime = std::chrono::steady_clock::now();
|
||||
m_slots.push_back(std::move(newSlot));
|
||||
dev.slotsAllocated++;
|
||||
m_totalCapacity = static_cast<int>(m_slots.size()); // all alive here
|
||||
s_globalElasticCount++;
|
||||
|
||||
std::cout << "Info [Pool]: On-demand slot created on GPU["
|
||||
<< dev.index << "] -- pool now " << m_totalCapacity
|
||||
<< " slot(s) (global " << s_globalElasticCount.load()
|
||||
<< "/" << s_globalElasticMax.load() << ")" << std::endl;
|
||||
|
||||
return &m_slots.back();
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr; // every GPU is full
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// growPool -- public demand-driven growth (bypasses grace period)
|
||||
// ----------------------------------------------------------------------------
|
||||
template <typename T>
|
||||
int Engine<T>::growPool(int count)
|
||||
{
|
||||
int created = 0;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
auto* slot = tryGrowPool(/*bypassGrace=*/true);
|
||||
if (!slot) break;
|
||||
// Release so inference threads can use it
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(m_slotMutex);
|
||||
slot->busy = false;
|
||||
slot->lastUsedTime = std::chrono::steady_clock::now();
|
||||
}
|
||||
m_slotFreeCv.notify_one();
|
||||
++created;
|
||||
}
|
||||
return created;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// runInferenceFromPool -- thread-safe slot dispatch
|
||||
// ----------------------------------------------------------------------------
|
||||
template <typename T>
|
||||
bool Engine<T>::runInferenceFromPool(
|
||||
const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
|
||||
std::vector<std::vector<std::vector<T>>>& featureVectors)
|
||||
{
|
||||
// -- 1. Acquire an idle, alive slot (round-robin) --------------------
|
||||
//
|
||||
// Round-robin starting point avoids always favouring GPU 0. Each call
|
||||
// advances m_nextSlotHint so consecutive requests spread across GPUs.
|
||||
// The mutex is held only for the O(N) scan + flag flip -- NOT during GPU
|
||||
// execution -- so threads using different slots proceed in parallel.
|
||||
//
|
||||
// PROACTIVE GROWTH (elastic mode):
|
||||
// If all alive slots are busy when a request arrives, the pool is
|
||||
// undersized for the current concurrency level. We kick off pool
|
||||
// growth (tryGrowPool) in a detached background thread while we
|
||||
// wait for the current slot to free. This ensures multi-GPU
|
||||
// utilisation: the new slot lands on the GPU with the most free
|
||||
// VRAM (typically GPU[1]) and is ready for the *next* request.
|
||||
// Growth is serialised by m_growMutex so duplicate threads are
|
||||
// harmless — the second one finds a fresh slot immediately.
|
||||
InferenceSlot* slot = nullptr;
|
||||
bool kickedGrowth = false;
|
||||
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(m_slotMutex);
|
||||
|
||||
const auto deadline = std::chrono::steady_clock::now()
|
||||
+ std::chrono::milliseconds(2000);
|
||||
|
||||
while (!slot) {
|
||||
const size_t n = m_slots.size();
|
||||
if (n > 0) {
|
||||
const size_t start = m_nextSlotHint.load() % n;
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
auto& s = m_slots[(start + i) % n];
|
||||
if (!s.busy && s.engine) { // alive and idle
|
||||
s.busy = true;
|
||||
slot = &s;
|
||||
m_nextSlotHint = (start + i + 1) % n;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!slot) {
|
||||
// All slots busy. In elastic mode, proactively grow the
|
||||
// pool in the background so the next request has a slot
|
||||
// on a different GPU. We only kick once per wait cycle.
|
||||
if (m_elasticMode && !kickedGrowth
|
||||
&& s_globalElasticCount.load() < s_globalElasticMax.load()) {
|
||||
kickedGrowth = true;
|
||||
std::cout << "Info [Pool]: All slots busy -- kicking background growth thread"
|
||||
<< std::endl;
|
||||
// Fire-and-forget: tryGrowPool is serialised by
|
||||
// m_growMutex, so concurrent kicks are safe.
|
||||
std::thread([this]() {
|
||||
std::cout << "Info [Pool]: Background growth thread started" << std::endl;
|
||||
auto* newSlot = this->tryGrowPool();
|
||||
if (newSlot) {
|
||||
// Slot was created pre-marked busy; release it
|
||||
// so the next requester can claim it.
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(m_slotMutex);
|
||||
newSlot->busy = false;
|
||||
newSlot->lastUsedTime = std::chrono::steady_clock::now();
|
||||
}
|
||||
m_slotFreeCv.notify_all();
|
||||
std::cout << "Info [Pool]: Background growth SUCCEEDED -- new slot on GPU["
|
||||
<< newSlot->deviceIndex << "], pool now "
|
||||
<< m_totalCapacity << " slot(s)" << std::endl;
|
||||
} else {
|
||||
std::cout << "Warning [Pool]: Background growth FAILED -- no slot created"
|
||||
<< std::endl;
|
||||
}
|
||||
}).detach();
|
||||
}
|
||||
|
||||
// Wait for a running slot to finish and signal us
|
||||
if (m_slotFreeCv.wait_until(lock, deadline)
|
||||
== std::cv_status::timeout) {
|
||||
break; // fall through to reject
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -- 3. Still no slot => reject ---------------------------------------
|
||||
if (!slot) {
|
||||
std::string errMsg = "[Engine] runInferenceFromPool FAIL: Capacity reached -- all "
|
||||
+ std::to_string(m_activeCount.load()) + "/" + std::to_string(m_totalCapacity)
|
||||
+ " slot(s) busy"
|
||||
+ (m_elasticMode ? " and all GPUs full" : "")
|
||||
+ ". Request rejected (2s timeout).";
|
||||
std::cout << errMsg << std::endl;
|
||||
logEngineEvent(errMsg, true);
|
||||
return false;
|
||||
}
|
||||
|
||||
++m_activeCount;
|
||||
|
||||
// -- RAII guard: guarantee busy-flag and activeCount are restored ----------
|
||||
// If runInference() throws (cv::Exception, std::bad_alloc, ...) the slot
|
||||
// must be released and the counter decremented -- otherwise the slot is
|
||||
// permanently lost and capacity shrinks with every exception.
|
||||
bool result = false;
|
||||
try {
|
||||
// Match the calling thread's CUDA context to the slot's device.
|
||||
// Skip the call if the thread is already on the correct device
|
||||
// (cudaSetDevice under WDDM can cost 1-5ms per call).
|
||||
int currentDev = -1;
|
||||
cudaGetDevice(¤tDev);
|
||||
if (currentDev != slot->deviceIndex) {
|
||||
cudaSetDevice(slot->deviceIndex);
|
||||
}
|
||||
result = slot->engine->runInference(inputs, featureVectors);
|
||||
}
|
||||
catch (const std::exception& ex) {
|
||||
std::cout << "Error [Pool]: runInference threw: " << ex.what() << std::endl;
|
||||
}
|
||||
catch (...) {
|
||||
std::cout << "Error [Pool]: runInference threw unknown exception" << std::endl;
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_slotMutex);
|
||||
slot->busy = false;
|
||||
slot->lastUsedTime = std::chrono::steady_clock::now();
|
||||
}
|
||||
--m_activeCount;
|
||||
m_slotFreeCv.notify_one(); // wake one thread waiting for a free slot
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// releaseIdleSlots -- VRAM reclamation for elastic pools
|
||||
//
|
||||
// Destroys engine instances that have been idle for at least `idleSeconds`.
|
||||
// The first slot (probe, index 0) is never released so the model remains
|
||||
// instantly usable without re-measurement.
|
||||
//
|
||||
// Dead slots are NOT erased from the deque (to avoid invalidating pointers);
|
||||
// their engine is reset to nullptr and they are recycled by tryGrowPool().
|
||||
//
|
||||
// Call from a periodic background timer, e.g. every 10-30 seconds:
|
||||
// engine->releaseIdleSlots(30.0);
|
||||
// ----------------------------------------------------------------------------
|
||||
template <typename T>
|
||||
int Engine<T>::releaseIdleSlots(double idleSeconds)
|
||||
{
|
||||
std::lock_guard<std::mutex> growLock(m_growMutex);
|
||||
std::lock_guard<std::mutex> slotLock(m_slotMutex);
|
||||
|
||||
const auto now = std::chrono::steady_clock::now();
|
||||
int released = 0;
|
||||
|
||||
// Skip index 0 -- that's the probe slot, always kept alive
|
||||
for (size_t i = 1; i < m_slots.size(); ++i) {
|
||||
auto& s = m_slots[i];
|
||||
if (!s.busy && s.engine) { // alive and idle
|
||||
const double idle = std::chrono::duration<double>(
|
||||
now - s.lastUsedTime).count();
|
||||
if (idle >= idleSeconds) {
|
||||
// Update device info
|
||||
for (auto& dev : m_deviceInfos) {
|
||||
if (dev.index == s.deviceIndex) {
|
||||
if (dev.slotsAllocated > 0) dev.slotsAllocated--;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Info [Pool]: Releasing idle slot on GPU["
|
||||
<< s.deviceIndex << "] (idle "
|
||||
<< static_cast<int>(idle) << "s)" << std::endl;
|
||||
|
||||
// Destroy engine -- frees GPU memory.
|
||||
// The InferenceSlot entry stays in the deque (dead) for reuse.
|
||||
s.engine.reset();
|
||||
s.memUsed = 0;
|
||||
released++;
|
||||
s_globalElasticCount--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Recount alive slots
|
||||
int alive = 0;
|
||||
for (const auto& s : m_slots) { if (s.engine) ++alive; }
|
||||
m_totalCapacity = alive;
|
||||
|
||||
if (released > 0) {
|
||||
std::cout << "Info [Pool]: Released " << released << " idle slot(s)"
|
||||
<< " -- pool now " << m_totalCapacity << " alive slot(s)"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
return released;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// printCapacityReport
|
||||
// ----------------------------------------------------------------------------
|
||||
template <typename T>
|
||||
void Engine<T>::printCapacityReport() const
|
||||
{
|
||||
// Count alive vs dead -- lock protects against concurrent tryGrowPool
|
||||
std::lock_guard<std::mutex> lock(m_slotMutex);
|
||||
int alive = 0, dead = 0;
|
||||
for (const auto& s : m_slots) {
|
||||
if (s.engine) ++alive; else ++dead;
|
||||
}
|
||||
|
||||
std::cout << "\n=====================================================" << std::endl;
|
||||
std::cout << " Engine Pool -- Capacity Report"
|
||||
<< (m_elasticMode ? " [ELASTIC]" : " [PRE-ALLOCATED]") << std::endl;
|
||||
std::cout << "=====================================================" << std::endl;
|
||||
std::cout << " Alive inference slots : " << alive << std::endl;
|
||||
if (dead > 0)
|
||||
std::cout << " Dead (recyclable) : " << dead << std::endl;
|
||||
std::cout << " Active inferences : " << m_activeCount.load() << std::endl;
|
||||
std::cout << " Available slots : "
|
||||
<< (alive - m_activeCount.load())
|
||||
<< (m_elasticMode ? " (+ on-demand)" : "")
|
||||
<< std::endl;
|
||||
if (m_elasticMode) {
|
||||
std::cout << " Global slot usage : "
|
||||
<< s_globalElasticCount.load() << "/" << s_globalElasticMax.load()
|
||||
<< " (across all pools)" << std::endl;
|
||||
}
|
||||
std::cout << " Memory per slot : " << m_memPerSlot / 1048576 << " MiB" << std::endl;
|
||||
std::cout << "-----------------------------------------------------" << std::endl;
|
||||
for (const auto& d : m_deviceInfos) {
|
||||
std::cout << " GPU[" << d.index << "] " << d.name
|
||||
<< " | SM " << d.computeMajor << "." << d.computeMinor
|
||||
<< " | Total " << d.totalMemoryBytes / 1048576 << " MiB"
|
||||
<< " | Slots: " << d.slotsAllocated
|
||||
<< " | Mem/slot: " << d.memoryPerSlotBytes / 1048576 << " MiB"
|
||||
<< std::endl;
|
||||
}
|
||||
std::cout << "=====================================================" << std::endl;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// startIdleTimer / stopIdleTimer -- automatic idle-slot cleanup
|
||||
//
|
||||
// A background thread wakes every m_idleTimerIntervalSec seconds and calls
|
||||
// releaseIdleSlots(m_idleTimerThresholdSec). The thread uses a
|
||||
// condition_variable with a timed wait so that stopIdleTimer() can wake it
|
||||
// immediately for a clean shutdown (no dangling sleeps).
|
||||
//
|
||||
// Only active in elastic mode -- pre-allocated pools have fixed capacity.
|
||||
// ----------------------------------------------------------------------------
|
||||
template <typename T>
|
||||
void Engine<T>::startIdleTimer()
|
||||
{
|
||||
if (!m_elasticMode) return; // no-op for pre-allocated pools
|
||||
if (m_idleTimerThread.joinable()) return; // already running
|
||||
|
||||
m_idleTimerStop = false;
|
||||
|
||||
m_idleTimerThread = std::thread([this]() {
|
||||
std::cout << "Info [Pool]: Idle-slot cleanup timer started "
|
||||
<< "(interval=" << m_idleTimerIntervalSec << "s, threshold="
|
||||
<< m_idleTimerThresholdSec << "s)" << std::endl;
|
||||
|
||||
while (!m_idleTimerStop.load()) {
|
||||
// Sleep for the interval, but wake early if stop is signalled
|
||||
{
|
||||
std::unique_lock<std::mutex> lk(m_idleTimerMutex);
|
||||
m_idleTimerCv.wait_for(lk,
|
||||
std::chrono::duration<double>(m_idleTimerIntervalSec),
|
||||
[this]() { return m_idleTimerStop.load(); });
|
||||
}
|
||||
|
||||
if (m_idleTimerStop.load()) break;
|
||||
|
||||
releaseIdleSlots(m_idleTimerThresholdSec);
|
||||
}
|
||||
|
||||
std::cout << "Info [Pool]: Idle-slot cleanup timer stopped." << std::endl;
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void Engine<T>::stopIdleTimer()
|
||||
{
|
||||
if (!m_idleTimerThread.joinable()) return; // not running
|
||||
|
||||
m_idleTimerStop = true;
|
||||
m_idleTimerCv.notify_all(); // wake the sleeping thread immediately
|
||||
|
||||
// During ExitProcess, worker threads are already killed by the OS.
|
||||
// Calling join() on a dead thread deadlocks or causes std::terminate.
|
||||
// Detach instead — the OS will reclaim everything momentarily.
|
||||
if (g_processExiting().load(std::memory_order_relaxed)) {
|
||||
m_idleTimerThread.detach();
|
||||
} else {
|
||||
m_idleTimerThread.join(); // normal path: wait for clean exit
|
||||
}
|
||||
}
|
||||
431
engines/TensorRTAPI/include/engine/EnginePoolManager.h
Normal file
431
engines/TensorRTAPI/include/engine/EnginePoolManager.h
Normal file
@@ -0,0 +1,431 @@
|
||||
#pragma once
|
||||
// EnginePoolManager.h — Process-wide cache for shared Engine<T> pool instances.
|
||||
//
|
||||
// When multiple AI tasks load the same model (same ONNX path + GPU + config),
|
||||
// this manager ensures they share a SINGLE Engine<T> pool instead of each task
|
||||
// creating its own pool with independent execution contexts and VRAM buffers.
|
||||
//
|
||||
// Without sharing: N tasks × ~500 MB = N × 500 MB VRAM (OOM at ~5-8 tasks on 8GB GPU)
|
||||
// With sharing: 1 pool × ~500 MB = 500 MB total (unlimited tasks, slower via queuing)
|
||||
//
|
||||
// Lazy eviction: when refcount drops to 0, the pool is kept alive for
|
||||
// kEvictGraceSec seconds. If a new task acquires it within that window,
|
||||
// it gets an instant HIT without rebuilding. This handles the LabView
|
||||
// edit/duplicate/create cycle (destroy → recreate) gracefully.
|
||||
//
|
||||
// Thread-safety: All public methods are mutex-protected.
|
||||
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <array>
|
||||
#include <iostream>
|
||||
#include <functional>
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
#include <atomic>
|
||||
#include <cuda_runtime.h>
|
||||
#include "TRTEngineCache.h" // constructor touches TRTEngineCache::instance() for destruction ordering
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
// Forward declare Engine<T> to avoid circular includes.
|
||||
// The header that includes this must also include engine.h.
|
||||
template <typename T> class Engine;
|
||||
|
||||
namespace ANSCENTER { struct Options; }
|
||||
|
||||
template <typename T>
|
||||
class EnginePoolManager {
|
||||
public:
|
||||
static EnginePoolManager& instance() {
|
||||
static EnginePoolManager s_instance;
|
||||
return s_instance;
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// Cache key — uniquely identifies a compatible Engine pool.
|
||||
// ========================================================================
|
||||
struct PoolKey {
|
||||
std::string modelPath;
|
||||
int precision = 0; // cast from Precision enum
|
||||
int maxBatch = 1;
|
||||
|
||||
bool operator==(const PoolKey& o) const {
|
||||
return modelPath == o.modelPath &&
|
||||
precision == o.precision &&
|
||||
maxBatch == o.maxBatch;
|
||||
}
|
||||
};
|
||||
|
||||
struct PoolKeyHash {
|
||||
size_t operator()(const PoolKey& k) const {
|
||||
size_t h = std::hash<std::string>{}(k.modelPath);
|
||||
h ^= std::hash<int>{}(k.precision) << 16;
|
||||
h ^= std::hash<int>{}(k.maxBatch) << 24;
|
||||
return h;
|
||||
}
|
||||
};
|
||||
|
||||
// ========================================================================
|
||||
// acquire() — get or create a shared Engine pool.
|
||||
//
|
||||
// On first call for a given key: creates a new Engine<T>, calls
|
||||
// buildLoadNetwork with the provided parameters, and caches it.
|
||||
//
|
||||
// On subsequent calls (or within lazy-eviction grace period):
|
||||
// returns the existing shared_ptr and increments refcount.
|
||||
// No VRAM allocated, near-instant.
|
||||
//
|
||||
// Returns nullptr if engine creation/loading fails.
|
||||
// ========================================================================
|
||||
std::shared_ptr<Engine<T>> acquire(
|
||||
const PoolKey& key,
|
||||
const ANSCENTER::Options& options,
|
||||
const std::string& modelPath,
|
||||
const std::array<float, 3>& subVals,
|
||||
const std::array<float, 3>& divVals,
|
||||
bool normalize,
|
||||
int maxSlotsPerGpu)
|
||||
{
|
||||
// Optimizer / temporary engines: maxSlotsPerGpu==0 means the caller
|
||||
// only needs a lightweight, non-shared engine (e.g., OptimizeModelStr).
|
||||
// Bypass the pool cache entirely:
|
||||
// - Don't hold m_mutex (which blocks ALL other pool creation)
|
||||
// - Don't cache the result (temporary engine is destroyed on release)
|
||||
// - Use the simple 4-param buildLoadNetwork (no pool, no probe, no VRAM measurement)
|
||||
// Note: maxSlotsPerGpu==1 is now the normal "1 slot per GPU" multi-GPU
|
||||
// round-robin mode, so it goes through the pool path below.
|
||||
if (maxSlotsPerGpu == 0) {
|
||||
logEvent("[EnginePoolManager] BYPASS (maxSlots=0): " + key.modelPath
|
||||
+ " — creating non-shared engine");
|
||||
auto engine = std::make_shared<Engine<T>>(options);
|
||||
bool ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
|
||||
return ok ? engine : nullptr;
|
||||
}
|
||||
|
||||
std::unique_lock<std::mutex> lock(m_mutex);
|
||||
|
||||
auto it = m_pools.find(key);
|
||||
if (it != m_pools.end()) {
|
||||
it->second.refcount++;
|
||||
it->second.evictTime = TimePoint{}; // cancel pending eviction
|
||||
int refs = it->second.refcount;
|
||||
auto engine = it->second.engine;
|
||||
logEvent("[EnginePoolManager] HIT: " + key.modelPath
|
||||
+ " refs=" + std::to_string(refs));
|
||||
|
||||
// Demand-driven growth: only in elastic mode (maxSlotsPerGpu <= 0
|
||||
// or > 1). With maxSlotsPerGpu==1 (round-robin default), the pool
|
||||
// already has the right number of slots (1 per GPU) — tasks queue
|
||||
// when all slots are busy, which is the intended behavior.
|
||||
if (maxSlotsPerGpu != 1 && refs > 1 && engine) {
|
||||
int alive = engine->getTotalCapacity();
|
||||
if (alive < refs) {
|
||||
// Check total GPU VRAM — skip growth on small GPUs
|
||||
size_t totalVram = 0;
|
||||
{
|
||||
size_t freeTmp = 0;
|
||||
cudaSetDevice(options.deviceIndex);
|
||||
cudaMemGetInfo(&freeTmp, &totalVram);
|
||||
}
|
||||
constexpr size_t kMinVramForGrowth = 6ULL * 1024 * 1024 * 1024; // 6 GB
|
||||
if (totalVram >= kMinVramForGrowth) {
|
||||
lock.unlock(); // release PoolManager lock before growing
|
||||
std::thread([engine, alive, refs, modelPath = key.modelPath]() {
|
||||
int created = engine->growPool(1);
|
||||
if (created > 0) {
|
||||
logEngineEvent("[EnginePoolManager] DEMAND GROWTH: " + modelPath
|
||||
+ " grew from " + std::to_string(alive)
|
||||
+ " to " + std::to_string(engine->getTotalCapacity())
|
||||
+ " slots (refs=" + std::to_string(refs) + ")");
|
||||
}
|
||||
}).detach();
|
||||
} else {
|
||||
logEvent("[EnginePoolManager] SKIP GROWTH: " + key.modelPath
|
||||
+ " (GPU VRAM " + std::to_string(totalVram >> 20)
|
||||
+ " MiB < 6 GB threshold, refs=" + std::to_string(refs) + ")");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return engine;
|
||||
}
|
||||
|
||||
// Cache miss — create new Engine pool
|
||||
logEvent("[EnginePoolManager] MISS: Creating pool for " + key.modelPath + "...");
|
||||
|
||||
// Log VRAM before attempting to create probe
|
||||
{
|
||||
size_t freeMem = 0, totalMem = 0;
|
||||
cudaSetDevice(options.deviceIndex);
|
||||
cudaMemGetInfo(&freeMem, &totalMem);
|
||||
logEvent("[EnginePoolManager] GPU[" + std::to_string(options.deviceIndex)
|
||||
+ "] VRAM: " + std::to_string(freeMem >> 20) + " MiB free / "
|
||||
+ std::to_string(totalMem >> 20) + " MiB total (before probe)");
|
||||
}
|
||||
|
||||
auto engine = std::make_shared<Engine<T>>(options);
|
||||
bool ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize, maxSlotsPerGpu);
|
||||
if (!ok) {
|
||||
// Step 1: Force-evict all pools with refcount=0 to reclaim VRAM
|
||||
int evicted = forceEvictPending();
|
||||
if (evicted > 0) {
|
||||
size_t freeMem2 = 0, totalMem2 = 0;
|
||||
cudaSetDevice(options.deviceIndex);
|
||||
cudaMemGetInfo(&freeMem2, &totalMem2);
|
||||
logEvent("[EnginePoolManager] RETRY EVICT: Force-evicted " + std::to_string(evicted)
|
||||
+ " pending pool(s), now " + std::to_string(freeMem2 >> 20)
|
||||
+ " MiB free. Retrying " + key.modelPath + "...");
|
||||
|
||||
engine = std::make_shared<Engine<T>>(options);
|
||||
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize, maxSlotsPerGpu);
|
||||
}
|
||||
|
||||
// Step 2: If still failing, retry with lightweight mode (no elastic pool).
|
||||
// The elastic probe does heavy warmup (batch 1-8, 10+ iterations) which
|
||||
// consumes ~300-500 MB vs ~50-100 MB for a simple loadNetwork.
|
||||
// Lightweight mode: tasks queue for a single shared slot — slower but works.
|
||||
if (!ok) {
|
||||
size_t freeMem3 = 0, totalMem3 = 0;
|
||||
cudaSetDevice(options.deviceIndex);
|
||||
cudaMemGetInfo(&freeMem3, &totalMem3);
|
||||
logEvent("[EnginePoolManager] RETRY LIGHTWEIGHT: Elastic probe failed, "
|
||||
+ std::to_string(freeMem3 >> 20) + " MiB free. "
|
||||
"Retrying with single-slot mode for " + key.modelPath + "...");
|
||||
|
||||
engine = std::make_shared<Engine<T>>(options);
|
||||
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
|
||||
}
|
||||
|
||||
// Step 3: If still failing, wait briefly and retry.
|
||||
// Transient failures can occur when:
|
||||
// - TRT engine file is being written by another build (partial file)
|
||||
// - CUDA driver has temporary resource contention during multi-pool startup
|
||||
// - GPU memory fragmentation resolves after previous allocations settle
|
||||
// Evidence: FireSmoke/detector.onnx failed at 3740 MiB free, then
|
||||
// succeeded 4 seconds later at 3154 MiB free (less VRAM!).
|
||||
if (!ok) {
|
||||
size_t freeMem4 = 0, totalMem4 = 0;
|
||||
cudaSetDevice(options.deviceIndex);
|
||||
cudaMemGetInfo(&freeMem4, &totalMem4);
|
||||
logEvent("[EnginePoolManager] RETRY DELAYED: All attempts failed with "
|
||||
+ std::to_string(freeMem4 >> 20) + " MiB free. "
|
||||
"Waiting 3s before final retry for " + key.modelPath + "...");
|
||||
|
||||
// Release mutex during sleep so other tasks can proceed
|
||||
// (they may complete pool creation that resolves our issue)
|
||||
lock.unlock();
|
||||
std::this_thread::sleep_for(std::chrono::seconds(3));
|
||||
lock.lock();
|
||||
|
||||
// Check if another thread created this pool while we slept
|
||||
auto it2 = m_pools.find(key);
|
||||
if (it2 != m_pools.end()) {
|
||||
it2->second.refcount++;
|
||||
it2->second.evictTime = TimePoint{};
|
||||
logEvent("[EnginePoolManager] HIT (after delay): " + key.modelPath
|
||||
+ " refs=" + std::to_string(it2->second.refcount));
|
||||
return it2->second.engine;
|
||||
}
|
||||
|
||||
// Final retry — try lightweight again after delay
|
||||
cudaSetDevice(options.deviceIndex);
|
||||
cudaMemGetInfo(&freeMem4, &totalMem4);
|
||||
logEvent("[EnginePoolManager] RETRY FINAL: " + std::to_string(freeMem4 >> 20)
|
||||
+ " MiB free. Last attempt for " + key.modelPath + "...");
|
||||
|
||||
engine = std::make_shared<Engine<T>>(options);
|
||||
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
size_t freeMem = 0, totalMem = 0;
|
||||
cudaMemGetInfo(&freeMem, &totalMem);
|
||||
logEvent("[EnginePoolManager] FAILED: Could not load engine for "
|
||||
+ key.modelPath + " | GPU[" + std::to_string(options.deviceIndex)
|
||||
+ "] VRAM: " + std::to_string(freeMem >> 20) + " MiB free / "
|
||||
+ std::to_string(totalMem >> 20) + " MiB total"
|
||||
+ " (after 4 attempts: elastic, evict, lightweight, delayed)", true);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
PoolEntry entry;
|
||||
entry.engine = engine;
|
||||
entry.refcount = 1;
|
||||
m_pools.emplace(key, std::move(entry));
|
||||
|
||||
// Start the lazy-eviction sweeper if not already running
|
||||
startSweeperIfNeeded();
|
||||
|
||||
logEvent("[EnginePoolManager] CREATED: " + key.modelPath + " refs=1");
|
||||
return engine;
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// release() — decrement refcount for a shared pool.
|
||||
//
|
||||
// When refcount reaches 0, the pool is NOT immediately evicted.
|
||||
// Instead, it is marked for lazy eviction after kEvictGraceSec.
|
||||
// This handles the LabView edit cycle (destroy → recreate within
|
||||
// seconds) without rebuilding the engine from scratch.
|
||||
// ========================================================================
|
||||
void release(const PoolKey& key) {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
auto it = m_pools.find(key);
|
||||
if (it == m_pools.end()) return;
|
||||
if (it->second.refcount <= 0) return;
|
||||
|
||||
it->second.refcount--;
|
||||
logEvent("[EnginePoolManager] RELEASE: " + key.modelPath
|
||||
+ " refs=" + std::to_string(it->second.refcount));
|
||||
|
||||
if (it->second.refcount <= 0) {
|
||||
// Mark for lazy eviction — don't destroy yet
|
||||
it->second.evictTime = Clock::now() + std::chrono::seconds(kEvictGraceSec);
|
||||
logEvent("[EnginePoolManager] PENDING EVICT: " + key.modelPath
|
||||
+ " (will evict in " + std::to_string(kEvictGraceSec) + "s if not re-acquired)");
|
||||
}
|
||||
}
|
||||
|
||||
/// Clear all cached pools (call during DLL_PROCESS_DETACH).
|
||||
void clearAll() {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
logEvent("[EnginePoolManager] CLEAR ALL (" + std::to_string(m_pools.size()) + " pools)");
|
||||
m_pools.clear();
|
||||
}
|
||||
stopSweeper();
|
||||
}
|
||||
|
||||
/// Number of cached pools (for diagnostics).
|
||||
size_t size() const {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
return m_pools.size();
|
||||
}
|
||||
|
||||
private:
|
||||
EnginePoolManager() {
|
||||
// CRITICAL: Touch TRTEngineCache singleton to ensure it is constructed
|
||||
// BEFORE EnginePoolManager. C++ destroys function-local statics in
|
||||
// reverse construction order, so this guarantees TRTEngineCache outlives
|
||||
// EnginePoolManager. Without this, during ExitProcess the cache may be
|
||||
// destroyed first, and ~Engine calling TRTEngineCache::release() crashes
|
||||
// on a destroyed unordered_map (static destruction order fiasco).
|
||||
(void)TRTEngineCache::instance();
|
||||
}
|
||||
~EnginePoolManager() {
|
||||
if (g_processExiting().load(std::memory_order_relaxed)) {
|
||||
// ExitProcess path: worker threads are dead, CUDA/TRT state is
|
||||
// unreliable. Don't destroy Engine objects (their destructors
|
||||
// call cudaFree, thread::join, etc. which deadlock or crash).
|
||||
// The OS reclaims all memory, VRAM, and handles at process exit.
|
||||
m_sweeperRunning.store(false);
|
||||
return;
|
||||
}
|
||||
// Normal FreeLibrary path: threads are alive, safe to clean up.
|
||||
// Explicitly clear pools before implicit member destruction.
|
||||
// This destroys Engine<T> objects (which call TRTEngineCache::release())
|
||||
// while we still hold m_mutex and can log diagnostics.
|
||||
try {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
m_pools.clear();
|
||||
} catch (...) {}
|
||||
stopSweeper();
|
||||
}
|
||||
EnginePoolManager(const EnginePoolManager&) = delete;
|
||||
EnginePoolManager& operator=(const EnginePoolManager&) = delete;
|
||||
|
||||
// Grace period before evicting a pool with refcount=0.
|
||||
// Covers LabView edit/duplicate/create cycles (destroy → recreate).
|
||||
static constexpr int kEvictGraceSec = 120; // 2 minutes
|
||||
|
||||
// Sweeper interval — how often to check for expired pools.
|
||||
static constexpr int kSweeperIntervalSec = 30;
|
||||
|
||||
using Clock = std::chrono::steady_clock;
|
||||
using TimePoint = std::chrono::time_point<Clock>;
|
||||
|
||||
// Log to stdout/stderr only — no Windows Event Viewer.
|
||||
// Event Viewer logging is handled by logEngineEvent() in engine.h for
|
||||
// critical engine-level errors. EnginePoolManager messages are
|
||||
// informational (HIT/MISS/EVICT) and don't need Event Viewer entries.
|
||||
static void logEvent(const std::string& msg, bool isError = false) {
|
||||
if (isError)
|
||||
std::cerr << msg << std::endl;
|
||||
else
|
||||
std::cout << msg << std::endl;
|
||||
}
|
||||
|
||||
struct PoolEntry {
|
||||
std::shared_ptr<Engine<T>> engine;
|
||||
int refcount = 0;
|
||||
TimePoint evictTime {}; // when to evict (zero = not pending)
|
||||
};
|
||||
|
||||
// ========================================================================
|
||||
// Sweeper thread — periodically checks for pools whose eviction
|
||||
// grace period has expired and removes them.
|
||||
// ========================================================================
|
||||
void startSweeperIfNeeded() {
|
||||
// Called under m_mutex
|
||||
if (m_sweeperRunning.load()) return;
|
||||
m_sweeperRunning.store(true);
|
||||
m_sweeperThread = std::thread([this]() {
|
||||
while (m_sweeperRunning.load()) {
|
||||
std::this_thread::sleep_for(std::chrono::seconds(kSweeperIntervalSec));
|
||||
if (!m_sweeperRunning.load()) break;
|
||||
sweepExpired();
|
||||
}
|
||||
});
|
||||
m_sweeperThread.detach();
|
||||
}
|
||||
|
||||
void stopSweeper() {
|
||||
m_sweeperRunning.store(false);
|
||||
}
|
||||
|
||||
// Force-evict ALL pools with refcount=0 (regardless of grace period).
|
||||
// Called when a new pool creation fails due to low VRAM.
|
||||
// Returns number of pools evicted.
|
||||
// MUST be called under m_mutex.
|
||||
int forceEvictPending() {
|
||||
int evicted = 0;
|
||||
for (auto it = m_pools.begin(); it != m_pools.end(); ) {
|
||||
if (it->second.refcount <= 0) {
|
||||
logEvent("[EnginePoolManager] FORCE EVICT (VRAM recovery): " + it->first.modelPath);
|
||||
it = m_pools.erase(it);
|
||||
evicted++;
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
return evicted;
|
||||
}
|
||||
|
||||
void sweepExpired() {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
auto now = Clock::now();
|
||||
for (auto it = m_pools.begin(); it != m_pools.end(); ) {
|
||||
auto& entry = it->second;
|
||||
// Only evict if refcount is 0 AND evictTime has passed
|
||||
if (entry.refcount <= 0
|
||||
&& entry.evictTime != TimePoint{}
|
||||
&& now >= entry.evictTime)
|
||||
{
|
||||
logEvent("[EnginePoolManager] EVICT (expired): " + it->first.modelPath);
|
||||
it = m_pools.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::unordered_map<PoolKey, PoolEntry, PoolKeyHash> m_pools;
|
||||
mutable std::mutex m_mutex;
|
||||
std::atomic<bool> m_sweeperRunning{false};
|
||||
std::thread m_sweeperThread;
|
||||
};
|
||||
719
engines/TensorRTAPI/include/engine/EngineRunInference.inl
Normal file
719
engines/TensorRTAPI/include/engine/EngineRunInference.inl
Normal file
@@ -0,0 +1,719 @@
|
||||
#pragma once
|
||||
#include <cstring>
|
||||
#include <filesystem>
|
||||
#include "TRTCompat.h"
|
||||
|
||||
// Per-device mutex for CUDA graph capture.
|
||||
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
|
||||
// at the CUDA context level. When two Engine instances on the same GPU
|
||||
// capture graphs concurrently, these cross-stream dependencies violate
|
||||
// graph capture rules ("operation not permitted when stream is capturing").
|
||||
// This mutex serialises graph captures across all Engine<T> instances on
|
||||
// the same device — subsequent cudaGraphLaunch calls are still concurrent.
|
||||
static std::mutex& graphCaptureMutex() {
|
||||
static std::mutex m;
|
||||
return m;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void Engine<T>::warmUp(int iterations) {
|
||||
if (m_verbose) {
|
||||
std::cout << "\n========================================" << std::endl;
|
||||
std::cout << "Engine Warmup" << std::endl;
|
||||
std::cout << "========================================" << std::endl;
|
||||
}
|
||||
|
||||
// Determine batch sizes to warm up
|
||||
std::vector<int> batchSizes;
|
||||
|
||||
if (m_options.maxBatchSize > 1) {
|
||||
if (m_verbose) {
|
||||
std::cout << "Dynamic batch engine detected (max batch: " << m_options.maxBatchSize << ")" << std::endl;
|
||||
std::cout << "Warming up common batch sizes to pre-compile kernels..." << std::endl;
|
||||
}
|
||||
|
||||
// Warm up ALL batch sizes from 1 to maxBatchSize.
|
||||
// Each unseen batch size incurs a 100-300ms kernel compilation penalty
|
||||
// on first use. Warming all sizes eliminates that latency at inference
|
||||
// time and ensures every CUDA graph is pre-captured.
|
||||
for (int batch = 1; batch <= m_options.maxBatchSize; ++batch) {
|
||||
batchSizes.push_back(batch);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (m_verbose) std::cout << "Fixed batch engine detected (batch size: " << m_options.maxBatchSize << ")" << std::endl;
|
||||
batchSizes.push_back(m_options.maxBatchSize);
|
||||
}
|
||||
|
||||
if (m_verbose) {
|
||||
std::cout << "Batch sizes to warm up: ";
|
||||
for (size_t i = 0; i < batchSizes.size(); ++i) {
|
||||
std::cout << batchSizes[i];
|
||||
if (i < batchSizes.size() - 1) std::cout << ", ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
// Warm up each batch size.
|
||||
// The first call triggers kernel compilation; the second captures the CUDA
|
||||
// graph. Additional iterations only measure steady-state latency for the
|
||||
// optBatchSize (printed as a diagnostic).
|
||||
for (int batchSize : batchSizes) {
|
||||
const int iters = (batchSize == m_options.optBatchSize) ? iterations : 2;
|
||||
if (m_verbose) std::cout << "\nWarming up batch=" << batchSize << " (x" << iters << " iterations)..." << std::endl;
|
||||
|
||||
// Create dummy inputs for this batch size
|
||||
std::vector<std::vector<cv::cuda::GpuMat>> dummyInputs;
|
||||
|
||||
for (size_t i = 0; i < m_inputDims.size(); ++i) {
|
||||
const auto& dims = m_inputDims[i];
|
||||
std::vector<cv::cuda::GpuMat> batch;
|
||||
|
||||
// FIXED: Create proper dummy images on GPU
|
||||
// For dynamic spatial dims, use opt dimensions for warmup
|
||||
int warmH = (dims.d[1] > 0) ? dims.d[1] : m_options.optInputHeight;
|
||||
int warmW = (dims.d[2] > 0) ? dims.d[2] : m_options.optInputWidth;
|
||||
for (int b = 0; b < batchSize; ++b) {
|
||||
// Create on CPU first
|
||||
cv::Mat cpuImg(warmH, warmW, CV_32FC(dims.d[0]), cv::Scalar(0.5f, 0.5f, 0.5f));
|
||||
|
||||
// Upload to GPU
|
||||
cv::cuda::GpuMat gpuImg;
|
||||
gpuImg.upload(cpuImg);
|
||||
|
||||
batch.push_back(gpuImg);
|
||||
}
|
||||
|
||||
dummyInputs.push_back(batch);
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::vector<T>>> dummyOutputs;
|
||||
|
||||
// Time the first iteration (kernel compilation happens here)
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
bool firstSuccess = runInference(dummyInputs, dummyOutputs);
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto firstTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
|
||||
|
||||
if (!firstSuccess) {
|
||||
if (m_verbose) std::cout << " ✗ ERROR: First iteration failed for batch=" << batchSize << std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (m_verbose) {
|
||||
std::cout << " First iteration: " << firstTime << " ms";
|
||||
if (firstTime > 100) {
|
||||
std::cout << " (kernel compilation detected)";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
// Run remaining iterations to measure stable performance
|
||||
if (iters > 1) {
|
||||
auto iterStart = std::chrono::high_resolution_clock::now();
|
||||
|
||||
for (int i = 1; i < iters; ++i) {
|
||||
bool success = runInference(dummyInputs, dummyOutputs);
|
||||
if (!success) {
|
||||
if (m_verbose) std::cout << " ✗ ERROR: Iteration " << i << " failed" << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
auto iterEnd = std::chrono::high_resolution_clock::now();
|
||||
auto totalTime = std::chrono::duration_cast<std::chrono::milliseconds>(iterEnd - iterStart).count();
|
||||
float avgTime = totalTime / static_cast<float>(iters - 1);
|
||||
|
||||
if (m_verbose) {
|
||||
std::cout << " Subsequent iterations (avg): " << std::fixed << std::setprecision(1)
|
||||
<< avgTime << " ms" << std::endl;
|
||||
|
||||
if (firstTime > 100 && avgTime < firstTime * 0.5f) {
|
||||
float speedup = firstTime / avgTime;
|
||||
std::cout << " ✓ Speedup after warmup: " << std::fixed << std::setprecision(1)
|
||||
<< speedup << "x faster" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (m_verbose) std::cout << " ✓ Batch=" << batchSize << " warmed up successfully" << std::endl;
|
||||
}
|
||||
|
||||
if (m_verbose) {
|
||||
std::cout << "\n========================================" << std::endl;
|
||||
std::cout << "Warmup Complete!" << std::endl;
|
||||
std::cout << "========================================" << std::endl;
|
||||
std::cout << "Kernels pre-compiled for all batch sizes." << std::endl;
|
||||
std::cout << "========================================\n" << std::endl;
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,std::vector<std::vector<std::vector<T>>>& featureVectors) {
|
||||
|
||||
// ============================================================================
|
||||
// MULTI-GPU POOL DISPATCH
|
||||
// ============================================================================
|
||||
// If this Engine was initialised with initializePool() / initializePoolFromEngine()
|
||||
// the m_slots vector is non-empty. In that case, delegate to the pool
|
||||
// dispatcher which acquires the first idle slot and runs inference there.
|
||||
// This branch is NEVER taken for single-GPU use (buildLoadNetwork / loadNetwork).
|
||||
if (!m_slots.empty()) {
|
||||
return runInferenceFromPool(inputs, featureVectors);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// SINGLE-ENGINE SERIALISATION
|
||||
// ============================================================================
|
||||
// The single Engine instance has shared mutable state (m_buffers, m_lastBatchSize,
|
||||
// m_inferenceStream, TRT execution context). If two LabVIEW threads call
|
||||
// runInference concurrently with different batch sizes, one will overwrite
|
||||
// the input shapes and buffers while the other is mid-inference, causing a
|
||||
// fatal "illegal memory access" that permanently corrupts the CUDA context.
|
||||
//
|
||||
// Pool-mode slots have their own busy-flag dispatch so they do NOT need this.
|
||||
std::lock_guard<std::mutex> inferenceLock(m_inferenceMutex);
|
||||
|
||||
// ============================================================================
|
||||
// THREAD-SAFE GPU CONTEXT
|
||||
// ============================================================================
|
||||
// Ensure the calling thread's CUDA device matches this engine's GPU.
|
||||
// This is essential for multi-GPU round-robin: LabVIEW reuses threads
|
||||
// across tasks, so a thread that last ran inference on GPU 1 might now
|
||||
// be running a task on GPU 0. Without this, cv::cuda::GpuMat allocations
|
||||
// and kernel launches would target the wrong GPU, causing result corruption.
|
||||
// Skip cudaSetDevice if already on the correct device — under WDDM
|
||||
// with multiple GPUs each call costs 1-5ms of scheduler overhead.
|
||||
{
|
||||
int currentDev = -1;
|
||||
cudaGetDevice(¤tDev);
|
||||
if (currentDev != m_options.deviceIndex) {
|
||||
cudaSetDevice(m_options.deviceIndex);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// DEBUG: First call diagnostics (per-instance, not process-wide)
|
||||
// ============================================================================
|
||||
|
||||
if (m_verbose && m_firstInferenceCall) {
|
||||
std::cout << "\n=== First runInference Call ===" << std::endl;
|
||||
std::cout << "Number of input tensors: " << inputs.size() << std::endl;
|
||||
for (size_t i = 0; i < inputs.size(); ++i) {
|
||||
std::cout << "Input " << i << " batch size: " << inputs[i].size() << std::endl;
|
||||
if (!inputs[i].empty()) {
|
||||
const auto& img = inputs[i][0];
|
||||
std::cout << " Image shape: " << img.cols << "x" << img.rows
|
||||
<< "x" << img.channels() << " (type: " << img.type() << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Print optimization profile information
|
||||
std::cout << "\n=== Engine Profile Information ===" << std::endl;
|
||||
std::cout << "Number of optimization profiles: "
|
||||
<< m_engine->getNbOptimizationProfiles() << std::endl;
|
||||
|
||||
if (m_engine->getNbOptimizationProfiles() > 0) {
|
||||
for (int profile = 0; profile < m_engine->getNbOptimizationProfiles(); ++profile) {
|
||||
std::cout << "\n--- Profile " << profile << " ---" << std::endl;
|
||||
|
||||
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
|
||||
const char* tensorName = m_IOTensorNames[i].c_str();
|
||||
|
||||
// Check if this is an input tensor
|
||||
auto ioMode = m_engine->getTensorIOMode(tensorName);
|
||||
if (ioMode != nvinfer1::TensorIOMode::kINPUT) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto minDims = m_engine->getProfileShape(tensorName, profile,
|
||||
nvinfer1::OptProfileSelector::kMIN);
|
||||
auto optDims = m_engine->getProfileShape(tensorName, profile,
|
||||
nvinfer1::OptProfileSelector::kOPT);
|
||||
auto maxDims = m_engine->getProfileShape(tensorName, profile,
|
||||
nvinfer1::OptProfileSelector::kMAX);
|
||||
|
||||
std::cout << "Tensor '" << tensorName << "' (INPUT):" << std::endl;
|
||||
std::cout << " Min: [" << minDims.d[0];
|
||||
for (int j = 1; j < minDims.nbDims; ++j) std::cout << "," << minDims.d[j];
|
||||
std::cout << "]" << std::endl;
|
||||
|
||||
std::cout << " Opt: [" << optDims.d[0];
|
||||
for (int j = 1; j < optDims.nbDims; ++j) std::cout << "," << optDims.d[j];
|
||||
std::cout << "]" << std::endl;
|
||||
|
||||
std::cout << " Max: [" << maxDims.d[0];
|
||||
for (int j = 1; j < maxDims.nbDims; ++j) std::cout << "," << maxDims.d[j];
|
||||
std::cout << "]" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!m_context->allInputDimensionsSpecified()) {
|
||||
std::cout << "ERROR: Input dimensions not specified in context!" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::cout << "\nContext state: All dimensions specified ✓" << std::endl;
|
||||
m_firstInferenceCall = false;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// INPUT VALIDATION
|
||||
// ============================================================================
|
||||
|
||||
if (inputs.empty() || inputs[0].empty()) {
|
||||
std::cout << "Error: Empty input" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto numInputs = m_inputDims.size();
|
||||
if (inputs.size() != numInputs) {
|
||||
std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
|
||||
<< ", Got: " << inputs.size() << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto batchSize = static_cast<int32_t>(inputs[0].size());
|
||||
|
||||
if (batchSize > m_options.maxBatchSize) {
|
||||
std::cout << "Error: Batch size " << batchSize << " exceeds maximum "
|
||||
<< m_options.maxBatchSize << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (batchSize < 1) {
|
||||
std::cout << "Error: Batch size must be at least 1" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Validate batch size consistency across all inputs
|
||||
for (size_t i = 1; i < inputs.size(); ++i) {
|
||||
if (inputs[i].size() != static_cast<size_t>(batchSize)) {
|
||||
std::cout << "Error: Inconsistent batch sizes across inputs. Input 0: "
|
||||
<< batchSize << ", Input " << i << ": " << inputs[i].size() << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// STREAM GUARD
|
||||
// ============================================================================
|
||||
// m_inferenceStream is now created eagerly in loadNetwork() so it is always
|
||||
// valid here. Guard against the (unlikely) edge case where runInference is
|
||||
// called before loadNetwork succeeds.
|
||||
if (!m_streamInitialized || !m_inferenceStream) {
|
||||
std::string errMsg = "Error: Inference stream not initialised. "
|
||||
"Call loadNetwork() / buildLoadNetwork() before runInference().";
|
||||
std::cout << errMsg << std::endl;
|
||||
logEngineEvent("[Engine] runInference: " + errMsg, true);
|
||||
return false;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// SET INPUT SHAPES (batch size changed OR dynamic spatial dims need updating)
|
||||
// ============================================================================
|
||||
// Fast path: compute desired dims first, then compare against cached dims.
|
||||
// This avoids all TRT API calls when the shape hasn't actually changed —
|
||||
// critical for the recognizer which is called ~50-100x per image with
|
||||
// dynamic width but often the same or similar widths.
|
||||
// ============================================================================
|
||||
|
||||
{
|
||||
// Lazily initialise the dims cache on first call
|
||||
if (m_lastSetInputDims.empty()) {
|
||||
m_lastSetInputDims.resize(numInputs);
|
||||
for (size_t i = 0; i < numInputs; ++i) {
|
||||
m_lastSetInputDims[i].nbDims = 0; // force mismatch on first call
|
||||
}
|
||||
}
|
||||
|
||||
// Build desired dims for every input tensor (cheap — no TRT API calls)
|
||||
bool anyDimChanged = (m_lastBatchSize != batchSize);
|
||||
std::vector<nvinfer1::Dims> desiredDims(numInputs);
|
||||
for (size_t i = 0; i < numInputs; ++i) {
|
||||
nvinfer1::Dims& nd = desiredDims[i];
|
||||
nd.nbDims = 4;
|
||||
nd.d[0] = batchSize;
|
||||
nd.d[1] = m_inputDims[i].d[0]; // channels
|
||||
if (m_hasDynamicSpatialDims && !inputs[i].empty()) {
|
||||
const auto& firstImg = inputs[i][0];
|
||||
nd.d[2] = (m_inputDims[i].d[1] == -1) ? firstImg.rows : m_inputDims[i].d[1];
|
||||
nd.d[3] = (m_inputDims[i].d[2] == -1) ? firstImg.cols : m_inputDims[i].d[2];
|
||||
} else {
|
||||
nd.d[2] = m_inputDims[i].d[1];
|
||||
nd.d[3] = m_inputDims[i].d[2];
|
||||
}
|
||||
// Compare with cached
|
||||
if (!anyDimChanged) {
|
||||
const auto& cached = m_lastSetInputDims[i];
|
||||
if (cached.nbDims != nd.nbDims ||
|
||||
cached.d[0] != nd.d[0] || cached.d[1] != nd.d[1] ||
|
||||
cached.d[2] != nd.d[2] || cached.d[3] != nd.d[3]) {
|
||||
anyDimChanged = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (anyDimChanged) {
|
||||
// === First-time diagnostics (verbose, once) ===
|
||||
const bool firstTime = !m_batchShapeChangeLogged;
|
||||
|
||||
if (m_verbose && firstTime) {
|
||||
std::cout << "\nInfo: Batch size change: " << m_lastBatchSize
|
||||
<< " -> " << batchSize << std::endl;
|
||||
}
|
||||
|
||||
// Set optimization profile (only when truly needed)
|
||||
if (m_engine->getNbOptimizationProfiles() > 0) {
|
||||
int currentProfile = m_context->getOptimizationProfile();
|
||||
if (currentProfile != 0 || m_lastBatchSize < 0) {
|
||||
if (m_verbose && firstTime) {
|
||||
std::cout << " Setting optimization profile to 0..." << std::endl;
|
||||
}
|
||||
if (!m_context->setOptimizationProfileAsync(0, m_inferenceStream)) {
|
||||
std::cout << "Error: Failed to set optimization profile 0" << std::endl;
|
||||
return false;
|
||||
}
|
||||
cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
|
||||
if (syncErr != cudaSuccess) {
|
||||
std::cout << "Error: Failed to sync after profile change: "
|
||||
<< cudaGetErrorString(syncErr) << std::endl;
|
||||
return false;
|
||||
}
|
||||
if (m_verbose && firstTime) {
|
||||
std::cout << " Optimization profile set successfully" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update shapes for input tensors that actually changed
|
||||
for (size_t i = 0; i < numInputs; ++i) {
|
||||
const char* tensorName = m_IOTensorNames[i].c_str();
|
||||
|
||||
// Skip non-input tensors
|
||||
auto ioMode = m_engine->getTensorIOMode(tensorName);
|
||||
if (ioMode != nvinfer1::TensorIOMode::kINPUT) continue;
|
||||
|
||||
const nvinfer1::Dims& newDims = desiredDims[i];
|
||||
const nvinfer1::Dims& cached = m_lastSetInputDims[i];
|
||||
|
||||
// Skip this tensor if its dims haven't changed
|
||||
if (cached.nbDims == newDims.nbDims &&
|
||||
cached.d[0] == newDims.d[0] && cached.d[1] == newDims.d[1] &&
|
||||
cached.d[2] == newDims.d[2] && cached.d[3] == newDims.d[3]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// First-time verbose diagnostics
|
||||
if (m_verbose && firstTime) {
|
||||
std::cout << "\n Processing tensor " << i << ": '" << tensorName << "'" << std::endl;
|
||||
// Validate batch size range (first time only)
|
||||
if (m_engine->getNbOptimizationProfiles() > 0) {
|
||||
int profileIndex = m_context->getOptimizationProfile();
|
||||
nvinfer1::Dims minDims = m_engine->getProfileShape(
|
||||
tensorName, profileIndex, nvinfer1::OptProfileSelector::kMIN);
|
||||
nvinfer1::Dims maxDims = m_engine->getProfileShape(
|
||||
tensorName, profileIndex, nvinfer1::OptProfileSelector::kMAX);
|
||||
std::cout << " Profile batch range: [" << minDims.d[0]
|
||||
<< " to " << maxDims.d[0] << "]" << std::endl;
|
||||
if (batchSize < minDims.d[0] || batchSize > maxDims.d[0]) {
|
||||
std::cout << "Error: Batch size " << batchSize
|
||||
<< " outside profile range" << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
auto currentShape = m_context->getTensorShape(tensorName);
|
||||
std::cout << " Current context shape: [";
|
||||
for (int j = 0; j < currentShape.nbDims; ++j) {
|
||||
if (j > 0) std::cout << ", ";
|
||||
std::cout << currentShape.d[j];
|
||||
}
|
||||
std::cout << "]" << std::endl;
|
||||
std::cout << " Setting new shape: [" << newDims.d[0] << ", "
|
||||
<< newDims.d[1] << ", " << newDims.d[2] << ", "
|
||||
<< newDims.d[3] << "]" << std::endl;
|
||||
}
|
||||
|
||||
if (!m_context->setInputShape(tensorName, newDims)) {
|
||||
std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Verify shape (first time only — trust the API on hot path)
|
||||
if (firstTime) {
|
||||
auto verifyShape = m_context->getTensorShape(tensorName);
|
||||
if (verifyShape.d[0] != batchSize) {
|
||||
std::cout << "Error: Shape change didn't take effect. Expected batch "
|
||||
<< batchSize << ", got " << verifyShape.d[0] << std::endl;
|
||||
return false;
|
||||
}
|
||||
if (m_verbose) {
|
||||
std::cout << " Shape updated successfully" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
m_lastSetInputDims[i] = newDims;
|
||||
}
|
||||
|
||||
// Verify all input dimensions specified (first time only)
|
||||
if (firstTime) {
|
||||
if (!m_context->allInputDimensionsSpecified()) {
|
||||
std::cout << "Error: Not all input dimensions specified after shape change" << std::endl;
|
||||
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
|
||||
auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
|
||||
std::cout << " " << m_IOTensorNames[i] << ": [";
|
||||
for (int j = 0; j < shape.nbDims; ++j) {
|
||||
if (j > 0) std::cout << ", ";
|
||||
std::cout << shape.d[j];
|
||||
}
|
||||
std::cout << "]" << std::endl;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
m_lastBatchSize = batchSize;
|
||||
m_batchShapeChangeLogged = true;
|
||||
if (m_verbose && firstTime) {
|
||||
std::cout << "\nInfo: Input shapes updated successfully for batch size "
|
||||
<< batchSize << " ✓\n" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// PREPROCESS AND COPY INPUTS TO GPU BUFFERS
|
||||
// ============================================================================
|
||||
|
||||
// Pass 1: Validate all input dimensions before any GPU work.
|
||||
// Dynamic dims (-1) are skipped in validation (they accept any size).
|
||||
for (size_t i = 0; i < numInputs; ++i) {
|
||||
const auto& batchInput = inputs[i];
|
||||
const auto& dims = m_inputDims[i];
|
||||
if (!batchInput.empty()) {
|
||||
const auto& firstImg = batchInput[0];
|
||||
bool mismatch = false;
|
||||
if (dims.d[0] > 0 && firstImg.channels() != dims.d[0]) mismatch = true;
|
||||
if (dims.d[1] > 0 && firstImg.rows != dims.d[1]) mismatch = true;
|
||||
if (dims.d[2] > 0 && firstImg.cols != dims.d[2]) mismatch = true;
|
||||
if (mismatch) {
|
||||
std::cout << "Error: Input " << i << " dimension mismatch!" << std::endl;
|
||||
std::cout << " Expected: " << dims.d[2] << "x" << dims.d[1]
|
||||
<< "x" << dims.d[0] << " (WxHxC, -1=dynamic)" << std::endl;
|
||||
std::cout << " Got: " << firstImg.cols << "x" << firstImg.rows
|
||||
<< "x" << firstImg.channels() << " (WxHxC)" << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pass 2: Preprocess + D2D copies — all on m_inferenceStream (no null stream).
|
||||
//
|
||||
// All OpenCV CUDA ops (convertTo, subtract, divide, split) in blobFromGpuMats
|
||||
// now run on m_inferenceStream via the cv::cuda::Stream wrapper. This means:
|
||||
// • No null-stream interaction — eliminates global sync barriers on WDDM
|
||||
// • No event bridge needed — same-stream ordering guarantees correctness
|
||||
// • CUDA graphs are safe — cv::cuda::split runs BEFORE graph capture
|
||||
//
|
||||
// GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
|
||||
// cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
|
||||
cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
|
||||
std::vector<cv::cuda::GpuMat> preprocessedBuffers;
|
||||
preprocessedBuffers.reserve(numInputs);
|
||||
|
||||
for (size_t i = 0; i < numInputs; ++i) {
|
||||
const auto& batchInput = inputs[i];
|
||||
|
||||
// Preprocess on m_inferenceStream (not the null stream).
|
||||
preprocessedBuffers.push_back(
|
||||
blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize, false, cvInferStream));
|
||||
|
||||
// D2D copy: same stream as preprocessing, so ordering is guaranteed.
|
||||
const auto& blobMat = preprocessedBuffers.back();
|
||||
const size_t copySize = static_cast<size_t>(blobMat.rows) * static_cast<size_t>(blobMat.cols) * blobMat.elemSize();
|
||||
cudaError_t copyErr = cudaMemcpyAsync(
|
||||
m_buffers[i],
|
||||
preprocessedBuffers.back().ptr<void>(),
|
||||
copySize,
|
||||
cudaMemcpyDeviceToDevice,
|
||||
m_inferenceStream);
|
||||
|
||||
if (copyErr != cudaSuccess) {
|
||||
std::cout << "Error: Failed to copy input " << i
|
||||
<< " to inference buffer: " << cudaGetErrorString(copyErr) << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// PRE-ALLOCATE OUTPUT STRUCTURE
|
||||
// ============================================================================
|
||||
|
||||
const size_t numOutputs = m_outputLengths.size();
|
||||
|
||||
featureVectors.clear();
|
||||
featureVectors.resize(batchSize);
|
||||
for (int batch = 0; batch < batchSize; ++batch) {
|
||||
featureVectors[batch].resize(numOutputs);
|
||||
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx)
|
||||
featureVectors[batch][outputIdx].resize(m_outputLengths[outputIdx]);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// RUN INFERENCE + COPY OUTPUTS (CUDA Graph path or direct path)
|
||||
// ============================================================================
|
||||
|
||||
// CUDA Graph path
|
||||
// ---------------
|
||||
// On the first call for a given batchSize we capture enqueueV3 + D2H copies
|
||||
// into a reusable graph. Subsequent calls use cudaGraphLaunch, replacing
|
||||
// many individual kernel-submission API calls with a single launch.
|
||||
//
|
||||
// Prerequisites satisfied here:
|
||||
// • Preprocessing + D2D copies are queued on m_inferenceStream (same-stream
|
||||
// ordering guarantees they complete before captured kernels execute)
|
||||
// • m_pinnedOutputBuffers has stable addresses (allocated in loadNetwork)
|
||||
// • m_buffers (GPU outputs) have stable addresses (allocated in loadNetwork)
|
||||
//
|
||||
// Falls back to the direct path if pinned buffers are unavailable or if
|
||||
// graph capture/instantiation fails for any reason.
|
||||
|
||||
// CUDA graphs capture fixed kernel sequences; incompatible with dynamic spatial dims
|
||||
// (input H/W change per inference call → different TRT kernel plans each time).
|
||||
// Disabled for pool slots — concurrent graph captures on the same GPU corrupt the
|
||||
// CUDA context ("operation not permitted when stream is capturing").
|
||||
const bool canGraph = !m_disableGraphs && !m_pinnedOutputBuffers.empty() && !m_hasDynamicSpatialDims;
|
||||
bool graphUsed = false;
|
||||
|
||||
if (canGraph) {
|
||||
auto& graphExec = m_graphExecs[batchSize]; // inserts nullptr on first access
|
||||
if (!graphExec) {
|
||||
// First call for this batchSize -- capture a new graph.
|
||||
// Serialise captures across all Engine instances on this device to
|
||||
// prevent TRT's shared workspace from creating cross-stream
|
||||
// dependencies that violate CUDA graph capture rules.
|
||||
std::lock_guard<std::mutex> captureLock(graphCaptureMutex());
|
||||
|
||||
// Clear any sticky CUDA error from a prior failed capture so that
|
||||
// this attempt starts clean.
|
||||
cudaGetLastError();
|
||||
|
||||
cudaGraph_t graph = nullptr;
|
||||
bool captureOk = false;
|
||||
|
||||
if (cudaStreamBeginCapture(m_inferenceStream,
|
||||
cudaStreamCaptureModeRelaxed) == cudaSuccess) {
|
||||
// Record TRT kernels into the graph (not executed yet).
|
||||
TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
|
||||
|
||||
// Record D2H copies to stable pinned addresses.
|
||||
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
|
||||
cudaMemcpyAsync(
|
||||
m_pinnedOutputBuffers[outputIdx],
|
||||
static_cast<char*>(m_buffers[numInputs + outputIdx]),
|
||||
static_cast<size_t>(batchSize) * m_outputLengths[outputIdx] * sizeof(T),
|
||||
cudaMemcpyDeviceToHost,
|
||||
m_inferenceStream);
|
||||
}
|
||||
|
||||
captureOk = (cudaStreamEndCapture(m_inferenceStream, &graph) == cudaSuccess
|
||||
&& graph != nullptr);
|
||||
}
|
||||
|
||||
if (captureOk) {
|
||||
cudaGraphExec_t exec = nullptr;
|
||||
if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
|
||||
graphExec = exec;
|
||||
cudaGraphDestroy(graph);
|
||||
}
|
||||
|
||||
if (!graphExec) {
|
||||
std::cout << "Warning: CUDA graph capture failed for batchSize="
|
||||
<< batchSize << " -- falling back to direct inference path." << std::endl;
|
||||
// Disable graph acceleration for this Engine instance.
|
||||
for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
|
||||
m_pinnedOutputBuffers.clear();
|
||||
m_graphExecs.erase(batchSize);
|
||||
}
|
||||
}
|
||||
|
||||
if (graphExec) {
|
||||
// Launch the pre-captured graph (single API call replaces many).
|
||||
cudaGraphLaunch(graphExec, m_inferenceStream);
|
||||
cudaStreamSynchronize(m_inferenceStream);
|
||||
|
||||
// CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
|
||||
for (int batch = 0; batch < batchSize; ++batch) {
|
||||
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
|
||||
std::memcpy(
|
||||
featureVectors[batch][outputIdx].data(),
|
||||
m_pinnedOutputBuffers[outputIdx]
|
||||
+ static_cast<size_t>(batch) * m_outputLengths[outputIdx],
|
||||
m_outputLengths[outputIdx] * sizeof(T));
|
||||
}
|
||||
}
|
||||
graphUsed = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Direct path (no graph)
|
||||
// ----------------------
|
||||
// Used when pinned buffers are unavailable or graph capture failed.
|
||||
if (!graphUsed) {
|
||||
bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
|
||||
if (!success) {
|
||||
std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
|
||||
+ std::to_string(batchSize)
|
||||
+ ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
|
||||
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
|
||||
auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
|
||||
debugInfo += ", tensor'" + m_IOTensorNames[i] + "'=[";
|
||||
for (int j = 0; j < shape.nbDims; ++j) {
|
||||
if (j > 0) debugInfo += ",";
|
||||
debugInfo += std::to_string(shape.d[j]);
|
||||
}
|
||||
debugInfo += "]";
|
||||
}
|
||||
std::cout << debugInfo << std::endl;
|
||||
logEngineEvent(debugInfo, true);
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int batch = 0; batch < batchSize; ++batch) {
|
||||
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
|
||||
const size_t outputBinding = numInputs + outputIdx;
|
||||
const size_t offset =
|
||||
static_cast<size_t>(batch) * m_outputLengths[outputIdx] * sizeof(T);
|
||||
|
||||
cudaError_t copyErr = cudaMemcpyAsync(
|
||||
featureVectors[batch][outputIdx].data(),
|
||||
static_cast<char*>(m_buffers[outputBinding]) + offset,
|
||||
m_outputLengths[outputIdx] * sizeof(T),
|
||||
cudaMemcpyDeviceToHost,
|
||||
m_inferenceStream);
|
||||
|
||||
if (copyErr != cudaSuccess) {
|
||||
std::string errMsg = "[Engine] runInference FAIL: cudaMemcpyAsync output "
|
||||
+ std::to_string(outputIdx) + " batch " + std::to_string(batch)
|
||||
+ ": " + cudaGetErrorString(copyErr);
|
||||
std::cout << errMsg << std::endl;
|
||||
logEngineEvent(errMsg, true);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
|
||||
if (syncErr != cudaSuccess) {
|
||||
std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
|
||||
+ std::string(cudaGetErrorString(syncErr));
|
||||
std::cout << errMsg << std::endl;
|
||||
logEngineEvent(errMsg, true);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
250
engines/TensorRTAPI/include/engine/EngineUtilities.inl
Normal file
250
engines/TensorRTAPI/include/engine/EngineUtilities.inl
Normal file
@@ -0,0 +1,250 @@
|
||||
#pragma once
|
||||
#include <filesystem>
|
||||
#include <NvInfer.h> // NV_TENSORRT_MAJOR/MINOR/PATCH
|
||||
#include <NvInferVersion.h> // also defines TRT version macros
|
||||
#include <cudnn_version.h> // CUDNN_MAJOR/MINOR/PATCHLEVEL
|
||||
#include <cuda_runtime.h> // cudaRuntimeGetVersion
|
||||
|
||||
template <typename T>
|
||||
void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output) {
|
||||
if (input.size() == 1) {
|
||||
output = std::move(input[0]);
|
||||
}
|
||||
else {
|
||||
auto msg = "The feature vector has incorrect dimensions!";
|
||||
std::cout<<msg;
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output) {
|
||||
if (input.size() != 1 || input[0].size() != 1) {
|
||||
auto msg = "The feature vector has incorrect dimensions!";
|
||||
std::cout<<msg;
|
||||
}
|
||||
output = std::move(input[0][0]);
|
||||
}
|
||||
template <typename T>
|
||||
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
|
||||
size_t height, size_t width,
|
||||
const cv::Scalar& bgcolor) {
|
||||
// Ensure input is valid
|
||||
if (input.empty()) {
|
||||
return cv::cuda::GpuMat();
|
||||
}
|
||||
// Create a CUDA stream
|
||||
cv::cuda::Stream stream;
|
||||
// Calculate aspect ratio and unpadded dimensions
|
||||
float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
|
||||
size_t unpad_w = static_cast<size_t>(r * input.cols);
|
||||
size_t unpad_h = static_cast<size_t>(r * input.rows);
|
||||
// Resize the input image
|
||||
cv::cuda::GpuMat re;
|
||||
re.create(unpad_h, unpad_w, input.type());
|
||||
cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);
|
||||
// Create the output image and fill with the background color
|
||||
cv::cuda::GpuMat out;
|
||||
out.create(height, width, input.type());
|
||||
out.setTo(bgcolor, stream);
|
||||
// Copy the resized content into the top-left corner of the output image
|
||||
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
|
||||
stream.waitForCompletion();
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename T> void Engine<T>::getDeviceNames(std::vector<std::string> &deviceNames) {
|
||||
int numGPUs;
|
||||
cudaGetDeviceCount(&numGPUs);
|
||||
for (int device = 0; device < numGPUs; device++) {
|
||||
cudaDeviceProp prop;
|
||||
cudaGetDeviceProperties(&prop, device);
|
||||
deviceNames.push_back(std::string(prop.name));
|
||||
}
|
||||
}
|
||||
template <typename T> int Engine<T>::getBindingIndexByName(const std::string& name) {
|
||||
for (int i = 0, e = m_engine->getNbIOTensors(); i < e; i++)
|
||||
{
|
||||
if (name == m_engine->getIOTensorName(i))
|
||||
{
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
//template <typename T> std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options &options, const std::string &onnxModelPath) {
|
||||
// const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
|
||||
// std::string engineName = onnxModelPath.substr(filenamePos, onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
|
||||
//
|
||||
// // Add the GPU device name to the file to ensure that the model is only used
|
||||
// // on devices with the exact same GPU
|
||||
// std::vector<std::string> deviceNames;
|
||||
// getDeviceNames(deviceNames);
|
||||
//
|
||||
// if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
|
||||
// auto msg = "Error, provided device index is out of range!";
|
||||
// std::cout<<msg;
|
||||
// return "";
|
||||
// }
|
||||
//
|
||||
// auto deviceName = deviceNames[options.deviceIndex];
|
||||
// // Remove spaces from the device name
|
||||
// deviceName.erase(std::remove_if(deviceName.begin(), deviceName.end(), ::isspace), deviceName.end());
|
||||
// engineName += "." + deviceName;
|
||||
// // Serialize the specified options into the filename
|
||||
// if (options.precision == ANSCENTER::Precision::FP16) {
|
||||
// engineName += ".fp16";
|
||||
// } else if (options.precision == ANSCENTER::Precision::FP32) {
|
||||
// engineName += ".fp32";
|
||||
// } else {
|
||||
// engineName += ".int8";
|
||||
// }
|
||||
// if (options.maxBatchSize > 1) {
|
||||
// engineName += "." + std::to_string(options.maxBatchSize);
|
||||
// }
|
||||
// return engineName;
|
||||
//}
|
||||
|
||||
template <typename T>
|
||||
std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options& options,
|
||||
const std::string& onnxModelPath)
|
||||
{
|
||||
// -- Base name from ONNX file ---------------------------------------------
|
||||
const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
|
||||
std::string engineName = onnxModelPath.substr(
|
||||
filenamePos,
|
||||
onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
|
||||
|
||||
// -- GPU device name ------------------------------------------------------
|
||||
// Ensures the engine is only loaded on the exact GPU it was built for.
|
||||
std::vector<std::string> deviceNames;
|
||||
getDeviceNames(deviceNames);
|
||||
if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
|
||||
std::cout << "Error, provided device index is out of range!";
|
||||
return "";
|
||||
}
|
||||
auto deviceName = deviceNames[options.deviceIndex];
|
||||
deviceName.erase(
|
||||
std::remove_if(deviceName.begin(), deviceName.end(), ::isspace),
|
||||
deviceName.end());
|
||||
engineName += "." + deviceName;
|
||||
|
||||
// -- Precision ------------------------------------------------------------
|
||||
if (options.precision == ANSCENTER::Precision::FP16) {
|
||||
engineName += ".fp16";
|
||||
}
|
||||
else if (options.precision == ANSCENTER::Precision::FP32) {
|
||||
engineName += ".fp32";
|
||||
}
|
||||
else {
|
||||
engineName += ".int8";
|
||||
}
|
||||
|
||||
// -- Batch size -----------------------------------------------------------
|
||||
if (options.maxBatchSize > 1) {
|
||||
engineName += ".b" + std::to_string(options.maxBatchSize);
|
||||
}
|
||||
|
||||
// -- Max spatial dims: intentionally NOT included in the filename ----------
|
||||
// buildWithRetry() may reduce max dims (e.g. 2560→1920) when GPU memory
|
||||
// is insufficient. If the filename included .s{H}x{W}, the next launch
|
||||
// would look for .s2560x2560, miss the cached .s1920x1920, and waste
|
||||
// minutes re-attempting the doomed 2560 build before falling back.
|
||||
// Without the suffix, the cache is found immediately on the next launch.
|
||||
// The actual profile max is queried at runtime via getProfileMaxHeight/Width.
|
||||
|
||||
// -- TensorRT version -----------------------------------------------------
|
||||
// Engine format changes between TensorRT minor versions -- must rebuild.
|
||||
// NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH are defined in
|
||||
// <NvInferVersion.h> which is included via NvInfer.h.
|
||||
engineName += ".trt"
|
||||
+ std::to_string(NV_TENSORRT_MAJOR) + "."
|
||||
+ std::to_string(NV_TENSORRT_MINOR) + "."
|
||||
+ std::to_string(NV_TENSORRT_PATCH);
|
||||
|
||||
// -- CUDA runtime version -------------------------------------------------
|
||||
// Engines built with different CUDA versions may use different PTX/cubin
|
||||
// formats and must be rebuilt.
|
||||
int cudaVersion = 0;
|
||||
cudaRuntimeGetVersion(&cudaVersion);
|
||||
const int cudaMajor = cudaVersion / 1000;
|
||||
const int cudaMinor = (cudaVersion % 1000) / 10;
|
||||
engineName += ".cuda"
|
||||
+ std::to_string(cudaMajor) + "."
|
||||
+ std::to_string(cudaMinor);
|
||||
|
||||
// -- cuDNN version --------------------------------------------------------
|
||||
// cuDNN version affects layer implementations inside the engine.
|
||||
// CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL are defined in <cudnn_version.h>.
|
||||
engineName += ".cudnn"
|
||||
+ std::to_string(CUDNN_MAJOR) + "."
|
||||
+ std::to_string(CUDNN_MINOR);
|
||||
|
||||
return engineName;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
|
||||
const std::array<float, 3> &divVals, bool normalize, bool swapRB,
|
||||
cv::cuda::Stream &stream) {
|
||||
cv::cuda::GpuMat result;
|
||||
if (batchInput.empty()) return result;
|
||||
if (batchInput[0].channels() != 3) return result;
|
||||
|
||||
const int H = batchInput[0].rows;
|
||||
const int W = batchInput[0].cols;
|
||||
const int batch = static_cast<int>(batchInput.size());
|
||||
const size_t planeSize = static_cast<size_t>(H) * W; // pixels per channel
|
||||
|
||||
// Output blob: planar NCHW layout stored as a single-channel GpuMat.
|
||||
// Total elements = batch * 3 * H * W.
|
||||
cv::cuda::GpuMat blob(1, batch * 3 * static_cast<int>(planeSize), CV_32FC1);
|
||||
|
||||
for (int img = 0; img < batch; ++img) {
|
||||
// 1. Convert to float and normalise while still in HWC (interleaved) format.
|
||||
// Channel-wise subtract / divide operate correctly on interleaved data.
|
||||
cv::cuda::GpuMat floatImg;
|
||||
if (normalize) {
|
||||
batchInput[img].convertTo(floatImg, CV_32FC3, 1.f / 255.f, stream);
|
||||
} else {
|
||||
batchInput[img].convertTo(floatImg, CV_32FC3, 1.0, stream);
|
||||
}
|
||||
|
||||
cv::cuda::subtract(floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), floatImg, cv::noArray(), -1, stream);
|
||||
cv::cuda::divide(floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), floatImg, 1, -1, stream);
|
||||
|
||||
// 2. Split normalised HWC image into CHW planes directly into the blob.
|
||||
size_t offset = static_cast<size_t>(img) * 3 * planeSize;
|
||||
|
||||
if (swapRB) {
|
||||
// BGR input -> RGB planes: B goes to plane 2, G to plane 1, R to plane 0
|
||||
std::vector<cv::cuda::GpuMat> channels{
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize), // B -> plane 2
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize), // G -> plane 1
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)}; // R -> plane 0
|
||||
cv::cuda::split(floatImg, channels, stream);
|
||||
} else {
|
||||
// BGR input -> BGR planes: keep channel order
|
||||
std::vector<cv::cuda::GpuMat> channels{
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
|
||||
cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
|
||||
cv::cuda::split(floatImg, channels, stream);
|
||||
}
|
||||
}
|
||||
|
||||
return blob;
|
||||
}
|
||||
|
||||
template <typename T> void Engine<T>::clearGpuBuffers() {
|
||||
if (!m_buffers.empty()) {
|
||||
// Free ALL I/O GPU buffers (both inputs and outputs).
|
||||
// Previously only outputs were freed, leaking input allocations from loadNetwork().
|
||||
for (void* ptr : m_buffers) {
|
||||
if (ptr) {
|
||||
Util::checkCudaErrorCode(cudaFree(ptr));
|
||||
}
|
||||
}
|
||||
m_buffers.clear();
|
||||
}
|
||||
}
|
||||
9
engines/TensorRTAPI/include/engine/NvDynLoader.h
Normal file
9
engines/TensorRTAPI/include/engine/NvDynLoader.h
Normal file
@@ -0,0 +1,9 @@
|
||||
#pragma once
|
||||
// ============================================================================
|
||||
// Forwarding header — NvDynLoader moved to ANSLibsLoader
|
||||
//
|
||||
// This file is retained for backward compatibility. All consuming projects
|
||||
// should update their include paths to reference ANSLibsLoader/include/
|
||||
// directly. Once all projects are updated, this file can be removed.
|
||||
// ============================================================================
|
||||
#include "../../../ANSLibsLoader/include/NvDynLoader.h"
|
||||
50
engines/TensorRTAPI/include/engine/TRTCompat.h
Normal file
50
engines/TensorRTAPI/include/engine/TRTCompat.h
Normal file
@@ -0,0 +1,50 @@
|
||||
#pragma once
|
||||
// ============================================================================
|
||||
// TRTCompat.h -- TensorRT version compatibility macros
|
||||
//
|
||||
// Centralises all TRT-version-dependent API differences so that the rest of
|
||||
// the codebase can be compiled against TRT 8.x or TRT 10.x without scattering
|
||||
// #if blocks everywhere.
|
||||
//
|
||||
// Build 1: CUDA 11.8 + cuDNN 8 + TensorRT 8.6 + OpenCV 4.10 (SM 35-86)
|
||||
// Build 2: CUDA 13.1 + cuDNN 9 + TensorRT 10 + OpenCV 4.13 (SM 75-121)
|
||||
// ============================================================================
|
||||
|
||||
#include <NvInferVersion.h>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Network creation
|
||||
// ---------------------------------------------------------------------------
|
||||
// TRT 10+: kEXPLICIT_BATCH was removed (it is the only mode).
|
||||
// TRT 8.x: The flag must be passed explicitly.
|
||||
#if NV_TENSORRT_MAJOR >= 10
|
||||
#define TRT_CREATE_NETWORK(builder) \
|
||||
(builder)->createNetworkV2(0)
|
||||
#else
|
||||
#define TRT_CREATE_NETWORK(builder) \
|
||||
(builder)->createNetworkV2( \
|
||||
1U << static_cast<uint32_t>( \
|
||||
nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH))
|
||||
#endif
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Inference execution
|
||||
// ---------------------------------------------------------------------------
|
||||
// TRT 10+: enqueueV3(stream) — uses tensor addresses pre-bound via
|
||||
// setTensorAddress().
|
||||
// TRT 8.x: enqueueV2(bindings, stream, nullptr) — uses a void** array
|
||||
// indexed by binding position.
|
||||
#if NV_TENSORRT_MAJOR >= 10
|
||||
#define TRT_ENQUEUE(context, stream, buffers) \
|
||||
(context)->enqueueV3(stream)
|
||||
#else
|
||||
#define TRT_ENQUEUE(context, stream, buffers) \
|
||||
(context)->enqueueV2( \
|
||||
reinterpret_cast<void**>((buffers).data()), (stream), nullptr)
|
||||
#endif
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Feature-detection helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
#define TRT_HAS_ENQUEUE_V3 (NV_TENSORRT_MAJOR >= 10)
|
||||
#define TRT_HAS_EXPLICIT_BATCH_FLAG (NV_TENSORRT_MAJOR < 10)
|
||||
177
engines/TensorRTAPI/include/engine/TRTEngineCache.h
Normal file
177
engines/TensorRTAPI/include/engine/TRTEngineCache.h
Normal file
@@ -0,0 +1,177 @@
|
||||
#pragma once
|
||||
// TRTEngineCache.h — Process-wide cache for shared TensorRT ICudaEngine instances.
|
||||
//
|
||||
// When multiple AI tasks load the same model (same .engine file + GPU), this cache
|
||||
// ensures only ONE copy of the model weights lives in VRAM. Each task creates its
|
||||
// own IExecutionContext from the shared ICudaEngine (TRT-supported pattern).
|
||||
//
|
||||
// Usage in loadNetwork():
|
||||
// auto& cache = TRTEngineCache::instance();
|
||||
// auto hit = cache.tryGet(enginePath, gpuIdx);
|
||||
// if (hit.engine) {
|
||||
// m_engine = hit.engine; m_runtime = hit.runtime; // cache hit
|
||||
// } else {
|
||||
// // ... deserialize as usual ...
|
||||
// m_engine = cache.putIfAbsent(enginePath, gpuIdx, runtime, engine);
|
||||
// }
|
||||
//
|
||||
// In ~Engine():
|
||||
// cache.release(enginePath, gpuIdx);
|
||||
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <iostream>
|
||||
#include <NvInfer.h>
|
||||
|
||||
/// Process-wide flag: set to true during DLL_PROCESS_DETACH when ExitProcess
|
||||
/// is in progress (lpReserved != NULL). Worker threads are already dead in
|
||||
/// this state, so thread::join() would deadlock and CUDA/TRT calls are unsafe.
|
||||
/// Checked by Engine::~Engine to skip cleanup that requires live threads or GPUs.
|
||||
inline std::atomic<bool>& g_processExiting() {
|
||||
static std::atomic<bool> s_flag{false};
|
||||
return s_flag;
|
||||
}
|
||||
|
||||
class TRTEngineCache {
|
||||
public:
|
||||
struct CacheHit {
|
||||
std::shared_ptr<nvinfer1::ICudaEngine> engine;
|
||||
std::shared_ptr<nvinfer1::IRuntime> runtime;
|
||||
};
|
||||
|
||||
static TRTEngineCache& instance() {
|
||||
static TRTEngineCache s_instance;
|
||||
return s_instance;
|
||||
}
|
||||
|
||||
/// Global bypass — when true, tryGet() always returns miss, putIfAbsent()
|
||||
/// is a no-op, and buildLoadNetwork/loadNetwork force single-GPU path.
|
||||
/// Used by OptimizeModelStr to prevent inner engines (created by
|
||||
/// custom DLLs via ANSLIB.dll) from creating pools/caching.
|
||||
/// Stored as a member of the singleton to guarantee a single instance
|
||||
/// across all translation units (avoids MSVC inline static duplication).
|
||||
static std::atomic<bool>& globalBypass() {
|
||||
return instance().m_globalBypass;
|
||||
}
|
||||
|
||||
std::atomic<bool> m_globalBypass{false};
|
||||
|
||||
/// Try to get a cached engine. Returns {nullptr, nullptr} on miss.
|
||||
/// On hit, increments refcount.
|
||||
CacheHit tryGet(const std::string& engineFilePath, int gpuIndex) {
|
||||
if (globalBypass().load(std::memory_order_relaxed)) return {nullptr, nullptr};
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
auto it = m_cache.find({engineFilePath, gpuIndex});
|
||||
if (it != m_cache.end()) {
|
||||
it->second.refcount++;
|
||||
std::cout << "[TRTEngineCache] HIT: " << engineFilePath
|
||||
<< " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl;
|
||||
return {it->second.engine, it->second.runtime};
|
||||
}
|
||||
return {nullptr, nullptr};
|
||||
}
|
||||
|
||||
/// Store a newly deserialized engine. If another thread already stored the
|
||||
/// same key (race), returns the existing one and the caller's copy is discarded.
|
||||
/// Increments refcount for the returned engine.
|
||||
std::shared_ptr<nvinfer1::ICudaEngine> putIfAbsent(
|
||||
const std::string& engineFilePath, int gpuIndex,
|
||||
std::shared_ptr<nvinfer1::IRuntime> runtime,
|
||||
std::shared_ptr<nvinfer1::ICudaEngine> engine) {
|
||||
if (globalBypass().load(std::memory_order_relaxed)) return engine; // don't cache
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
CacheKey key{engineFilePath, gpuIndex};
|
||||
auto it = m_cache.find(key);
|
||||
if (it != m_cache.end()) {
|
||||
// Another thread beat us — use theirs, discard ours
|
||||
it->second.refcount++;
|
||||
std::cout << "[TRTEngineCache] RACE: using existing for " << engineFilePath
|
||||
<< " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl;
|
||||
return it->second.engine;
|
||||
}
|
||||
// First to store — insert
|
||||
CachedEntry entry;
|
||||
entry.engine = std::move(engine);
|
||||
entry.runtime = std::move(runtime);
|
||||
entry.refcount = 1;
|
||||
auto inserted = m_cache.emplace(std::move(key), std::move(entry));
|
||||
std::cout << "[TRTEngineCache] STORED: " << engineFilePath
|
||||
<< " GPU[" << gpuIndex << "] refs=1" << std::endl;
|
||||
return inserted.first->second.engine;
|
||||
}
|
||||
|
||||
/// Decrement refcount. When refcount reaches 0, the engine is evicted immediately
|
||||
/// to release VRAM and file handles (allows ModelOptimizer to rebuild .engine files
|
||||
/// while LabVIEW is running).
|
||||
void release(const std::string& engineFilePath, int gpuIndex) {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
auto it = m_cache.find({engineFilePath, gpuIndex});
|
||||
if (it != m_cache.end() && it->second.refcount > 0) {
|
||||
it->second.refcount--;
|
||||
std::cout << "[TRTEngineCache] RELEASE: " << engineFilePath
|
||||
<< " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl;
|
||||
if (it->second.refcount <= 0) {
|
||||
std::cout << "[TRTEngineCache] EVICT (refcount=0): " << engineFilePath
|
||||
<< " GPU[" << gpuIndex << "]" << std::endl;
|
||||
m_cache.erase(it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove all entries with refcount == 0 (call at shutdown or when VRAM tight).
|
||||
void evictUnused() {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
for (auto it = m_cache.begin(); it != m_cache.end(); ) {
|
||||
if (it->second.refcount <= 0) {
|
||||
std::cout << "[TRTEngineCache] EVICT: " << it->first.path
|
||||
<< " GPU[" << it->first.gpuIndex << "]" << std::endl;
|
||||
it = m_cache.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Clear all cached engines immediately (call during DLL_PROCESS_DETACH
|
||||
/// BEFORE destroying engine handles, to avoid calling into unloaded TRT DLLs).
|
||||
void clearAll() {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
std::cout << "[TRTEngineCache] CLEAR ALL (" << m_cache.size() << " entries)" << std::endl;
|
||||
m_cache.clear(); // shared_ptrs released — engines destroyed while TRT is still loaded
|
||||
}
|
||||
|
||||
/// Number of cached engines (for diagnostics).
|
||||
size_t size() const {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
return m_cache.size();
|
||||
}
|
||||
|
||||
private:
|
||||
TRTEngineCache() = default;
|
||||
TRTEngineCache(const TRTEngineCache&) = delete;
|
||||
TRTEngineCache& operator=(const TRTEngineCache&) = delete;
|
||||
|
||||
struct CacheKey {
|
||||
std::string path;
|
||||
int gpuIndex = 0;
|
||||
bool operator==(const CacheKey& o) const {
|
||||
return path == o.path && gpuIndex == o.gpuIndex;
|
||||
}
|
||||
};
|
||||
struct CacheKeyHash {
|
||||
size_t operator()(const CacheKey& k) const {
|
||||
return std::hash<std::string>{}(k.path) ^
|
||||
(std::hash<int>{}(k.gpuIndex) << 16);
|
||||
}
|
||||
};
|
||||
struct CachedEntry {
|
||||
std::shared_ptr<nvinfer1::ICudaEngine> engine;
|
||||
std::shared_ptr<nvinfer1::IRuntime> runtime;
|
||||
int refcount = 0;
|
||||
};
|
||||
|
||||
std::unordered_map<CacheKey, CachedEntry, CacheKeyHash> m_cache;
|
||||
mutable std::mutex m_mutex;
|
||||
};
|
||||
Reference in New Issue
Block a user