Files
ANSCORE/engines/TensorRTAPI/include/engine/EnginePoolManager.h
Tuan Nghia Nguyen 8d2db78cee Remove [Engine] and [EnginePoolManager] debug log messages
Cleaned up verbose engine telemetry emitted to stdout/stderr and the
Windows Event Viewer. Removes logEngineEvent/logEvent calls (and their
diagnostic-only locals) across the TensorRT engine load, build, run,
multi-GPU, and pool-manager paths, plus the now-unused logEvent helper
in EnginePoolManager.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 18:09:08 +10:00

364 lines
15 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#pragma once
// EnginePoolManager.h — Process-wide cache for shared Engine<T> pool instances.
//
// When multiple AI tasks load the same model (same ONNX path + GPU + config),
// this manager ensures they share a SINGLE Engine<T> pool instead of each task
// creating its own pool with independent execution contexts and VRAM buffers.
//
// Without sharing: N tasks × ~500 MB = N × 500 MB VRAM (OOM at ~5-8 tasks on 8GB GPU)
// With sharing: 1 pool × ~500 MB = 500 MB total (unlimited tasks, slower via queuing)
//
// Lazy eviction: when refcount drops to 0, the pool is kept alive for
// kEvictGraceSec seconds. If a new task acquires it within that window,
// it gets an instant HIT without rebuilding. This handles the LabView
// edit/duplicate/create cycle (destroy → recreate) gracefully.
//
// Thread-safety: All public methods are mutex-protected.
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <array>
#include <iostream>
#include <functional>
#include <chrono>
#include <thread>
#include <atomic>
#include <cuda_runtime.h>
#include "TRTEngineCache.h" // constructor touches TRTEngineCache::instance() for destruction ordering
#ifdef _WIN32
#include <windows.h>
#endif
// Forward declare Engine<T> to avoid circular includes.
// The header that includes this must also include engine.h.
template <typename T> class Engine;
namespace ANSCENTER { struct Options; }
template <typename T>
class EnginePoolManager {
public:
static EnginePoolManager& instance() {
static EnginePoolManager s_instance;
return s_instance;
}
// ========================================================================
// Cache key — uniquely identifies a compatible Engine pool.
// ========================================================================
struct PoolKey {
std::string modelPath;
int precision = 0; // cast from Precision enum
int maxBatch = 1;
bool operator==(const PoolKey& o) const {
return modelPath == o.modelPath &&
precision == o.precision &&
maxBatch == o.maxBatch;
}
};
struct PoolKeyHash {
size_t operator()(const PoolKey& k) const {
size_t h = std::hash<std::string>{}(k.modelPath);
h ^= std::hash<int>{}(k.precision) << 16;
h ^= std::hash<int>{}(k.maxBatch) << 24;
return h;
}
};
// ========================================================================
// acquire() — get or create a shared Engine pool.
//
// On first call for a given key: creates a new Engine<T>, calls
// buildLoadNetwork with the provided parameters, and caches it.
//
// On subsequent calls (or within lazy-eviction grace period):
// returns the existing shared_ptr and increments refcount.
// No VRAM allocated, near-instant.
//
// Returns nullptr if engine creation/loading fails.
// ========================================================================
std::shared_ptr<Engine<T>> acquire(
const PoolKey& key,
const ANSCENTER::Options& options,
const std::string& modelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
int maxSlotsPerGpu)
{
// Optimizer / temporary engines: maxSlotsPerGpu==0 means the caller
// only needs a lightweight, non-shared engine (e.g., OptimizeModelStr).
// Bypass the pool cache entirely:
// - Don't hold m_mutex (which blocks ALL other pool creation)
// - Don't cache the result (temporary engine is destroyed on release)
// - Use the simple 4-param buildLoadNetwork (no pool, no probe, no VRAM measurement)
// Note: maxSlotsPerGpu==1 is now the normal "1 slot per GPU" multi-GPU
// round-robin mode, so it goes through the pool path below.
if (maxSlotsPerGpu == 0) {
auto engine = std::make_shared<Engine<T>>(options);
bool ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
return ok ? engine : nullptr;
}
std::unique_lock<std::mutex> lock(m_mutex);
auto it = m_pools.find(key);
if (it != m_pools.end()) {
it->second.refcount++;
it->second.evictTime = TimePoint{}; // cancel pending eviction
int refs = it->second.refcount;
auto engine = it->second.engine;
// Demand-driven growth: only in elastic mode (maxSlotsPerGpu <= 0
// or > 1). With maxSlotsPerGpu==1 (round-robin default), the pool
// already has the right number of slots (1 per GPU) — tasks queue
// when all slots are busy, which is the intended behavior.
if (maxSlotsPerGpu != 1 && refs > 1 && engine) {
int alive = engine->getTotalCapacity();
if (alive < refs) {
// Check total GPU VRAM — skip growth on small GPUs
size_t totalVram = 0;
{
size_t freeTmp = 0;
cudaSetDevice(options.deviceIndex);
cudaMemGetInfo(&freeTmp, &totalVram);
}
constexpr size_t kMinVramForGrowth = 6ULL * 1024 * 1024 * 1024; // 6 GB
if (totalVram >= kMinVramForGrowth) {
lock.unlock(); // release PoolManager lock before growing
std::thread([engine]() {
engine->growPool(1);
}).detach();
}
}
}
return engine;
}
// Cache miss — create new Engine pool
auto engine = std::make_shared<Engine<T>>(options);
bool ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize, maxSlotsPerGpu);
if (!ok) {
// Step 1: Force-evict all pools with refcount=0 to reclaim VRAM
int evicted = forceEvictPending();
if (evicted > 0) {
engine = std::make_shared<Engine<T>>(options);
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize, maxSlotsPerGpu);
}
// Step 2: If still failing, retry with lightweight mode (no elastic pool).
// The elastic probe does heavy warmup (batch 1-8, 10+ iterations) which
// consumes ~300-500 MB vs ~50-100 MB for a simple loadNetwork.
// Lightweight mode: tasks queue for a single shared slot — slower but works.
if (!ok) {
engine = std::make_shared<Engine<T>>(options);
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
}
// Step 3: If still failing, wait briefly and retry.
// Transient failures can occur when:
// - TRT engine file is being written by another build (partial file)
// - CUDA driver has temporary resource contention during multi-pool startup
// - GPU memory fragmentation resolves after previous allocations settle
// Evidence: FireSmoke/detector.onnx failed at 3740 MiB free, then
// succeeded 4 seconds later at 3154 MiB free (less VRAM!).
if (!ok) {
// Release mutex during sleep so other tasks can proceed
// (they may complete pool creation that resolves our issue)
lock.unlock();
std::this_thread::sleep_for(std::chrono::seconds(3));
lock.lock();
// Check if another thread created this pool while we slept
auto it2 = m_pools.find(key);
if (it2 != m_pools.end()) {
it2->second.refcount++;
it2->second.evictTime = TimePoint{};
return it2->second.engine;
}
// Final retry — try lightweight again after delay
engine = std::make_shared<Engine<T>>(options);
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
}
if (!ok) {
return nullptr;
}
}
PoolEntry entry;
entry.engine = engine;
entry.refcount = 1;
m_pools.emplace(key, std::move(entry));
// Start the lazy-eviction sweeper if not already running
startSweeperIfNeeded();
return engine;
}
// ========================================================================
// release() — decrement refcount for a shared pool.
//
// When refcount reaches 0, the pool is NOT immediately evicted.
// Instead, it is marked for lazy eviction after kEvictGraceSec.
// This handles the LabView edit cycle (destroy → recreate within
// seconds) without rebuilding the engine from scratch.
// ========================================================================
void release(const PoolKey& key) {
std::lock_guard<std::mutex> lock(m_mutex);
auto it = m_pools.find(key);
if (it == m_pools.end()) return;
if (it->second.refcount <= 0) return;
it->second.refcount--;
if (it->second.refcount <= 0) {
// Mark for lazy eviction — don't destroy yet
it->second.evictTime = Clock::now() + std::chrono::seconds(kEvictGraceSec);
}
}
/// Clear all cached pools (call during DLL_PROCESS_DETACH).
void clearAll() {
{
std::lock_guard<std::mutex> lock(m_mutex);
m_pools.clear();
}
stopSweeper();
}
/// Number of cached pools (for diagnostics).
size_t size() const {
std::lock_guard<std::mutex> lock(m_mutex);
return m_pools.size();
}
private:
EnginePoolManager() {
// CRITICAL: Touch TRTEngineCache singleton to ensure it is constructed
// BEFORE EnginePoolManager. C++ destroys function-local statics in
// reverse construction order, so this guarantees TRTEngineCache outlives
// EnginePoolManager. Without this, during ExitProcess the cache may be
// destroyed first, and ~Engine calling TRTEngineCache::release() crashes
// on a destroyed unordered_map (static destruction order fiasco).
(void)TRTEngineCache::instance();
}
~EnginePoolManager() {
if (g_processExiting().load(std::memory_order_relaxed)) {
// ExitProcess path: worker threads are dead, CUDA/TRT state is
// unreliable. Don't destroy Engine objects (their destructors
// call cudaFree, thread::join, etc. which deadlock or crash).
// The OS reclaims all memory, VRAM, and handles at process exit.
//
// Intentionally leak Engine shared_ptrs: after the explicit dtor
// body returns, the compiler still runs implicit member dtors for
// m_pools. That would destroy shared_ptr<Engine<T>>, triggering
// TRT/CUDA cleanup on a dead context. Detach the shared_ptrs
// first so the map destructor only frees empty entries.
m_sweeperRunning.store(false);
// Leak Engine objects: bump refcount so shared_ptr dtor won't
// actually delete them when m_pools is implicitly destroyed.
for (auto& [_, entry] : m_pools) {
auto* leaked = new std::shared_ptr<Engine<T>>(std::move(entry.engine));
(void)leaked; // intentional leak — OS reclaims at exit
}
return;
}
// Normal FreeLibrary path: threads are alive, safe to clean up.
// Explicitly clear pools before implicit member destruction.
// This destroys Engine<T> objects (which call TRTEngineCache::release())
// while we still hold m_mutex and can log diagnostics.
try {
std::lock_guard<std::mutex> lock(m_mutex);
m_pools.clear();
} catch (...) {}
stopSweeper();
}
EnginePoolManager(const EnginePoolManager&) = delete;
EnginePoolManager& operator=(const EnginePoolManager&) = delete;
// Grace period before evicting a pool with refcount=0.
// Covers LabView edit/duplicate/create cycles (destroy → recreate).
static constexpr int kEvictGraceSec = 120; // 2 minutes
// Sweeper interval — how often to check for expired pools.
static constexpr int kSweeperIntervalSec = 30;
using Clock = std::chrono::steady_clock;
using TimePoint = std::chrono::time_point<Clock>;
struct PoolEntry {
std::shared_ptr<Engine<T>> engine;
int refcount = 0;
TimePoint evictTime {}; // when to evict (zero = not pending)
};
// ========================================================================
// Sweeper thread — periodically checks for pools whose eviction
// grace period has expired and removes them.
// ========================================================================
void startSweeperIfNeeded() {
// Called under m_mutex
if (m_sweeperRunning.load()) return;
m_sweeperRunning.store(true);
m_sweeperThread = std::thread([this]() {
while (m_sweeperRunning.load()) {
std::this_thread::sleep_for(std::chrono::seconds(kSweeperIntervalSec));
if (!m_sweeperRunning.load()) break;
sweepExpired();
}
});
m_sweeperThread.detach();
}
void stopSweeper() {
m_sweeperRunning.store(false);
}
// Force-evict ALL pools with refcount=0 (regardless of grace period).
// Called when a new pool creation fails due to low VRAM.
// Returns number of pools evicted.
// MUST be called under m_mutex.
int forceEvictPending() {
int evicted = 0;
for (auto it = m_pools.begin(); it != m_pools.end(); ) {
if (it->second.refcount <= 0) {
it = m_pools.erase(it);
evicted++;
} else {
++it;
}
}
return evicted;
}
void sweepExpired() {
std::lock_guard<std::mutex> lock(m_mutex);
auto now = Clock::now();
for (auto it = m_pools.begin(); it != m_pools.end(); ) {
auto& entry = it->second;
// Only evict if refcount is 0 AND evictTime has passed
if (entry.refcount <= 0
&& entry.evictTime != TimePoint{}
&& now >= entry.evictTime)
{
it = m_pools.erase(it);
} else {
++it;
}
}
}
std::unordered_map<PoolKey, PoolEntry, PoolKeyHash> m_pools;
mutable std::mutex m_mutex;
std::atomic<bool> m_sweeperRunning{false};
std::thread m_sweeperThread;
};