#pragma once // TRTEngineCache.h — Process-wide cache for shared TensorRT ICudaEngine instances. // // When multiple AI tasks load the same model (same .engine file + GPU), this cache // ensures only ONE copy of the model weights lives in VRAM. Each task creates its // own IExecutionContext from the shared ICudaEngine (TRT-supported pattern). // // Usage in loadNetwork(): // auto& cache = TRTEngineCache::instance(); // auto hit = cache.tryGet(enginePath, gpuIdx); // if (hit.engine) { // m_engine = hit.engine; m_runtime = hit.runtime; // cache hit // } else { // // ... deserialize as usual ... // m_engine = cache.putIfAbsent(enginePath, gpuIdx, runtime, engine); // } // // In ~Engine(): // cache.release(enginePath, gpuIdx); #include #include #include #include #include #include /// Process-wide flag: set to true during DLL_PROCESS_DETACH when ExitProcess /// is in progress (lpReserved != NULL). Worker threads are already dead in /// this state, so thread::join() would deadlock and CUDA/TRT calls are unsafe. /// Checked by Engine::~Engine to skip cleanup that requires live threads or GPUs. inline std::atomic& g_processExiting() { static std::atomic s_flag{false}; return s_flag; } class TRTEngineCache { public: struct CacheHit { std::shared_ptr engine; std::shared_ptr runtime; }; static TRTEngineCache& instance() { static TRTEngineCache s_instance; return s_instance; } /// Global bypass — when true, tryGet() always returns miss, putIfAbsent() /// is a no-op, and buildLoadNetwork/loadNetwork force single-GPU path. /// Used by OptimizeModelStr to prevent inner engines (created by /// custom DLLs via ANSLIB.dll) from creating pools/caching. /// Stored as a member of the singleton to guarantee a single instance /// across all translation units (avoids MSVC inline static duplication). static std::atomic& globalBypass() { return instance().m_globalBypass; } std::atomic m_globalBypass{false}; /// Try to get a cached engine. Returns {nullptr, nullptr} on miss. /// On hit, increments refcount. CacheHit tryGet(const std::string& engineFilePath, int gpuIndex) { if (globalBypass().load(std::memory_order_relaxed)) return {nullptr, nullptr}; std::lock_guard lock(m_mutex); auto it = m_cache.find({engineFilePath, gpuIndex}); if (it != m_cache.end()) { it->second.refcount++; std::cout << "[TRTEngineCache] HIT: " << engineFilePath << " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl; return {it->second.engine, it->second.runtime}; } return {nullptr, nullptr}; } /// Store a newly deserialized engine. If another thread already stored the /// same key (race), returns the existing one and the caller's copy is discarded. /// Increments refcount for the returned engine. std::shared_ptr putIfAbsent( const std::string& engineFilePath, int gpuIndex, std::shared_ptr runtime, std::shared_ptr engine) { if (globalBypass().load(std::memory_order_relaxed)) return engine; // don't cache std::lock_guard lock(m_mutex); CacheKey key{engineFilePath, gpuIndex}; auto it = m_cache.find(key); if (it != m_cache.end()) { // Another thread beat us — use theirs, discard ours it->second.refcount++; std::cout << "[TRTEngineCache] RACE: using existing for " << engineFilePath << " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl; return it->second.engine; } // First to store — insert CachedEntry entry; entry.engine = std::move(engine); entry.runtime = std::move(runtime); entry.refcount = 1; auto inserted = m_cache.emplace(std::move(key), std::move(entry)); std::cout << "[TRTEngineCache] STORED: " << engineFilePath << " GPU[" << gpuIndex << "] refs=1" << std::endl; return inserted.first->second.engine; } /// Decrement refcount. When refcount reaches 0, the engine is evicted immediately /// to release VRAM and file handles (allows ModelOptimizer to rebuild .engine files /// while LabVIEW is running). void release(const std::string& engineFilePath, int gpuIndex) { std::lock_guard lock(m_mutex); auto it = m_cache.find({engineFilePath, gpuIndex}); if (it != m_cache.end() && it->second.refcount > 0) { it->second.refcount--; std::cout << "[TRTEngineCache] RELEASE: " << engineFilePath << " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl; if (it->second.refcount <= 0) { std::cout << "[TRTEngineCache] EVICT (refcount=0): " << engineFilePath << " GPU[" << gpuIndex << "]" << std::endl; m_cache.erase(it); } } } /// Remove all entries with refcount == 0 (call at shutdown or when VRAM tight). void evictUnused() { std::lock_guard lock(m_mutex); for (auto it = m_cache.begin(); it != m_cache.end(); ) { if (it->second.refcount <= 0) { std::cout << "[TRTEngineCache] EVICT: " << it->first.path << " GPU[" << it->first.gpuIndex << "]" << std::endl; it = m_cache.erase(it); } else { ++it; } } } /// Clear all cached engines immediately (call during DLL_PROCESS_DETACH /// BEFORE destroying engine handles, to avoid calling into unloaded TRT DLLs). void clearAll() { std::lock_guard lock(m_mutex); std::cout << "[TRTEngineCache] CLEAR ALL (" << m_cache.size() << " entries)" << std::endl; m_cache.clear(); // shared_ptrs released — engines destroyed while TRT is still loaded } /// Number of cached engines (for diagnostics). size_t size() const { std::lock_guard lock(m_mutex); return m_cache.size(); } private: TRTEngineCache() = default; ~TRTEngineCache() { if (g_processExiting().load(std::memory_order_relaxed)) { // ExitProcess path: CUDA context is dead. Leak ICudaEngine and // IRuntime shared_ptrs so their destructors don't call into a // destroyed CUDA driver. The OS reclaims everything at exit. for (auto& [_, entry] : m_cache) { auto* le = new std::shared_ptr(std::move(entry.engine)); auto* lr = new std::shared_ptr(std::move(entry.runtime)); (void)le; (void)lr; // intentional leak } } } TRTEngineCache(const TRTEngineCache&) = delete; TRTEngineCache& operator=(const TRTEngineCache&) = delete; struct CacheKey { std::string path; int gpuIndex = 0; bool operator==(const CacheKey& o) const { return path == o.path && gpuIndex == o.gpuIndex; } }; struct CacheKeyHash { size_t operator()(const CacheKey& k) const { return std::hash{}(k.path) ^ (std::hash{}(k.gpuIndex) << 16); } }; struct CachedEntry { std::shared_ptr engine; std::shared_ptr runtime; int refcount = 0; }; std::unordered_map m_cache; mutable std::mutex m_mutex; };