Initial setup for CLion
This commit is contained in:
177
engines/TensorRTAPI/include/engine/TRTEngineCache.h
Normal file
177
engines/TensorRTAPI/include/engine/TRTEngineCache.h
Normal file
@@ -0,0 +1,177 @@
|
||||
#pragma once
|
||||
// TRTEngineCache.h — Process-wide cache for shared TensorRT ICudaEngine instances.
|
||||
//
|
||||
// When multiple AI tasks load the same model (same .engine file + GPU), this cache
|
||||
// ensures only ONE copy of the model weights lives in VRAM. Each task creates its
|
||||
// own IExecutionContext from the shared ICudaEngine (TRT-supported pattern).
|
||||
//
|
||||
// Usage in loadNetwork():
|
||||
// auto& cache = TRTEngineCache::instance();
|
||||
// auto hit = cache.tryGet(enginePath, gpuIdx);
|
||||
// if (hit.engine) {
|
||||
// m_engine = hit.engine; m_runtime = hit.runtime; // cache hit
|
||||
// } else {
|
||||
// // ... deserialize as usual ...
|
||||
// m_engine = cache.putIfAbsent(enginePath, gpuIdx, runtime, engine);
|
||||
// }
|
||||
//
|
||||
// In ~Engine():
|
||||
// cache.release(enginePath, gpuIdx);
|
||||
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <iostream>
|
||||
#include <NvInfer.h>
|
||||
|
||||
/// Process-wide flag: set to true during DLL_PROCESS_DETACH when ExitProcess
|
||||
/// is in progress (lpReserved != NULL). Worker threads are already dead in
|
||||
/// this state, so thread::join() would deadlock and CUDA/TRT calls are unsafe.
|
||||
/// Checked by Engine::~Engine to skip cleanup that requires live threads or GPUs.
|
||||
inline std::atomic<bool>& g_processExiting() {
|
||||
static std::atomic<bool> s_flag{false};
|
||||
return s_flag;
|
||||
}
|
||||
|
||||
class TRTEngineCache {
|
||||
public:
|
||||
struct CacheHit {
|
||||
std::shared_ptr<nvinfer1::ICudaEngine> engine;
|
||||
std::shared_ptr<nvinfer1::IRuntime> runtime;
|
||||
};
|
||||
|
||||
static TRTEngineCache& instance() {
|
||||
static TRTEngineCache s_instance;
|
||||
return s_instance;
|
||||
}
|
||||
|
||||
/// Global bypass — when true, tryGet() always returns miss, putIfAbsent()
|
||||
/// is a no-op, and buildLoadNetwork/loadNetwork force single-GPU path.
|
||||
/// Used by OptimizeModelStr to prevent inner engines (created by
|
||||
/// custom DLLs via ANSLIB.dll) from creating pools/caching.
|
||||
/// Stored as a member of the singleton to guarantee a single instance
|
||||
/// across all translation units (avoids MSVC inline static duplication).
|
||||
static std::atomic<bool>& globalBypass() {
|
||||
return instance().m_globalBypass;
|
||||
}
|
||||
|
||||
std::atomic<bool> m_globalBypass{false};
|
||||
|
||||
/// Try to get a cached engine. Returns {nullptr, nullptr} on miss.
|
||||
/// On hit, increments refcount.
|
||||
CacheHit tryGet(const std::string& engineFilePath, int gpuIndex) {
|
||||
if (globalBypass().load(std::memory_order_relaxed)) return {nullptr, nullptr};
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
auto it = m_cache.find({engineFilePath, gpuIndex});
|
||||
if (it != m_cache.end()) {
|
||||
it->second.refcount++;
|
||||
std::cout << "[TRTEngineCache] HIT: " << engineFilePath
|
||||
<< " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl;
|
||||
return {it->second.engine, it->second.runtime};
|
||||
}
|
||||
return {nullptr, nullptr};
|
||||
}
|
||||
|
||||
/// Store a newly deserialized engine. If another thread already stored the
|
||||
/// same key (race), returns the existing one and the caller's copy is discarded.
|
||||
/// Increments refcount for the returned engine.
|
||||
std::shared_ptr<nvinfer1::ICudaEngine> putIfAbsent(
|
||||
const std::string& engineFilePath, int gpuIndex,
|
||||
std::shared_ptr<nvinfer1::IRuntime> runtime,
|
||||
std::shared_ptr<nvinfer1::ICudaEngine> engine) {
|
||||
if (globalBypass().load(std::memory_order_relaxed)) return engine; // don't cache
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
CacheKey key{engineFilePath, gpuIndex};
|
||||
auto it = m_cache.find(key);
|
||||
if (it != m_cache.end()) {
|
||||
// Another thread beat us — use theirs, discard ours
|
||||
it->second.refcount++;
|
||||
std::cout << "[TRTEngineCache] RACE: using existing for " << engineFilePath
|
||||
<< " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl;
|
||||
return it->second.engine;
|
||||
}
|
||||
// First to store — insert
|
||||
CachedEntry entry;
|
||||
entry.engine = std::move(engine);
|
||||
entry.runtime = std::move(runtime);
|
||||
entry.refcount = 1;
|
||||
auto inserted = m_cache.emplace(std::move(key), std::move(entry));
|
||||
std::cout << "[TRTEngineCache] STORED: " << engineFilePath
|
||||
<< " GPU[" << gpuIndex << "] refs=1" << std::endl;
|
||||
return inserted.first->second.engine;
|
||||
}
|
||||
|
||||
/// Decrement refcount. When refcount reaches 0, the engine is evicted immediately
|
||||
/// to release VRAM and file handles (allows ModelOptimizer to rebuild .engine files
|
||||
/// while LabVIEW is running).
|
||||
void release(const std::string& engineFilePath, int gpuIndex) {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
auto it = m_cache.find({engineFilePath, gpuIndex});
|
||||
if (it != m_cache.end() && it->second.refcount > 0) {
|
||||
it->second.refcount--;
|
||||
std::cout << "[TRTEngineCache] RELEASE: " << engineFilePath
|
||||
<< " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl;
|
||||
if (it->second.refcount <= 0) {
|
||||
std::cout << "[TRTEngineCache] EVICT (refcount=0): " << engineFilePath
|
||||
<< " GPU[" << gpuIndex << "]" << std::endl;
|
||||
m_cache.erase(it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove all entries with refcount == 0 (call at shutdown or when VRAM tight).
|
||||
void evictUnused() {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
for (auto it = m_cache.begin(); it != m_cache.end(); ) {
|
||||
if (it->second.refcount <= 0) {
|
||||
std::cout << "[TRTEngineCache] EVICT: " << it->first.path
|
||||
<< " GPU[" << it->first.gpuIndex << "]" << std::endl;
|
||||
it = m_cache.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Clear all cached engines immediately (call during DLL_PROCESS_DETACH
|
||||
/// BEFORE destroying engine handles, to avoid calling into unloaded TRT DLLs).
|
||||
void clearAll() {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
std::cout << "[TRTEngineCache] CLEAR ALL (" << m_cache.size() << " entries)" << std::endl;
|
||||
m_cache.clear(); // shared_ptrs released — engines destroyed while TRT is still loaded
|
||||
}
|
||||
|
||||
/// Number of cached engines (for diagnostics).
|
||||
size_t size() const {
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
return m_cache.size();
|
||||
}
|
||||
|
||||
private:
|
||||
TRTEngineCache() = default;
|
||||
TRTEngineCache(const TRTEngineCache&) = delete;
|
||||
TRTEngineCache& operator=(const TRTEngineCache&) = delete;
|
||||
|
||||
struct CacheKey {
|
||||
std::string path;
|
||||
int gpuIndex = 0;
|
||||
bool operator==(const CacheKey& o) const {
|
||||
return path == o.path && gpuIndex == o.gpuIndex;
|
||||
}
|
||||
};
|
||||
struct CacheKeyHash {
|
||||
size_t operator()(const CacheKey& k) const {
|
||||
return std::hash<std::string>{}(k.path) ^
|
||||
(std::hash<int>{}(k.gpuIndex) << 16);
|
||||
}
|
||||
};
|
||||
struct CachedEntry {
|
||||
std::shared_ptr<nvinfer1::ICudaEngine> engine;
|
||||
std::shared_ptr<nvinfer1::IRuntime> runtime;
|
||||
int refcount = 0;
|
||||
};
|
||||
|
||||
std::unordered_map<CacheKey, CachedEntry, CacheKeyHash> m_cache;
|
||||
mutable std::mutex m_mutex;
|
||||
};
|
||||
Reference in New Issue
Block a user