Files

847 lines
41 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#pragma once
#include "engine_api.h" // ENGINE_API (__declspec dllexport / dllimport)
// -- TRT / ONNX-parser API decoration override -----------------------------
// Must appear BEFORE the first inclusion of NvInfer.h / NvOnnxParser.h.
//
// By default these macros expand to __declspec(dllimport) for consumers of
// the TRT DLL, which causes the compiler to emit indirect import-table calls.
// Overriding them to empty makes all calls direct, so the linker resolves
// them against the extern "C" stub definitions in NvDynLoader.cpp instead of
// against nvinfer_XX.lib / nvonnxparser_XX.lib.
//
// Remove nvinfer_XX.lib and nvonnxparser_XX.lib from Linker -> Input.
// Link ANSLibsLoader.lib instead. See NvDynLoader.h for full details.
#ifndef TENSORRTAPI
# define TENSORRTAPI
#endif
#ifndef NVONNXPARSER_API
# define NVONNXPARSER_API
#endif
#include "NvOnnxParser.h"
#include <atomic>
#include <chrono>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <deque>
#include <filesystem>
#include <fstream>
#include <unordered_map>
#include <memory>
#include <condition_variable>
#include <mutex>
#include <string>
#include <thread>
#include <opencv2/core/cuda.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>
#include <opencv2/cudaarithm.hpp>
#include "engine/TRTEngineCache.h"
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/opencv.hpp>
#include "interfaces\IEngine.h"
#include "ANSLicense.h"
#include "Int8Calibrator.h"
#include "NvDynLoader.h" // from ANSLibsLoader/include/
#ifdef _WIN32
#include <windows.h>
#endif
// Class to extend TensorRT logger
class ENGINE_API Logger : public nvinfer1::ILogger {
void log(Severity severity, const char *msg) noexcept override;
};
// ============================================================================
// Diagnostic logging to Windows Event Viewer (ANSLogger source).
// Allows internal engine failure points to be traced even when running
// under LabView which swallows stdout/stderr.
// ============================================================================
inline void logEngineEvent(const std::string& msg, bool isError = false) {
#ifdef _WIN32
// Cache the event source handle to avoid RegisterEventSourceA/
// DeregisterEventSource churn. Under heavy load (10+ AI tasks),
// the per-message register/deregister cycle overwhelms the Windows
// Event Log service, causing ERROR_UNRECOGNIZED_VOLUME (1005) floods
// that freeze the system.
static HANDLE hLog = RegisterEventSourceA(nullptr, "ANSLogger");
if (hLog) {
const char* msgs[1] = { msg.c_str() };
ReportEventA(hLog,
isError ? EVENTLOG_ERROR_TYPE : EVENTLOG_INFORMATION_TYPE,
0, isError ? 1004 : 1000, nullptr,
1, 0, msgs, nullptr);
}
#endif
if (isError)
std::cerr << msg << std::endl;
else
std::cout << msg << std::endl;
}
// ============================================================================
// GpuDeviceInfo -- snapshot of one CUDA device captured at pool-init time.
// Returned by Engine<T>::enumerateDevices() and stored inside the pool.
// ============================================================================
struct ENGINE_API GpuDeviceInfo {
int index = 0;
std::string name;
size_t totalMemoryBytes = 0; ///< Physical VRAM
size_t freeMemoryAtInitBytes = 0; ///< Free VRAM when pool started
int computeMajor = 0;
int computeMinor = 0;
int slotsAllocated = 0; ///< Engine<T> instances on this GPU
size_t memoryPerSlotBytes = 0; ///< VRAM bytes each slot occupies
};
template <typename T>
class Engine : public IEngine<T> {
public:
Engine();
Engine(const ANSCENTER::Options &options);
~Engine();
// Build the onnx model into a TensorRT engine file, cache the model to disk
// (to avoid rebuilding in future), and then load the model into memory The
// default implementation will normalize values between [0.f, 1.f] Setting the
// normalize flag to false will leave values between [0.f, 255.f] (some
// converted models may require this). If the model requires values to be
// normalized between [-1.f, 1.f], use the following params:
// subVals = {0.5f, 0.5f, 0.5f};
// divVals = {0.5f, 0.5f, 0.5f};
// normalize = true;
bool buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override;
// Load a TensorRT engine file from disk into memory
// The default implementation will normalize values between [0.f, 1.f]
// Setting the normalize flag to false will leave values between [0.f, 255.f]
// (some converted models may require this). If the model requires values to
// be normalized between [-1.f, 1.f], use the following params:
// subVals = {0.5f, 0.5f, 0.5f};
// divVals = {0.5f, 0.5f, 0.5f};
// normalize = true;
bool loadNetwork(std::string trtModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override;
// -- Multi-GPU pool overloads (non-virtual; call through Engine<T>* or
// unique_ptr<Engine<T>>, not through IEngine<T>*).
//
// maxSlotsPerGpu == 0 -> BYPASS: identical to the 4-param single-GPU path
// above. Used by OptimizeModelStr for temporary engines.
// maxSlotsPerGpu == 1 -> ROUND-ROBIN (default): 1 slot per GPU, multi-GPU
// round-robin dispatch. Tasks queue when all slots
// busy. Best balance of VRAM usage and multi-GPU
// utilisation.
// maxSlotsPerGpu == -1 -> ELASTIC pool: 1 probe slot at init, additional
// slots created on-demand when concurrent requests
// arrive, released when idle via releaseIdleSlots().
// Higher throughput but higher VRAM usage — only for
// large GPUs (≥ 8 GB).
// maxSlotsPerGpu > 1 -> PRE-ALLOCATED pool: cap at that many slots/GPU,
// all created at init time.
//
// Usage -- opt into pool mode by adding ONE extra argument:
// m_trtEngine->buildLoadNetwork(path, sub, div, norm); // single-GPU (unchanged)
// m_trtEngine->buildLoadNetwork(path, sub, div, norm, 1); // round-robin (default)
// m_trtEngine->buildLoadNetwork(path, sub, div, norm, -1); // elastic pool
// m_trtEngine->buildLoadNetwork(path, sub, div, norm, 4); // pre-allocated, 4 slots/GPU
bool buildLoadNetwork(std::string onnxModelPath,
const std::array<float, 3> &subVals,
const std::array<float, 3> &divVals,
bool normalize,
int maxSlotsPerGpu,
double memSafetyFactor = 0.80);
bool loadNetwork(std::string trtModelPath,
const std::array<float, 3> &subVals,
const std::array<float, 3> &divVals,
bool normalize,
int maxSlotsPerGpu,
double memSafetyFactor = 0.80);
// Run inference.
// Input format [input][batch][cv::cuda::GpuMat]
// Output format [batch][output][feature_vector]
bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &inputs, std::vector<std::vector<std::vector<T>>> &featureVectors) override;
// Utility method for resizing an image while maintaining the aspect ratio by
// adding padding to smaller dimension after scaling While letterbox padding
// normally adds padding to top & bottom, or left & right sides, this
// implementation only adds padding to the right or bottom side This is done
// so that it's easier to convert detected coordinates (ex. YOLO model) back
// to the original reference frame.
static cv::cuda::GpuMat resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width,
const cv::Scalar &bgcolor = cv::Scalar(0, 0, 0));
static cv::Mat cpuResizeKeepAspectRatioPadRightBottom(const cv::Mat &input, size_t height, size_t width,
const cv::Scalar &bgcolor = cv::Scalar(114, 114, 114));
[[nodiscard]] const std::vector<nvinfer1::Dims3> &getInputDims() const override { return m_inputDims; };
[[nodiscard]] const std::vector<nvinfer1::Dims> &getOutputDims() const override { return m_outputDims; };
[[nodiscard]] const ANSCENTER::Options& getOptions() const { return m_options; }
/// Return the CUDA device index that preprocessing should target.
/// In pool mode this returns the device of the first alive slot;
/// in single-engine mode it returns m_options.deviceIndex.
/// Call cudaSetDevice(engine->getPreferredDeviceIndex()) before
/// any cv::cuda::GpuMat operations to ensure multi-GPU correctness.
[[nodiscard]] int getPreferredDeviceIndex() const {
// Pool mode: use the first alive slot's device
if (!m_slots.empty()) {
for (const auto& s : m_slots) {
if (s.engine) return s.deviceIndex;
}
}
return m_options.deviceIndex;
}
/// Convenience: set the calling thread's CUDA device to match this engine.
/// Skips the call if already on the correct device (WDDM overhead avoidance).
void setDeviceContext() const {
const int target = getPreferredDeviceIndex();
int current = -1;
cudaGetDevice(&current);
if (current != target) {
cudaSetDevice(target);
}
}
// Utility method for transforming triple nested output array into 2D array
// Should be used when the output batch size is 1, but there are multiple
// output feature vectors
static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output);
// Utility method for transforming triple nested output array into single
// array Should be used when the output batch size is 1, and there is only a
// single output feature vector
static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output);
// Convert NHWC to NCHW and apply scaling and mean subtraction.
// Pass a cv::cuda::Stream to run all GPU kernels on that stream instead of
// the CUDA null (default) stream. This eliminates global sync barriers and
// enables CUDA-graph-safe preprocessing on OpenCV 4.13+.
static cv::cuda::GpuMat blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
const std::array<float, 3> &divVals, bool normalize, bool swapRB = false,
cv::cuda::Stream &stream = cv::cuda::Stream::Null());
// Converts the engine options into a string
std::string serializeEngineOptions(const ANSCENTER::Options& options, const std::string& onnxModelPath);
// Build the network (single attempt)
bool build(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize);
// SEH-safe wrapper around build(). Catches access violations that corrupt
// the CUDA context (e.g. opset 19+ ONNX models crash TRT's parser).
// Returns false and sets *outSehCode on crash.
bool buildSafe(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize, unsigned long* outSehCode);
// Build with auto-retry for dynamic spatial dimension models.
// Pre-analyzes the ONNX model; if H/W are dynamic and the max profile
// is too large for GPU VRAM, automatically reduces max spatial dims and
// retries until build succeeds or all candidates are exhausted.
// Fixed-spatial models fall through to a single build() call.
bool buildWithRetry(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize);
void getDeviceNames(std::vector<std::string>& deviceNames);
void clearGpuBuffers();
int getBindingIndexByName(const std::string& name);
void UpdateOptions(const ANSCENTER::Options& options) { m_options = options; }
void warmUp(int iterations = 5);
void setVerbose(bool v) { m_verbose = v; }
void setDisableGraphs(bool v) { m_disableGraphs = v; }
// -- Multi-GPU pool management (replaces MultiGpuEngineManager<T>) ---------
//
// Call ONE of the two initialize methods below instead of
// buildLoadNetwork() / loadNetwork(), then use runInference() exactly as
// you would for single-GPU. runInference() dispatches automatically to
// the first idle slot across all GPUs.
//
// Single-GPU code that calls buildLoadNetwork() + runInference() directly
// is completely unaffected -- it follows the existing code path unchanged.
/**
* Build from an ONNX file and create an Engine pool across all GPUs.
*
* @param baseOptions Configuration template; deviceIndex is ignored
* and overridden per slot.
* @param onnxModelPath Path to the .onnx model.
* @param subVals Per-channel subtraction for normalisation.
* @param divVals Per-channel divisor for normalisation.
* @param normalize Scale pixel values to [0, 1] before inference.
* @param maxSlotsPerGpu Cap slots per GPU (-1 = memory-limited only).
* @param memSafetyFactor Fraction of free VRAM to consume (default 0.80).
*/
bool initializePool(const ANSCENTER::Options& baseOptions,
const std::string& onnxModelPath,
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},
const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
bool normalize = true,
int maxSlotsPerGpu = -1,
double memSafetyFactor = 0.80);
/**
* Load a pre-built TRT engine and create a pool -- no ONNX build step.
*/
bool initializePoolFromEngine(const ANSCENTER::Options& baseOptions,
const std::string& trtEnginePath,
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},
const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
bool normalize = true,
int maxSlotsPerGpu = -1,
double memSafetyFactor = 0.80);
/** Enumerate all CUDA-capable GPUs without loading any model. */
static std::vector<GpuDeviceInfo> enumerateDevices();
/** Total Engine<T> slots across all GPUs (valid after initializePool). */
int getTotalCapacity() const { return m_totalCapacity; }
/** Slots currently executing inference (approximate, lock-free). */
int getActiveInferences() const { return m_activeCount.load(); }
/** Idle slots not currently running inference. */
int getAvailableSlots() const { return m_totalCapacity - m_activeCount.load(); }
/** True when every alive slot is busy.
* In elastic mode, new work will trigger on-demand slot creation
* rather than being rejected (unless all GPUs are physically full). */
bool isAtCapacity() const { return getAvailableSlots() <= 0; }
/** Device snapshots captured at initializePool() time. */
const std::vector<GpuDeviceInfo>& getDeviceInfo() const { return m_deviceInfos; }
/** Print a human-readable pool capacity and device report. */
void printCapacityReport() const;
/**
* Release idle slots whose last inference was more than @p idleSeconds
* ago. The probe slot (first slot) is never released so the model
* remains instantly usable. Returns the number of slots released.
*
* Typical usage -- call periodically from a background timer:
* @code
* int freed = engine->releaseIdleSlots(30.0);
* @endcode
*/
int releaseIdleSlots(double idleSeconds = 30.0);
/** Actual max spatial dims from the loaded engine's optimization profile.
* Returns -1 if the engine is not loaded yet or dim is fixed. */
int getProfileMaxHeight() const { return m_profileMaxHeight; }
int getProfileMaxWidth() const { return m_profileMaxWidth; }
/// Grow the pool by up to @p count slots (demand-driven, bypasses grace period).
/// Returns the number of slots actually created.
int growPool(int count);
private:
// Serialises concurrent runInference() calls on the SAME Engine instance.
// Without this, two threads with different batch sizes corrupt m_buffers,
// m_lastBatchSize, and the TRT execution context — causing fatal "illegal
// memory access" errors that permanently destroy the CUDA context.
// Pool-mode slots have their own busy-flag dispatch, so this only matters
// for the single-engine path (m_slots.empty()).
mutable std::mutex m_inferenceMutex;
int32_t m_lastBatchSize = -1;
// BATCH STATE MANAGEMENT (THREAD-SAFE)
//mutable std::mutex m_batchStateMutex; // Protects all batch-related state
//size_t m_targetStableBatchSize = 4; // Default optimal batch size
//int m_batchSizeChangeCounter = 0;
//const int BATCH_STABILITY_THRESHOLD = 15; // Frames before allowing downsize
//std::deque<size_t> m_recentBatchSizes;
//const size_t BATCH_HISTORY_SIZE = 30;
//std::chrono::steady_clock::time_point m_lastBatchSizeChange;
//// Context reshape mutex (separate from batch state)
//mutable std::mutex m_contextReshapeMutex;
//// Helper methods
//size_t DetermineOptimalBatchSize(size_t requestedSize);
//std::vector<std::vector<cv::cuda::GpuMat>> PadBatchToSize(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,size_t targetSize);
// Normalization, scaling, and mean subtraction of inputs
std::array<float, 3> m_subVals{};
std::array<float, 3> m_divVals{};
bool m_normalize;
// Holds pointers to the input and output GPU buffers
std::vector<void *> m_buffers;
std::vector<uint32_t> m_outputLengths{};
std::vector<nvinfer1::Dims3> m_inputDims;
std::vector<nvinfer1::Dims> m_outputDims;
std::vector<std::string> m_IOTensorNames;
int32_t m_inputBatchSize = 0;
bool m_firstInferenceCall = true; // Per-instance first-call diagnostics flag
bool m_batchShapeChangeLogged = false; // Suppress per-tensor shape dump after first batch-size change
bool m_hasDynamicSpatialDims = false; // True if any input has dynamic H or W (-1)
std::vector<nvinfer1::Dims> m_lastSetInputDims; // Cache of last setInputShape() dims per tensor — skip if unchanged
int m_profileMaxHeight = -1; // Max H from optimization profile (after load)
int m_profileMaxWidth = -1; // Max W from optimization profile (after load)
// Must keep IRuntime around for inference, see:
// https://forums.developer.nvidia.com/t/is-it-safe-to-deallocate-nvinfer1-iruntime-after-creating-an-nvinfer1-icudaengine-but-before-running-inference-with-said-icudaengine/255381/2?u=cyruspk4w6
//
// m_runtime and m_engine are shared_ptr to support TRTEngineCache:
// multiple Engine<T> instances loading the same model share one ICudaEngine
// in VRAM, each with its own IExecutionContext.
std::shared_ptr<nvinfer1::IRuntime> m_runtime = nullptr;
std::unique_ptr<Int8EntropyCalibrator2> m_calibrator = nullptr;
std::shared_ptr<nvinfer1::ICudaEngine> m_engine = nullptr;
std::unique_ptr<nvinfer1::IExecutionContext> m_context = nullptr;
// TRTEngineCache tracking — used by destructor to release shared engine
std::string m_cachedEnginePath;
int m_cachedGpuIndex = -1;
bool m_usingCachedEngine = false;
bool m_skipEngineCache = false; // true = bypass TRTEngineCache (for OptimizeModelStr)
bool m_skipOnnxRebuild = false; // true = don't delete/rebuild corrupt engine files (elastic growth)
bool m_skipOnnxBuild = false; // true = skip ONNX→TRT build if no cached engine (demand growth)
bool m_lastLoadFailedVRAM = false; // set by loadNetwork when failure is due to insufficient VRAM
bool m_forceNoPool = false; // true = force single-GPU path, skip multi-GPU pool
ANSCENTER::Options m_options;
cudaStream_t m_cudaStream = nullptr;
cudaStream_t m_inferenceStream;
bool m_streamInitialized;
//int32_t m_lastBatchSize;
cudaStream_t m_memoryStream; // ADD THIS - separate stream for memory operations
std::vector<cv::cuda::GpuMat> m_preprocessedInputs; // Keep inputs alive
// Note: blobFromGpuMats and resizeKeepAspectRatioPadRightBottom are static,
// so cached buffers use thread_local inside the functions themselves.
// Thermal management (ADD THESE)
//int m_consecutiveInferences;
//int m_thermalCooldownThreshold;
//int m_thermalCooldownDelayMs;
bool m_profileInitialized;
// -- GPU clock lock (NVML) -------------------------------------------------
// Dynamically loaded NVML to lock GPU clocks and prevent power throttling.
// No build-time dependency on nvml.lib — loads nvml.dll at runtime.
bool m_clocksLocked = false;
unsigned int m_nvmlDeviceIdx = 0;
void* m_nvmlLib = nullptr; // LibHandle (HMODULE on Windows)
void lockGpuClocks(int deviceIndex, int requestedMHz);
void unlockGpuClocks();
// -- CUDA Graph acceleration ---------------------------------------------
// Pinned (page-locked) host buffers -- stable addresses required for graph
// D2H copies. One pointer per output tensor, sized for maxBatchSize elements.
std::vector<T*> m_pinnedOutputBuffers;
std::vector<size_t> m_pinnedOutputBufElems; // elements per buffer
// Cached executable graphs, keyed by batch size. A new graph is captured
// the first time each batch size is seen; subsequent calls reuse it.
std::unordered_map<int, cudaGraphExec_t> m_graphExecs;
// Leak diagnostics — per-engine-instance counters for CUDA graph
// create/destroy balance. Incremented in EngineRunInference.inl and
// EngineBuildLoadNetwork.inl. Read by the [TRT_Leak] heartbeat in
// runInference (fires ≤1×/60s per engine instance).
// m_trtLeakNextLogTick stores a steady_clock epoch count for lock-free
// compare_exchange window claim across concurrent inference threads.
std::atomic<int64_t> m_trtGraphCreates{0};
std::atomic<int64_t> m_trtGraphDestroys{0};
std::atomic<long long> m_trtLeakNextLogTick{0};
Logger m_logger;
bool m_verbose{ true }; // false for non-probe pool slots
bool m_disableGraphs{ true }; // DISABLED by default — concurrent graph launches + uploads cause GPU deadlock on WDDM
// -- Multi-GPU pool data ---------------------------------------------------
// Per-slot descriptor. Slot engines are separate Engine<T> instances;
// they are never pooled recursively (their own m_slots deques are empty).
//
// Lifecycle in elastic mode:
// alive (engine != nullptr, busy == false) -- idle, ready for inference
// alive (engine != nullptr, busy == true) -- running inference
// dead (engine == nullptr) -- released, VRAM freed
// Dead entries stay in the deque to avoid invalidating pointers
// held by other threads. tryGrowPool() reuses them.
struct InferenceSlot {
int deviceIndex = 0;
bool busy = false;
size_t memUsed = 0; ///< VRAM bytes this slot holds
std::unique_ptr<Engine<T>> engine;
std::chrono::steady_clock::time_point lastUsedTime
= std::chrono::steady_clock::now(); ///< For idle-release
};
// Internal helpers -- implemented in EngineMultiGpu.inl
bool loadSlots(const ANSCENTER::Options& baseOptions,
const std::string& modelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
bool fromOnnx,
int maxSlotsPerGpu,
double memSafetyFactor);
bool runInferenceFromPool(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
std::vector<std::vector<std::vector<T>>>& featureVectors);
InferenceSlot* tryGrowPool(bool bypassGrace = false);
// -- Elastic pool storage ----------------------------------------------
//
// std::deque is used instead of std::vector because push_back on a deque
// does NOT invalidate references to existing elements. This is critical
// because runInferenceFromPool holds a raw InferenceSlot* across the GPU
// call while another thread may call tryGrowPool -> push_back concurrently.
std::deque<InferenceSlot> m_slots; ///< All managed Engine<T> instances
std::vector<GpuDeviceInfo> m_deviceInfos; ///< Per-device info from init
mutable std::mutex m_slotMutex; ///< Protects m_slots busy flags
std::condition_variable m_slotFreeCv; ///< Notified when any slot becomes idle
std::mutex m_growMutex; ///< Serialises slot creation
std::atomic<int> m_activeCount{0}; ///< Running inference count
std::atomic<int> m_totalCapacity{0}; ///< Alive slot count (atomic -- read lock-free by public API)
std::atomic<size_t> m_nextSlotHint{0}; ///< Round-robin start index for slot scan
// -- Elastic pool config (stored from loadSlots for on-demand growth) --
size_t m_memPerSlot{0};
std::string m_poolModelPath;
std::array<float, 3> m_poolSubVals{};
std::array<float, 3> m_poolDivVals{};
bool m_poolNormalize{true};
bool m_poolFromOnnx{false};
double m_poolSafetyFactor{0.80};
bool m_elasticMode{false}; ///< true when maxSlotsPerGpu == -1
// -- Global elastic slot cap (shared across ALL pools on ALL GPUs) --
// Prevents CUDA driver SRW lock convoy when too many concurrent contexts exist.
// Budget: ~4 slots per GB of total GPU VRAM (computed once on first pool init).
// 4 GB → 12, 6 GB → 24, 8 GB → 32, 12 GB → 48, 24 GB → 96
static std::atomic<int> s_globalElasticCount; ///< Current total elastic slots
static std::atomic<int> s_globalElasticMax; ///< Max total slots
static std::once_flag s_globalCapInitFlag; ///< Ensures max is computed once
// -- Elastic growth grace period --
// Delays elastic growth for N seconds after the most recent pool creation,
// giving later models time to create their probe engines before elastic
// growth from earlier pools consumes all free VRAM.
static constexpr int s_elasticGraceSec{5}; ///< Grace period in seconds (reduced from 30s: shared pools need fewer probes)
static std::atomic<int64_t> s_lastPoolCreatedMs; ///< Timestamp of last pool creation (ms since epoch)
// -- Automatic idle-slot cleanup timer (elastic mode only) --
std::thread m_idleTimerThread;
std::condition_variable m_idleTimerCv;
std::mutex m_idleTimerMutex;
std::atomic<bool> m_idleTimerStop{false};
double m_idleTimerIntervalSec{60.0}; ///< How often to check
double m_idleTimerThresholdSec{600.0}; ///< Idle time (seconds) before releasing — 10 minutes to avoid churn
void startIdleTimer();
void stopIdleTimer();
};
template <typename T> Engine<T>::Engine(const ANSCENTER::Options &options) : m_options(options) {
NvDynLoader::Initialize(); // no-op after first call; safe to call repeatedly
m_streamInitialized = false;
m_inferenceStream = nullptr;
m_lastBatchSize = -1;
m_inputBatchSize = 0;
m_memoryStream = nullptr;
m_firstInferenceCall = true;
m_batchShapeChangeLogged = false;
m_hasDynamicSpatialDims = false;
m_profileInitialized = false;
m_lastSetInputDims.clear();
m_preprocessedInputs.clear();
}
template <typename T> Engine<T>::Engine() {
NvDynLoader::Initialize();
m_streamInitialized = false;
m_inferenceStream = nullptr;
m_lastBatchSize = -1;
m_inputBatchSize = 0;
m_memoryStream = nullptr;
m_normalize = true;
m_firstInferenceCall = true;
m_batchShapeChangeLogged = false;
m_hasDynamicSpatialDims = false;
m_profileInitialized = false;
m_lastSetInputDims.clear();
}
template <typename T> Engine<T>::~Engine() {
// =========================================================================
// CRITICAL: Destructors must NEVER throw. Every CUDA/TRT call below is
// wrapped in try/catch so that a failure in one step does not skip the
// remaining cleanup (especially m_engine / m_runtime reset, which release
// TensorRT file handles and VRAM).
//
// Previous bug: clearGpuBuffers() called Util::checkCudaErrorCode() which
// throws on cudaFree failure. That aborted the destructor mid-way,
// leaving ICudaEngine/IRuntime alive → model folder locked on disk.
// =========================================================================
// Stop the idle-slot cleanup timer (if running) before touching pool state
try { stopIdleTimer(); } catch (...) {}
// Destroy cached CUDA graphs
try {
for (auto& [bs, ge] : m_graphExecs) {
if (ge) {
cudaGraphExecDestroy(ge);
m_trtGraphDestroys.fetch_add(1, std::memory_order_relaxed);
}
}
m_graphExecs.clear();
} catch (...) {}
// Free pinned output buffers
try {
for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
m_pinnedOutputBuffers.clear();
} catch (...) {}
// Free GPU I/O buffers — use cudaFree directly (NOT checkCudaErrorCode which throws)
try {
if (!m_buffers.empty()) {
for (void* ptr : m_buffers) {
if (ptr) cudaFree(ptr); // ignore errors — destructor must not throw
}
m_buffers.clear();
}
} catch (...) {}
try {
if (m_cudaStream) {
cudaStreamDestroy(m_cudaStream);
m_cudaStream = nullptr;
}
} catch (...) {}
try {
if (m_streamInitialized && m_inferenceStream) {
cudaStreamDestroy(m_inferenceStream);
m_inferenceStream = nullptr;
m_streamInitialized = false;
}
} catch (...) {}
try {
if (m_memoryStream) {
cudaStreamDestroy(m_memoryStream);
m_memoryStream = nullptr;
}
} catch (...) {}
// Release GPU clock lock if we hold one
try { unlockGpuClocks(); } catch (...) {}
// TRT destruction ordering: context MUST be destroyed before engine.
// With shared_ptr, m_engine may outlive this instance (other tasks share it).
// Explicit reset ensures correct order.
try { m_context.reset(); } catch (...) {}
// Release from TRTEngineCache (decrements refcount; engine freed when last user releases)
try {
if (m_usingCachedEngine && !m_cachedEnginePath.empty()) {
TRTEngineCache::instance().release(m_cachedEnginePath, m_cachedGpuIndex);
}
} catch (...) {}
// These MUST execute to release TensorRT file handles and VRAM.
// Previously, a throw in clearGpuBuffers() skipped these lines,
// leaving the model folder locked on disk.
try { m_engine.reset(); } catch (...) {}
try { m_runtime.reset(); } catch (...) {}
}
// ============================================================================
// NVML GPU Clock Lock — dynamically loaded, no build-time nvml.lib needed
// ============================================================================
// NVML type / constant definitions (subset — avoids #include <nvml.h>)
namespace nvml_types {
using Return_t = unsigned int; // nvmlReturn_t
using Device_t = void*; // nvmlDevice_t (opaque pointer)
enum : unsigned int { SUCCESS = 0 };
enum ClockType : unsigned int { CLOCK_GRAPHICS = 0 };
}
template <typename T>
void Engine<T>::lockGpuClocks(int deviceIndex, int requestedMHz) {
if (requestedMHz == 0) return; // disabled
#ifdef _WIN32
// Load nvml.dll from System32 (installed by NVIDIA driver)
auto hLib = LoadLibraryA("nvml.dll");
if (!hLib) {
if (m_verbose) std::cout << "Warning: nvml.dll not found — GPU clock lock unavailable" << std::endl;
return;
}
m_nvmlLib = static_cast<void*>(hLib);
// Resolve NVML function pointers
using fn_Init = nvml_types::Return_t(*)();
using fn_Shutdown = nvml_types::Return_t(*)();
using fn_GetHandle = nvml_types::Return_t(*)(unsigned int, nvml_types::Device_t*);
using fn_GetMaxClock = nvml_types::Return_t(*)(nvml_types::Device_t, nvml_types::ClockType, unsigned int*);
using fn_SetLocked = nvml_types::Return_t(*)(nvml_types::Device_t, unsigned int, unsigned int);
using fn_ErrorString = const char*(*)(nvml_types::Return_t);
auto pInit = reinterpret_cast<fn_Init>(GetProcAddress(hLib, "nvmlInit_v2"));
auto pShutdown = reinterpret_cast<fn_Shutdown>(GetProcAddress(hLib, "nvmlShutdown"));
auto pGetHandle = reinterpret_cast<fn_GetHandle>(GetProcAddress(hLib, "nvmlDeviceGetHandleByIndex_v2"));
auto pGetMaxClk = reinterpret_cast<fn_GetMaxClock>(GetProcAddress(hLib, "nvmlDeviceGetMaxClockInfo"));
auto pSetLocked = reinterpret_cast<fn_SetLocked>(GetProcAddress(hLib, "nvmlDeviceSetGpuLockedClocks"));
auto pErrStr = reinterpret_cast<fn_ErrorString>(GetProcAddress(hLib, "nvmlErrorString"));
if (!pInit || !pGetHandle || !pSetLocked) {
if (m_verbose) std::cout << "Warning: NVML symbols not found — GPU clock lock unavailable" << std::endl;
FreeLibrary(hLib); m_nvmlLib = nullptr;
return;
}
auto errName = [&](nvml_types::Return_t r) -> const char* {
return pErrStr ? pErrStr(r) : "unknown";
};
nvml_types::Return_t rc = pInit();
if (rc != nvml_types::SUCCESS) {
if (m_verbose) std::cout << "Warning: nvmlInit failed: " << errName(rc) << std::endl;
FreeLibrary(hLib); m_nvmlLib = nullptr;
return;
}
nvml_types::Device_t nvmlDev = nullptr;
rc = pGetHandle(static_cast<unsigned int>(deviceIndex), &nvmlDev);
if (rc != nvml_types::SUCCESS) {
if (m_verbose) std::cout << "Warning: nvmlDeviceGetHandleByIndex(" << deviceIndex << ") failed: " << errName(rc) << std::endl;
pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr;
return;
}
unsigned int targetMHz = 0;
if (requestedMHz < 0) {
// Auto mode: query max graphics clock and use ~80%
unsigned int maxClock = 0;
if (pGetMaxClk) {
rc = pGetMaxClk(nvmlDev, nvml_types::CLOCK_GRAPHICS, &maxClock);
if (rc == nvml_types::SUCCESS && maxClock > 0) {
targetMHz = static_cast<unsigned int>(maxClock * 0.80);
targetMHz = (targetMHz / 15) * 15; // Round down to 15 MHz step
if (targetMHz < 210) targetMHz = 210;
} else {
if (m_verbose) std::cout << "Warning: Could not query max GPU clock — skipping clock lock" << std::endl;
pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr;
return;
}
}
} else {
targetMHz = static_cast<unsigned int>(requestedMHz);
}
rc = pSetLocked(nvmlDev, targetMHz, targetMHz);
if (rc == nvml_types::SUCCESS) {
m_clocksLocked = true;
m_nvmlDeviceIdx = static_cast<unsigned int>(deviceIndex);
// Always emit to DebugView so operators can confirm the lock took
// effect without needing to read engine-level verbose output.
ANS_DBG("TRT_Clock",
"GPU clocks LOCKED at %u MHz (device %d) — P-state will stay high, "
"no WDDM down-clock between inferences",
targetMHz, deviceIndex);
if (m_verbose) std::cout << "Info: GPU clocks locked at " << targetMHz << " MHz (device " << deviceIndex << ")" << std::endl;
} else {
// Surface the failure reason + remediation in DebugView. Most common
// failure is access-denied (requires Administrator) or the driver
// refusing the requested frequency. Users see this in the log and
// know to elevate, set NVCP 'Prefer maximum performance', or run
// `nvidia-smi -lgc <MHz>,<MHz>` before launching.
ANS_DBG("TRT_Clock",
"GPU clock lock FAILED (nvml rc=%s) — expect 2-3x inference latency from "
"WDDM down-clocking. Fix: run as Admin, OR set NVCP 'Prefer maximum "
"performance' for this app, OR: nvidia-smi -lgc %u,%u",
errName(rc), targetMHz, targetMHz);
if (m_verbose) {
std::cout << "Warning: nvmlDeviceSetGpuLockedClocks failed: " << errName(rc) << std::endl;
std::cout << " (Run as Administrator, or use: nvidia-smi -lgc " << targetMHz << "," << targetMHz << ")" << std::endl;
}
pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr;
}
#else
// Linux: similar pattern with dlopen("libnvidia-ml.so.1", RTLD_LAZY)
// Left as TODO — nvidia-smi -lgc works without elevation on Linux.
if (m_verbose) std::cout << "Info: GPU clock lock not implemented on this platform — use nvidia-smi -lgc" << std::endl;
#endif
}
template <typename T>
void Engine<T>::unlockGpuClocks() {
if (!m_clocksLocked || !m_nvmlLib) return;
#ifdef _WIN32
auto hLib = static_cast<HMODULE>(m_nvmlLib);
using fn_GetHandle = nvml_types::Return_t(*)(unsigned int, nvml_types::Device_t*);
using fn_ResetClk = nvml_types::Return_t(*)(nvml_types::Device_t);
using fn_Shutdown = nvml_types::Return_t(*)();
auto pGetHandle = reinterpret_cast<fn_GetHandle>(GetProcAddress(hLib, "nvmlDeviceGetHandleByIndex_v2"));
auto pResetClk = reinterpret_cast<fn_ResetClk>(GetProcAddress(hLib, "nvmlDeviceResetGpuLockedClocks"));
auto pShutdown = reinterpret_cast<fn_Shutdown>(GetProcAddress(hLib, "nvmlShutdown"));
if (pGetHandle && pResetClk) {
nvml_types::Device_t nvmlDev = nullptr;
if (pGetHandle(m_nvmlDeviceIdx, &nvmlDev) == nvml_types::SUCCESS) {
pResetClk(nvmlDev);
if (m_verbose) std::cout << "Info: GPU clocks unlocked (device " << m_nvmlDeviceIdx << ")" << std::endl;
}
}
if (pShutdown) pShutdown();
FreeLibrary(hLib);
#endif
m_nvmlLib = nullptr;
m_clocksLocked = false;
}
namespace Util {
inline bool doesFileExist(const std::string& filepath) {
std::ifstream f(filepath.c_str());
return f.good();
}
inline void checkCudaErrorCode(cudaError_t code) {
if (code != cudaSuccess) {
std::string errMsg = "CUDA operation failed with code: " + std::to_string(code) + " (" + cudaGetErrorName(code) +
"), with message: " + cudaGetErrorString(code);
std::cout<<errMsg;
}
}
inline std::vector<std::string> getFilesInDirectory(const std::string& dirPath) {
std::vector<std::string> fileNames;
for (const auto& entry : std::filesystem::directory_iterator(dirPath)) {
if (entry.is_regular_file()) {
fileNames.push_back(entry.path().string());
}
}
return fileNames;
}
}
#include "engine/EngineRunInference.inl"
#include "engine/EngineUtilities.inl"
#include "engine/EngineBuildLoadNetwork.inl"
#include "engine/EngineMultiGpu.inl"
// ============================================================================
// Explicit template instantiation declarations -- DLL consumers only
//
// These tell each consuming translation unit: "Engine<float> is already fully
// compiled and exported by the DLL. Do NOT re-instantiate it here; resolve
// the symbols at link time from the DLL's import library instead."
//
// This suppresses redundant code generation in every consuming TU and
// guarantees that only ONE definition of each Engine<T> specialisation exists
// across the whole program (in the DLL).
//
// To add a new exported instantiation (e.g. Engine<uint8_t>):
// 1. Add a matching template class ENGINE_API Engine<uint8_t>; in engine.cpp
// 2. Add the corresponding extern declaration below.
// ============================================================================
#ifndef ENGINE_EXPORTS
extern template class ENGINE_API Engine<float>;
#endif