Files
ANSCORE/modules/ANSODEngine/engine.h

816 lines
39 KiB
C++

#pragma once
#include "engine_api.h" // ENGINE_API (__declspec dllexport / dllimport)
// -- TRT / ONNX-parser API decoration override -----------------------------
// Must appear BEFORE the first inclusion of NvInfer.h / NvOnnxParser.h.
//
// By default these macros expand to __declspec(dllimport) for consumers of
// the TRT DLL, which causes the compiler to emit indirect import-table calls.
// Overriding them to empty makes all calls direct, so the linker resolves
// them against the extern "C" stub definitions in NvDynLoader.cpp instead of
// against nvinfer_XX.lib / nvonnxparser_XX.lib.
//
// Remove nvinfer_XX.lib and nvonnxparser_XX.lib from Linker -> Input.
// Link ANSLibsLoader.lib instead. See NvDynLoader.h for full details.
#ifndef TENSORRTAPI
# define TENSORRTAPI
#endif
#ifndef NVONNXPARSER_API
# define NVONNXPARSER_API
#endif
#include "NvOnnxParser.h"
#include <atomic>
#include <chrono>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <deque>
#include <filesystem>
#include <fstream>
#include <unordered_map>
#include <memory>
#include <condition_variable>
#include <mutex>
#include <string>
#include <thread>
#include <opencv2/core/cuda.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>
#include <opencv2/cudaarithm.hpp>
#include "engine/TRTEngineCache.h"
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/opencv.hpp>
#include "interfaces\IEngine.h"
#include "ANSLicense.h"
#include "Int8Calibrator.h"
#include "NvDynLoader.h" // from ANSLibsLoader/include/
#ifdef _WIN32
#include <windows.h>
#endif
// Class to extend TensorRT logger
class ENGINE_API Logger : public nvinfer1::ILogger {
void log(Severity severity, const char *msg) noexcept override;
};
// ============================================================================
// Diagnostic logging to Windows Event Viewer (ANSLogger source).
// Allows internal engine failure points to be traced even when running
// under LabView which swallows stdout/stderr.
// ============================================================================
inline void logEngineEvent(const std::string& msg, bool isError = false) {
#ifdef _WIN32
// Cache the event source handle to avoid RegisterEventSourceA/
// DeregisterEventSource churn. Under heavy load (10+ AI tasks),
// the per-message register/deregister cycle overwhelms the Windows
// Event Log service, causing ERROR_UNRECOGNIZED_VOLUME (1005) floods
// that freeze the system.
static HANDLE hLog = RegisterEventSourceA(nullptr, "ANSLogger");
if (hLog) {
const char* msgs[1] = { msg.c_str() };
ReportEventA(hLog,
isError ? EVENTLOG_ERROR_TYPE : EVENTLOG_INFORMATION_TYPE,
0, isError ? 1004 : 1000, nullptr,
1, 0, msgs, nullptr);
}
#endif
if (isError)
std::cerr << msg << std::endl;
else
std::cout << msg << std::endl;
}
// ============================================================================
// GpuDeviceInfo -- snapshot of one CUDA device captured at pool-init time.
// Returned by Engine<T>::enumerateDevices() and stored inside the pool.
// ============================================================================
struct ENGINE_API GpuDeviceInfo {
int index = 0;
std::string name;
size_t totalMemoryBytes = 0; ///< Physical VRAM
size_t freeMemoryAtInitBytes = 0; ///< Free VRAM when pool started
int computeMajor = 0;
int computeMinor = 0;
int slotsAllocated = 0; ///< Engine<T> instances on this GPU
size_t memoryPerSlotBytes = 0; ///< VRAM bytes each slot occupies
};
template <typename T>
class Engine : public IEngine<T> {
public:
Engine();
Engine(const ANSCENTER::Options &options);
~Engine();
// Build the onnx model into a TensorRT engine file, cache the model to disk
// (to avoid rebuilding in future), and then load the model into memory The
// default implementation will normalize values between [0.f, 1.f] Setting the
// normalize flag to false will leave values between [0.f, 255.f] (some
// converted models may require this). If the model requires values to be
// normalized between [-1.f, 1.f], use the following params:
// subVals = {0.5f, 0.5f, 0.5f};
// divVals = {0.5f, 0.5f, 0.5f};
// normalize = true;
bool buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override;
// Load a TensorRT engine file from disk into memory
// The default implementation will normalize values between [0.f, 1.f]
// Setting the normalize flag to false will leave values between [0.f, 255.f]
// (some converted models may require this). If the model requires values to
// be normalized between [-1.f, 1.f], use the following params:
// subVals = {0.5f, 0.5f, 0.5f};
// divVals = {0.5f, 0.5f, 0.5f};
// normalize = true;
bool loadNetwork(std::string trtModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override;
// -- Multi-GPU pool overloads (non-virtual; call through Engine<T>* or
// unique_ptr<Engine<T>>, not through IEngine<T>*).
//
// maxSlotsPerGpu == 0 -> BYPASS: identical to the 4-param single-GPU path
// above. Used by OptimizeModelStr for temporary engines.
// maxSlotsPerGpu == 1 -> ROUND-ROBIN (default): 1 slot per GPU, multi-GPU
// round-robin dispatch. Tasks queue when all slots
// busy. Best balance of VRAM usage and multi-GPU
// utilisation.
// maxSlotsPerGpu == -1 -> ELASTIC pool: 1 probe slot at init, additional
// slots created on-demand when concurrent requests
// arrive, released when idle via releaseIdleSlots().
// Higher throughput but higher VRAM usage — only for
// large GPUs (≥ 8 GB).
// maxSlotsPerGpu > 1 -> PRE-ALLOCATED pool: cap at that many slots/GPU,
// all created at init time.
//
// Usage -- opt into pool mode by adding ONE extra argument:
// m_trtEngine->buildLoadNetwork(path, sub, div, norm); // single-GPU (unchanged)
// m_trtEngine->buildLoadNetwork(path, sub, div, norm, 1); // round-robin (default)
// m_trtEngine->buildLoadNetwork(path, sub, div, norm, -1); // elastic pool
// m_trtEngine->buildLoadNetwork(path, sub, div, norm, 4); // pre-allocated, 4 slots/GPU
bool buildLoadNetwork(std::string onnxModelPath,
const std::array<float, 3> &subVals,
const std::array<float, 3> &divVals,
bool normalize,
int maxSlotsPerGpu,
double memSafetyFactor = 0.80);
bool loadNetwork(std::string trtModelPath,
const std::array<float, 3> &subVals,
const std::array<float, 3> &divVals,
bool normalize,
int maxSlotsPerGpu,
double memSafetyFactor = 0.80);
// Run inference.
// Input format [input][batch][cv::cuda::GpuMat]
// Output format [batch][output][feature_vector]
bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &inputs, std::vector<std::vector<std::vector<T>>> &featureVectors) override;
// Utility method for resizing an image while maintaining the aspect ratio by
// adding padding to smaller dimension after scaling While letterbox padding
// normally adds padding to top & bottom, or left & right sides, this
// implementation only adds padding to the right or bottom side This is done
// so that it's easier to convert detected coordinates (ex. YOLO model) back
// to the original reference frame.
static cv::cuda::GpuMat resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width,
const cv::Scalar &bgcolor = cv::Scalar(0, 0, 0));
static cv::Mat cpuResizeKeepAspectRatioPadRightBottom(const cv::Mat &input, size_t height, size_t width,
const cv::Scalar &bgcolor = cv::Scalar(114, 114, 114));
[[nodiscard]] const std::vector<nvinfer1::Dims3> &getInputDims() const override { return m_inputDims; };
[[nodiscard]] const std::vector<nvinfer1::Dims> &getOutputDims() const override { return m_outputDims; };
[[nodiscard]] const ANSCENTER::Options& getOptions() const { return m_options; }
/// Return the CUDA device index that preprocessing should target.
/// In pool mode this returns the device of the first alive slot;
/// in single-engine mode it returns m_options.deviceIndex.
/// Call cudaSetDevice(engine->getPreferredDeviceIndex()) before
/// any cv::cuda::GpuMat operations to ensure multi-GPU correctness.
[[nodiscard]] int getPreferredDeviceIndex() const {
// Pool mode: use the first alive slot's device
if (!m_slots.empty()) {
for (const auto& s : m_slots) {
if (s.engine) return s.deviceIndex;
}
}
return m_options.deviceIndex;
}
/// Convenience: set the calling thread's CUDA device to match this engine.
/// Skips the call if already on the correct device (WDDM overhead avoidance).
void setDeviceContext() const {
const int target = getPreferredDeviceIndex();
int current = -1;
cudaGetDevice(&current);
if (current != target) {
cudaSetDevice(target);
}
}
// Utility method for transforming triple nested output array into 2D array
// Should be used when the output batch size is 1, but there are multiple
// output feature vectors
static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output);
// Utility method for transforming triple nested output array into single
// array Should be used when the output batch size is 1, and there is only a
// single output feature vector
static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output);
// Convert NHWC to NCHW and apply scaling and mean subtraction.
// Pass a cv::cuda::Stream to run all GPU kernels on that stream instead of
// the CUDA null (default) stream. This eliminates global sync barriers and
// enables CUDA-graph-safe preprocessing on OpenCV 4.13+.
static cv::cuda::GpuMat blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
const std::array<float, 3> &divVals, bool normalize, bool swapRB = false,
cv::cuda::Stream &stream = cv::cuda::Stream::Null());
// Converts the engine options into a string
std::string serializeEngineOptions(const ANSCENTER::Options& options, const std::string& onnxModelPath);
// Build the network (single attempt)
bool build(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize);
// SEH-safe wrapper around build(). Catches access violations that corrupt
// the CUDA context (e.g. opset 19+ ONNX models crash TRT's parser).
// Returns false and sets *outSehCode on crash.
bool buildSafe(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize, unsigned long* outSehCode);
// Build with auto-retry for dynamic spatial dimension models.
// Pre-analyzes the ONNX model; if H/W are dynamic and the max profile
// is too large for GPU VRAM, automatically reduces max spatial dims and
// retries until build succeeds or all candidates are exhausted.
// Fixed-spatial models fall through to a single build() call.
bool buildWithRetry(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize);
void getDeviceNames(std::vector<std::string>& deviceNames);
void clearGpuBuffers();
int getBindingIndexByName(const std::string& name);
void UpdateOptions(const ANSCENTER::Options& options) { m_options = options; }
void warmUp(int iterations = 5);
void setVerbose(bool v) { m_verbose = v; }
void setDisableGraphs(bool v) { m_disableGraphs = v; }
// -- Multi-GPU pool management (replaces MultiGpuEngineManager<T>) ---------
//
// Call ONE of the two initialize methods below instead of
// buildLoadNetwork() / loadNetwork(), then use runInference() exactly as
// you would for single-GPU. runInference() dispatches automatically to
// the first idle slot across all GPUs.
//
// Single-GPU code that calls buildLoadNetwork() + runInference() directly
// is completely unaffected -- it follows the existing code path unchanged.
/**
* Build from an ONNX file and create an Engine pool across all GPUs.
*
* @param baseOptions Configuration template; deviceIndex is ignored
* and overridden per slot.
* @param onnxModelPath Path to the .onnx model.
* @param subVals Per-channel subtraction for normalisation.
* @param divVals Per-channel divisor for normalisation.
* @param normalize Scale pixel values to [0, 1] before inference.
* @param maxSlotsPerGpu Cap slots per GPU (-1 = memory-limited only).
* @param memSafetyFactor Fraction of free VRAM to consume (default 0.80).
*/
bool initializePool(const ANSCENTER::Options& baseOptions,
const std::string& onnxModelPath,
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},
const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
bool normalize = true,
int maxSlotsPerGpu = -1,
double memSafetyFactor = 0.80);
/**
* Load a pre-built TRT engine and create a pool -- no ONNX build step.
*/
bool initializePoolFromEngine(const ANSCENTER::Options& baseOptions,
const std::string& trtEnginePath,
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},
const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
bool normalize = true,
int maxSlotsPerGpu = -1,
double memSafetyFactor = 0.80);
/** Enumerate all CUDA-capable GPUs without loading any model. */
static std::vector<GpuDeviceInfo> enumerateDevices();
/** Total Engine<T> slots across all GPUs (valid after initializePool). */
int getTotalCapacity() const { return m_totalCapacity; }
/** Slots currently executing inference (approximate, lock-free). */
int getActiveInferences() const { return m_activeCount.load(); }
/** Idle slots not currently running inference. */
int getAvailableSlots() const { return m_totalCapacity - m_activeCount.load(); }
/** True when every alive slot is busy.
* In elastic mode, new work will trigger on-demand slot creation
* rather than being rejected (unless all GPUs are physically full). */
bool isAtCapacity() const { return getAvailableSlots() <= 0; }
/** Device snapshots captured at initializePool() time. */
const std::vector<GpuDeviceInfo>& getDeviceInfo() const { return m_deviceInfos; }
/** Print a human-readable pool capacity and device report. */
void printCapacityReport() const;
/**
* Release idle slots whose last inference was more than @p idleSeconds
* ago. The probe slot (first slot) is never released so the model
* remains instantly usable. Returns the number of slots released.
*
* Typical usage -- call periodically from a background timer:
* @code
* int freed = engine->releaseIdleSlots(30.0);
* @endcode
*/
int releaseIdleSlots(double idleSeconds = 30.0);
/** Actual max spatial dims from the loaded engine's optimization profile.
* Returns -1 if the engine is not loaded yet or dim is fixed. */
int getProfileMaxHeight() const { return m_profileMaxHeight; }
int getProfileMaxWidth() const { return m_profileMaxWidth; }
/// Grow the pool by up to @p count slots (demand-driven, bypasses grace period).
/// Returns the number of slots actually created.
int growPool(int count);
private:
// Serialises concurrent runInference() calls on the SAME Engine instance.
// Without this, two threads with different batch sizes corrupt m_buffers,
// m_lastBatchSize, and the TRT execution context — causing fatal "illegal
// memory access" errors that permanently destroy the CUDA context.
// Pool-mode slots have their own busy-flag dispatch, so this only matters
// for the single-engine path (m_slots.empty()).
mutable std::mutex m_inferenceMutex;
int32_t m_lastBatchSize = -1;
// BATCH STATE MANAGEMENT (THREAD-SAFE)
//mutable std::mutex m_batchStateMutex; // Protects all batch-related state
//size_t m_targetStableBatchSize = 4; // Default optimal batch size
//int m_batchSizeChangeCounter = 0;
//const int BATCH_STABILITY_THRESHOLD = 15; // Frames before allowing downsize
//std::deque<size_t> m_recentBatchSizes;
//const size_t BATCH_HISTORY_SIZE = 30;
//std::chrono::steady_clock::time_point m_lastBatchSizeChange;
//// Context reshape mutex (separate from batch state)
//mutable std::mutex m_contextReshapeMutex;
//// Helper methods
//size_t DetermineOptimalBatchSize(size_t requestedSize);
//std::vector<std::vector<cv::cuda::GpuMat>> PadBatchToSize(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,size_t targetSize);
// Normalization, scaling, and mean subtraction of inputs
std::array<float, 3> m_subVals{};
std::array<float, 3> m_divVals{};
bool m_normalize;
// Holds pointers to the input and output GPU buffers
std::vector<void *> m_buffers;
std::vector<uint32_t> m_outputLengths{};
std::vector<nvinfer1::Dims3> m_inputDims;
std::vector<nvinfer1::Dims> m_outputDims;
std::vector<std::string> m_IOTensorNames;
int32_t m_inputBatchSize = 0;
bool m_firstInferenceCall = true; // Per-instance first-call diagnostics flag
bool m_batchShapeChangeLogged = false; // Suppress per-tensor shape dump after first batch-size change
bool m_hasDynamicSpatialDims = false; // True if any input has dynamic H or W (-1)
std::vector<nvinfer1::Dims> m_lastSetInputDims; // Cache of last setInputShape() dims per tensor — skip if unchanged
int m_profileMaxHeight = -1; // Max H from optimization profile (after load)
int m_profileMaxWidth = -1; // Max W from optimization profile (after load)
// Must keep IRuntime around for inference, see:
// https://forums.developer.nvidia.com/t/is-it-safe-to-deallocate-nvinfer1-iruntime-after-creating-an-nvinfer1-icudaengine-but-before-running-inference-with-said-icudaengine/255381/2?u=cyruspk4w6
//
// m_runtime and m_engine are shared_ptr to support TRTEngineCache:
// multiple Engine<T> instances loading the same model share one ICudaEngine
// in VRAM, each with its own IExecutionContext.
std::shared_ptr<nvinfer1::IRuntime> m_runtime = nullptr;
std::unique_ptr<Int8EntropyCalibrator2> m_calibrator = nullptr;
std::shared_ptr<nvinfer1::ICudaEngine> m_engine = nullptr;
std::unique_ptr<nvinfer1::IExecutionContext> m_context = nullptr;
// TRTEngineCache tracking — used by destructor to release shared engine
std::string m_cachedEnginePath;
int m_cachedGpuIndex = -1;
bool m_usingCachedEngine = false;
bool m_skipEngineCache = false; // true = bypass TRTEngineCache (for OptimizeModelStr)
bool m_skipOnnxRebuild = false; // true = don't delete/rebuild corrupt engine files (elastic growth)
bool m_skipOnnxBuild = false; // true = skip ONNX→TRT build if no cached engine (demand growth)
bool m_lastLoadFailedVRAM = false; // set by loadNetwork when failure is due to insufficient VRAM
bool m_forceNoPool = false; // true = force single-GPU path, skip multi-GPU pool
ANSCENTER::Options m_options;
cudaStream_t m_cudaStream = nullptr;
cudaStream_t m_inferenceStream;
bool m_streamInitialized;
//int32_t m_lastBatchSize;
cudaStream_t m_memoryStream; // ADD THIS - separate stream for memory operations
std::vector<cv::cuda::GpuMat> m_preprocessedInputs; // Keep inputs alive
// Note: blobFromGpuMats and resizeKeepAspectRatioPadRightBottom are static,
// so cached buffers use thread_local inside the functions themselves.
// Thermal management (ADD THESE)
//int m_consecutiveInferences;
//int m_thermalCooldownThreshold;
//int m_thermalCooldownDelayMs;
bool m_profileInitialized;
// -- GPU clock lock (NVML) -------------------------------------------------
// Dynamically loaded NVML to lock GPU clocks and prevent power throttling.
// No build-time dependency on nvml.lib — loads nvml.dll at runtime.
bool m_clocksLocked = false;
unsigned int m_nvmlDeviceIdx = 0;
void* m_nvmlLib = nullptr; // LibHandle (HMODULE on Windows)
void lockGpuClocks(int deviceIndex, int requestedMHz);
void unlockGpuClocks();
// -- CUDA Graph acceleration ---------------------------------------------
// Pinned (page-locked) host buffers -- stable addresses required for graph
// D2H copies. One pointer per output tensor, sized for maxBatchSize elements.
std::vector<T*> m_pinnedOutputBuffers;
std::vector<size_t> m_pinnedOutputBufElems; // elements per buffer
// Cached executable graphs, keyed by batch size. A new graph is captured
// the first time each batch size is seen; subsequent calls reuse it.
std::unordered_map<int, cudaGraphExec_t> m_graphExecs;
Logger m_logger;
bool m_verbose{ true }; // false for non-probe pool slots
bool m_disableGraphs{ true }; // DISABLED by default — concurrent graph launches + uploads cause GPU deadlock on WDDM
// -- Multi-GPU pool data ---------------------------------------------------
// Per-slot descriptor. Slot engines are separate Engine<T> instances;
// they are never pooled recursively (their own m_slots deques are empty).
//
// Lifecycle in elastic mode:
// alive (engine != nullptr, busy == false) -- idle, ready for inference
// alive (engine != nullptr, busy == true) -- running inference
// dead (engine == nullptr) -- released, VRAM freed
// Dead entries stay in the deque to avoid invalidating pointers
// held by other threads. tryGrowPool() reuses them.
struct InferenceSlot {
int deviceIndex = 0;
bool busy = false;
size_t memUsed = 0; ///< VRAM bytes this slot holds
std::unique_ptr<Engine<T>> engine;
std::chrono::steady_clock::time_point lastUsedTime
= std::chrono::steady_clock::now(); ///< For idle-release
};
// Internal helpers -- implemented in EngineMultiGpu.inl
bool loadSlots(const ANSCENTER::Options& baseOptions,
const std::string& modelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
bool fromOnnx,
int maxSlotsPerGpu,
double memSafetyFactor);
bool runInferenceFromPool(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
std::vector<std::vector<std::vector<T>>>& featureVectors);
InferenceSlot* tryGrowPool(bool bypassGrace = false);
// -- Elastic pool storage ----------------------------------------------
//
// std::deque is used instead of std::vector because push_back on a deque
// does NOT invalidate references to existing elements. This is critical
// because runInferenceFromPool holds a raw InferenceSlot* across the GPU
// call while another thread may call tryGrowPool -> push_back concurrently.
std::deque<InferenceSlot> m_slots; ///< All managed Engine<T> instances
std::vector<GpuDeviceInfo> m_deviceInfos; ///< Per-device info from init
mutable std::mutex m_slotMutex; ///< Protects m_slots busy flags
std::condition_variable m_slotFreeCv; ///< Notified when any slot becomes idle
std::mutex m_growMutex; ///< Serialises slot creation
std::atomic<int> m_activeCount{0}; ///< Running inference count
std::atomic<int> m_totalCapacity{0}; ///< Alive slot count (atomic -- read lock-free by public API)
std::atomic<size_t> m_nextSlotHint{0}; ///< Round-robin start index for slot scan
// -- Elastic pool config (stored from loadSlots for on-demand growth) --
size_t m_memPerSlot{0};
std::string m_poolModelPath;
std::array<float, 3> m_poolSubVals{};
std::array<float, 3> m_poolDivVals{};
bool m_poolNormalize{true};
bool m_poolFromOnnx{false};
double m_poolSafetyFactor{0.80};
bool m_elasticMode{false}; ///< true when maxSlotsPerGpu == -1
// -- Global elastic slot cap (shared across ALL pools on ALL GPUs) --
// Prevents CUDA driver SRW lock convoy when too many concurrent contexts exist.
// Budget: ~4 slots per GB of total GPU VRAM (computed once on first pool init).
// 4 GB → 12, 6 GB → 24, 8 GB → 32, 12 GB → 48, 24 GB → 96
static std::atomic<int> s_globalElasticCount; ///< Current total elastic slots
static std::atomic<int> s_globalElasticMax; ///< Max total slots
static std::once_flag s_globalCapInitFlag; ///< Ensures max is computed once
// -- Elastic growth grace period --
// Delays elastic growth for N seconds after the most recent pool creation,
// giving later models time to create their probe engines before elastic
// growth from earlier pools consumes all free VRAM.
static constexpr int s_elasticGraceSec{5}; ///< Grace period in seconds (reduced from 30s: shared pools need fewer probes)
static std::atomic<int64_t> s_lastPoolCreatedMs; ///< Timestamp of last pool creation (ms since epoch)
// -- Automatic idle-slot cleanup timer (elastic mode only) --
std::thread m_idleTimerThread;
std::condition_variable m_idleTimerCv;
std::mutex m_idleTimerMutex;
std::atomic<bool> m_idleTimerStop{false};
double m_idleTimerIntervalSec{60.0}; ///< How often to check
double m_idleTimerThresholdSec{600.0}; ///< Idle time (seconds) before releasing — 10 minutes to avoid churn
void startIdleTimer();
void stopIdleTimer();
};
template <typename T> Engine<T>::Engine(const ANSCENTER::Options &options) : m_options(options) {
NvDynLoader::Initialize(); // no-op after first call; safe to call repeatedly
m_streamInitialized = false;
m_inferenceStream = nullptr;
m_lastBatchSize = -1;
m_inputBatchSize = 0;
m_memoryStream = nullptr;
m_firstInferenceCall = true;
m_batchShapeChangeLogged = false;
m_hasDynamicSpatialDims = false;
m_profileInitialized = false;
m_lastSetInputDims.clear();
m_preprocessedInputs.clear();
}
template <typename T> Engine<T>::Engine() {
NvDynLoader::Initialize();
m_streamInitialized = false;
m_inferenceStream = nullptr;
m_lastBatchSize = -1;
m_inputBatchSize = 0;
m_memoryStream = nullptr;
m_normalize = true;
m_firstInferenceCall = true;
m_batchShapeChangeLogged = false;
m_hasDynamicSpatialDims = false;
m_profileInitialized = false;
m_lastSetInputDims.clear();
}
template <typename T> Engine<T>::~Engine() {
// =========================================================================
// CRITICAL: Destructors must NEVER throw. Every CUDA/TRT call below is
// wrapped in try/catch so that a failure in one step does not skip the
// remaining cleanup (especially m_engine / m_runtime reset, which release
// TensorRT file handles and VRAM).
//
// Previous bug: clearGpuBuffers() called Util::checkCudaErrorCode() which
// throws on cudaFree failure. That aborted the destructor mid-way,
// leaving ICudaEngine/IRuntime alive → model folder locked on disk.
// =========================================================================
// Stop the idle-slot cleanup timer (if running) before touching pool state
try { stopIdleTimer(); } catch (...) {}
// Destroy cached CUDA graphs
try {
for (auto& [bs, ge] : m_graphExecs) { if (ge) cudaGraphExecDestroy(ge); }
m_graphExecs.clear();
} catch (...) {}
// Free pinned output buffers
try {
for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
m_pinnedOutputBuffers.clear();
} catch (...) {}
// Free GPU I/O buffers — use cudaFree directly (NOT checkCudaErrorCode which throws)
try {
if (!m_buffers.empty()) {
for (void* ptr : m_buffers) {
if (ptr) cudaFree(ptr); // ignore errors — destructor must not throw
}
m_buffers.clear();
}
} catch (...) {}
try {
if (m_cudaStream) {
cudaStreamDestroy(m_cudaStream);
m_cudaStream = nullptr;
}
} catch (...) {}
try {
if (m_streamInitialized && m_inferenceStream) {
cudaStreamDestroy(m_inferenceStream);
m_inferenceStream = nullptr;
m_streamInitialized = false;
}
} catch (...) {}
try {
if (m_memoryStream) {
cudaStreamDestroy(m_memoryStream);
m_memoryStream = nullptr;
}
} catch (...) {}
// Release GPU clock lock if we hold one
try { unlockGpuClocks(); } catch (...) {}
// TRT destruction ordering: context MUST be destroyed before engine.
// With shared_ptr, m_engine may outlive this instance (other tasks share it).
// Explicit reset ensures correct order.
try { m_context.reset(); } catch (...) {}
// Release from TRTEngineCache (decrements refcount; engine freed when last user releases)
try {
if (m_usingCachedEngine && !m_cachedEnginePath.empty()) {
TRTEngineCache::instance().release(m_cachedEnginePath, m_cachedGpuIndex);
}
} catch (...) {}
// These MUST execute to release TensorRT file handles and VRAM.
// Previously, a throw in clearGpuBuffers() skipped these lines,
// leaving the model folder locked on disk.
try { m_engine.reset(); } catch (...) {}
try { m_runtime.reset(); } catch (...) {}
}
// ============================================================================
// NVML GPU Clock Lock — dynamically loaded, no build-time nvml.lib needed
// ============================================================================
// NVML type / constant definitions (subset — avoids #include <nvml.h>)
namespace nvml_types {
using Return_t = unsigned int; // nvmlReturn_t
using Device_t = void*; // nvmlDevice_t (opaque pointer)
enum : unsigned int { SUCCESS = 0 };
enum ClockType : unsigned int { CLOCK_GRAPHICS = 0 };
}
template <typename T>
void Engine<T>::lockGpuClocks(int deviceIndex, int requestedMHz) {
if (requestedMHz == 0) return; // disabled
#ifdef _WIN32
// Load nvml.dll from System32 (installed by NVIDIA driver)
auto hLib = LoadLibraryA("nvml.dll");
if (!hLib) {
if (m_verbose) std::cout << "Warning: nvml.dll not found — GPU clock lock unavailable" << std::endl;
return;
}
m_nvmlLib = static_cast<void*>(hLib);
// Resolve NVML function pointers
using fn_Init = nvml_types::Return_t(*)();
using fn_Shutdown = nvml_types::Return_t(*)();
using fn_GetHandle = nvml_types::Return_t(*)(unsigned int, nvml_types::Device_t*);
using fn_GetMaxClock = nvml_types::Return_t(*)(nvml_types::Device_t, nvml_types::ClockType, unsigned int*);
using fn_SetLocked = nvml_types::Return_t(*)(nvml_types::Device_t, unsigned int, unsigned int);
using fn_ErrorString = const char*(*)(nvml_types::Return_t);
auto pInit = reinterpret_cast<fn_Init>(GetProcAddress(hLib, "nvmlInit_v2"));
auto pShutdown = reinterpret_cast<fn_Shutdown>(GetProcAddress(hLib, "nvmlShutdown"));
auto pGetHandle = reinterpret_cast<fn_GetHandle>(GetProcAddress(hLib, "nvmlDeviceGetHandleByIndex_v2"));
auto pGetMaxClk = reinterpret_cast<fn_GetMaxClock>(GetProcAddress(hLib, "nvmlDeviceGetMaxClockInfo"));
auto pSetLocked = reinterpret_cast<fn_SetLocked>(GetProcAddress(hLib, "nvmlDeviceSetGpuLockedClocks"));
auto pErrStr = reinterpret_cast<fn_ErrorString>(GetProcAddress(hLib, "nvmlErrorString"));
if (!pInit || !pGetHandle || !pSetLocked) {
if (m_verbose) std::cout << "Warning: NVML symbols not found — GPU clock lock unavailable" << std::endl;
FreeLibrary(hLib); m_nvmlLib = nullptr;
return;
}
auto errName = [&](nvml_types::Return_t r) -> const char* {
return pErrStr ? pErrStr(r) : "unknown";
};
nvml_types::Return_t rc = pInit();
if (rc != nvml_types::SUCCESS) {
if (m_verbose) std::cout << "Warning: nvmlInit failed: " << errName(rc) << std::endl;
FreeLibrary(hLib); m_nvmlLib = nullptr;
return;
}
nvml_types::Device_t nvmlDev = nullptr;
rc = pGetHandle(static_cast<unsigned int>(deviceIndex), &nvmlDev);
if (rc != nvml_types::SUCCESS) {
if (m_verbose) std::cout << "Warning: nvmlDeviceGetHandleByIndex(" << deviceIndex << ") failed: " << errName(rc) << std::endl;
pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr;
return;
}
unsigned int targetMHz = 0;
if (requestedMHz < 0) {
// Auto mode: query max graphics clock and use ~80%
unsigned int maxClock = 0;
if (pGetMaxClk) {
rc = pGetMaxClk(nvmlDev, nvml_types::CLOCK_GRAPHICS, &maxClock);
if (rc == nvml_types::SUCCESS && maxClock > 0) {
targetMHz = static_cast<unsigned int>(maxClock * 0.80);
targetMHz = (targetMHz / 15) * 15; // Round down to 15 MHz step
if (targetMHz < 210) targetMHz = 210;
} else {
if (m_verbose) std::cout << "Warning: Could not query max GPU clock — skipping clock lock" << std::endl;
pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr;
return;
}
}
} else {
targetMHz = static_cast<unsigned int>(requestedMHz);
}
rc = pSetLocked(nvmlDev, targetMHz, targetMHz);
if (rc == nvml_types::SUCCESS) {
m_clocksLocked = true;
m_nvmlDeviceIdx = static_cast<unsigned int>(deviceIndex);
if (m_verbose) std::cout << "Info: GPU clocks locked at " << targetMHz << " MHz (device " << deviceIndex << ")" << std::endl;
} else {
if (m_verbose) {
std::cout << "Warning: nvmlDeviceSetGpuLockedClocks failed: " << errName(rc) << std::endl;
std::cout << " (Run as Administrator, or use: nvidia-smi -lgc " << targetMHz << "," << targetMHz << ")" << std::endl;
}
pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr;
}
#else
// Linux: similar pattern with dlopen("libnvidia-ml.so.1", RTLD_LAZY)
// Left as TODO — nvidia-smi -lgc works without elevation on Linux.
if (m_verbose) std::cout << "Info: GPU clock lock not implemented on this platform — use nvidia-smi -lgc" << std::endl;
#endif
}
template <typename T>
void Engine<T>::unlockGpuClocks() {
if (!m_clocksLocked || !m_nvmlLib) return;
#ifdef _WIN32
auto hLib = static_cast<HMODULE>(m_nvmlLib);
using fn_GetHandle = nvml_types::Return_t(*)(unsigned int, nvml_types::Device_t*);
using fn_ResetClk = nvml_types::Return_t(*)(nvml_types::Device_t);
using fn_Shutdown = nvml_types::Return_t(*)();
auto pGetHandle = reinterpret_cast<fn_GetHandle>(GetProcAddress(hLib, "nvmlDeviceGetHandleByIndex_v2"));
auto pResetClk = reinterpret_cast<fn_ResetClk>(GetProcAddress(hLib, "nvmlDeviceResetGpuLockedClocks"));
auto pShutdown = reinterpret_cast<fn_Shutdown>(GetProcAddress(hLib, "nvmlShutdown"));
if (pGetHandle && pResetClk) {
nvml_types::Device_t nvmlDev = nullptr;
if (pGetHandle(m_nvmlDeviceIdx, &nvmlDev) == nvml_types::SUCCESS) {
pResetClk(nvmlDev);
if (m_verbose) std::cout << "Info: GPU clocks unlocked (device " << m_nvmlDeviceIdx << ")" << std::endl;
}
}
if (pShutdown) pShutdown();
FreeLibrary(hLib);
#endif
m_nvmlLib = nullptr;
m_clocksLocked = false;
}
namespace Util {
inline bool doesFileExist(const std::string& filepath) {
std::ifstream f(filepath.c_str());
return f.good();
}
inline void checkCudaErrorCode(cudaError_t code) {
if (code != cudaSuccess) {
std::string errMsg = "CUDA operation failed with code: " + std::to_string(code) + " (" + cudaGetErrorName(code) +
"), with message: " + cudaGetErrorString(code);
std::cout<<errMsg;
}
}
inline std::vector<std::string> getFilesInDirectory(const std::string& dirPath) {
std::vector<std::string> fileNames;
for (const auto& entry : std::filesystem::directory_iterator(dirPath)) {
if (entry.is_regular_file()) {
fileNames.push_back(entry.path().string());
}
}
return fileNames;
}
}
#include "engine/EngineRunInference.inl"
#include "engine/EngineUtilities.inl"
#include "engine/EngineBuildLoadNetwork.inl"
#include "engine/EngineMultiGpu.inl"
// ============================================================================
// Explicit template instantiation declarations -- DLL consumers only
//
// These tell each consuming translation unit: "Engine<float> is already fully
// compiled and exported by the DLL. Do NOT re-instantiate it here; resolve
// the symbols at link time from the DLL's import library instead."
//
// This suppresses redundant code generation in every consuming TU and
// guarantees that only ONE definition of each Engine<T> specialisation exists
// across the whole program (in the DLL).
//
// To add a new exported instantiation (e.g. Engine<uint8_t>):
// 1. Add a matching template class ENGINE_API Engine<uint8_t>; in engine.cpp
// 2. Add the corresponding extern declaration below.
// ============================================================================
#ifndef ENGINE_EXPORTS
extern template class ENGINE_API Engine<float>;
#endif