#pragma once #include "engine_api.h" // ENGINE_API (__declspec dllexport / dllimport) // -- TRT / ONNX-parser API decoration override ----------------------------- // Must appear BEFORE the first inclusion of NvInfer.h / NvOnnxParser.h. // // By default these macros expand to __declspec(dllimport) for consumers of // the TRT DLL, which causes the compiler to emit indirect import-table calls. // Overriding them to empty makes all calls direct, so the linker resolves // them against the extern "C" stub definitions in NvDynLoader.cpp instead of // against nvinfer_XX.lib / nvonnxparser_XX.lib. // // Remove nvinfer_XX.lib and nvonnxparser_XX.lib from Linker -> Input. // Link ANSLibsLoader.lib instead. See NvDynLoader.h for full details. #ifndef TENSORRTAPI # define TENSORRTAPI #endif #ifndef NVONNXPARSER_API # define NVONNXPARSER_API #endif #include "NvOnnxParser.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "engine/TRTEngineCache.h" #include #include #include #include "interfaces\IEngine.h" #include "ANSLicense.h" #include "Int8Calibrator.h" #include "NvDynLoader.h" // from ANSLibsLoader/include/ #ifdef _WIN32 #include #endif // Class to extend TensorRT logger class ENGINE_API Logger : public nvinfer1::ILogger { void log(Severity severity, const char *msg) noexcept override; }; // ============================================================================ // Diagnostic logging to Windows Event Viewer (ANSLogger source). // Allows internal engine failure points to be traced even when running // under LabView which swallows stdout/stderr. // ============================================================================ inline void logEngineEvent(const std::string& msg, bool isError = false) { #ifdef _WIN32 // Cache the event source handle to avoid RegisterEventSourceA/ // DeregisterEventSource churn. Under heavy load (10+ AI tasks), // the per-message register/deregister cycle overwhelms the Windows // Event Log service, causing ERROR_UNRECOGNIZED_VOLUME (1005) floods // that freeze the system. static HANDLE hLog = RegisterEventSourceA(nullptr, "ANSLogger"); if (hLog) { const char* msgs[1] = { msg.c_str() }; ReportEventA(hLog, isError ? EVENTLOG_ERROR_TYPE : EVENTLOG_INFORMATION_TYPE, 0, isError ? 1004 : 1000, nullptr, 1, 0, msgs, nullptr); } #endif if (isError) std::cerr << msg << std::endl; else std::cout << msg << std::endl; } // ============================================================================ // GpuDeviceInfo -- snapshot of one CUDA device captured at pool-init time. // Returned by Engine::enumerateDevices() and stored inside the pool. // ============================================================================ struct ENGINE_API GpuDeviceInfo { int index = 0; std::string name; size_t totalMemoryBytes = 0; ///< Physical VRAM size_t freeMemoryAtInitBytes = 0; ///< Free VRAM when pool started int computeMajor = 0; int computeMinor = 0; int slotsAllocated = 0; ///< Engine instances on this GPU size_t memoryPerSlotBytes = 0; ///< VRAM bytes each slot occupies }; template class Engine : public IEngine { public: Engine(); Engine(const ANSCENTER::Options &options); ~Engine(); // Build the onnx model into a TensorRT engine file, cache the model to disk // (to avoid rebuilding in future), and then load the model into memory The // default implementation will normalize values between [0.f, 1.f] Setting the // normalize flag to false will leave values between [0.f, 255.f] (some // converted models may require this). If the model requires values to be // normalized between [-1.f, 1.f], use the following params: // subVals = {0.5f, 0.5f, 0.5f}; // divVals = {0.5f, 0.5f, 0.5f}; // normalize = true; bool buildLoadNetwork(std::string onnxModelPath, const std::array &subVals = {0.f, 0.f, 0.f}, const std::array &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override; // Load a TensorRT engine file from disk into memory // The default implementation will normalize values between [0.f, 1.f] // Setting the normalize flag to false will leave values between [0.f, 255.f] // (some converted models may require this). If the model requires values to // be normalized between [-1.f, 1.f], use the following params: // subVals = {0.5f, 0.5f, 0.5f}; // divVals = {0.5f, 0.5f, 0.5f}; // normalize = true; bool loadNetwork(std::string trtModelPath, const std::array &subVals = {0.f, 0.f, 0.f}, const std::array &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override; // -- Multi-GPU pool overloads (non-virtual; call through Engine* or // unique_ptr>, not through IEngine*). // // maxSlotsPerGpu == 0 -> BYPASS: identical to the 4-param single-GPU path // above. Used by OptimizeModelStr for temporary engines. // maxSlotsPerGpu == 1 -> ROUND-ROBIN (default): 1 slot per GPU, multi-GPU // round-robin dispatch. Tasks queue when all slots // busy. Best balance of VRAM usage and multi-GPU // utilisation. // maxSlotsPerGpu == -1 -> ELASTIC pool: 1 probe slot at init, additional // slots created on-demand when concurrent requests // arrive, released when idle via releaseIdleSlots(). // Higher throughput but higher VRAM usage — only for // large GPUs (≥ 8 GB). // maxSlotsPerGpu > 1 -> PRE-ALLOCATED pool: cap at that many slots/GPU, // all created at init time. // // Usage -- opt into pool mode by adding ONE extra argument: // m_trtEngine->buildLoadNetwork(path, sub, div, norm); // single-GPU (unchanged) // m_trtEngine->buildLoadNetwork(path, sub, div, norm, 1); // round-robin (default) // m_trtEngine->buildLoadNetwork(path, sub, div, norm, -1); // elastic pool // m_trtEngine->buildLoadNetwork(path, sub, div, norm, 4); // pre-allocated, 4 slots/GPU bool buildLoadNetwork(std::string onnxModelPath, const std::array &subVals, const std::array &divVals, bool normalize, int maxSlotsPerGpu, double memSafetyFactor = 0.80); bool loadNetwork(std::string trtModelPath, const std::array &subVals, const std::array &divVals, bool normalize, int maxSlotsPerGpu, double memSafetyFactor = 0.80); // Run inference. // Input format [input][batch][cv::cuda::GpuMat] // Output format [batch][output][feature_vector] bool runInference(const std::vector> &inputs, std::vector>> &featureVectors) override; // Utility method for resizing an image while maintaining the aspect ratio by // adding padding to smaller dimension after scaling While letterbox padding // normally adds padding to top & bottom, or left & right sides, this // implementation only adds padding to the right or bottom side This is done // so that it's easier to convert detected coordinates (ex. YOLO model) back // to the original reference frame. static cv::cuda::GpuMat resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width, const cv::Scalar &bgcolor = cv::Scalar(0, 0, 0)); [[nodiscard]] const std::vector &getInputDims() const override { return m_inputDims; }; [[nodiscard]] const std::vector &getOutputDims() const override { return m_outputDims; }; [[nodiscard]] const ANSCENTER::Options& getOptions() const { return m_options; } /// Return the CUDA device index that preprocessing should target. /// In pool mode this returns the device of the first alive slot; /// in single-engine mode it returns m_options.deviceIndex. /// Call cudaSetDevice(engine->getPreferredDeviceIndex()) before /// any cv::cuda::GpuMat operations to ensure multi-GPU correctness. [[nodiscard]] int getPreferredDeviceIndex() const { // Pool mode: use the first alive slot's device if (!m_slots.empty()) { for (const auto& s : m_slots) { if (s.engine) return s.deviceIndex; } } return m_options.deviceIndex; } /// Convenience: set the calling thread's CUDA device to match this engine. /// Skips the call if already on the correct device (WDDM overhead avoidance). void setDeviceContext() const { const int target = getPreferredDeviceIndex(); int current = -1; cudaGetDevice(¤t); if (current != target) { cudaSetDevice(target); } } // Utility method for transforming triple nested output array into 2D array // Should be used when the output batch size is 1, but there are multiple // output feature vectors static void transformOutput(std::vector>> &input, std::vector> &output); // Utility method for transforming triple nested output array into single // array Should be used when the output batch size is 1, and there is only a // single output feature vector static void transformOutput(std::vector>> &input, std::vector &output); // Convert NHWC to NCHW and apply scaling and mean subtraction. // Pass a cv::cuda::Stream to run all GPU kernels on that stream instead of // the CUDA null (default) stream. This eliminates global sync barriers and // enables CUDA-graph-safe preprocessing on OpenCV 4.13+. static cv::cuda::GpuMat blobFromGpuMats(const std::vector &batchInput, const std::array &subVals, const std::array &divVals, bool normalize, bool swapRB = false, cv::cuda::Stream &stream = cv::cuda::Stream::Null()); // Converts the engine options into a string std::string serializeEngineOptions(const ANSCENTER::Options& options, const std::string& onnxModelPath); // Build the network (single attempt) bool build(std::string onnxModelPath, const std::array& subVals, const std::array& divVals, bool normalize); // SEH-safe wrapper around build(). Catches access violations that corrupt // the CUDA context (e.g. opset 19+ ONNX models crash TRT's parser). // Returns false and sets *outSehCode on crash. bool buildSafe(std::string onnxModelPath, const std::array& subVals, const std::array& divVals, bool normalize, unsigned long* outSehCode); // Build with auto-retry for dynamic spatial dimension models. // Pre-analyzes the ONNX model; if H/W are dynamic and the max profile // is too large for GPU VRAM, automatically reduces max spatial dims and // retries until build succeeds or all candidates are exhausted. // Fixed-spatial models fall through to a single build() call. bool buildWithRetry(std::string onnxModelPath, const std::array& subVals, const std::array& divVals, bool normalize); void getDeviceNames(std::vector& deviceNames); void clearGpuBuffers(); int getBindingIndexByName(const std::string& name); void UpdateOptions(const ANSCENTER::Options& options) { m_options = options; } void warmUp(int iterations = 5); void setVerbose(bool v) { m_verbose = v; } void setDisableGraphs(bool v) { m_disableGraphs = v; } // -- Multi-GPU pool management (replaces MultiGpuEngineManager) --------- // // Call ONE of the two initialize methods below instead of // buildLoadNetwork() / loadNetwork(), then use runInference() exactly as // you would for single-GPU. runInference() dispatches automatically to // the first idle slot across all GPUs. // // Single-GPU code that calls buildLoadNetwork() + runInference() directly // is completely unaffected -- it follows the existing code path unchanged. /** * Build from an ONNX file and create an Engine pool across all GPUs. * * @param baseOptions Configuration template; deviceIndex is ignored * and overridden per slot. * @param onnxModelPath Path to the .onnx model. * @param subVals Per-channel subtraction for normalisation. * @param divVals Per-channel divisor for normalisation. * @param normalize Scale pixel values to [0, 1] before inference. * @param maxSlotsPerGpu Cap slots per GPU (-1 = memory-limited only). * @param memSafetyFactor Fraction of free VRAM to consume (default 0.80). */ bool initializePool(const ANSCENTER::Options& baseOptions, const std::string& onnxModelPath, const std::array& subVals = {0.f, 0.f, 0.f}, const std::array& divVals = {1.f, 1.f, 1.f}, bool normalize = true, int maxSlotsPerGpu = -1, double memSafetyFactor = 0.80); /** * Load a pre-built TRT engine and create a pool -- no ONNX build step. */ bool initializePoolFromEngine(const ANSCENTER::Options& baseOptions, const std::string& trtEnginePath, const std::array& subVals = {0.f, 0.f, 0.f}, const std::array& divVals = {1.f, 1.f, 1.f}, bool normalize = true, int maxSlotsPerGpu = -1, double memSafetyFactor = 0.80); /** Enumerate all CUDA-capable GPUs without loading any model. */ static std::vector enumerateDevices(); /** Total Engine slots across all GPUs (valid after initializePool). */ int getTotalCapacity() const { return m_totalCapacity; } /** Slots currently executing inference (approximate, lock-free). */ int getActiveInferences() const { return m_activeCount.load(); } /** Idle slots not currently running inference. */ int getAvailableSlots() const { return m_totalCapacity - m_activeCount.load(); } /** True when every alive slot is busy. * In elastic mode, new work will trigger on-demand slot creation * rather than being rejected (unless all GPUs are physically full). */ bool isAtCapacity() const { return getAvailableSlots() <= 0; } /** Device snapshots captured at initializePool() time. */ const std::vector& getDeviceInfo() const { return m_deviceInfos; } /** Print a human-readable pool capacity and device report. */ void printCapacityReport() const; /** * Release idle slots whose last inference was more than @p idleSeconds * ago. The probe slot (first slot) is never released so the model * remains instantly usable. Returns the number of slots released. * * Typical usage -- call periodically from a background timer: * @code * int freed = engine->releaseIdleSlots(30.0); * @endcode */ int releaseIdleSlots(double idleSeconds = 30.0); /** Actual max spatial dims from the loaded engine's optimization profile. * Returns -1 if the engine is not loaded yet or dim is fixed. */ int getProfileMaxHeight() const { return m_profileMaxHeight; } int getProfileMaxWidth() const { return m_profileMaxWidth; } /// Grow the pool by up to @p count slots (demand-driven, bypasses grace period). /// Returns the number of slots actually created. int growPool(int count); private: // Serialises concurrent runInference() calls on the SAME Engine instance. // Without this, two threads with different batch sizes corrupt m_buffers, // m_lastBatchSize, and the TRT execution context — causing fatal "illegal // memory access" errors that permanently destroy the CUDA context. // Pool-mode slots have their own busy-flag dispatch, so this only matters // for the single-engine path (m_slots.empty()). mutable std::mutex m_inferenceMutex; int32_t m_lastBatchSize = -1; // BATCH STATE MANAGEMENT (THREAD-SAFE) //mutable std::mutex m_batchStateMutex; // Protects all batch-related state //size_t m_targetStableBatchSize = 4; // Default optimal batch size //int m_batchSizeChangeCounter = 0; //const int BATCH_STABILITY_THRESHOLD = 15; // Frames before allowing downsize //std::deque m_recentBatchSizes; //const size_t BATCH_HISTORY_SIZE = 30; //std::chrono::steady_clock::time_point m_lastBatchSizeChange; //// Context reshape mutex (separate from batch state) //mutable std::mutex m_contextReshapeMutex; //// Helper methods //size_t DetermineOptimalBatchSize(size_t requestedSize); //std::vector> PadBatchToSize(const std::vector>& inputs,size_t targetSize); // Normalization, scaling, and mean subtraction of inputs std::array m_subVals{}; std::array m_divVals{}; bool m_normalize; // Holds pointers to the input and output GPU buffers std::vector m_buffers; std::vector m_outputLengths{}; std::vector m_inputDims; std::vector m_outputDims; std::vector m_IOTensorNames; int32_t m_inputBatchSize = 0; bool m_firstInferenceCall = true; // Per-instance first-call diagnostics flag bool m_batchShapeChangeLogged = false; // Suppress per-tensor shape dump after first batch-size change bool m_hasDynamicSpatialDims = false; // True if any input has dynamic H or W (-1) std::vector m_lastSetInputDims; // Cache of last setInputShape() dims per tensor — skip if unchanged int m_profileMaxHeight = -1; // Max H from optimization profile (after load) int m_profileMaxWidth = -1; // Max W from optimization profile (after load) // Must keep IRuntime around for inference, see: // https://forums.developer.nvidia.com/t/is-it-safe-to-deallocate-nvinfer1-iruntime-after-creating-an-nvinfer1-icudaengine-but-before-running-inference-with-said-icudaengine/255381/2?u=cyruspk4w6 // // m_runtime and m_engine are shared_ptr to support TRTEngineCache: // multiple Engine instances loading the same model share one ICudaEngine // in VRAM, each with its own IExecutionContext. std::shared_ptr m_runtime = nullptr; std::unique_ptr m_calibrator = nullptr; std::shared_ptr m_engine = nullptr; std::unique_ptr m_context = nullptr; // TRTEngineCache tracking — used by destructor to release shared engine std::string m_cachedEnginePath; int m_cachedGpuIndex = -1; bool m_usingCachedEngine = false; bool m_skipEngineCache = false; // true = bypass TRTEngineCache (for OptimizeModelStr) bool m_skipOnnxRebuild = false; // true = don't delete/rebuild corrupt engine files (elastic growth) bool m_skipOnnxBuild = false; // true = skip ONNX→TRT build if no cached engine (demand growth) bool m_lastLoadFailedVRAM = false; // set by loadNetwork when failure is due to insufficient VRAM bool m_forceNoPool = false; // true = force single-GPU path, skip multi-GPU pool ANSCENTER::Options m_options; cudaStream_t m_cudaStream = nullptr; cudaStream_t m_inferenceStream; bool m_streamInitialized; //int32_t m_lastBatchSize; cudaStream_t m_memoryStream; // ADD THIS - separate stream for memory operations std::vector m_preprocessedInputs; // Keep inputs alive // Note: blobFromGpuMats and resizeKeepAspectRatioPadRightBottom are static, // so cached buffers use thread_local inside the functions themselves. // Thermal management (ADD THESE) //int m_consecutiveInferences; //int m_thermalCooldownThreshold; //int m_thermalCooldownDelayMs; bool m_profileInitialized; // -- GPU clock lock (NVML) ------------------------------------------------- // Dynamically loaded NVML to lock GPU clocks and prevent power throttling. // No build-time dependency on nvml.lib — loads nvml.dll at runtime. bool m_clocksLocked = false; unsigned int m_nvmlDeviceIdx = 0; void* m_nvmlLib = nullptr; // LibHandle (HMODULE on Windows) void lockGpuClocks(int deviceIndex, int requestedMHz); void unlockGpuClocks(); // -- CUDA Graph acceleration --------------------------------------------- // Pinned (page-locked) host buffers -- stable addresses required for graph // D2H copies. One pointer per output tensor, sized for maxBatchSize elements. std::vector m_pinnedOutputBuffers; std::vector m_pinnedOutputBufElems; // elements per buffer // Cached executable graphs, keyed by batch size. A new graph is captured // the first time each batch size is seen; subsequent calls reuse it. std::unordered_map m_graphExecs; Logger m_logger; bool m_verbose{ true }; // false for non-probe pool slots bool m_disableGraphs{ true }; // DISABLED by default — concurrent graph launches + uploads cause GPU deadlock on WDDM // -- Multi-GPU pool data --------------------------------------------------- // Per-slot descriptor. Slot engines are separate Engine instances; // they are never pooled recursively (their own m_slots deques are empty). // // Lifecycle in elastic mode: // alive (engine != nullptr, busy == false) -- idle, ready for inference // alive (engine != nullptr, busy == true) -- running inference // dead (engine == nullptr) -- released, VRAM freed // Dead entries stay in the deque to avoid invalidating pointers // held by other threads. tryGrowPool() reuses them. struct InferenceSlot { int deviceIndex = 0; bool busy = false; size_t memUsed = 0; ///< VRAM bytes this slot holds std::unique_ptr> engine; std::chrono::steady_clock::time_point lastUsedTime = std::chrono::steady_clock::now(); ///< For idle-release }; // Internal helpers -- implemented in EngineMultiGpu.inl bool loadSlots(const ANSCENTER::Options& baseOptions, const std::string& modelPath, const std::array& subVals, const std::array& divVals, bool normalize, bool fromOnnx, int maxSlotsPerGpu, double memSafetyFactor); bool runInferenceFromPool(const std::vector>& inputs, std::vector>>& featureVectors); InferenceSlot* tryGrowPool(bool bypassGrace = false); // -- Elastic pool storage ---------------------------------------------- // // std::deque is used instead of std::vector because push_back on a deque // does NOT invalidate references to existing elements. This is critical // because runInferenceFromPool holds a raw InferenceSlot* across the GPU // call while another thread may call tryGrowPool -> push_back concurrently. std::deque m_slots; ///< All managed Engine instances std::vector m_deviceInfos; ///< Per-device info from init mutable std::mutex m_slotMutex; ///< Protects m_slots busy flags std::condition_variable m_slotFreeCv; ///< Notified when any slot becomes idle std::mutex m_growMutex; ///< Serialises slot creation std::atomic m_activeCount{0}; ///< Running inference count std::atomic m_totalCapacity{0}; ///< Alive slot count (atomic -- read lock-free by public API) std::atomic m_nextSlotHint{0}; ///< Round-robin start index for slot scan // -- Elastic pool config (stored from loadSlots for on-demand growth) -- size_t m_memPerSlot{0}; std::string m_poolModelPath; std::array m_poolSubVals{}; std::array m_poolDivVals{}; bool m_poolNormalize{true}; bool m_poolFromOnnx{false}; double m_poolSafetyFactor{0.80}; bool m_elasticMode{false}; ///< true when maxSlotsPerGpu == -1 // -- Global elastic slot cap (shared across ALL pools on ALL GPUs) -- // Prevents CUDA driver SRW lock convoy when too many concurrent contexts exist. // Budget: ~4 slots per GB of total GPU VRAM (computed once on first pool init). // 4 GB → 12, 6 GB → 24, 8 GB → 32, 12 GB → 48, 24 GB → 96 static std::atomic s_globalElasticCount; ///< Current total elastic slots static std::atomic s_globalElasticMax; ///< Max total slots static std::once_flag s_globalCapInitFlag; ///< Ensures max is computed once // -- Elastic growth grace period -- // Delays elastic growth for N seconds after the most recent pool creation, // giving later models time to create their probe engines before elastic // growth from earlier pools consumes all free VRAM. static constexpr int s_elasticGraceSec{5}; ///< Grace period in seconds (reduced from 30s: shared pools need fewer probes) static std::atomic s_lastPoolCreatedMs; ///< Timestamp of last pool creation (ms since epoch) // -- Automatic idle-slot cleanup timer (elastic mode only) -- std::thread m_idleTimerThread; std::condition_variable m_idleTimerCv; std::mutex m_idleTimerMutex; std::atomic m_idleTimerStop{false}; double m_idleTimerIntervalSec{60.0}; ///< How often to check double m_idleTimerThresholdSec{600.0}; ///< Idle time (seconds) before releasing — 10 minutes to avoid churn void startIdleTimer(); void stopIdleTimer(); }; template Engine::Engine(const ANSCENTER::Options &options) : m_options(options) { NvDynLoader::Initialize(); // no-op after first call; safe to call repeatedly m_streamInitialized = false; m_inferenceStream = nullptr; m_lastBatchSize = -1; m_inputBatchSize = 0; m_memoryStream = nullptr; m_firstInferenceCall = true; m_batchShapeChangeLogged = false; m_hasDynamicSpatialDims = false; m_profileInitialized = false; m_lastSetInputDims.clear(); m_preprocessedInputs.clear(); } template Engine::Engine() { NvDynLoader::Initialize(); m_streamInitialized = false; m_inferenceStream = nullptr; m_lastBatchSize = -1; m_inputBatchSize = 0; m_memoryStream = nullptr; m_normalize = true; m_firstInferenceCall = true; m_batchShapeChangeLogged = false; m_hasDynamicSpatialDims = false; m_profileInitialized = false; m_lastSetInputDims.clear(); } template Engine::~Engine() { // ========================================================================= // CRITICAL: Destructors must NEVER throw. Every CUDA/TRT call below is // wrapped in try/catch so that a failure in one step does not skip the // remaining cleanup (especially m_engine / m_runtime reset, which release // TensorRT file handles and VRAM). // // Previous bug: clearGpuBuffers() called Util::checkCudaErrorCode() which // throws on cudaFree failure. That aborted the destructor mid-way, // leaving ICudaEngine/IRuntime alive → model folder locked on disk. // ========================================================================= // Stop the idle-slot cleanup timer (if running) before touching pool state try { stopIdleTimer(); } catch (...) {} // Destroy cached CUDA graphs try { for (auto& [bs, ge] : m_graphExecs) { if (ge) cudaGraphExecDestroy(ge); } m_graphExecs.clear(); } catch (...) {} // Free pinned output buffers try { for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); } m_pinnedOutputBuffers.clear(); } catch (...) {} // Free GPU I/O buffers — use cudaFree directly (NOT checkCudaErrorCode which throws) try { if (!m_buffers.empty()) { for (void* ptr : m_buffers) { if (ptr) cudaFree(ptr); // ignore errors — destructor must not throw } m_buffers.clear(); } } catch (...) {} try { if (m_cudaStream) { cudaStreamDestroy(m_cudaStream); m_cudaStream = nullptr; } } catch (...) {} try { if (m_streamInitialized && m_inferenceStream) { cudaStreamDestroy(m_inferenceStream); m_inferenceStream = nullptr; m_streamInitialized = false; } } catch (...) {} try { if (m_memoryStream) { cudaStreamDestroy(m_memoryStream); m_memoryStream = nullptr; } } catch (...) {} // Release GPU clock lock if we hold one try { unlockGpuClocks(); } catch (...) {} // TRT destruction ordering: context MUST be destroyed before engine. // With shared_ptr, m_engine may outlive this instance (other tasks share it). // Explicit reset ensures correct order. try { m_context.reset(); } catch (...) {} // Release from TRTEngineCache (decrements refcount; engine freed when last user releases) try { if (m_usingCachedEngine && !m_cachedEnginePath.empty()) { TRTEngineCache::instance().release(m_cachedEnginePath, m_cachedGpuIndex); } } catch (...) {} // These MUST execute to release TensorRT file handles and VRAM. // Previously, a throw in clearGpuBuffers() skipped these lines, // leaving the model folder locked on disk. try { m_engine.reset(); } catch (...) {} try { m_runtime.reset(); } catch (...) {} } // ============================================================================ // NVML GPU Clock Lock — dynamically loaded, no build-time nvml.lib needed // ============================================================================ // NVML type / constant definitions (subset — avoids #include ) namespace nvml_types { using Return_t = unsigned int; // nvmlReturn_t using Device_t = void*; // nvmlDevice_t (opaque pointer) enum : unsigned int { SUCCESS = 0 }; enum ClockType : unsigned int { CLOCK_GRAPHICS = 0 }; } template void Engine::lockGpuClocks(int deviceIndex, int requestedMHz) { if (requestedMHz == 0) return; // disabled #ifdef _WIN32 // Load nvml.dll from System32 (installed by NVIDIA driver) auto hLib = LoadLibraryA("nvml.dll"); if (!hLib) { if (m_verbose) std::cout << "Warning: nvml.dll not found — GPU clock lock unavailable" << std::endl; return; } m_nvmlLib = static_cast(hLib); // Resolve NVML function pointers using fn_Init = nvml_types::Return_t(*)(); using fn_Shutdown = nvml_types::Return_t(*)(); using fn_GetHandle = nvml_types::Return_t(*)(unsigned int, nvml_types::Device_t*); using fn_GetMaxClock = nvml_types::Return_t(*)(nvml_types::Device_t, nvml_types::ClockType, unsigned int*); using fn_SetLocked = nvml_types::Return_t(*)(nvml_types::Device_t, unsigned int, unsigned int); using fn_ErrorString = const char*(*)(nvml_types::Return_t); auto pInit = reinterpret_cast(GetProcAddress(hLib, "nvmlInit_v2")); auto pShutdown = reinterpret_cast(GetProcAddress(hLib, "nvmlShutdown")); auto pGetHandle = reinterpret_cast(GetProcAddress(hLib, "nvmlDeviceGetHandleByIndex_v2")); auto pGetMaxClk = reinterpret_cast(GetProcAddress(hLib, "nvmlDeviceGetMaxClockInfo")); auto pSetLocked = reinterpret_cast(GetProcAddress(hLib, "nvmlDeviceSetGpuLockedClocks")); auto pErrStr = reinterpret_cast(GetProcAddress(hLib, "nvmlErrorString")); if (!pInit || !pGetHandle || !pSetLocked) { if (m_verbose) std::cout << "Warning: NVML symbols not found — GPU clock lock unavailable" << std::endl; FreeLibrary(hLib); m_nvmlLib = nullptr; return; } auto errName = [&](nvml_types::Return_t r) -> const char* { return pErrStr ? pErrStr(r) : "unknown"; }; nvml_types::Return_t rc = pInit(); if (rc != nvml_types::SUCCESS) { if (m_verbose) std::cout << "Warning: nvmlInit failed: " << errName(rc) << std::endl; FreeLibrary(hLib); m_nvmlLib = nullptr; return; } nvml_types::Device_t nvmlDev = nullptr; rc = pGetHandle(static_cast(deviceIndex), &nvmlDev); if (rc != nvml_types::SUCCESS) { if (m_verbose) std::cout << "Warning: nvmlDeviceGetHandleByIndex(" << deviceIndex << ") failed: " << errName(rc) << std::endl; pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr; return; } unsigned int targetMHz = 0; if (requestedMHz < 0) { // Auto mode: query max graphics clock and use ~80% unsigned int maxClock = 0; if (pGetMaxClk) { rc = pGetMaxClk(nvmlDev, nvml_types::CLOCK_GRAPHICS, &maxClock); if (rc == nvml_types::SUCCESS && maxClock > 0) { targetMHz = static_cast(maxClock * 0.80); targetMHz = (targetMHz / 15) * 15; // Round down to 15 MHz step if (targetMHz < 210) targetMHz = 210; } else { if (m_verbose) std::cout << "Warning: Could not query max GPU clock — skipping clock lock" << std::endl; pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr; return; } } } else { targetMHz = static_cast(requestedMHz); } rc = pSetLocked(nvmlDev, targetMHz, targetMHz); if (rc == nvml_types::SUCCESS) { m_clocksLocked = true; m_nvmlDeviceIdx = static_cast(deviceIndex); if (m_verbose) std::cout << "Info: GPU clocks locked at " << targetMHz << " MHz (device " << deviceIndex << ")" << std::endl; } else { if (m_verbose) { std::cout << "Warning: nvmlDeviceSetGpuLockedClocks failed: " << errName(rc) << std::endl; std::cout << " (Run as Administrator, or use: nvidia-smi -lgc " << targetMHz << "," << targetMHz << ")" << std::endl; } pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr; } #else // Linux: similar pattern with dlopen("libnvidia-ml.so.1", RTLD_LAZY) // Left as TODO — nvidia-smi -lgc works without elevation on Linux. if (m_verbose) std::cout << "Info: GPU clock lock not implemented on this platform — use nvidia-smi -lgc" << std::endl; #endif } template void Engine::unlockGpuClocks() { if (!m_clocksLocked || !m_nvmlLib) return; #ifdef _WIN32 auto hLib = static_cast(m_nvmlLib); using fn_GetHandle = nvml_types::Return_t(*)(unsigned int, nvml_types::Device_t*); using fn_ResetClk = nvml_types::Return_t(*)(nvml_types::Device_t); using fn_Shutdown = nvml_types::Return_t(*)(); auto pGetHandle = reinterpret_cast(GetProcAddress(hLib, "nvmlDeviceGetHandleByIndex_v2")); auto pResetClk = reinterpret_cast(GetProcAddress(hLib, "nvmlDeviceResetGpuLockedClocks")); auto pShutdown = reinterpret_cast(GetProcAddress(hLib, "nvmlShutdown")); if (pGetHandle && pResetClk) { nvml_types::Device_t nvmlDev = nullptr; if (pGetHandle(m_nvmlDeviceIdx, &nvmlDev) == nvml_types::SUCCESS) { pResetClk(nvmlDev); if (m_verbose) std::cout << "Info: GPU clocks unlocked (device " << m_nvmlDeviceIdx << ")" << std::endl; } } if (pShutdown) pShutdown(); FreeLibrary(hLib); #endif m_nvmlLib = nullptr; m_clocksLocked = false; } namespace Util { inline bool doesFileExist(const std::string& filepath) { std::ifstream f(filepath.c_str()); return f.good(); } inline void checkCudaErrorCode(cudaError_t code) { if (code != cudaSuccess) { std::string errMsg = "CUDA operation failed with code: " + std::to_string(code) + " (" + cudaGetErrorName(code) + "), with message: " + cudaGetErrorString(code); std::cout< getFilesInDirectory(const std::string& dirPath) { std::vector fileNames; for (const auto& entry : std::filesystem::directory_iterator(dirPath)) { if (entry.is_regular_file()) { fileNames.push_back(entry.path().string()); } } return fileNames; } } #include "engine/EngineRunInference.inl" #include "engine/EngineUtilities.inl" #include "engine/EngineBuildLoadNetwork.inl" #include "engine/EngineMultiGpu.inl" // ============================================================================ // Explicit template instantiation declarations -- DLL consumers only // // These tell each consuming translation unit: "Engine is already fully // compiled and exported by the DLL. Do NOT re-instantiate it here; resolve // the symbols at link time from the DLL's import library instead." // // This suppresses redundant code generation in every consuming TU and // guarantees that only ONE definition of each Engine specialisation exists // across the whole program (in the DLL). // // To add a new exported instantiation (e.g. Engine): // 1. Add a matching template class ENGINE_API Engine; in engine.cpp // 2. Add the corresponding extern declaration below. // ============================================================================ #ifndef ENGINE_EXPORTS extern template class ENGINE_API Engine; #endif