811 lines
39 KiB
C
811 lines
39 KiB
C
|
|
#pragma once
|
||
|
|
|
||
|
|
#include "engine_api.h" // ENGINE_API (__declspec dllexport / dllimport)
|
||
|
|
|
||
|
|
// -- TRT / ONNX-parser API decoration override -----------------------------
|
||
|
|
// Must appear BEFORE the first inclusion of NvInfer.h / NvOnnxParser.h.
|
||
|
|
//
|
||
|
|
// By default these macros expand to __declspec(dllimport) for consumers of
|
||
|
|
// the TRT DLL, which causes the compiler to emit indirect import-table calls.
|
||
|
|
// Overriding them to empty makes all calls direct, so the linker resolves
|
||
|
|
// them against the extern "C" stub definitions in NvDynLoader.cpp instead of
|
||
|
|
// against nvinfer_XX.lib / nvonnxparser_XX.lib.
|
||
|
|
//
|
||
|
|
// Remove nvinfer_XX.lib and nvonnxparser_XX.lib from Linker -> Input.
|
||
|
|
// Link ANSLibsLoader.lib instead. See NvDynLoader.h for full details.
|
||
|
|
#ifndef TENSORRTAPI
|
||
|
|
# define TENSORRTAPI
|
||
|
|
#endif
|
||
|
|
#ifndef NVONNXPARSER_API
|
||
|
|
# define NVONNXPARSER_API
|
||
|
|
#endif
|
||
|
|
|
||
|
|
#include "NvOnnxParser.h"
|
||
|
|
#include <atomic>
|
||
|
|
#include <chrono>
|
||
|
|
#include <cuda_fp16.h>
|
||
|
|
#include <cuda_runtime.h>
|
||
|
|
#include <deque>
|
||
|
|
#include <filesystem>
|
||
|
|
#include <fstream>
|
||
|
|
#include <unordered_map>
|
||
|
|
#include <memory>
|
||
|
|
#include <condition_variable>
|
||
|
|
#include <mutex>
|
||
|
|
#include <string>
|
||
|
|
#include <thread>
|
||
|
|
#include <opencv2/core/cuda.hpp>
|
||
|
|
#include <opencv2/core/cuda_stream_accessor.hpp>
|
||
|
|
#include <opencv2/cudaarithm.hpp>
|
||
|
|
#include "engine/TRTEngineCache.h"
|
||
|
|
#include <opencv2/cudaimgproc.hpp>
|
||
|
|
#include <opencv2/cudawarping.hpp>
|
||
|
|
#include <opencv2/opencv.hpp>
|
||
|
|
#include "interfaces\IEngine.h"
|
||
|
|
#include "ANSLicense.h"
|
||
|
|
#include "Int8Calibrator.h"
|
||
|
|
#include "NvDynLoader.h" // from ANSLibsLoader/include/
|
||
|
|
#ifdef _WIN32
|
||
|
|
#include <windows.h>
|
||
|
|
#endif
|
||
|
|
|
||
|
|
// Class to extend TensorRT logger
|
||
|
|
class ENGINE_API Logger : public nvinfer1::ILogger {
|
||
|
|
void log(Severity severity, const char *msg) noexcept override;
|
||
|
|
};
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Diagnostic logging to Windows Event Viewer (ANSLogger source).
|
||
|
|
// Allows internal engine failure points to be traced even when running
|
||
|
|
// under LabView which swallows stdout/stderr.
|
||
|
|
// ============================================================================
|
||
|
|
inline void logEngineEvent(const std::string& msg, bool isError = false) {
|
||
|
|
#ifdef _WIN32
|
||
|
|
// Cache the event source handle to avoid RegisterEventSourceA/
|
||
|
|
// DeregisterEventSource churn. Under heavy load (10+ AI tasks),
|
||
|
|
// the per-message register/deregister cycle overwhelms the Windows
|
||
|
|
// Event Log service, causing ERROR_UNRECOGNIZED_VOLUME (1005) floods
|
||
|
|
// that freeze the system.
|
||
|
|
static HANDLE hLog = RegisterEventSourceA(nullptr, "ANSLogger");
|
||
|
|
if (hLog) {
|
||
|
|
const char* msgs[1] = { msg.c_str() };
|
||
|
|
ReportEventA(hLog,
|
||
|
|
isError ? EVENTLOG_ERROR_TYPE : EVENTLOG_INFORMATION_TYPE,
|
||
|
|
0, isError ? 1004 : 1000, nullptr,
|
||
|
|
1, 0, msgs, nullptr);
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
if (isError)
|
||
|
|
std::cerr << msg << std::endl;
|
||
|
|
else
|
||
|
|
std::cout << msg << std::endl;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// GpuDeviceInfo -- snapshot of one CUDA device captured at pool-init time.
|
||
|
|
// Returned by Engine<T>::enumerateDevices() and stored inside the pool.
|
||
|
|
// ============================================================================
|
||
|
|
struct ENGINE_API GpuDeviceInfo {
|
||
|
|
int index = 0;
|
||
|
|
std::string name;
|
||
|
|
size_t totalMemoryBytes = 0; ///< Physical VRAM
|
||
|
|
size_t freeMemoryAtInitBytes = 0; ///< Free VRAM when pool started
|
||
|
|
int computeMajor = 0;
|
||
|
|
int computeMinor = 0;
|
||
|
|
int slotsAllocated = 0; ///< Engine<T> instances on this GPU
|
||
|
|
size_t memoryPerSlotBytes = 0; ///< VRAM bytes each slot occupies
|
||
|
|
};
|
||
|
|
template <typename T>
|
||
|
|
class Engine : public IEngine<T> {
|
||
|
|
public:
|
||
|
|
Engine();
|
||
|
|
Engine(const ANSCENTER::Options &options);
|
||
|
|
~Engine();
|
||
|
|
|
||
|
|
// Build the onnx model into a TensorRT engine file, cache the model to disk
|
||
|
|
// (to avoid rebuilding in future), and then load the model into memory The
|
||
|
|
// default implementation will normalize values between [0.f, 1.f] Setting the
|
||
|
|
// normalize flag to false will leave values between [0.f, 255.f] (some
|
||
|
|
// converted models may require this). If the model requires values to be
|
||
|
|
// normalized between [-1.f, 1.f], use the following params:
|
||
|
|
// subVals = {0.5f, 0.5f, 0.5f};
|
||
|
|
// divVals = {0.5f, 0.5f, 0.5f};
|
||
|
|
// normalize = true;
|
||
|
|
bool buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
|
||
|
|
const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override;
|
||
|
|
|
||
|
|
// Load a TensorRT engine file from disk into memory
|
||
|
|
// The default implementation will normalize values between [0.f, 1.f]
|
||
|
|
// Setting the normalize flag to false will leave values between [0.f, 255.f]
|
||
|
|
// (some converted models may require this). If the model requires values to
|
||
|
|
// be normalized between [-1.f, 1.f], use the following params:
|
||
|
|
// subVals = {0.5f, 0.5f, 0.5f};
|
||
|
|
// divVals = {0.5f, 0.5f, 0.5f};
|
||
|
|
// normalize = true;
|
||
|
|
bool loadNetwork(std::string trtModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
|
||
|
|
const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override;
|
||
|
|
|
||
|
|
// -- Multi-GPU pool overloads (non-virtual; call through Engine<T>* or
|
||
|
|
// unique_ptr<Engine<T>>, not through IEngine<T>*).
|
||
|
|
//
|
||
|
|
// maxSlotsPerGpu == 0 -> BYPASS: identical to the 4-param single-GPU path
|
||
|
|
// above. Used by OptimizeModelStr for temporary engines.
|
||
|
|
// maxSlotsPerGpu == 1 -> ROUND-ROBIN (default): 1 slot per GPU, multi-GPU
|
||
|
|
// round-robin dispatch. Tasks queue when all slots
|
||
|
|
// busy. Best balance of VRAM usage and multi-GPU
|
||
|
|
// utilisation.
|
||
|
|
// maxSlotsPerGpu == -1 -> ELASTIC pool: 1 probe slot at init, additional
|
||
|
|
// slots created on-demand when concurrent requests
|
||
|
|
// arrive, released when idle via releaseIdleSlots().
|
||
|
|
// Higher throughput but higher VRAM usage — only for
|
||
|
|
// large GPUs (≥ 8 GB).
|
||
|
|
// maxSlotsPerGpu > 1 -> PRE-ALLOCATED pool: cap at that many slots/GPU,
|
||
|
|
// all created at init time.
|
||
|
|
//
|
||
|
|
// Usage -- opt into pool mode by adding ONE extra argument:
|
||
|
|
// m_trtEngine->buildLoadNetwork(path, sub, div, norm); // single-GPU (unchanged)
|
||
|
|
// m_trtEngine->buildLoadNetwork(path, sub, div, norm, 1); // round-robin (default)
|
||
|
|
// m_trtEngine->buildLoadNetwork(path, sub, div, norm, -1); // elastic pool
|
||
|
|
// m_trtEngine->buildLoadNetwork(path, sub, div, norm, 4); // pre-allocated, 4 slots/GPU
|
||
|
|
bool buildLoadNetwork(std::string onnxModelPath,
|
||
|
|
const std::array<float, 3> &subVals,
|
||
|
|
const std::array<float, 3> &divVals,
|
||
|
|
bool normalize,
|
||
|
|
int maxSlotsPerGpu,
|
||
|
|
double memSafetyFactor = 0.80);
|
||
|
|
bool loadNetwork(std::string trtModelPath,
|
||
|
|
const std::array<float, 3> &subVals,
|
||
|
|
const std::array<float, 3> &divVals,
|
||
|
|
bool normalize,
|
||
|
|
int maxSlotsPerGpu,
|
||
|
|
double memSafetyFactor = 0.80);
|
||
|
|
|
||
|
|
// Run inference.
|
||
|
|
// Input format [input][batch][cv::cuda::GpuMat]
|
||
|
|
// Output format [batch][output][feature_vector]
|
||
|
|
bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &inputs, std::vector<std::vector<std::vector<T>>> &featureVectors) override;
|
||
|
|
|
||
|
|
// Utility method for resizing an image while maintaining the aspect ratio by
|
||
|
|
// adding padding to smaller dimension after scaling While letterbox padding
|
||
|
|
// normally adds padding to top & bottom, or left & right sides, this
|
||
|
|
// implementation only adds padding to the right or bottom side This is done
|
||
|
|
// so that it's easier to convert detected coordinates (ex. YOLO model) back
|
||
|
|
// to the original reference frame.
|
||
|
|
static cv::cuda::GpuMat resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width,
|
||
|
|
const cv::Scalar &bgcolor = cv::Scalar(0, 0, 0));
|
||
|
|
|
||
|
|
[[nodiscard]] const std::vector<nvinfer1::Dims3> &getInputDims() const override { return m_inputDims; };
|
||
|
|
[[nodiscard]] const std::vector<nvinfer1::Dims> &getOutputDims() const override { return m_outputDims; };
|
||
|
|
[[nodiscard]] const ANSCENTER::Options& getOptions() const { return m_options; }
|
||
|
|
|
||
|
|
/// Return the CUDA device index that preprocessing should target.
|
||
|
|
/// In pool mode this returns the device of the first alive slot;
|
||
|
|
/// in single-engine mode it returns m_options.deviceIndex.
|
||
|
|
/// Call cudaSetDevice(engine->getPreferredDeviceIndex()) before
|
||
|
|
/// any cv::cuda::GpuMat operations to ensure multi-GPU correctness.
|
||
|
|
[[nodiscard]] int getPreferredDeviceIndex() const {
|
||
|
|
// Pool mode: use the first alive slot's device
|
||
|
|
if (!m_slots.empty()) {
|
||
|
|
for (const auto& s : m_slots) {
|
||
|
|
if (s.engine) return s.deviceIndex;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return m_options.deviceIndex;
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Convenience: set the calling thread's CUDA device to match this engine.
|
||
|
|
/// Skips the call if already on the correct device (WDDM overhead avoidance).
|
||
|
|
void setDeviceContext() const {
|
||
|
|
const int target = getPreferredDeviceIndex();
|
||
|
|
int current = -1;
|
||
|
|
cudaGetDevice(¤t);
|
||
|
|
if (current != target) {
|
||
|
|
cudaSetDevice(target);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Utility method for transforming triple nested output array into 2D array
|
||
|
|
// Should be used when the output batch size is 1, but there are multiple
|
||
|
|
// output feature vectors
|
||
|
|
static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output);
|
||
|
|
|
||
|
|
// Utility method for transforming triple nested output array into single
|
||
|
|
// array Should be used when the output batch size is 1, and there is only a
|
||
|
|
// single output feature vector
|
||
|
|
static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output);
|
||
|
|
// Convert NHWC to NCHW and apply scaling and mean subtraction.
|
||
|
|
// Pass a cv::cuda::Stream to run all GPU kernels on that stream instead of
|
||
|
|
// the CUDA null (default) stream. This eliminates global sync barriers and
|
||
|
|
// enables CUDA-graph-safe preprocessing on OpenCV 4.13+.
|
||
|
|
static cv::cuda::GpuMat blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
|
||
|
|
const std::array<float, 3> &divVals, bool normalize, bool swapRB = false,
|
||
|
|
cv::cuda::Stream &stream = cv::cuda::Stream::Null());
|
||
|
|
// Converts the engine options into a string
|
||
|
|
std::string serializeEngineOptions(const ANSCENTER::Options& options, const std::string& onnxModelPath);
|
||
|
|
// Build the network (single attempt)
|
||
|
|
bool build(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize);
|
||
|
|
// SEH-safe wrapper around build(). Catches access violations that corrupt
|
||
|
|
// the CUDA context (e.g. opset 19+ ONNX models crash TRT's parser).
|
||
|
|
// Returns false and sets *outSehCode on crash.
|
||
|
|
bool buildSafe(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize, unsigned long* outSehCode);
|
||
|
|
// Build with auto-retry for dynamic spatial dimension models.
|
||
|
|
// Pre-analyzes the ONNX model; if H/W are dynamic and the max profile
|
||
|
|
// is too large for GPU VRAM, automatically reduces max spatial dims and
|
||
|
|
// retries until build succeeds or all candidates are exhausted.
|
||
|
|
// Fixed-spatial models fall through to a single build() call.
|
||
|
|
bool buildWithRetry(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize);
|
||
|
|
void getDeviceNames(std::vector<std::string>& deviceNames);
|
||
|
|
void clearGpuBuffers();
|
||
|
|
int getBindingIndexByName(const std::string& name);
|
||
|
|
void UpdateOptions(const ANSCENTER::Options& options) { m_options = options; }
|
||
|
|
void warmUp(int iterations = 5);
|
||
|
|
void setVerbose(bool v) { m_verbose = v; }
|
||
|
|
void setDisableGraphs(bool v) { m_disableGraphs = v; }
|
||
|
|
|
||
|
|
// -- Multi-GPU pool management (replaces MultiGpuEngineManager<T>) ---------
|
||
|
|
//
|
||
|
|
// Call ONE of the two initialize methods below instead of
|
||
|
|
// buildLoadNetwork() / loadNetwork(), then use runInference() exactly as
|
||
|
|
// you would for single-GPU. runInference() dispatches automatically to
|
||
|
|
// the first idle slot across all GPUs.
|
||
|
|
//
|
||
|
|
// Single-GPU code that calls buildLoadNetwork() + runInference() directly
|
||
|
|
// is completely unaffected -- it follows the existing code path unchanged.
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Build from an ONNX file and create an Engine pool across all GPUs.
|
||
|
|
*
|
||
|
|
* @param baseOptions Configuration template; deviceIndex is ignored
|
||
|
|
* and overridden per slot.
|
||
|
|
* @param onnxModelPath Path to the .onnx model.
|
||
|
|
* @param subVals Per-channel subtraction for normalisation.
|
||
|
|
* @param divVals Per-channel divisor for normalisation.
|
||
|
|
* @param normalize Scale pixel values to [0, 1] before inference.
|
||
|
|
* @param maxSlotsPerGpu Cap slots per GPU (-1 = memory-limited only).
|
||
|
|
* @param memSafetyFactor Fraction of free VRAM to consume (default 0.80).
|
||
|
|
*/
|
||
|
|
bool initializePool(const ANSCENTER::Options& baseOptions,
|
||
|
|
const std::string& onnxModelPath,
|
||
|
|
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},
|
||
|
|
const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
|
||
|
|
bool normalize = true,
|
||
|
|
int maxSlotsPerGpu = -1,
|
||
|
|
double memSafetyFactor = 0.80);
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Load a pre-built TRT engine and create a pool -- no ONNX build step.
|
||
|
|
*/
|
||
|
|
bool initializePoolFromEngine(const ANSCENTER::Options& baseOptions,
|
||
|
|
const std::string& trtEnginePath,
|
||
|
|
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},
|
||
|
|
const std::array<float, 3>& divVals = {1.f, 1.f, 1.f},
|
||
|
|
bool normalize = true,
|
||
|
|
int maxSlotsPerGpu = -1,
|
||
|
|
double memSafetyFactor = 0.80);
|
||
|
|
|
||
|
|
/** Enumerate all CUDA-capable GPUs without loading any model. */
|
||
|
|
static std::vector<GpuDeviceInfo> enumerateDevices();
|
||
|
|
|
||
|
|
/** Total Engine<T> slots across all GPUs (valid after initializePool). */
|
||
|
|
int getTotalCapacity() const { return m_totalCapacity; }
|
||
|
|
|
||
|
|
/** Slots currently executing inference (approximate, lock-free). */
|
||
|
|
int getActiveInferences() const { return m_activeCount.load(); }
|
||
|
|
|
||
|
|
/** Idle slots not currently running inference. */
|
||
|
|
int getAvailableSlots() const { return m_totalCapacity - m_activeCount.load(); }
|
||
|
|
|
||
|
|
/** True when every alive slot is busy.
|
||
|
|
* In elastic mode, new work will trigger on-demand slot creation
|
||
|
|
* rather than being rejected (unless all GPUs are physically full). */
|
||
|
|
bool isAtCapacity() const { return getAvailableSlots() <= 0; }
|
||
|
|
|
||
|
|
/** Device snapshots captured at initializePool() time. */
|
||
|
|
const std::vector<GpuDeviceInfo>& getDeviceInfo() const { return m_deviceInfos; }
|
||
|
|
|
||
|
|
/** Print a human-readable pool capacity and device report. */
|
||
|
|
void printCapacityReport() const;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Release idle slots whose last inference was more than @p idleSeconds
|
||
|
|
* ago. The probe slot (first slot) is never released so the model
|
||
|
|
* remains instantly usable. Returns the number of slots released.
|
||
|
|
*
|
||
|
|
* Typical usage -- call periodically from a background timer:
|
||
|
|
* @code
|
||
|
|
* int freed = engine->releaseIdleSlots(30.0);
|
||
|
|
* @endcode
|
||
|
|
*/
|
||
|
|
int releaseIdleSlots(double idleSeconds = 30.0);
|
||
|
|
|
||
|
|
/** Actual max spatial dims from the loaded engine's optimization profile.
|
||
|
|
* Returns -1 if the engine is not loaded yet or dim is fixed. */
|
||
|
|
int getProfileMaxHeight() const { return m_profileMaxHeight; }
|
||
|
|
int getProfileMaxWidth() const { return m_profileMaxWidth; }
|
||
|
|
|
||
|
|
/// Grow the pool by up to @p count slots (demand-driven, bypasses grace period).
|
||
|
|
/// Returns the number of slots actually created.
|
||
|
|
int growPool(int count);
|
||
|
|
|
||
|
|
private:
|
||
|
|
// Serialises concurrent runInference() calls on the SAME Engine instance.
|
||
|
|
// Without this, two threads with different batch sizes corrupt m_buffers,
|
||
|
|
// m_lastBatchSize, and the TRT execution context — causing fatal "illegal
|
||
|
|
// memory access" errors that permanently destroy the CUDA context.
|
||
|
|
// Pool-mode slots have their own busy-flag dispatch, so this only matters
|
||
|
|
// for the single-engine path (m_slots.empty()).
|
||
|
|
mutable std::mutex m_inferenceMutex;
|
||
|
|
|
||
|
|
int32_t m_lastBatchSize = -1;
|
||
|
|
|
||
|
|
// BATCH STATE MANAGEMENT (THREAD-SAFE)
|
||
|
|
//mutable std::mutex m_batchStateMutex; // Protects all batch-related state
|
||
|
|
//size_t m_targetStableBatchSize = 4; // Default optimal batch size
|
||
|
|
//int m_batchSizeChangeCounter = 0;
|
||
|
|
//const int BATCH_STABILITY_THRESHOLD = 15; // Frames before allowing downsize
|
||
|
|
//std::deque<size_t> m_recentBatchSizes;
|
||
|
|
//const size_t BATCH_HISTORY_SIZE = 30;
|
||
|
|
//std::chrono::steady_clock::time_point m_lastBatchSizeChange;
|
||
|
|
//// Context reshape mutex (separate from batch state)
|
||
|
|
//mutable std::mutex m_contextReshapeMutex;
|
||
|
|
|
||
|
|
//// Helper methods
|
||
|
|
//size_t DetermineOptimalBatchSize(size_t requestedSize);
|
||
|
|
//std::vector<std::vector<cv::cuda::GpuMat>> PadBatchToSize(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,size_t targetSize);
|
||
|
|
|
||
|
|
// Normalization, scaling, and mean subtraction of inputs
|
||
|
|
std::array<float, 3> m_subVals{};
|
||
|
|
std::array<float, 3> m_divVals{};
|
||
|
|
bool m_normalize;
|
||
|
|
|
||
|
|
// Holds pointers to the input and output GPU buffers
|
||
|
|
std::vector<void *> m_buffers;
|
||
|
|
std::vector<uint32_t> m_outputLengths{};
|
||
|
|
std::vector<nvinfer1::Dims3> m_inputDims;
|
||
|
|
std::vector<nvinfer1::Dims> m_outputDims;
|
||
|
|
std::vector<std::string> m_IOTensorNames;
|
||
|
|
int32_t m_inputBatchSize = 0;
|
||
|
|
bool m_firstInferenceCall = true; // Per-instance first-call diagnostics flag
|
||
|
|
bool m_batchShapeChangeLogged = false; // Suppress per-tensor shape dump after first batch-size change
|
||
|
|
bool m_hasDynamicSpatialDims = false; // True if any input has dynamic H or W (-1)
|
||
|
|
std::vector<nvinfer1::Dims> m_lastSetInputDims; // Cache of last setInputShape() dims per tensor — skip if unchanged
|
||
|
|
int m_profileMaxHeight = -1; // Max H from optimization profile (after load)
|
||
|
|
int m_profileMaxWidth = -1; // Max W from optimization profile (after load)
|
||
|
|
|
||
|
|
// Must keep IRuntime around for inference, see:
|
||
|
|
// https://forums.developer.nvidia.com/t/is-it-safe-to-deallocate-nvinfer1-iruntime-after-creating-an-nvinfer1-icudaengine-but-before-running-inference-with-said-icudaengine/255381/2?u=cyruspk4w6
|
||
|
|
//
|
||
|
|
// m_runtime and m_engine are shared_ptr to support TRTEngineCache:
|
||
|
|
// multiple Engine<T> instances loading the same model share one ICudaEngine
|
||
|
|
// in VRAM, each with its own IExecutionContext.
|
||
|
|
std::shared_ptr<nvinfer1::IRuntime> m_runtime = nullptr;
|
||
|
|
std::unique_ptr<Int8EntropyCalibrator2> m_calibrator = nullptr;
|
||
|
|
std::shared_ptr<nvinfer1::ICudaEngine> m_engine = nullptr;
|
||
|
|
std::unique_ptr<nvinfer1::IExecutionContext> m_context = nullptr;
|
||
|
|
|
||
|
|
// TRTEngineCache tracking — used by destructor to release shared engine
|
||
|
|
std::string m_cachedEnginePath;
|
||
|
|
int m_cachedGpuIndex = -1;
|
||
|
|
bool m_usingCachedEngine = false;
|
||
|
|
bool m_skipEngineCache = false; // true = bypass TRTEngineCache (for OptimizeModelStr)
|
||
|
|
bool m_skipOnnxRebuild = false; // true = don't delete/rebuild corrupt engine files (elastic growth)
|
||
|
|
bool m_skipOnnxBuild = false; // true = skip ONNX→TRT build if no cached engine (demand growth)
|
||
|
|
bool m_lastLoadFailedVRAM = false; // set by loadNetwork when failure is due to insufficient VRAM
|
||
|
|
bool m_forceNoPool = false; // true = force single-GPU path, skip multi-GPU pool
|
||
|
|
|
||
|
|
ANSCENTER::Options m_options;
|
||
|
|
cudaStream_t m_cudaStream = nullptr;
|
||
|
|
|
||
|
|
cudaStream_t m_inferenceStream;
|
||
|
|
bool m_streamInitialized;
|
||
|
|
//int32_t m_lastBatchSize;
|
||
|
|
cudaStream_t m_memoryStream; // ADD THIS - separate stream for memory operations
|
||
|
|
std::vector<cv::cuda::GpuMat> m_preprocessedInputs; // Keep inputs alive
|
||
|
|
|
||
|
|
|
||
|
|
// Thermal management (ADD THESE)
|
||
|
|
//int m_consecutiveInferences;
|
||
|
|
//int m_thermalCooldownThreshold;
|
||
|
|
//int m_thermalCooldownDelayMs;
|
||
|
|
|
||
|
|
bool m_profileInitialized;
|
||
|
|
|
||
|
|
// -- GPU clock lock (NVML) -------------------------------------------------
|
||
|
|
// Dynamically loaded NVML to lock GPU clocks and prevent power throttling.
|
||
|
|
// No build-time dependency on nvml.lib — loads nvml.dll at runtime.
|
||
|
|
bool m_clocksLocked = false;
|
||
|
|
unsigned int m_nvmlDeviceIdx = 0;
|
||
|
|
void* m_nvmlLib = nullptr; // LibHandle (HMODULE on Windows)
|
||
|
|
|
||
|
|
void lockGpuClocks(int deviceIndex, int requestedMHz);
|
||
|
|
void unlockGpuClocks();
|
||
|
|
|
||
|
|
// -- CUDA Graph acceleration ---------------------------------------------
|
||
|
|
// Pinned (page-locked) host buffers -- stable addresses required for graph
|
||
|
|
// D2H copies. One pointer per output tensor, sized for maxBatchSize elements.
|
||
|
|
std::vector<T*> m_pinnedOutputBuffers;
|
||
|
|
std::vector<size_t> m_pinnedOutputBufElems; // elements per buffer
|
||
|
|
// Cached executable graphs, keyed by batch size. A new graph is captured
|
||
|
|
// the first time each batch size is seen; subsequent calls reuse it.
|
||
|
|
std::unordered_map<int, cudaGraphExec_t> m_graphExecs;
|
||
|
|
|
||
|
|
Logger m_logger;
|
||
|
|
bool m_verbose{ true }; // false for non-probe pool slots
|
||
|
|
bool m_disableGraphs{ false }; // true for pool slots — concurrent graph captures corrupt CUDA context
|
||
|
|
|
||
|
|
// -- Multi-GPU pool data ---------------------------------------------------
|
||
|
|
|
||
|
|
// Per-slot descriptor. Slot engines are separate Engine<T> instances;
|
||
|
|
// they are never pooled recursively (their own m_slots deques are empty).
|
||
|
|
//
|
||
|
|
// Lifecycle in elastic mode:
|
||
|
|
// alive (engine != nullptr, busy == false) -- idle, ready for inference
|
||
|
|
// alive (engine != nullptr, busy == true) -- running inference
|
||
|
|
// dead (engine == nullptr) -- released, VRAM freed
|
||
|
|
// Dead entries stay in the deque to avoid invalidating pointers
|
||
|
|
// held by other threads. tryGrowPool() reuses them.
|
||
|
|
struct InferenceSlot {
|
||
|
|
int deviceIndex = 0;
|
||
|
|
bool busy = false;
|
||
|
|
size_t memUsed = 0; ///< VRAM bytes this slot holds
|
||
|
|
std::unique_ptr<Engine<T>> engine;
|
||
|
|
std::chrono::steady_clock::time_point lastUsedTime
|
||
|
|
= std::chrono::steady_clock::now(); ///< For idle-release
|
||
|
|
};
|
||
|
|
|
||
|
|
// Internal helpers -- implemented in EngineMultiGpu.inl
|
||
|
|
bool loadSlots(const ANSCENTER::Options& baseOptions,
|
||
|
|
const std::string& modelPath,
|
||
|
|
const std::array<float, 3>& subVals,
|
||
|
|
const std::array<float, 3>& divVals,
|
||
|
|
bool normalize,
|
||
|
|
bool fromOnnx,
|
||
|
|
int maxSlotsPerGpu,
|
||
|
|
double memSafetyFactor);
|
||
|
|
|
||
|
|
bool runInferenceFromPool(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
|
||
|
|
std::vector<std::vector<std::vector<T>>>& featureVectors);
|
||
|
|
|
||
|
|
InferenceSlot* tryGrowPool(bool bypassGrace = false);
|
||
|
|
|
||
|
|
// -- Elastic pool storage ----------------------------------------------
|
||
|
|
//
|
||
|
|
// std::deque is used instead of std::vector because push_back on a deque
|
||
|
|
// does NOT invalidate references to existing elements. This is critical
|
||
|
|
// because runInferenceFromPool holds a raw InferenceSlot* across the GPU
|
||
|
|
// call while another thread may call tryGrowPool -> push_back concurrently.
|
||
|
|
std::deque<InferenceSlot> m_slots; ///< All managed Engine<T> instances
|
||
|
|
std::vector<GpuDeviceInfo> m_deviceInfos; ///< Per-device info from init
|
||
|
|
mutable std::mutex m_slotMutex; ///< Protects m_slots busy flags
|
||
|
|
std::condition_variable m_slotFreeCv; ///< Notified when any slot becomes idle
|
||
|
|
std::mutex m_growMutex; ///< Serialises slot creation
|
||
|
|
std::atomic<int> m_activeCount{0}; ///< Running inference count
|
||
|
|
std::atomic<int> m_totalCapacity{0}; ///< Alive slot count (atomic -- read lock-free by public API)
|
||
|
|
std::atomic<size_t> m_nextSlotHint{0}; ///< Round-robin start index for slot scan
|
||
|
|
|
||
|
|
// -- Elastic pool config (stored from loadSlots for on-demand growth) --
|
||
|
|
size_t m_memPerSlot{0};
|
||
|
|
std::string m_poolModelPath;
|
||
|
|
std::array<float, 3> m_poolSubVals{};
|
||
|
|
std::array<float, 3> m_poolDivVals{};
|
||
|
|
bool m_poolNormalize{true};
|
||
|
|
bool m_poolFromOnnx{false};
|
||
|
|
double m_poolSafetyFactor{0.80};
|
||
|
|
bool m_elasticMode{false}; ///< true when maxSlotsPerGpu == -1
|
||
|
|
|
||
|
|
// -- Global elastic slot cap (shared across ALL pools on ALL GPUs) --
|
||
|
|
// Prevents CUDA driver SRW lock convoy when too many concurrent contexts exist.
|
||
|
|
// Budget: ~4 slots per GB of total GPU VRAM (computed once on first pool init).
|
||
|
|
// 4 GB → 12, 6 GB → 24, 8 GB → 32, 12 GB → 48, 24 GB → 96
|
||
|
|
static std::atomic<int> s_globalElasticCount; ///< Current total elastic slots
|
||
|
|
static std::atomic<int> s_globalElasticMax; ///< Max total slots
|
||
|
|
static std::once_flag s_globalCapInitFlag; ///< Ensures max is computed once
|
||
|
|
|
||
|
|
// -- Elastic growth grace period --
|
||
|
|
// Delays elastic growth for N seconds after the most recent pool creation,
|
||
|
|
// giving later models time to create their probe engines before elastic
|
||
|
|
// growth from earlier pools consumes all free VRAM.
|
||
|
|
static constexpr int s_elasticGraceSec{5}; ///< Grace period in seconds (reduced from 30s: shared pools need fewer probes)
|
||
|
|
static std::atomic<int64_t> s_lastPoolCreatedMs; ///< Timestamp of last pool creation (ms since epoch)
|
||
|
|
|
||
|
|
// -- Automatic idle-slot cleanup timer (elastic mode only) --
|
||
|
|
std::thread m_idleTimerThread;
|
||
|
|
std::condition_variable m_idleTimerCv;
|
||
|
|
std::mutex m_idleTimerMutex;
|
||
|
|
std::atomic<bool> m_idleTimerStop{false};
|
||
|
|
double m_idleTimerIntervalSec{60.0}; ///< How often to check
|
||
|
|
double m_idleTimerThresholdSec{600.0}; ///< Idle time (seconds) before releasing — 10 minutes to avoid churn
|
||
|
|
|
||
|
|
void startIdleTimer();
|
||
|
|
void stopIdleTimer();
|
||
|
|
};
|
||
|
|
template <typename T> Engine<T>::Engine(const ANSCENTER::Options &options) : m_options(options) {
|
||
|
|
NvDynLoader::Initialize(); // no-op after first call; safe to call repeatedly
|
||
|
|
m_streamInitialized = false;
|
||
|
|
m_inferenceStream = nullptr;
|
||
|
|
m_lastBatchSize = -1;
|
||
|
|
m_inputBatchSize = 0;
|
||
|
|
m_memoryStream = nullptr;
|
||
|
|
m_firstInferenceCall = true;
|
||
|
|
m_batchShapeChangeLogged = false;
|
||
|
|
m_hasDynamicSpatialDims = false;
|
||
|
|
m_profileInitialized = false;
|
||
|
|
m_lastSetInputDims.clear();
|
||
|
|
m_preprocessedInputs.clear();
|
||
|
|
}
|
||
|
|
template <typename T> Engine<T>::Engine() {
|
||
|
|
NvDynLoader::Initialize();
|
||
|
|
m_streamInitialized = false;
|
||
|
|
m_inferenceStream = nullptr;
|
||
|
|
m_lastBatchSize = -1;
|
||
|
|
m_inputBatchSize = 0;
|
||
|
|
m_memoryStream = nullptr;
|
||
|
|
m_normalize = true;
|
||
|
|
m_firstInferenceCall = true;
|
||
|
|
m_batchShapeChangeLogged = false;
|
||
|
|
m_hasDynamicSpatialDims = false;
|
||
|
|
m_profileInitialized = false;
|
||
|
|
m_lastSetInputDims.clear();
|
||
|
|
}
|
||
|
|
template <typename T> Engine<T>::~Engine() {
|
||
|
|
// =========================================================================
|
||
|
|
// CRITICAL: Destructors must NEVER throw. Every CUDA/TRT call below is
|
||
|
|
// wrapped in try/catch so that a failure in one step does not skip the
|
||
|
|
// remaining cleanup (especially m_engine / m_runtime reset, which release
|
||
|
|
// TensorRT file handles and VRAM).
|
||
|
|
//
|
||
|
|
// Previous bug: clearGpuBuffers() called Util::checkCudaErrorCode() which
|
||
|
|
// throws on cudaFree failure. That aborted the destructor mid-way,
|
||
|
|
// leaving ICudaEngine/IRuntime alive → model folder locked on disk.
|
||
|
|
// =========================================================================
|
||
|
|
|
||
|
|
// Stop the idle-slot cleanup timer (if running) before touching pool state
|
||
|
|
try { stopIdleTimer(); } catch (...) {}
|
||
|
|
|
||
|
|
// Destroy cached CUDA graphs
|
||
|
|
try {
|
||
|
|
for (auto& [bs, ge] : m_graphExecs) { if (ge) cudaGraphExecDestroy(ge); }
|
||
|
|
m_graphExecs.clear();
|
||
|
|
} catch (...) {}
|
||
|
|
|
||
|
|
// Free pinned output buffers
|
||
|
|
try {
|
||
|
|
for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
|
||
|
|
m_pinnedOutputBuffers.clear();
|
||
|
|
} catch (...) {}
|
||
|
|
|
||
|
|
// Free GPU I/O buffers — use cudaFree directly (NOT checkCudaErrorCode which throws)
|
||
|
|
try {
|
||
|
|
if (!m_buffers.empty()) {
|
||
|
|
for (void* ptr : m_buffers) {
|
||
|
|
if (ptr) cudaFree(ptr); // ignore errors — destructor must not throw
|
||
|
|
}
|
||
|
|
m_buffers.clear();
|
||
|
|
}
|
||
|
|
} catch (...) {}
|
||
|
|
|
||
|
|
try {
|
||
|
|
if (m_cudaStream) {
|
||
|
|
cudaStreamDestroy(m_cudaStream);
|
||
|
|
m_cudaStream = nullptr;
|
||
|
|
}
|
||
|
|
} catch (...) {}
|
||
|
|
try {
|
||
|
|
if (m_streamInitialized && m_inferenceStream) {
|
||
|
|
cudaStreamDestroy(m_inferenceStream);
|
||
|
|
m_inferenceStream = nullptr;
|
||
|
|
m_streamInitialized = false;
|
||
|
|
}
|
||
|
|
} catch (...) {}
|
||
|
|
try {
|
||
|
|
if (m_memoryStream) {
|
||
|
|
cudaStreamDestroy(m_memoryStream);
|
||
|
|
m_memoryStream = nullptr;
|
||
|
|
}
|
||
|
|
} catch (...) {}
|
||
|
|
|
||
|
|
// Release GPU clock lock if we hold one
|
||
|
|
try { unlockGpuClocks(); } catch (...) {}
|
||
|
|
|
||
|
|
// TRT destruction ordering: context MUST be destroyed before engine.
|
||
|
|
// With shared_ptr, m_engine may outlive this instance (other tasks share it).
|
||
|
|
// Explicit reset ensures correct order.
|
||
|
|
try { m_context.reset(); } catch (...) {}
|
||
|
|
|
||
|
|
// Release from TRTEngineCache (decrements refcount; engine freed when last user releases)
|
||
|
|
try {
|
||
|
|
if (m_usingCachedEngine && !m_cachedEnginePath.empty()) {
|
||
|
|
TRTEngineCache::instance().release(m_cachedEnginePath, m_cachedGpuIndex);
|
||
|
|
}
|
||
|
|
} catch (...) {}
|
||
|
|
|
||
|
|
// These MUST execute to release TensorRT file handles and VRAM.
|
||
|
|
// Previously, a throw in clearGpuBuffers() skipped these lines,
|
||
|
|
// leaving the model folder locked on disk.
|
||
|
|
try { m_engine.reset(); } catch (...) {}
|
||
|
|
try { m_runtime.reset(); } catch (...) {}
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// NVML GPU Clock Lock — dynamically loaded, no build-time nvml.lib needed
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
// NVML type / constant definitions (subset — avoids #include <nvml.h>)
|
||
|
|
namespace nvml_types {
|
||
|
|
using Return_t = unsigned int; // nvmlReturn_t
|
||
|
|
using Device_t = void*; // nvmlDevice_t (opaque pointer)
|
||
|
|
enum : unsigned int { SUCCESS = 0 };
|
||
|
|
enum ClockType : unsigned int { CLOCK_GRAPHICS = 0 };
|
||
|
|
}
|
||
|
|
|
||
|
|
template <typename T>
|
||
|
|
void Engine<T>::lockGpuClocks(int deviceIndex, int requestedMHz) {
|
||
|
|
if (requestedMHz == 0) return; // disabled
|
||
|
|
|
||
|
|
#ifdef _WIN32
|
||
|
|
// Load nvml.dll from System32 (installed by NVIDIA driver)
|
||
|
|
auto hLib = LoadLibraryA("nvml.dll");
|
||
|
|
if (!hLib) {
|
||
|
|
if (m_verbose) std::cout << "Warning: nvml.dll not found — GPU clock lock unavailable" << std::endl;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
m_nvmlLib = static_cast<void*>(hLib);
|
||
|
|
|
||
|
|
// Resolve NVML function pointers
|
||
|
|
using fn_Init = nvml_types::Return_t(*)();
|
||
|
|
using fn_Shutdown = nvml_types::Return_t(*)();
|
||
|
|
using fn_GetHandle = nvml_types::Return_t(*)(unsigned int, nvml_types::Device_t*);
|
||
|
|
using fn_GetMaxClock = nvml_types::Return_t(*)(nvml_types::Device_t, nvml_types::ClockType, unsigned int*);
|
||
|
|
using fn_SetLocked = nvml_types::Return_t(*)(nvml_types::Device_t, unsigned int, unsigned int);
|
||
|
|
using fn_ErrorString = const char*(*)(nvml_types::Return_t);
|
||
|
|
|
||
|
|
auto pInit = reinterpret_cast<fn_Init>(GetProcAddress(hLib, "nvmlInit_v2"));
|
||
|
|
auto pShutdown = reinterpret_cast<fn_Shutdown>(GetProcAddress(hLib, "nvmlShutdown"));
|
||
|
|
auto pGetHandle = reinterpret_cast<fn_GetHandle>(GetProcAddress(hLib, "nvmlDeviceGetHandleByIndex_v2"));
|
||
|
|
auto pGetMaxClk = reinterpret_cast<fn_GetMaxClock>(GetProcAddress(hLib, "nvmlDeviceGetMaxClockInfo"));
|
||
|
|
auto pSetLocked = reinterpret_cast<fn_SetLocked>(GetProcAddress(hLib, "nvmlDeviceSetGpuLockedClocks"));
|
||
|
|
auto pErrStr = reinterpret_cast<fn_ErrorString>(GetProcAddress(hLib, "nvmlErrorString"));
|
||
|
|
|
||
|
|
if (!pInit || !pGetHandle || !pSetLocked) {
|
||
|
|
if (m_verbose) std::cout << "Warning: NVML symbols not found — GPU clock lock unavailable" << std::endl;
|
||
|
|
FreeLibrary(hLib); m_nvmlLib = nullptr;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
auto errName = [&](nvml_types::Return_t r) -> const char* {
|
||
|
|
return pErrStr ? pErrStr(r) : "unknown";
|
||
|
|
};
|
||
|
|
|
||
|
|
nvml_types::Return_t rc = pInit();
|
||
|
|
if (rc != nvml_types::SUCCESS) {
|
||
|
|
if (m_verbose) std::cout << "Warning: nvmlInit failed: " << errName(rc) << std::endl;
|
||
|
|
FreeLibrary(hLib); m_nvmlLib = nullptr;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
nvml_types::Device_t nvmlDev = nullptr;
|
||
|
|
rc = pGetHandle(static_cast<unsigned int>(deviceIndex), &nvmlDev);
|
||
|
|
if (rc != nvml_types::SUCCESS) {
|
||
|
|
if (m_verbose) std::cout << "Warning: nvmlDeviceGetHandleByIndex(" << deviceIndex << ") failed: " << errName(rc) << std::endl;
|
||
|
|
pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
unsigned int targetMHz = 0;
|
||
|
|
if (requestedMHz < 0) {
|
||
|
|
// Auto mode: query max graphics clock and use ~80%
|
||
|
|
unsigned int maxClock = 0;
|
||
|
|
if (pGetMaxClk) {
|
||
|
|
rc = pGetMaxClk(nvmlDev, nvml_types::CLOCK_GRAPHICS, &maxClock);
|
||
|
|
if (rc == nvml_types::SUCCESS && maxClock > 0) {
|
||
|
|
targetMHz = static_cast<unsigned int>(maxClock * 0.80);
|
||
|
|
targetMHz = (targetMHz / 15) * 15; // Round down to 15 MHz step
|
||
|
|
if (targetMHz < 210) targetMHz = 210;
|
||
|
|
} else {
|
||
|
|
if (m_verbose) std::cout << "Warning: Could not query max GPU clock — skipping clock lock" << std::endl;
|
||
|
|
pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr;
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
targetMHz = static_cast<unsigned int>(requestedMHz);
|
||
|
|
}
|
||
|
|
|
||
|
|
rc = pSetLocked(nvmlDev, targetMHz, targetMHz);
|
||
|
|
if (rc == nvml_types::SUCCESS) {
|
||
|
|
m_clocksLocked = true;
|
||
|
|
m_nvmlDeviceIdx = static_cast<unsigned int>(deviceIndex);
|
||
|
|
if (m_verbose) std::cout << "Info: GPU clocks locked at " << targetMHz << " MHz (device " << deviceIndex << ")" << std::endl;
|
||
|
|
} else {
|
||
|
|
if (m_verbose) {
|
||
|
|
std::cout << "Warning: nvmlDeviceSetGpuLockedClocks failed: " << errName(rc) << std::endl;
|
||
|
|
std::cout << " (Run as Administrator, or use: nvidia-smi -lgc " << targetMHz << "," << targetMHz << ")" << std::endl;
|
||
|
|
}
|
||
|
|
pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr;
|
||
|
|
}
|
||
|
|
#else
|
||
|
|
// Linux: similar pattern with dlopen("libnvidia-ml.so.1", RTLD_LAZY)
|
||
|
|
// Left as TODO — nvidia-smi -lgc works without elevation on Linux.
|
||
|
|
if (m_verbose) std::cout << "Info: GPU clock lock not implemented on this platform — use nvidia-smi -lgc" << std::endl;
|
||
|
|
#endif
|
||
|
|
}
|
||
|
|
|
||
|
|
template <typename T>
|
||
|
|
void Engine<T>::unlockGpuClocks() {
|
||
|
|
if (!m_clocksLocked || !m_nvmlLib) return;
|
||
|
|
|
||
|
|
#ifdef _WIN32
|
||
|
|
auto hLib = static_cast<HMODULE>(m_nvmlLib);
|
||
|
|
|
||
|
|
using fn_GetHandle = nvml_types::Return_t(*)(unsigned int, nvml_types::Device_t*);
|
||
|
|
using fn_ResetClk = nvml_types::Return_t(*)(nvml_types::Device_t);
|
||
|
|
using fn_Shutdown = nvml_types::Return_t(*)();
|
||
|
|
|
||
|
|
auto pGetHandle = reinterpret_cast<fn_GetHandle>(GetProcAddress(hLib, "nvmlDeviceGetHandleByIndex_v2"));
|
||
|
|
auto pResetClk = reinterpret_cast<fn_ResetClk>(GetProcAddress(hLib, "nvmlDeviceResetGpuLockedClocks"));
|
||
|
|
auto pShutdown = reinterpret_cast<fn_Shutdown>(GetProcAddress(hLib, "nvmlShutdown"));
|
||
|
|
|
||
|
|
if (pGetHandle && pResetClk) {
|
||
|
|
nvml_types::Device_t nvmlDev = nullptr;
|
||
|
|
if (pGetHandle(m_nvmlDeviceIdx, &nvmlDev) == nvml_types::SUCCESS) {
|
||
|
|
pResetClk(nvmlDev);
|
||
|
|
if (m_verbose) std::cout << "Info: GPU clocks unlocked (device " << m_nvmlDeviceIdx << ")" << std::endl;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (pShutdown) pShutdown();
|
||
|
|
FreeLibrary(hLib);
|
||
|
|
#endif
|
||
|
|
|
||
|
|
m_nvmlLib = nullptr;
|
||
|
|
m_clocksLocked = false;
|
||
|
|
}
|
||
|
|
|
||
|
|
namespace Util {
|
||
|
|
|
||
|
|
inline bool doesFileExist(const std::string& filepath) {
|
||
|
|
std::ifstream f(filepath.c_str());
|
||
|
|
return f.good();
|
||
|
|
}
|
||
|
|
|
||
|
|
inline void checkCudaErrorCode(cudaError_t code) {
|
||
|
|
if (code != cudaSuccess) {
|
||
|
|
std::string errMsg = "CUDA operation failed with code: " + std::to_string(code) + " (" + cudaGetErrorName(code) +
|
||
|
|
"), with message: " + cudaGetErrorString(code);
|
||
|
|
std::cout<<errMsg;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
inline std::vector<std::string> getFilesInDirectory(const std::string& dirPath) {
|
||
|
|
std::vector<std::string> fileNames;
|
||
|
|
for (const auto& entry : std::filesystem::directory_iterator(dirPath)) {
|
||
|
|
if (entry.is_regular_file()) {
|
||
|
|
fileNames.push_back(entry.path().string());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return fileNames;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
#include "engine/EngineRunInference.inl"
|
||
|
|
#include "engine/EngineUtilities.inl"
|
||
|
|
#include "engine/EngineBuildLoadNetwork.inl"
|
||
|
|
#include "engine/EngineMultiGpu.inl"
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Explicit template instantiation declarations -- DLL consumers only
|
||
|
|
//
|
||
|
|
// These tell each consuming translation unit: "Engine<float> is already fully
|
||
|
|
// compiled and exported by the DLL. Do NOT re-instantiate it here; resolve
|
||
|
|
// the symbols at link time from the DLL's import library instead."
|
||
|
|
//
|
||
|
|
// This suppresses redundant code generation in every consuming TU and
|
||
|
|
// guarantees that only ONE definition of each Engine<T> specialisation exists
|
||
|
|
// across the whole program (in the DLL).
|
||
|
|
//
|
||
|
|
// To add a new exported instantiation (e.g. Engine<uint8_t>):
|
||
|
|
// 1. Add a matching template class ENGINE_API Engine<uint8_t>; in engine.cpp
|
||
|
|
// 2. Add the corresponding extern declaration below.
|
||
|
|
// ============================================================================
|
||
|
|
#ifndef ENGINE_EXPORTS
|
||
|
|
extern template class ENGINE_API Engine<float>;
|
||
|
|
#endif
|