Files
ANSCORE/core/ANSLibsLoader/NvDynLoader.cpp

647 lines
36 KiB
C++
Raw Normal View History

2026-03-28 16:54:11 +11:00
// ============================================================================
// NvDynLoader.cpp
//
// Moved from TensorRTAPI/ to ANSLibsLoader/ for centralized library management.
// Now compiled into ANSLibsLoader.dll — all extern "C" stubs are exported via
// ANSLIBS_API so consuming projects can link ANSLibsLoader.lib instead of
// nvinfer_XX.lib / nvonnxparser_XX.lib.
//
// Two responsibilities:
//
// 1. Provide extern "C" stub DEFINITIONS for every symbol that TRT / ONNX
// parser inline wrappers reference. This satisfies the linker without
// linking nvinfer_XX.lib / nvonnxparser_XX.lib. Each stub forwards the
// call through a function pointer that Initialize() populates at runtime.
//
// 2. Implement NvDynLoader::Initialize() which searches for installed NVIDIA
// DLLs (trying multiple major versions, newest first) and binds all
// required function pointers.
// ============================================================================
#include "NvDynLoader.h"
#include "DynLibUtils.h"
#include <iostream>
#include <cassert>
#include <mutex>
using namespace ANSCENTER::DynLib;
// ─────────────────────────────────────────────────────────────────────────────
// Static member definitions
// ─────────────────────────────────────────────────────────────────────────────
bool NvDynLoader::s_initialized = false;
int NvDynLoader::s_trtMajor = 0;
std::string NvDynLoader::s_trtPath;
std::string NvDynLoader::s_onnxPath;
std::string NvDynLoader::s_cudaPath;
std::string NvDynLoader::s_cudnnPath;
LibHandle NvDynLoader::s_hTrt = nullptr;
LibHandle NvDynLoader::s_hOnnx = nullptr;
LibHandle NvDynLoader::s_hCuda = nullptr;
LibHandle NvDynLoader::s_hCudnn = nullptr;
NvDynLoader::PfnBuilder* NvDynLoader::pfn_createInferBuilder_INTERNAL = nullptr;
NvDynLoader::PfnRuntime* NvDynLoader::pfn_createInferRuntime_INTERNAL = nullptr;
NvDynLoader::PfnRefitter* NvDynLoader::pfn_createInferRefitter_INTERNAL = nullptr;
NvDynLoader::PfnParser* NvDynLoader::pfn_createNvOnnxParser_INTERNAL = nullptr;
NvDynLoader::PfnParserRefitter* NvDynLoader::pfn_createNvOnnxParserRefitter_INTERNAL = nullptr;
NvDynLoader::PfnGetParserVersion* NvDynLoader::pfn_getNvOnnxParserVersion = nullptr;
#ifdef NV_DYNAMIC_CUDA
cudaError_t (*NvDynLoader::pfn_cudaGetDeviceCount) (int*) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaSetDevice) (int) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaGetDeviceProperties) (cudaDeviceProp*, int) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaDeviceSetLimit) (cudaLimit, size_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaDeviceSynchronize) () = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaDeviceGetStreamPriorityRange)(int*, int*) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMalloc) (void**, size_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaFree) (void*) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMemset) (void*, int, size_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMemGetInfo) (size_t*, size_t*) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMemcpy) (void*, const void*, size_t, cudaMemcpyKind) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMemcpyAsync) (void*, const void*, size_t, cudaMemcpyKind, cudaStream_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamCreate) (cudaStream_t*) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamCreateWithPriority) (cudaStream_t*, unsigned int, int) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamDestroy) (cudaStream_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamSynchronize) (cudaStream_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamWaitEvent) (cudaStream_t, cudaEvent_t, unsigned int) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaEventCreate) (cudaEvent_t*) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaEventCreateWithFlags) (cudaEvent_t*, unsigned int) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaEventRecord) (cudaEvent_t, cudaStream_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaEventDestroy) (cudaEvent_t) = nullptr;
const char* (*NvDynLoader::pfn_cudaGetErrorString) (cudaError_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaGetLastError) () = nullptr;
#endif // NV_DYNAMIC_CUDA
// ─────────────────────────────────────────────────────────────────────────────
// extern "C" stubs (exported from ANSLibsLoader.dll via .def file)
//
// These provide the linker symbols that TRT / ONNX-parser inline wrappers
// reference. TENSORRTAPI / NVONNXPARSER_API are overridden to empty in
// NvDynLoader.h, so the NvInfer.h / NvOnnxParser.h declarations are plain
// extern "C" (no __declspec attribute). Our definitions below are also
// plain extern "C" — no __declspec attribute — so the linkage matches.
//
// The symbols are exported from the DLL via ANSLibsLoader.def, NOT via
// __declspec(dllexport). This avoids the C2375 "different linkage" error
// that occurs when NvInfer.h and our definitions carry different attributes.
// ─────────────────────────────────────────────────────────────────────────────
extern "C" {
void* createInferBuilder_INTERNAL(void* logger, int32_t version) noexcept
{
assert(NvDynLoader::pfn_createInferBuilder_INTERNAL &&
"NvDynLoader::Initialize() has not been called before the first TRT use.");
return NvDynLoader::pfn_createInferBuilder_INTERNAL(logger, version);
}
void* createInferRuntime_INTERNAL(void* logger, int32_t version) noexcept
{
assert(NvDynLoader::pfn_createInferRuntime_INTERNAL &&
"NvDynLoader::Initialize() has not been called before the first TRT use.");
return NvDynLoader::pfn_createInferRuntime_INTERNAL(logger, version);
}
// createInferRefitter_INTERNAL — only needed if engine refitting is used;
// guarded so a missing symbol is a no-op rather than an assert failure.
void* createInferRefitter_INTERNAL(void* engine, void* logger, int32_t version) noexcept
{
if (!NvDynLoader::pfn_createInferRefitter_INTERNAL) return nullptr;
return NvDynLoader::pfn_createInferRefitter_INTERNAL(engine, logger, version);
}
void* createNvOnnxParser_INTERNAL(void* network, void* logger, int32_t version) noexcept
{
assert(NvDynLoader::pfn_createNvOnnxParser_INTERNAL &&
"NvDynLoader::Initialize() has not been called before the first ONNX-parser use.");
return NvDynLoader::pfn_createNvOnnxParser_INTERNAL(network, logger, version);
}
// createNvOnnxParserRefitter_INTERNAL — TRT 10+ symbol; absent on TRT 8/9.
// Guarded rather than asserted so older TRT versions continue to work.
void* createNvOnnxParserRefitter_INTERNAL(void* refitter, void* logger, int32_t version) noexcept
{
if (!NvDynLoader::pfn_createNvOnnxParserRefitter_INTERNAL) return nullptr;
return NvDynLoader::pfn_createNvOnnxParserRefitter_INTERNAL(refitter, logger, version);
}
// getNvOnnxParserVersion — optional version query; returns 0 if not available.
int getNvOnnxParserVersion() noexcept
{
if (!NvDynLoader::pfn_getNvOnnxParserVersion) return 0;
return NvDynLoader::pfn_getNvOnnxParserVersion();
}
} // extern "C"
// ─────────────────────────────────────────────────────────────────────────────
// CUDA stubs (only when NV_DYNAMIC_CUDA is defined)
// Removes the cudart_static.lib dependency; CUDA RT is loaded dynamically.
//
// WHY extern "C":
// cuda_runtime.h wraps every public API in `extern "C"` when compiled as C++,
// giving each function C linkage (no name mangling). Our stub DEFINITIONS
// must use the same linkage — otherwise the linker sees a C-linkage declaration
// (from the header) paired with a C++-mangled definition (from our .cpp), which
// is an unresolved-symbol error on both MSVC and GCC/Clang.
//
// WHY cudaDeviceSynchronize / cudaGetLastError are NOT in the macro list:
// CUDA_STUB_N macros require at least one parameter. These two functions take
// zero arguments; they are defined manually in the block below. Do NOT add
// them back to the macro invocation list — it would create duplicate definitions.
// ─────────────────────────────────────────────────────────────────────────────
#ifdef NV_DYNAMIC_CUDA
extern "C" { // ← must match the extern "C" in cuda_runtime.h
#define CUDA_STUB_1(ret, name, t0, a0) \
ret name(t0 a0) { \
assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0); }
#define CUDA_STUB_2(ret, name, t0,a0, t1,a1) \
ret name(t0 a0, t1 a1) { \
assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1); }
#define CUDA_STUB_3(ret, name, t0,a0, t1,a1, t2,a2) \
ret name(t0 a0, t1 a1, t2 a2) { \
assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1,a2); }
#define CUDA_STUB_4(ret, name, t0,a0, t1,a1, t2,a2, t3,a3) \
ret name(t0 a0, t1 a1, t2 a2, t3 a3) { \
assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1,a2,a3); }
#define CUDA_STUB_5(ret, name, t0,a0, t1,a1, t2,a2, t3,a3, t4,a4) \
ret name(t0 a0, t1 a1, t2 a2, t3 a3, t4 a4) { \
assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1,a2,a3,a4); }
CUDA_STUB_1(cudaError_t, cudaGetDeviceCount, int*, count)
CUDA_STUB_1(cudaError_t, cudaSetDevice, int, dev)
CUDA_STUB_2(cudaError_t, cudaGetDeviceProperties, cudaDeviceProp*, prop, int, dev)
CUDA_STUB_2(cudaError_t, cudaDeviceSetLimit, cudaLimit, limit, size_t, value)
CUDA_STUB_2(cudaError_t, cudaDeviceGetStreamPriorityRange, int*, leastPriority, int*, greatestPriority)
CUDA_STUB_2(cudaError_t, cudaMalloc, void**, devPtr, size_t, size)
CUDA_STUB_1(cudaError_t, cudaFree, void*, devPtr)
CUDA_STUB_3(cudaError_t, cudaMemset, void*, devPtr, int, value, size_t, count)
CUDA_STUB_2(cudaError_t, cudaMemGetInfo, size_t*, free, size_t*, total)
CUDA_STUB_4(cudaError_t, cudaMemcpy, void*, dst, const void*, src, size_t, count, cudaMemcpyKind, kind)
CUDA_STUB_5(cudaError_t, cudaMemcpyAsync, void*, dst, const void*, src, size_t, count, cudaMemcpyKind, kind, cudaStream_t, stream)
CUDA_STUB_1(cudaError_t, cudaStreamCreate, cudaStream_t*, stream)
CUDA_STUB_3(cudaError_t, cudaStreamCreateWithPriority, cudaStream_t*, stream, unsigned int, flags, int, priority)
CUDA_STUB_1(cudaError_t, cudaStreamDestroy, cudaStream_t, stream)
CUDA_STUB_1(cudaError_t, cudaStreamSynchronize, cudaStream_t, stream)
CUDA_STUB_3(cudaError_t, cudaStreamWaitEvent, cudaStream_t, stream, cudaEvent_t, event, unsigned int, flags)
CUDA_STUB_1(cudaError_t, cudaEventCreate, cudaEvent_t*, event)
CUDA_STUB_2(cudaError_t, cudaEventCreateWithFlags, cudaEvent_t*, event, unsigned int, flags)
CUDA_STUB_2(cudaError_t, cudaEventRecord, cudaEvent_t, event, cudaStream_t, stream)
CUDA_STUB_1(cudaError_t, cudaEventDestroy, cudaEvent_t, event)
CUDA_STUB_1(const char*, cudaGetErrorString, cudaError_t, error)
// 0-argument stubs — defined manually; NOT in the macro list above:
cudaError_t cudaDeviceSynchronize() {
assert(NvDynLoader::pfn_cudaDeviceSynchronize);
return NvDynLoader::pfn_cudaDeviceSynchronize();
}
cudaError_t cudaGetLastError() {
assert(NvDynLoader::pfn_cudaGetLastError);
return NvDynLoader::pfn_cudaGetLastError();
}
#undef CUDA_STUB_1
#undef CUDA_STUB_2
#undef CUDA_STUB_3
#undef CUDA_STUB_4
#undef CUDA_STUB_5
} // extern "C"
#endif // NV_DYNAMIC_CUDA
// ─────────────────────────────────────────────────────────────────────────────
// Helper — shared directory path with trailing separator
// ─────────────────────────────────────────────────────────────────────────────
static std::string SharedDirWithSep()
{
std::string dir = DEFAULT_SHARED_DIR;
#ifdef _WIN32
if (!dir.empty() && dir.back() != '\\') dir += '\\';
#else
if (!dir.empty() && dir.back() != '/') dir += '/';
#endif
return dir;
}
// ─────────────────────────────────────────────────────────────────────────────
// DLL candidate lists
// Priority 1 — ANSCENTER shared directory (full absolute path, newest first)
// Priority 2 — System PATH / LD_LIBRARY_PATH fallback (bare name, newest first)
// ─────────────────────────────────────────────────────────────────────────────
std::vector<std::string> NvDynLoader::TrtCandidates()
{
const std::string ansDir = SharedDirWithSep();
std::vector<std::string> v;
// ── Priority 1: ANSCENTER shared directory ────────────────────────────────
for (int major = 20; major >= 8; --major) {
#ifdef _WIN32
v.push_back(ansDir + "nvinfer_" + std::to_string(major) + ".dll");
#else
v.push_back(ansDir + "libnvinfer.so." + std::to_string(major));
#endif
}
// ── Priority 2: System PATH / LD_LIBRARY_PATH ─────────────────────────────
for (int major = 20; major >= 8; --major) {
#ifdef _WIN32
v.push_back("nvinfer_" + std::to_string(major) + ".dll");
#else
v.push_back("libnvinfer.so." + std::to_string(major));
#endif
}
return v;
}
std::vector<std::string> NvDynLoader::OnnxCandidates()
{
const std::string ansDir = SharedDirWithSep();
std::vector<std::string> v;
// ── Priority 1: ANSCENTER shared directory ────────────────────────────────
for (int major = 20; major >= 8; --major) {
#ifdef _WIN32
v.push_back(ansDir + "nvonnxparser_" + std::to_string(major) + ".dll");
#else
v.push_back(ansDir + "libnvonnxparser.so." + std::to_string(major));
#endif
}
// ── Priority 2: System PATH / LD_LIBRARY_PATH ─────────────────────────────
for (int major = 20; major >= 8; --major) {
#ifdef _WIN32
v.push_back("nvonnxparser_" + std::to_string(major) + ".dll");
#else
v.push_back("libnvonnxparser.so." + std::to_string(major));
#endif
}
return v;
}
std::vector<std::string> NvDynLoader::CudnnCandidates()
{
const std::string ansDir = SharedDirWithSep();
std::vector<std::string> v;
// ── cuDNN DLL/SO naming conventions ───────────────────────────────────────
// Windows: cudnn64_<major>.dll (e.g. cudnn64_8.dll, cudnn64_9.dll)
// Linux: libcudnn.so.<major> (e.g. libcudnn.so.8, libcudnn.so.9)
//
// Note: cuDNN 9 on Windows ships as multiple split DLLs
// (cudnn_ops.dll, cudnn_cnn.dll, etc.) but cudnn64_9.dll remains the
// main entry-point / compatibility shim. The split DLLs are resolved
// automatically once the ANSCENTER directory is added to the search path
// via LOAD_WITH_ALTERED_SEARCH_PATH (Windows) or RTLD_GLOBAL (Linux).
//
// ── Priority 1: ANSCENTER shared directory ────────────────────────────────
for (int major = 12; major >= 7; --major) {
#ifdef _WIN32
v.push_back(ansDir + "cudnn64_" + std::to_string(major) + ".dll");
#else
v.push_back(ansDir + "libcudnn.so." + std::to_string(major));
#endif
}
// ── Priority 2: System PATH / LD_LIBRARY_PATH ─────────────────────────────
for (int major = 12; major >= 7; --major) {
#ifdef _WIN32
v.push_back("cudnn64_" + std::to_string(major) + ".dll");
#else
v.push_back("libcudnn.so." + std::to_string(major));
#endif
}
return v;
}
#ifdef NV_DYNAMIC_CUDA
std::vector<std::string> NvDynLoader::CudaRtCandidates()
{
const std::string ansDir = SharedDirWithSep();
std::vector<std::string> v;
// ── CUDA runtime DLL/SO naming conventions ────────────────────────────────
//
// Windows — NVIDIA changed naming at CUDA 12:
// CUDA 11.x : cudart64_110.dll … cudart64_118.dll (major + minor digit)
// CUDA 12+ : cudart64_12.dll, cudart64_13.dll … (major-only)
//
// Linux — major-only symlink is the stable name on all versions:
// libcudart.so.11, libcudart.so.12, libcudart.so.13 …
// (major.minor symlinks also tried as fallback for CUDA 11.x)
//
// ── Priority 1: ANSCENTER shared directory ────────────────────────────────
#ifdef _WIN32
for (int major = 15; major >= 12; --major) // CUDA 12+ : major-only
v.push_back(ansDir + "cudart64_" + std::to_string(major) + ".dll");
for (int minor = 9; minor >= 0; --minor) // CUDA 11.x: major+minor
v.push_back(ansDir + "cudart64_11" + std::to_string(minor) + ".dll");
v.push_back(ansDir + "cudart64.dll"); // non-versioned alias
#else
for (int major = 15; major >= 11; --major) // all versions: major-only
v.push_back(ansDir + "libcudart.so." + std::to_string(major));
for (int minor = 9; minor >= 0; --minor) // CUDA 11.x: major.minor fallback
v.push_back(ansDir + "libcudart.so.11." + std::to_string(minor));
#endif
// ── Priority 2: System PATH / LD_LIBRARY_PATH ─────────────────────────────
#ifdef _WIN32
for (int major = 15; major >= 12; --major)
v.push_back("cudart64_" + std::to_string(major) + ".dll");
for (int minor = 9; minor >= 0; --minor)
v.push_back("cudart64_11" + std::to_string(minor) + ".dll");
v.push_back("cudart64.dll");
#else
for (int major = 15; major >= 11; --major)
v.push_back("libcudart.so." + std::to_string(major));
for (int minor = 9; minor >= 0; --minor)
v.push_back("libcudart.so.11." + std::to_string(minor));
#endif
return v;
}
#endif
// ─────────────────────────────────────────────────────────────────────────────
// Initialize
// ─────────────────────────────────────────────────────────────────────────────
static std::mutex g_initMutex;
bool NvDynLoader::Initialize(bool verbose)
{
std::lock_guard<std::mutex> lock(g_initMutex);
if (s_initialized) return true;
const std::string ansDir = SharedDirWithSep();
// If a previous Initialize() call partially loaded some libraries before
// failing (s_initialized stays false but handles may be non-null), release
// them now so we can retry cleanly without leaking handles.
if (s_hOnnx) { FreeLib(s_hOnnx); s_hOnnx = nullptr; }
if (s_hTrt) { FreeLib(s_hTrt); s_hTrt = nullptr; }
if (s_hCudnn) { FreeLib(s_hCudnn); s_hCudnn = nullptr; }
#ifdef NV_DYNAMIC_CUDA
if (s_hCuda) { FreeLib(s_hCuda); s_hCuda = nullptr; }
#endif
bool ok = true;
if (verbose) {
std::cout << "==================================================" << std::endl;
std::cout << "NvDynLoader: discovering NVIDIA libraries..." << std::endl;
std::cout << " Shared dir : " << ansDir << std::endl;
std::cout << "==================================================" << std::endl;
}
// ── STEP 1: CUDA Runtime (NV_DYNAMIC_CUDA only) ───────────────────────────
// Loaded first so cuDNN and TRT find CUDA symbols already resident in the
// process image when they are opened.
// Linux: globalLoad=true (RTLD_GLOBAL) makes cudart symbols process-wide.
// Windows: LoadLibraryExA is always process-global; globalLoad is a no-op.
#ifdef NV_DYNAMIC_CUDA
s_hCuda = FindAndLoad(CudaRtCandidates(), s_cudaPath, verbose, /*globalLoad=*/true);
if (!s_hCuda) {
std::cout << "NvDynLoader ERROR: No CUDA runtime DLL found. "
"Searched " << ansDir << " and system PATH." << std::endl;
ok = false;
} else {
bool b = true;
b &= Bind(s_hCuda, "cudaGetDeviceCount", pfn_cudaGetDeviceCount);
b &= Bind(s_hCuda, "cudaSetDevice", pfn_cudaSetDevice);
// cudaGetDeviceProperties: CUDA 12 renamed the export to '_v2'; CUDA 11
// uses the plain name. Try the CUDA 12+ symbol first, fall back to
// the CUDA 11 name. IMPORTANT: do NOT `b &=` the first attempt before
// the fallback — that would set b=false on CUDA 11 even though the
// plain-name fallback succeeds, causing a spurious init failure.
Bind(s_hCuda, "cudaGetDeviceProperties_v2", pfn_cudaGetDeviceProperties); // CUDA 12+
if (!pfn_cudaGetDeviceProperties)
Bind(s_hCuda, "cudaGetDeviceProperties", pfn_cudaGetDeviceProperties); // CUDA 11
b &= (pfn_cudaGetDeviceProperties != nullptr);
b &= Bind(s_hCuda, "cudaDeviceSetLimit", pfn_cudaDeviceSetLimit);
b &= Bind(s_hCuda, "cudaDeviceSynchronize", pfn_cudaDeviceSynchronize);
b &= Bind(s_hCuda, "cudaDeviceGetStreamPriorityRange", pfn_cudaDeviceGetStreamPriorityRange);
b &= Bind(s_hCuda, "cudaMalloc", pfn_cudaMalloc);
b &= Bind(s_hCuda, "cudaFree", pfn_cudaFree);
b &= Bind(s_hCuda, "cudaMemset", pfn_cudaMemset);
b &= Bind(s_hCuda, "cudaMemGetInfo", pfn_cudaMemGetInfo);
b &= Bind(s_hCuda, "cudaMemcpy", pfn_cudaMemcpy);
b &= Bind(s_hCuda, "cudaMemcpyAsync", pfn_cudaMemcpyAsync);
b &= Bind(s_hCuda, "cudaStreamCreate", pfn_cudaStreamCreate);
b &= Bind(s_hCuda, "cudaStreamCreateWithPriority", pfn_cudaStreamCreateWithPriority);
b &= Bind(s_hCuda, "cudaStreamDestroy", pfn_cudaStreamDestroy);
b &= Bind(s_hCuda, "cudaStreamSynchronize", pfn_cudaStreamSynchronize);
b &= Bind(s_hCuda, "cudaStreamWaitEvent", pfn_cudaStreamWaitEvent);
b &= Bind(s_hCuda, "cudaEventCreate", pfn_cudaEventCreate);
b &= Bind(s_hCuda, "cudaEventCreateWithFlags", pfn_cudaEventCreateWithFlags);
b &= Bind(s_hCuda, "cudaEventRecord", pfn_cudaEventRecord);
b &= Bind(s_hCuda, "cudaEventDestroy", pfn_cudaEventDestroy);
b &= Bind(s_hCuda, "cudaGetErrorString", pfn_cudaGetErrorString);
b &= Bind(s_hCuda, "cudaGetLastError", pfn_cudaGetLastError);
if (!b) {
std::cout << "NvDynLoader ERROR: One or more CUDA entry points not found in "
<< s_cudaPath << std::endl;
ok = false;
}
}
#endif // NV_DYNAMIC_CUDA
// ── STEP 2: cuDNN pre-load ────────────────────────────────────────────────
// Must be loaded BEFORE TRT so the OS can resolve TRT's cuDNN dependency:
//
// Windows LoadLibraryExA + LOAD_WITH_ALTERED_SEARCH_PATH causes Windows
// to use the DLL's own directory when resolving its transitive
// dependencies. Pre-loading cuDNN from ANSCENTER dir also puts
// it in the process DLL table so TRT's import loader finds it.
//
// Linux RTLD_GLOBAL exports cuDNN symbols process-wide so TRT's
// DT_NEEDED resolution succeeds without LD_LIBRARY_PATH changes.
// cuDNN 9 also ships split DLLs (libcudnn_ops.so.9, etc.); those
// are found automatically once libcudnn.so.9 is globally loaded.
//
// Not finding cuDNN is a WARNING, not a fatal error — TRT may still locate
// it via LD_LIBRARY_PATH / system PATH at load time.
s_hCudnn = FindAndLoad(CudnnCandidates(), s_cudnnPath, verbose, /*globalLoad=*/true);
if (!s_hCudnn && verbose) {
std::cout << "NvDynLoader: WARNING — cuDNN not found in " << ansDir
<< "; TRT will search system PATH / LD_LIBRARY_PATH." << std::endl;
}
// ── STEP 3: TensorRT core ─────────────────────────────────────────────────
s_hTrt = FindAndLoad(TrtCandidates(), s_trtPath, verbose);
if (!s_hTrt) {
std::cout << "NvDynLoader ERROR: No TensorRT DLL found. "
"Searched " << ansDir << " and system PATH." << std::endl;
ok = false;
} else {
// Extract major version from the loaded library path.
// Windows : "C:\...\nvinfer_10.dll" → digits between last '_' and last '.'
// Linux : ".../libnvinfer.so.10" → digits after last '.'
#ifdef _WIN32
{
const size_t under = s_trtPath.find_last_of('_');
const size_t dot = s_trtPath.find_last_of('.');
if (under != std::string::npos && dot != std::string::npos && dot > under) {
try { s_trtMajor = std::stoi(s_trtPath.substr(under + 1, dot - under - 1)); }
catch (...) { /* non-numeric — leave s_trtMajor at 0 */ }
}
}
#else
{
const size_t dot = s_trtPath.find_last_of('.');
if (dot != std::string::npos && dot + 1 < s_trtPath.size()) {
try { s_trtMajor = std::stoi(s_trtPath.substr(dot + 1)); }
catch (...) { /* non-numeric — leave s_trtMajor at 0 */ }
}
}
#endif
bool b = true;
b &= Bind(s_hTrt, "createInferBuilder_INTERNAL", pfn_createInferBuilder_INTERNAL);
b &= Bind(s_hTrt, "createInferRuntime_INTERNAL", pfn_createInferRuntime_INTERNAL);
Bind(s_hTrt, "createInferRefitter_INTERNAL", pfn_createInferRefitter_INTERNAL); // optional
if (!b) {
std::cout << "NvDynLoader ERROR: Required TRT entry points not found in "
<< s_trtPath << std::endl;
ok = false;
} else if (verbose) {
std::cout << "NvDynLoader: TRT major version = " << s_trtMajor << std::endl;
}
}
// ── STEP 4: ONNX parser ───────────────────────────────────────────────────
s_hOnnx = FindAndLoad(OnnxCandidates(), s_onnxPath, verbose);
if (!s_hOnnx) {
std::cout << "NvDynLoader ERROR: No nvonnxparser DLL found. "
"Searched " << ansDir << " and system PATH." << std::endl;
ok = false;
} else {
if (!Bind(s_hOnnx, "createNvOnnxParser_INTERNAL", pfn_createNvOnnxParser_INTERNAL)) {
std::cout << "NvDynLoader ERROR: createNvOnnxParser_INTERNAL not found in "
<< s_onnxPath << std::endl;
ok = false;
} else {
// TRT 10+ optional symbols — absent on TRT 8/9; missing is not an error.
Bind(s_hOnnx, "createNvOnnxParserRefitter_INTERNAL", pfn_createNvOnnxParserRefitter_INTERNAL);
Bind(s_hOnnx, "getNvOnnxParserVersion", pfn_getNvOnnxParserVersion);
// Cross-check: TRT and ONNX-parser must share the same major version.
// Mismatched majors (e.g. nvinfer_10 + nvonnxparser_11) compile fine
// but crash at runtime when the ONNX parser tries to populate a TRT
// network object built by a different major version.
int onnxMajor = 0;
#ifdef _WIN32
{
const size_t under = s_onnxPath.find_last_of('_');
const size_t dot = s_onnxPath.find_last_of('.');
if (under != std::string::npos && dot != std::string::npos && dot > under)
try { onnxMajor = std::stoi(s_onnxPath.substr(under + 1, dot - under - 1)); }
catch (...) {}
}
#else
{
const size_t dot = s_onnxPath.find_last_of('.');
if (dot != std::string::npos && dot + 1 < s_onnxPath.size())
try { onnxMajor = std::stoi(s_onnxPath.substr(dot + 1)); }
catch (...) {}
}
#endif
if (onnxMajor > 0 && s_trtMajor > 0 && onnxMajor != s_trtMajor) {
std::cout << "NvDynLoader ERROR: TRT major version (" << s_trtMajor
<< ") != ONNX-parser major version (" << onnxMajor << ").\n"
<< " TRT : " << s_trtPath << "\n"
<< " ONNX : " << s_onnxPath << "\n"
<< " Replace both DLLs with matching versions in "
<< ansDir << std::endl;
ok = false;
} else if (verbose && onnxMajor > 0) {
std::cout << "NvDynLoader: ONNX-parser major version = " << onnxMajor << std::endl;
}
}
}
s_initialized = ok;
if (verbose) {
std::cout << "NvDynLoader: "
<< (ok ? "initialised successfully." : "FAILED — check errors above.")
<< std::endl;
std::cout << "==================================================" << std::endl;
}
return ok;
}
// ─────────────────────────────────────────────────────────────────────────────
// Shutdown
// ─────────────────────────────────────────────────────────────────────────────
void NvDynLoader::Shutdown()
{
std::lock_guard<std::mutex> lock(g_initMutex);
// NOTE: No early-return guard on s_initialized here.
//
// Shutdown() must always release whatever handles are currently open,
// including handles left behind by a *partially*-successful Initialize()
// call (where s_initialized == false but some DLLs were already opened).
// Every FreeLib call is guarded by if(h), so this function is safe to call
// on a fully- or partially-initialised instance, and also safe to call
// multiple times (idempotent).
// ── Null all entry-point pointers first ───────────────────────────────────
// Prevents stubs from jumping into freed library memory in the window
// between Shutdown() and a subsequent Initialize() call.
pfn_createInferBuilder_INTERNAL = nullptr;
pfn_createInferRuntime_INTERNAL = nullptr;
pfn_createInferRefitter_INTERNAL = nullptr;
pfn_createNvOnnxParser_INTERNAL = nullptr;
pfn_createNvOnnxParserRefitter_INTERNAL = nullptr;
pfn_getNvOnnxParserVersion = nullptr;
#ifdef NV_DYNAMIC_CUDA
// Null every CUDA function pointer so the assert() in each stub fires
// immediately if a CUDA call is made after Shutdown() without a new
// Initialize(), rather than silently calling into freed DLL memory.
pfn_cudaGetDeviceCount = nullptr;
pfn_cudaSetDevice = nullptr;
pfn_cudaGetDeviceProperties = nullptr;
pfn_cudaDeviceSetLimit = nullptr;
pfn_cudaDeviceSynchronize = nullptr;
pfn_cudaDeviceGetStreamPriorityRange = nullptr;
pfn_cudaMalloc = nullptr;
pfn_cudaFree = nullptr;
pfn_cudaMemset = nullptr;
pfn_cudaMemGetInfo = nullptr;
pfn_cudaMemcpy = nullptr;
pfn_cudaMemcpyAsync = nullptr;
pfn_cudaStreamCreate = nullptr;
pfn_cudaStreamCreateWithPriority = nullptr;
pfn_cudaStreamDestroy = nullptr;
pfn_cudaStreamSynchronize = nullptr;
pfn_cudaStreamWaitEvent = nullptr;
pfn_cudaEventCreate = nullptr;
pfn_cudaEventCreateWithFlags = nullptr;
pfn_cudaEventRecord = nullptr;
pfn_cudaEventDestroy = nullptr;
pfn_cudaGetErrorString = nullptr;
pfn_cudaGetLastError = nullptr;
#endif // NV_DYNAMIC_CUDA
// ── Release handles in reverse load order ─────────────────────────────────
// ONNX parser → depends on TRT
// TRT → depends on cuDNN + CUDA RT
// cuDNN → depends on CUDA RT
// CUDA RT → base dependency
if (s_hOnnx) { FreeLib(s_hOnnx); s_hOnnx = nullptr; }
if (s_hTrt) { FreeLib(s_hTrt); s_hTrt = nullptr; }
if (s_hCudnn) { FreeLib(s_hCudnn); s_hCudnn = nullptr; }
#ifdef NV_DYNAMIC_CUDA
if (s_hCuda) { FreeLib(s_hCuda); s_hCuda = nullptr; }
#endif
// ── Reset version / path state ────────────────────────────────────────────
// A subsequent Initialize() call will re-discover fresh paths and versions.
s_trtMajor = 0;
s_trtPath.clear();
s_onnxPath.clear();
s_cudaPath.clear();
s_cudnnPath.clear();
s_initialized = false;
}