Files
ANSCORE/core/ANSLibsLoader/NvDynLoader.cpp

647 lines
36 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// ============================================================================
// NvDynLoader.cpp
//
// Moved from TensorRTAPI/ to ANSLibsLoader/ for centralized library management.
// Now compiled into ANSLibsLoader.dll — all extern "C" stubs are exported via
// ANSLIBS_API so consuming projects can link ANSLibsLoader.lib instead of
// nvinfer_XX.lib / nvonnxparser_XX.lib.
//
// Two responsibilities:
//
// 1. Provide extern "C" stub DEFINITIONS for every symbol that TRT / ONNX
// parser inline wrappers reference. This satisfies the linker without
// linking nvinfer_XX.lib / nvonnxparser_XX.lib. Each stub forwards the
// call through a function pointer that Initialize() populates at runtime.
//
// 2. Implement NvDynLoader::Initialize() which searches for installed NVIDIA
// DLLs (trying multiple major versions, newest first) and binds all
// required function pointers.
// ============================================================================
#include "NvDynLoader.h"
#include "DynLibUtils.h"
#include <iostream>
#include <cassert>
#include <mutex>
using namespace ANSCENTER::DynLib;
// ─────────────────────────────────────────────────────────────────────────────
// Static member definitions
// ─────────────────────────────────────────────────────────────────────────────
bool NvDynLoader::s_initialized = false;
int NvDynLoader::s_trtMajor = 0;
std::string NvDynLoader::s_trtPath;
std::string NvDynLoader::s_onnxPath;
std::string NvDynLoader::s_cudaPath;
std::string NvDynLoader::s_cudnnPath;
LibHandle NvDynLoader::s_hTrt = nullptr;
LibHandle NvDynLoader::s_hOnnx = nullptr;
LibHandle NvDynLoader::s_hCuda = nullptr;
LibHandle NvDynLoader::s_hCudnn = nullptr;
NvDynLoader::PfnBuilder* NvDynLoader::pfn_createInferBuilder_INTERNAL = nullptr;
NvDynLoader::PfnRuntime* NvDynLoader::pfn_createInferRuntime_INTERNAL = nullptr;
NvDynLoader::PfnRefitter* NvDynLoader::pfn_createInferRefitter_INTERNAL = nullptr;
NvDynLoader::PfnParser* NvDynLoader::pfn_createNvOnnxParser_INTERNAL = nullptr;
NvDynLoader::PfnParserRefitter* NvDynLoader::pfn_createNvOnnxParserRefitter_INTERNAL = nullptr;
NvDynLoader::PfnGetParserVersion* NvDynLoader::pfn_getNvOnnxParserVersion = nullptr;
#ifdef NV_DYNAMIC_CUDA
cudaError_t (*NvDynLoader::pfn_cudaGetDeviceCount) (int*) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaSetDevice) (int) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaGetDeviceProperties) (cudaDeviceProp*, int) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaDeviceSetLimit) (cudaLimit, size_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaDeviceSynchronize) () = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaDeviceGetStreamPriorityRange)(int*, int*) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMalloc) (void**, size_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaFree) (void*) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMemset) (void*, int, size_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMemGetInfo) (size_t*, size_t*) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMemcpy) (void*, const void*, size_t, cudaMemcpyKind) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMemcpyAsync) (void*, const void*, size_t, cudaMemcpyKind, cudaStream_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamCreate) (cudaStream_t*) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamCreateWithPriority) (cudaStream_t*, unsigned int, int) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamDestroy) (cudaStream_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamSynchronize) (cudaStream_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamWaitEvent) (cudaStream_t, cudaEvent_t, unsigned int) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaEventCreate) (cudaEvent_t*) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaEventCreateWithFlags) (cudaEvent_t*, unsigned int) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaEventRecord) (cudaEvent_t, cudaStream_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaEventDestroy) (cudaEvent_t) = nullptr;
const char* (*NvDynLoader::pfn_cudaGetErrorString) (cudaError_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaGetLastError) () = nullptr;
#endif // NV_DYNAMIC_CUDA
// ─────────────────────────────────────────────────────────────────────────────
// extern "C" stubs (exported from ANSLibsLoader.dll via .def file)
//
// These provide the linker symbols that TRT / ONNX-parser inline wrappers
// reference. TENSORRTAPI / NVONNXPARSER_API are overridden to empty in
// NvDynLoader.h, so the NvInfer.h / NvOnnxParser.h declarations are plain
// extern "C" (no __declspec attribute). Our definitions below are also
// plain extern "C" — no __declspec attribute — so the linkage matches.
//
// The symbols are exported from the DLL via ANSLibsLoader.def, NOT via
// __declspec(dllexport). This avoids the C2375 "different linkage" error
// that occurs when NvInfer.h and our definitions carry different attributes.
// ─────────────────────────────────────────────────────────────────────────────
extern "C" {
void* createInferBuilder_INTERNAL(void* logger, int32_t version) noexcept
{
assert(NvDynLoader::pfn_createInferBuilder_INTERNAL &&
"NvDynLoader::Initialize() has not been called before the first TRT use.");
return NvDynLoader::pfn_createInferBuilder_INTERNAL(logger, version);
}
void* createInferRuntime_INTERNAL(void* logger, int32_t version) noexcept
{
assert(NvDynLoader::pfn_createInferRuntime_INTERNAL &&
"NvDynLoader::Initialize() has not been called before the first TRT use.");
return NvDynLoader::pfn_createInferRuntime_INTERNAL(logger, version);
}
// createInferRefitter_INTERNAL — only needed if engine refitting is used;
// guarded so a missing symbol is a no-op rather than an assert failure.
void* createInferRefitter_INTERNAL(void* engine, void* logger, int32_t version) noexcept
{
if (!NvDynLoader::pfn_createInferRefitter_INTERNAL) return nullptr;
return NvDynLoader::pfn_createInferRefitter_INTERNAL(engine, logger, version);
}
void* createNvOnnxParser_INTERNAL(void* network, void* logger, int32_t version) noexcept
{
assert(NvDynLoader::pfn_createNvOnnxParser_INTERNAL &&
"NvDynLoader::Initialize() has not been called before the first ONNX-parser use.");
return NvDynLoader::pfn_createNvOnnxParser_INTERNAL(network, logger, version);
}
// createNvOnnxParserRefitter_INTERNAL — TRT 10+ symbol; absent on TRT 8/9.
// Guarded rather than asserted so older TRT versions continue to work.
void* createNvOnnxParserRefitter_INTERNAL(void* refitter, void* logger, int32_t version) noexcept
{
if (!NvDynLoader::pfn_createNvOnnxParserRefitter_INTERNAL) return nullptr;
return NvDynLoader::pfn_createNvOnnxParserRefitter_INTERNAL(refitter, logger, version);
}
// getNvOnnxParserVersion — optional version query; returns 0 if not available.
int getNvOnnxParserVersion() noexcept
{
if (!NvDynLoader::pfn_getNvOnnxParserVersion) return 0;
return NvDynLoader::pfn_getNvOnnxParserVersion();
}
} // extern "C"
// ─────────────────────────────────────────────────────────────────────────────
// CUDA stubs (only when NV_DYNAMIC_CUDA is defined)
// Removes the cudart_static.lib dependency; CUDA RT is loaded dynamically.
//
// WHY extern "C":
// cuda_runtime.h wraps every public API in `extern "C"` when compiled as C++,
// giving each function C linkage (no name mangling). Our stub DEFINITIONS
// must use the same linkage — otherwise the linker sees a C-linkage declaration
// (from the header) paired with a C++-mangled definition (from our .cpp), which
// is an unresolved-symbol error on both MSVC and GCC/Clang.
//
// WHY cudaDeviceSynchronize / cudaGetLastError are NOT in the macro list:
// CUDA_STUB_N macros require at least one parameter. These two functions take
// zero arguments; they are defined manually in the block below. Do NOT add
// them back to the macro invocation list — it would create duplicate definitions.
// ─────────────────────────────────────────────────────────────────────────────
#ifdef NV_DYNAMIC_CUDA
extern "C" { // ← must match the extern "C" in cuda_runtime.h
#define CUDA_STUB_1(ret, name, t0, a0) \
ret name(t0 a0) { \
assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0); }
#define CUDA_STUB_2(ret, name, t0,a0, t1,a1) \
ret name(t0 a0, t1 a1) { \
assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1); }
#define CUDA_STUB_3(ret, name, t0,a0, t1,a1, t2,a2) \
ret name(t0 a0, t1 a1, t2 a2) { \
assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1,a2); }
#define CUDA_STUB_4(ret, name, t0,a0, t1,a1, t2,a2, t3,a3) \
ret name(t0 a0, t1 a1, t2 a2, t3 a3) { \
assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1,a2,a3); }
#define CUDA_STUB_5(ret, name, t0,a0, t1,a1, t2,a2, t3,a3, t4,a4) \
ret name(t0 a0, t1 a1, t2 a2, t3 a3, t4 a4) { \
assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1,a2,a3,a4); }
CUDA_STUB_1(cudaError_t, cudaGetDeviceCount, int*, count)
CUDA_STUB_1(cudaError_t, cudaSetDevice, int, dev)
CUDA_STUB_2(cudaError_t, cudaGetDeviceProperties, cudaDeviceProp*, prop, int, dev)
CUDA_STUB_2(cudaError_t, cudaDeviceSetLimit, cudaLimit, limit, size_t, value)
CUDA_STUB_2(cudaError_t, cudaDeviceGetStreamPriorityRange, int*, leastPriority, int*, greatestPriority)
CUDA_STUB_2(cudaError_t, cudaMalloc, void**, devPtr, size_t, size)
CUDA_STUB_1(cudaError_t, cudaFree, void*, devPtr)
CUDA_STUB_3(cudaError_t, cudaMemset, void*, devPtr, int, value, size_t, count)
CUDA_STUB_2(cudaError_t, cudaMemGetInfo, size_t*, free, size_t*, total)
CUDA_STUB_4(cudaError_t, cudaMemcpy, void*, dst, const void*, src, size_t, count, cudaMemcpyKind, kind)
CUDA_STUB_5(cudaError_t, cudaMemcpyAsync, void*, dst, const void*, src, size_t, count, cudaMemcpyKind, kind, cudaStream_t, stream)
CUDA_STUB_1(cudaError_t, cudaStreamCreate, cudaStream_t*, stream)
CUDA_STUB_3(cudaError_t, cudaStreamCreateWithPriority, cudaStream_t*, stream, unsigned int, flags, int, priority)
CUDA_STUB_1(cudaError_t, cudaStreamDestroy, cudaStream_t, stream)
CUDA_STUB_1(cudaError_t, cudaStreamSynchronize, cudaStream_t, stream)
CUDA_STUB_3(cudaError_t, cudaStreamWaitEvent, cudaStream_t, stream, cudaEvent_t, event, unsigned int, flags)
CUDA_STUB_1(cudaError_t, cudaEventCreate, cudaEvent_t*, event)
CUDA_STUB_2(cudaError_t, cudaEventCreateWithFlags, cudaEvent_t*, event, unsigned int, flags)
CUDA_STUB_2(cudaError_t, cudaEventRecord, cudaEvent_t, event, cudaStream_t, stream)
CUDA_STUB_1(cudaError_t, cudaEventDestroy, cudaEvent_t, event)
CUDA_STUB_1(const char*, cudaGetErrorString, cudaError_t, error)
// 0-argument stubs — defined manually; NOT in the macro list above:
cudaError_t cudaDeviceSynchronize() {
assert(NvDynLoader::pfn_cudaDeviceSynchronize);
return NvDynLoader::pfn_cudaDeviceSynchronize();
}
cudaError_t cudaGetLastError() {
assert(NvDynLoader::pfn_cudaGetLastError);
return NvDynLoader::pfn_cudaGetLastError();
}
#undef CUDA_STUB_1
#undef CUDA_STUB_2
#undef CUDA_STUB_3
#undef CUDA_STUB_4
#undef CUDA_STUB_5
} // extern "C"
#endif // NV_DYNAMIC_CUDA
// ─────────────────────────────────────────────────────────────────────────────
// Helper — shared directory path with trailing separator
// ─────────────────────────────────────────────────────────────────────────────
static std::string SharedDirWithSep()
{
std::string dir = DEFAULT_SHARED_DIR;
#ifdef _WIN32
if (!dir.empty() && dir.back() != '\\') dir += '\\';
#else
if (!dir.empty() && dir.back() != '/') dir += '/';
#endif
return dir;
}
// ─────────────────────────────────────────────────────────────────────────────
// DLL candidate lists
// Priority 1 — ANSCENTER shared directory (full absolute path, newest first)
// Priority 2 — System PATH / LD_LIBRARY_PATH fallback (bare name, newest first)
// ─────────────────────────────────────────────────────────────────────────────
std::vector<std::string> NvDynLoader::TrtCandidates()
{
const std::string ansDir = SharedDirWithSep();
std::vector<std::string> v;
// ── Priority 1: ANSCENTER shared directory ────────────────────────────────
for (int major = 20; major >= 8; --major) {
#ifdef _WIN32
v.push_back(ansDir + "nvinfer_" + std::to_string(major) + ".dll");
#else
v.push_back(ansDir + "libnvinfer.so." + std::to_string(major));
#endif
}
// ── Priority 2: System PATH / LD_LIBRARY_PATH ─────────────────────────────
for (int major = 20; major >= 8; --major) {
#ifdef _WIN32
v.push_back("nvinfer_" + std::to_string(major) + ".dll");
#else
v.push_back("libnvinfer.so." + std::to_string(major));
#endif
}
return v;
}
std::vector<std::string> NvDynLoader::OnnxCandidates()
{
const std::string ansDir = SharedDirWithSep();
std::vector<std::string> v;
// ── Priority 1: ANSCENTER shared directory ────────────────────────────────
for (int major = 20; major >= 8; --major) {
#ifdef _WIN32
v.push_back(ansDir + "nvonnxparser_" + std::to_string(major) + ".dll");
#else
v.push_back(ansDir + "libnvonnxparser.so." + std::to_string(major));
#endif
}
// ── Priority 2: System PATH / LD_LIBRARY_PATH ─────────────────────────────
for (int major = 20; major >= 8; --major) {
#ifdef _WIN32
v.push_back("nvonnxparser_" + std::to_string(major) + ".dll");
#else
v.push_back("libnvonnxparser.so." + std::to_string(major));
#endif
}
return v;
}
std::vector<std::string> NvDynLoader::CudnnCandidates()
{
const std::string ansDir = SharedDirWithSep();
std::vector<std::string> v;
// ── cuDNN DLL/SO naming conventions ───────────────────────────────────────
// Windows: cudnn64_<major>.dll (e.g. cudnn64_8.dll, cudnn64_9.dll)
// Linux: libcudnn.so.<major> (e.g. libcudnn.so.8, libcudnn.so.9)
//
// Note: cuDNN 9 on Windows ships as multiple split DLLs
// (cudnn_ops.dll, cudnn_cnn.dll, etc.) but cudnn64_9.dll remains the
// main entry-point / compatibility shim. The split DLLs are resolved
// automatically once the ANSCENTER directory is added to the search path
// via LOAD_WITH_ALTERED_SEARCH_PATH (Windows) or RTLD_GLOBAL (Linux).
//
// ── Priority 1: ANSCENTER shared directory ────────────────────────────────
for (int major = 12; major >= 7; --major) {
#ifdef _WIN32
v.push_back(ansDir + "cudnn64_" + std::to_string(major) + ".dll");
#else
v.push_back(ansDir + "libcudnn.so." + std::to_string(major));
#endif
}
// ── Priority 2: System PATH / LD_LIBRARY_PATH ─────────────────────────────
for (int major = 12; major >= 7; --major) {
#ifdef _WIN32
v.push_back("cudnn64_" + std::to_string(major) + ".dll");
#else
v.push_back("libcudnn.so." + std::to_string(major));
#endif
}
return v;
}
#ifdef NV_DYNAMIC_CUDA
std::vector<std::string> NvDynLoader::CudaRtCandidates()
{
const std::string ansDir = SharedDirWithSep();
std::vector<std::string> v;
// ── CUDA runtime DLL/SO naming conventions ────────────────────────────────
//
// Windows — NVIDIA changed naming at CUDA 12:
// CUDA 11.x : cudart64_110.dll … cudart64_118.dll (major + minor digit)
// CUDA 12+ : cudart64_12.dll, cudart64_13.dll … (major-only)
//
// Linux — major-only symlink is the stable name on all versions:
// libcudart.so.11, libcudart.so.12, libcudart.so.13 …
// (major.minor symlinks also tried as fallback for CUDA 11.x)
//
// ── Priority 1: ANSCENTER shared directory ────────────────────────────────
#ifdef _WIN32
for (int major = 15; major >= 12; --major) // CUDA 12+ : major-only
v.push_back(ansDir + "cudart64_" + std::to_string(major) + ".dll");
for (int minor = 9; minor >= 0; --minor) // CUDA 11.x: major+minor
v.push_back(ansDir + "cudart64_11" + std::to_string(minor) + ".dll");
v.push_back(ansDir + "cudart64.dll"); // non-versioned alias
#else
for (int major = 15; major >= 11; --major) // all versions: major-only
v.push_back(ansDir + "libcudart.so." + std::to_string(major));
for (int minor = 9; minor >= 0; --minor) // CUDA 11.x: major.minor fallback
v.push_back(ansDir + "libcudart.so.11." + std::to_string(minor));
#endif
// ── Priority 2: System PATH / LD_LIBRARY_PATH ─────────────────────────────
#ifdef _WIN32
for (int major = 15; major >= 12; --major)
v.push_back("cudart64_" + std::to_string(major) + ".dll");
for (int minor = 9; minor >= 0; --minor)
v.push_back("cudart64_11" + std::to_string(minor) + ".dll");
v.push_back("cudart64.dll");
#else
for (int major = 15; major >= 11; --major)
v.push_back("libcudart.so." + std::to_string(major));
for (int minor = 9; minor >= 0; --minor)
v.push_back("libcudart.so.11." + std::to_string(minor));
#endif
return v;
}
#endif
// ─────────────────────────────────────────────────────────────────────────────
// Initialize
// ─────────────────────────────────────────────────────────────────────────────
static std::mutex g_initMutex;
bool NvDynLoader::Initialize(bool verbose)
{
std::lock_guard<std::mutex> lock(g_initMutex);
if (s_initialized) return true;
const std::string ansDir = SharedDirWithSep();
// If a previous Initialize() call partially loaded some libraries before
// failing (s_initialized stays false but handles may be non-null), release
// them now so we can retry cleanly without leaking handles.
if (s_hOnnx) { FreeLib(s_hOnnx); s_hOnnx = nullptr; }
if (s_hTrt) { FreeLib(s_hTrt); s_hTrt = nullptr; }
if (s_hCudnn) { FreeLib(s_hCudnn); s_hCudnn = nullptr; }
#ifdef NV_DYNAMIC_CUDA
if (s_hCuda) { FreeLib(s_hCuda); s_hCuda = nullptr; }
#endif
bool ok = true;
if (verbose) {
std::cout << "==================================================" << std::endl;
std::cout << "NvDynLoader: discovering NVIDIA libraries..." << std::endl;
std::cout << " Shared dir : " << ansDir << std::endl;
std::cout << "==================================================" << std::endl;
}
// ── STEP 1: CUDA Runtime (NV_DYNAMIC_CUDA only) ───────────────────────────
// Loaded first so cuDNN and TRT find CUDA symbols already resident in the
// process image when they are opened.
// Linux: globalLoad=true (RTLD_GLOBAL) makes cudart symbols process-wide.
// Windows: LoadLibraryExA is always process-global; globalLoad is a no-op.
#ifdef NV_DYNAMIC_CUDA
s_hCuda = FindAndLoad(CudaRtCandidates(), s_cudaPath, verbose, /*globalLoad=*/true);
if (!s_hCuda) {
std::cout << "NvDynLoader ERROR: No CUDA runtime DLL found. "
"Searched " << ansDir << " and system PATH." << std::endl;
ok = false;
} else {
bool b = true;
b &= Bind(s_hCuda, "cudaGetDeviceCount", pfn_cudaGetDeviceCount);
b &= Bind(s_hCuda, "cudaSetDevice", pfn_cudaSetDevice);
// cudaGetDeviceProperties: CUDA 12 renamed the export to '_v2'; CUDA 11
// uses the plain name. Try the CUDA 12+ symbol first, fall back to
// the CUDA 11 name. IMPORTANT: do NOT `b &=` the first attempt before
// the fallback — that would set b=false on CUDA 11 even though the
// plain-name fallback succeeds, causing a spurious init failure.
Bind(s_hCuda, "cudaGetDeviceProperties_v2", pfn_cudaGetDeviceProperties); // CUDA 12+
if (!pfn_cudaGetDeviceProperties)
Bind(s_hCuda, "cudaGetDeviceProperties", pfn_cudaGetDeviceProperties); // CUDA 11
b &= (pfn_cudaGetDeviceProperties != nullptr);
b &= Bind(s_hCuda, "cudaDeviceSetLimit", pfn_cudaDeviceSetLimit);
b &= Bind(s_hCuda, "cudaDeviceSynchronize", pfn_cudaDeviceSynchronize);
b &= Bind(s_hCuda, "cudaDeviceGetStreamPriorityRange", pfn_cudaDeviceGetStreamPriorityRange);
b &= Bind(s_hCuda, "cudaMalloc", pfn_cudaMalloc);
b &= Bind(s_hCuda, "cudaFree", pfn_cudaFree);
b &= Bind(s_hCuda, "cudaMemset", pfn_cudaMemset);
b &= Bind(s_hCuda, "cudaMemGetInfo", pfn_cudaMemGetInfo);
b &= Bind(s_hCuda, "cudaMemcpy", pfn_cudaMemcpy);
b &= Bind(s_hCuda, "cudaMemcpyAsync", pfn_cudaMemcpyAsync);
b &= Bind(s_hCuda, "cudaStreamCreate", pfn_cudaStreamCreate);
b &= Bind(s_hCuda, "cudaStreamCreateWithPriority", pfn_cudaStreamCreateWithPriority);
b &= Bind(s_hCuda, "cudaStreamDestroy", pfn_cudaStreamDestroy);
b &= Bind(s_hCuda, "cudaStreamSynchronize", pfn_cudaStreamSynchronize);
b &= Bind(s_hCuda, "cudaStreamWaitEvent", pfn_cudaStreamWaitEvent);
b &= Bind(s_hCuda, "cudaEventCreate", pfn_cudaEventCreate);
b &= Bind(s_hCuda, "cudaEventCreateWithFlags", pfn_cudaEventCreateWithFlags);
b &= Bind(s_hCuda, "cudaEventRecord", pfn_cudaEventRecord);
b &= Bind(s_hCuda, "cudaEventDestroy", pfn_cudaEventDestroy);
b &= Bind(s_hCuda, "cudaGetErrorString", pfn_cudaGetErrorString);
b &= Bind(s_hCuda, "cudaGetLastError", pfn_cudaGetLastError);
if (!b) {
std::cout << "NvDynLoader ERROR: One or more CUDA entry points not found in "
<< s_cudaPath << std::endl;
ok = false;
}
}
#endif // NV_DYNAMIC_CUDA
// ── STEP 2: cuDNN pre-load ────────────────────────────────────────────────
// Must be loaded BEFORE TRT so the OS can resolve TRT's cuDNN dependency:
//
// Windows LoadLibraryExA + LOAD_WITH_ALTERED_SEARCH_PATH causes Windows
// to use the DLL's own directory when resolving its transitive
// dependencies. Pre-loading cuDNN from ANSCENTER dir also puts
// it in the process DLL table so TRT's import loader finds it.
//
// Linux RTLD_GLOBAL exports cuDNN symbols process-wide so TRT's
// DT_NEEDED resolution succeeds without LD_LIBRARY_PATH changes.
// cuDNN 9 also ships split DLLs (libcudnn_ops.so.9, etc.); those
// are found automatically once libcudnn.so.9 is globally loaded.
//
// Not finding cuDNN is a WARNING, not a fatal error — TRT may still locate
// it via LD_LIBRARY_PATH / system PATH at load time.
s_hCudnn = FindAndLoad(CudnnCandidates(), s_cudnnPath, verbose, /*globalLoad=*/true);
if (!s_hCudnn && verbose) {
std::cout << "NvDynLoader: WARNING — cuDNN not found in " << ansDir
<< "; TRT will search system PATH / LD_LIBRARY_PATH." << std::endl;
}
// ── STEP 3: TensorRT core ─────────────────────────────────────────────────
s_hTrt = FindAndLoad(TrtCandidates(), s_trtPath, verbose);
if (!s_hTrt) {
std::cout << "NvDynLoader ERROR: No TensorRT DLL found. "
"Searched " << ansDir << " and system PATH." << std::endl;
ok = false;
} else {
// Extract major version from the loaded library path.
// Windows : "C:\...\nvinfer_10.dll" → digits between last '_' and last '.'
// Linux : ".../libnvinfer.so.10" → digits after last '.'
#ifdef _WIN32
{
const size_t under = s_trtPath.find_last_of('_');
const size_t dot = s_trtPath.find_last_of('.');
if (under != std::string::npos && dot != std::string::npos && dot > under) {
try { s_trtMajor = std::stoi(s_trtPath.substr(under + 1, dot - under - 1)); }
catch (...) { /* non-numeric — leave s_trtMajor at 0 */ }
}
}
#else
{
const size_t dot = s_trtPath.find_last_of('.');
if (dot != std::string::npos && dot + 1 < s_trtPath.size()) {
try { s_trtMajor = std::stoi(s_trtPath.substr(dot + 1)); }
catch (...) { /* non-numeric — leave s_trtMajor at 0 */ }
}
}
#endif
bool b = true;
b &= Bind(s_hTrt, "createInferBuilder_INTERNAL", pfn_createInferBuilder_INTERNAL);
b &= Bind(s_hTrt, "createInferRuntime_INTERNAL", pfn_createInferRuntime_INTERNAL);
Bind(s_hTrt, "createInferRefitter_INTERNAL", pfn_createInferRefitter_INTERNAL); // optional
if (!b) {
std::cout << "NvDynLoader ERROR: Required TRT entry points not found in "
<< s_trtPath << std::endl;
ok = false;
} else if (verbose) {
std::cout << "NvDynLoader: TRT major version = " << s_trtMajor << std::endl;
}
}
// ── STEP 4: ONNX parser ───────────────────────────────────────────────────
s_hOnnx = FindAndLoad(OnnxCandidates(), s_onnxPath, verbose);
if (!s_hOnnx) {
std::cout << "NvDynLoader ERROR: No nvonnxparser DLL found. "
"Searched " << ansDir << " and system PATH." << std::endl;
ok = false;
} else {
if (!Bind(s_hOnnx, "createNvOnnxParser_INTERNAL", pfn_createNvOnnxParser_INTERNAL)) {
std::cout << "NvDynLoader ERROR: createNvOnnxParser_INTERNAL not found in "
<< s_onnxPath << std::endl;
ok = false;
} else {
// TRT 10+ optional symbols — absent on TRT 8/9; missing is not an error.
Bind(s_hOnnx, "createNvOnnxParserRefitter_INTERNAL", pfn_createNvOnnxParserRefitter_INTERNAL);
Bind(s_hOnnx, "getNvOnnxParserVersion", pfn_getNvOnnxParserVersion);
// Cross-check: TRT and ONNX-parser must share the same major version.
// Mismatched majors (e.g. nvinfer_10 + nvonnxparser_11) compile fine
// but crash at runtime when the ONNX parser tries to populate a TRT
// network object built by a different major version.
int onnxMajor = 0;
#ifdef _WIN32
{
const size_t under = s_onnxPath.find_last_of('_');
const size_t dot = s_onnxPath.find_last_of('.');
if (under != std::string::npos && dot != std::string::npos && dot > under)
try { onnxMajor = std::stoi(s_onnxPath.substr(under + 1, dot - under - 1)); }
catch (...) {}
}
#else
{
const size_t dot = s_onnxPath.find_last_of('.');
if (dot != std::string::npos && dot + 1 < s_onnxPath.size())
try { onnxMajor = std::stoi(s_onnxPath.substr(dot + 1)); }
catch (...) {}
}
#endif
if (onnxMajor > 0 && s_trtMajor > 0 && onnxMajor != s_trtMajor) {
std::cout << "NvDynLoader ERROR: TRT major version (" << s_trtMajor
<< ") != ONNX-parser major version (" << onnxMajor << ").\n"
<< " TRT : " << s_trtPath << "\n"
<< " ONNX : " << s_onnxPath << "\n"
<< " Replace both DLLs with matching versions in "
<< ansDir << std::endl;
ok = false;
} else if (verbose && onnxMajor > 0) {
std::cout << "NvDynLoader: ONNX-parser major version = " << onnxMajor << std::endl;
}
}
}
s_initialized = ok;
if (verbose) {
std::cout << "NvDynLoader: "
<< (ok ? "initialised successfully." : "FAILED — check errors above.")
<< std::endl;
std::cout << "==================================================" << std::endl;
}
return ok;
}
// ─────────────────────────────────────────────────────────────────────────────
// Shutdown
// ─────────────────────────────────────────────────────────────────────────────
void NvDynLoader::Shutdown()
{
std::lock_guard<std::mutex> lock(g_initMutex);
// NOTE: No early-return guard on s_initialized here.
//
// Shutdown() must always release whatever handles are currently open,
// including handles left behind by a *partially*-successful Initialize()
// call (where s_initialized == false but some DLLs were already opened).
// Every FreeLib call is guarded by if(h), so this function is safe to call
// on a fully- or partially-initialised instance, and also safe to call
// multiple times (idempotent).
// ── Null all entry-point pointers first ───────────────────────────────────
// Prevents stubs from jumping into freed library memory in the window
// between Shutdown() and a subsequent Initialize() call.
pfn_createInferBuilder_INTERNAL = nullptr;
pfn_createInferRuntime_INTERNAL = nullptr;
pfn_createInferRefitter_INTERNAL = nullptr;
pfn_createNvOnnxParser_INTERNAL = nullptr;
pfn_createNvOnnxParserRefitter_INTERNAL = nullptr;
pfn_getNvOnnxParserVersion = nullptr;
#ifdef NV_DYNAMIC_CUDA
// Null every CUDA function pointer so the assert() in each stub fires
// immediately if a CUDA call is made after Shutdown() without a new
// Initialize(), rather than silently calling into freed DLL memory.
pfn_cudaGetDeviceCount = nullptr;
pfn_cudaSetDevice = nullptr;
pfn_cudaGetDeviceProperties = nullptr;
pfn_cudaDeviceSetLimit = nullptr;
pfn_cudaDeviceSynchronize = nullptr;
pfn_cudaDeviceGetStreamPriorityRange = nullptr;
pfn_cudaMalloc = nullptr;
pfn_cudaFree = nullptr;
pfn_cudaMemset = nullptr;
pfn_cudaMemGetInfo = nullptr;
pfn_cudaMemcpy = nullptr;
pfn_cudaMemcpyAsync = nullptr;
pfn_cudaStreamCreate = nullptr;
pfn_cudaStreamCreateWithPriority = nullptr;
pfn_cudaStreamDestroy = nullptr;
pfn_cudaStreamSynchronize = nullptr;
pfn_cudaStreamWaitEvent = nullptr;
pfn_cudaEventCreate = nullptr;
pfn_cudaEventCreateWithFlags = nullptr;
pfn_cudaEventRecord = nullptr;
pfn_cudaEventDestroy = nullptr;
pfn_cudaGetErrorString = nullptr;
pfn_cudaGetLastError = nullptr;
#endif // NV_DYNAMIC_CUDA
// ── Release handles in reverse load order ─────────────────────────────────
// ONNX parser → depends on TRT
// TRT → depends on cuDNN + CUDA RT
// cuDNN → depends on CUDA RT
// CUDA RT → base dependency
if (s_hOnnx) { FreeLib(s_hOnnx); s_hOnnx = nullptr; }
if (s_hTrt) { FreeLib(s_hTrt); s_hTrt = nullptr; }
if (s_hCudnn) { FreeLib(s_hCudnn); s_hCudnn = nullptr; }
#ifdef NV_DYNAMIC_CUDA
if (s_hCuda) { FreeLib(s_hCuda); s_hCuda = nullptr; }
#endif
// ── Reset version / path state ────────────────────────────────────────────
// A subsequent Initialize() call will re-discover fresh paths and versions.
s_trtMajor = 0;
s_trtPath.clear();
s_onnxPath.clear();
s_cudaPath.clear();
s_cudnnPath.clear();
s_initialized = false;
}