// ============================================================================ // NvDynLoader.cpp // // Two responsibilities: // // 1. Provide extern "C" stub DEFINITIONS for every symbol that TRT / ONNX // parser inline wrappers reference. This satisfies the linker without // linking nvinfer_XX.lib / nvonnxparser_XX.lib. Each stub forwards the // call through a function pointer that Initialize() populates at runtime. // // 2. Implement NvDynLoader::Initialize() which searches for installed NVIDIA // DLLs (trying multiple major versions, newest first) and binds all // required function pointers. // ============================================================================ #include "include/engine/NvDynLoader.h" #include #include #include // ───────────────────────────────────────────────────────────────────────────── // Static member definitions // ───────────────────────────────────────────────────────────────────────────── bool NvDynLoader::s_initialized = false; int NvDynLoader::s_trtMajor = 0; std::string NvDynLoader::s_trtPath; std::string NvDynLoader::s_onnxPath; std::string NvDynLoader::s_cudaPath; NvLibHandle NvDynLoader::s_hTrt = nullptr; NvLibHandle NvDynLoader::s_hOnnx = nullptr; NvLibHandle NvDynLoader::s_hCuda = nullptr; NvLibHandle NvDynLoader::s_hCudnn = nullptr; std::string NvDynLoader::s_cudnnPath; NvDynLoader::PfnBuilder* NvDynLoader::pfn_createInferBuilder_INTERNAL = nullptr; NvDynLoader::PfnRuntime* NvDynLoader::pfn_createInferRuntime_INTERNAL = nullptr; NvDynLoader::PfnRefitter* NvDynLoader::pfn_createInferRefitter_INTERNAL = nullptr; NvDynLoader::PfnParser* NvDynLoader::pfn_createNvOnnxParser_INTERNAL = nullptr; NvDynLoader::PfnParserRefitter* NvDynLoader::pfn_createNvOnnxParserRefitter_INTERNAL = nullptr; NvDynLoader::PfnGetParserVersion* NvDynLoader::pfn_getNvOnnxParserVersion = nullptr; #ifdef NV_DYNAMIC_CUDA cudaError_t (*NvDynLoader::pfn_cudaGetDeviceCount) (int*) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaSetDevice) (int) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaGetDeviceProperties) (cudaDeviceProp*, int) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaDeviceSetLimit) (cudaLimit, size_t) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaDeviceSynchronize) () = nullptr; cudaError_t (*NvDynLoader::pfn_cudaDeviceGetStreamPriorityRange)(int*, int*) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaMalloc) (void**, size_t) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaFree) (void*) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaMemset) (void*, int, size_t) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaMemGetInfo) (size_t*, size_t*) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaMemcpy) (void*, const void*, size_t, cudaMemcpyKind) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaMemcpyAsync) (void*, const void*, size_t, cudaMemcpyKind, cudaStream_t) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaStreamCreate) (cudaStream_t*) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaStreamCreateWithPriority) (cudaStream_t*, unsigned int, int) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaStreamDestroy) (cudaStream_t) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaStreamSynchronize) (cudaStream_t) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaStreamWaitEvent) (cudaStream_t, cudaEvent_t, unsigned int) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaEventCreate) (cudaEvent_t*) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaEventCreateWithFlags) (cudaEvent_t*, unsigned int) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaEventRecord) (cudaEvent_t, cudaStream_t) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaEventDestroy) (cudaEvent_t) = nullptr; const char* (*NvDynLoader::pfn_cudaGetErrorString) (cudaError_t) = nullptr; cudaError_t (*NvDynLoader::pfn_cudaGetLastError) () = nullptr; #endif // NV_DYNAMIC_CUDA // ───────────────────────────────────────────────────────────────────────────── // extern "C" stubs // // These provide the linker symbols that TRT / ONNX-parser inline wrappers // reference. With TENSORRTAPI / NVONNXPARSER_API overridden to empty (done // at the top of engine.h, before TRT headers are first included), the inline // wrappers compile to direct calls — not indirect import-table calls — so the // linker resolves them to these definitions rather than to a .lib file. // ───────────────────────────────────────────────────────────────────────────── extern "C" { void* createInferBuilder_INTERNAL(void* logger, int32_t version) noexcept { assert(NvDynLoader::pfn_createInferBuilder_INTERNAL && "NvDynLoader::Initialize() has not been called before the first TRT use."); return NvDynLoader::pfn_createInferBuilder_INTERNAL(logger, version); } void* createInferRuntime_INTERNAL(void* logger, int32_t version) noexcept { assert(NvDynLoader::pfn_createInferRuntime_INTERNAL && "NvDynLoader::Initialize() has not been called before the first TRT use."); return NvDynLoader::pfn_createInferRuntime_INTERNAL(logger, version); } // createInferRefitter_INTERNAL — only needed if engine refitting is used; // guarded so a missing symbol is a no-op rather than an assert failure. void* createInferRefitter_INTERNAL(void* engine, void* logger, int32_t version) noexcept { if (!NvDynLoader::pfn_createInferRefitter_INTERNAL) return nullptr; return NvDynLoader::pfn_createInferRefitter_INTERNAL(engine, logger, version); } void* createNvOnnxParser_INTERNAL(void* network, void* logger, int32_t version) noexcept { assert(NvDynLoader::pfn_createNvOnnxParser_INTERNAL && "NvDynLoader::Initialize() has not been called before the first ONNX-parser use."); return NvDynLoader::pfn_createNvOnnxParser_INTERNAL(network, logger, version); } // createNvOnnxParserRefitter_INTERNAL — TRT 10+ symbol; absent on TRT 8/9. // Guarded rather than asserted so older TRT versions continue to work. void* createNvOnnxParserRefitter_INTERNAL(void* refitter, void* logger, int32_t version) noexcept { if (!NvDynLoader::pfn_createNvOnnxParserRefitter_INTERNAL) return nullptr; return NvDynLoader::pfn_createNvOnnxParserRefitter_INTERNAL(refitter, logger, version); } // getNvOnnxParserVersion — optional version query; returns 0 if not available. int getNvOnnxParserVersion() noexcept { if (!NvDynLoader::pfn_getNvOnnxParserVersion) return 0; return NvDynLoader::pfn_getNvOnnxParserVersion(); } } // extern "C" // ───────────────────────────────────────────────────────────────────────────── // CUDA stubs (only when NV_DYNAMIC_CUDA is defined) // Removes the cudart_static.lib dependency; CUDA RT is loaded dynamically. // // WHY extern "C": // cuda_runtime.h wraps every public API in `extern "C"` when compiled as C++, // giving each function C linkage (no name mangling). Our stub DEFINITIONS // must use the same linkage — otherwise the linker sees a C-linkage declaration // (from the header) paired with a C++-mangled definition (from our .cpp), which // is an unresolved-symbol error on both MSVC and GCC/Clang. // // WHY cudaDeviceSynchronize / cudaGetLastError are NOT in the macro list: // CUDA_STUB_N macros require at least one parameter. These two functions take // zero arguments; they are defined manually in the block below. Do NOT add // them back to the macro invocation list — it would create duplicate definitions. // ───────────────────────────────────────────────────────────────────────────── #ifdef NV_DYNAMIC_CUDA extern "C" { // ← must match the extern "C" in cuda_runtime.h #define CUDA_STUB_1(ret, name, t0, a0) \ ret name(t0 a0) { \ assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0); } #define CUDA_STUB_2(ret, name, t0,a0, t1,a1) \ ret name(t0 a0, t1 a1) { \ assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1); } #define CUDA_STUB_3(ret, name, t0,a0, t1,a1, t2,a2) \ ret name(t0 a0, t1 a1, t2 a2) { \ assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1,a2); } #define CUDA_STUB_4(ret, name, t0,a0, t1,a1, t2,a2, t3,a3) \ ret name(t0 a0, t1 a1, t2 a2, t3 a3) { \ assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1,a2,a3); } #define CUDA_STUB_5(ret, name, t0,a0, t1,a1, t2,a2, t3,a3, t4,a4) \ ret name(t0 a0, t1 a1, t2 a2, t3 a3, t4 a4) { \ assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1,a2,a3,a4); } CUDA_STUB_1(cudaError_t, cudaGetDeviceCount, int*, count) CUDA_STUB_1(cudaError_t, cudaSetDevice, int, dev) CUDA_STUB_2(cudaError_t, cudaGetDeviceProperties, cudaDeviceProp*, prop, int, dev) CUDA_STUB_2(cudaError_t, cudaDeviceSetLimit, cudaLimit, limit, size_t, value) CUDA_STUB_2(cudaError_t, cudaDeviceGetStreamPriorityRange, int*, leastPriority, int*, greatestPriority) CUDA_STUB_2(cudaError_t, cudaMalloc, void**, devPtr, size_t, size) CUDA_STUB_1(cudaError_t, cudaFree, void*, devPtr) CUDA_STUB_3(cudaError_t, cudaMemset, void*, devPtr, int, value, size_t, count) CUDA_STUB_2(cudaError_t, cudaMemGetInfo, size_t*, free, size_t*, total) CUDA_STUB_4(cudaError_t, cudaMemcpy, void*, dst, const void*, src, size_t, count, cudaMemcpyKind, kind) CUDA_STUB_5(cudaError_t, cudaMemcpyAsync, void*, dst, const void*, src, size_t, count, cudaMemcpyKind, kind, cudaStream_t, stream) CUDA_STUB_1(cudaError_t, cudaStreamCreate, cudaStream_t*, stream) CUDA_STUB_3(cudaError_t, cudaStreamCreateWithPriority, cudaStream_t*, stream, unsigned int, flags, int, priority) CUDA_STUB_1(cudaError_t, cudaStreamDestroy, cudaStream_t, stream) CUDA_STUB_1(cudaError_t, cudaStreamSynchronize, cudaStream_t, stream) CUDA_STUB_3(cudaError_t, cudaStreamWaitEvent, cudaStream_t, stream, cudaEvent_t, event, unsigned int, flags) CUDA_STUB_1(cudaError_t, cudaEventCreate, cudaEvent_t*, event) CUDA_STUB_2(cudaError_t, cudaEventCreateWithFlags, cudaEvent_t*, event, unsigned int, flags) CUDA_STUB_2(cudaError_t, cudaEventRecord, cudaEvent_t, event, cudaStream_t, stream) CUDA_STUB_1(cudaError_t, cudaEventDestroy, cudaEvent_t, event) CUDA_STUB_1(const char*, cudaGetErrorString, cudaError_t, error) // 0-argument stubs — defined manually; NOT in the macro list above: cudaError_t cudaDeviceSynchronize() { assert(NvDynLoader::pfn_cudaDeviceSynchronize); return NvDynLoader::pfn_cudaDeviceSynchronize(); } cudaError_t cudaGetLastError() { assert(NvDynLoader::pfn_cudaGetLastError); return NvDynLoader::pfn_cudaGetLastError(); } #undef CUDA_STUB_1 #undef CUDA_STUB_2 #undef CUDA_STUB_3 #undef CUDA_STUB_4 #undef CUDA_STUB_5 } // extern "C" #endif // NV_DYNAMIC_CUDA // ───────────────────────────────────────────────────────────────────────────── // Template helper — type-safe GetProcAddress // ───────────────────────────────────────────────────────────────────────────── template bool NvDynLoader::Bind(NvLibHandle h, const char* sym, Fn*& pfn) { pfn = reinterpret_cast(NV_GET_PROC(h, sym)); return pfn != nullptr; } // ───────────────────────────────────────────────────────────────────────────── // ANSCENTER shared directory — always checked first, ahead of system PATH. // // Windows : C:\ProgramData\ANSCENTER\Shared\ // Linux : /usr/share/ANSCENTER/Shared/ // // Drop any versioned NVIDIA DLL into this directory and it will be discovered // automatically on the next application launch — no project rebuild required. // ───────────────────────────────────────────────────────────────────────────── #ifdef _WIN32 static const std::string kAnsDir = "C:\\ProgramData\\ANSCENTER\\Shared\\"; #else static const std::string kAnsDir = "/usr/share/ANSCENTER/Shared/"; #endif // ───────────────────────────────────────────────────────────────────────────── // DLL candidate lists // Priority 1 — ANSCENTER shared directory (full absolute path, newest first) // Priority 2 — System PATH / LD_LIBRARY_PATH fallback (bare name, newest first) // ───────────────────────────────────────────────────────────────────────────── std::vector NvDynLoader::TrtCandidates() { std::vector v; // ── Priority 1: ANSCENTER shared directory ──────────────────────────────── for (int major = 20; major >= 8; --major) { #ifdef _WIN32 v.push_back(kAnsDir + "nvinfer_" + std::to_string(major) + ".dll"); #else v.push_back(kAnsDir + "libnvinfer.so." + std::to_string(major)); #endif } // ── Priority 2: System PATH / LD_LIBRARY_PATH ───────────────────────────── for (int major = 20; major >= 8; --major) { #ifdef _WIN32 v.push_back("nvinfer_" + std::to_string(major) + ".dll"); #else v.push_back("libnvinfer.so." + std::to_string(major)); #endif } return v; } std::vector NvDynLoader::OnnxCandidates() { std::vector v; // ── Priority 1: ANSCENTER shared directory ──────────────────────────────── for (int major = 20; major >= 8; --major) { #ifdef _WIN32 v.push_back(kAnsDir + "nvonnxparser_" + std::to_string(major) + ".dll"); #else v.push_back(kAnsDir + "libnvonnxparser.so." + std::to_string(major)); #endif } // ── Priority 2: System PATH / LD_LIBRARY_PATH ───────────────────────────── for (int major = 20; major >= 8; --major) { #ifdef _WIN32 v.push_back("nvonnxparser_" + std::to_string(major) + ".dll"); #else v.push_back("libnvonnxparser.so." + std::to_string(major)); #endif } return v; } std::vector NvDynLoader::CudnnCandidates() { std::vector v; // ── cuDNN DLL/SO naming conventions ─────────────────────────────────────── // Windows: cudnn64_.dll (e.g. cudnn64_8.dll, cudnn64_9.dll) // Linux: libcudnn.so. (e.g. libcudnn.so.8, libcudnn.so.9) // // Note: cuDNN 9 on Windows ships as multiple split DLLs // (cudnn_ops.dll, cudnn_cnn.dll, etc.) but cudnn64_9.dll remains the // main entry-point / compatibility shim. The split DLLs are resolved // automatically once the ANSCENTER directory is added to the search path // via LOAD_WITH_ALTERED_SEARCH_PATH (Windows) or RTLD_GLOBAL (Linux). // // ── Priority 1: ANSCENTER shared directory ──────────────────────────────── for (int major = 12; major >= 7; --major) { #ifdef _WIN32 v.push_back(kAnsDir + "cudnn64_" + std::to_string(major) + ".dll"); #else v.push_back(kAnsDir + "libcudnn.so." + std::to_string(major)); #endif } // ── Priority 2: System PATH / LD_LIBRARY_PATH ───────────────────────────── for (int major = 12; major >= 7; --major) { #ifdef _WIN32 v.push_back("cudnn64_" + std::to_string(major) + ".dll"); #else v.push_back("libcudnn.so." + std::to_string(major)); #endif } return v; } #ifdef NV_DYNAMIC_CUDA std::vector NvDynLoader::CudaRtCandidates() { std::vector v; // ── CUDA runtime DLL/SO naming conventions ──────────────────────────────── // // Windows — NVIDIA changed naming at CUDA 12: // CUDA 11.x : cudart64_110.dll … cudart64_118.dll (major + minor digit) // CUDA 12+ : cudart64_12.dll, cudart64_13.dll … (major-only) // // Linux — major-only symlink is the stable name on all versions: // libcudart.so.11, libcudart.so.12, libcudart.so.13 … // (major.minor symlinks also tried as fallback for CUDA 11.x) // // ── Priority 1: ANSCENTER shared directory ──────────────────────────────── #ifdef _WIN32 for (int major = 15; major >= 12; --major) // CUDA 12+ : major-only v.push_back(kAnsDir + "cudart64_" + std::to_string(major) + ".dll"); for (int minor = 9; minor >= 0; --minor) // CUDA 11.x: major+minor v.push_back(kAnsDir + "cudart64_11" + std::to_string(minor) + ".dll"); v.push_back(kAnsDir + "cudart64.dll"); // non-versioned alias #else for (int major = 15; major >= 11; --major) // all versions: major-only v.push_back(kAnsDir + "libcudart.so." + std::to_string(major)); for (int minor = 9; minor >= 0; --minor) // CUDA 11.x: major.minor fallback v.push_back(kAnsDir + "libcudart.so.11." + std::to_string(minor)); #endif // ── Priority 2: System PATH / LD_LIBRARY_PATH ───────────────────────────── #ifdef _WIN32 for (int major = 15; major >= 12; --major) v.push_back("cudart64_" + std::to_string(major) + ".dll"); for (int minor = 9; minor >= 0; --minor) v.push_back("cudart64_11" + std::to_string(minor) + ".dll"); v.push_back("cudart64.dll"); #else for (int major = 15; major >= 11; --major) v.push_back("libcudart.so." + std::to_string(major)); for (int minor = 9; minor >= 0; --minor) v.push_back("libcudart.so.11." + std::to_string(minor)); #endif return v; } #endif // ───────────────────────────────────────────────────────────────────────────── // FindAndLoad — iterate candidates, return first successful handle // ───────────────────────────────────────────────────────────────────────────── NvLibHandle NvDynLoader::FindAndLoad(const std::vector& candidates, std::string& loadedPath, bool verbose, bool globalLoad) { for (const auto& name : candidates) { #ifdef _WIN32 // LoadLibraryExA with LOAD_WITH_ALTERED_SEARCH_PATH: // • For absolute paths (ANSCENTER dir): searches the DLL's own directory // for its transitive dependencies (cuDNN, cudart, etc.). // • For bare names (PATH fallback): flag is ignored, standard search applies. // globalLoad is irrelevant on Windows — LoadLibraryExA is always process-global. (void)globalLoad; NvLibHandle h = NV_LOAD_LIB(name.c_str()); #else // RTLD_GLOBAL: exports the loaded library's symbols into the process-wide // symbol table so subsequently loaded libraries (e.g. TRT needs cuDNN // symbols) can resolve them. // RTLD_LOCAL: keeps symbols private — used for TRT / ONNX where we call // through function pointers and don't want symbol collisions. NvLibHandle h = globalLoad ? ::dlopen(name.c_str(), RTLD_LAZY | RTLD_GLOBAL) : ::dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL); #endif if (h) { loadedPath = name; if (verbose) std::cout << "NvDynLoader: loaded " << name << std::endl; return h; } } return nullptr; } // ───────────────────────────────────────────────────────────────────────────── // Initialize // ───────────────────────────────────────────────────────────────────────────── static std::mutex g_initMutex; bool NvDynLoader::Initialize(bool verbose) { std::lock_guard lock(g_initMutex); if (s_initialized) return true; // If a previous Initialize() call partially loaded some libraries before // failing (s_initialized stays false but handles may be non-null), release // them now so we can retry cleanly without leaking handles. // The if(h) guards make this a safe no-op on the very first call. if (s_hOnnx) { NV_FREE_LIB(s_hOnnx); s_hOnnx = nullptr; } if (s_hTrt) { NV_FREE_LIB(s_hTrt); s_hTrt = nullptr; } if (s_hCudnn) { NV_FREE_LIB(s_hCudnn); s_hCudnn = nullptr; } #ifdef NV_DYNAMIC_CUDA if (s_hCuda) { NV_FREE_LIB(s_hCuda); s_hCuda = nullptr; } #endif bool ok = true; if (verbose) { std::cout << "==================================================" << std::endl; std::cout << "NvDynLoader: discovering NVIDIA libraries..." << std::endl; std::cout << " Shared dir : " << kAnsDir << std::endl; std::cout << "==================================================" << std::endl; } // ── STEP 1: CUDA Runtime (NV_DYNAMIC_CUDA only) ─────────────────────────── // Loaded first so cuDNN and TRT find CUDA symbols already resident in the // process image when they are opened. // Linux: globalLoad=true (RTLD_GLOBAL) makes cudart symbols process-wide. // Windows: LoadLibraryExA is always process-global; globalLoad is a no-op. #ifdef NV_DYNAMIC_CUDA s_hCuda = FindAndLoad(CudaRtCandidates(), s_cudaPath, verbose, /*globalLoad=*/true); if (!s_hCuda) { std::cout << "NvDynLoader ERROR: No CUDA runtime DLL found. " "Searched " << kAnsDir << " and system PATH." << std::endl; ok = false; } else { bool b = true; b &= Bind(s_hCuda, "cudaGetDeviceCount", pfn_cudaGetDeviceCount); b &= Bind(s_hCuda, "cudaSetDevice", pfn_cudaSetDevice); // cudaGetDeviceProperties: CUDA 12 renamed the export to '_v2'; CUDA 11 // uses the plain name. Try the CUDA 12+ symbol first, fall back to // the CUDA 11 name. IMPORTANT: do NOT `b &=` the first attempt before // the fallback — that would set b=false on CUDA 11 even though the // plain-name fallback succeeds, causing a spurious init failure. Bind(s_hCuda, "cudaGetDeviceProperties_v2", pfn_cudaGetDeviceProperties); // CUDA 12+ if (!pfn_cudaGetDeviceProperties) Bind(s_hCuda, "cudaGetDeviceProperties", pfn_cudaGetDeviceProperties); // CUDA 11 b &= (pfn_cudaGetDeviceProperties != nullptr); b &= Bind(s_hCuda, "cudaDeviceSetLimit", pfn_cudaDeviceSetLimit); b &= Bind(s_hCuda, "cudaDeviceSynchronize", pfn_cudaDeviceSynchronize); b &= Bind(s_hCuda, "cudaDeviceGetStreamPriorityRange", pfn_cudaDeviceGetStreamPriorityRange); b &= Bind(s_hCuda, "cudaMalloc", pfn_cudaMalloc); b &= Bind(s_hCuda, "cudaFree", pfn_cudaFree); b &= Bind(s_hCuda, "cudaMemset", pfn_cudaMemset); b &= Bind(s_hCuda, "cudaMemGetInfo", pfn_cudaMemGetInfo); b &= Bind(s_hCuda, "cudaMemcpy", pfn_cudaMemcpy); b &= Bind(s_hCuda, "cudaMemcpyAsync", pfn_cudaMemcpyAsync); b &= Bind(s_hCuda, "cudaStreamCreate", pfn_cudaStreamCreate); b &= Bind(s_hCuda, "cudaStreamCreateWithPriority", pfn_cudaStreamCreateWithPriority); b &= Bind(s_hCuda, "cudaStreamDestroy", pfn_cudaStreamDestroy); b &= Bind(s_hCuda, "cudaStreamSynchronize", pfn_cudaStreamSynchronize); b &= Bind(s_hCuda, "cudaStreamWaitEvent", pfn_cudaStreamWaitEvent); b &= Bind(s_hCuda, "cudaEventCreate", pfn_cudaEventCreate); b &= Bind(s_hCuda, "cudaEventCreateWithFlags", pfn_cudaEventCreateWithFlags); b &= Bind(s_hCuda, "cudaEventRecord", pfn_cudaEventRecord); b &= Bind(s_hCuda, "cudaEventDestroy", pfn_cudaEventDestroy); b &= Bind(s_hCuda, "cudaGetErrorString", pfn_cudaGetErrorString); b &= Bind(s_hCuda, "cudaGetLastError", pfn_cudaGetLastError); if (!b) { std::cout << "NvDynLoader ERROR: One or more CUDA entry points not found in " << s_cudaPath << std::endl; ok = false; } } #endif // NV_DYNAMIC_CUDA // ── STEP 2: cuDNN pre-load ──────────────────────────────────────────────── // Must be loaded BEFORE TRT so the OS can resolve TRT's cuDNN dependency: // // Windows – LoadLibraryExA + LOAD_WITH_ALTERED_SEARCH_PATH causes Windows // to use the DLL's own directory when resolving its transitive // dependencies. Pre-loading cuDNN from ANSCENTER dir also puts // it in the process DLL table so TRT's import loader finds it. // // Linux – RTLD_GLOBAL exports cuDNN symbols process-wide so TRT's // DT_NEEDED resolution succeeds without LD_LIBRARY_PATH changes. // cuDNN 9 also ships split DLLs (libcudnn_ops.so.9, etc.); those // are found automatically once libcudnn.so.9 is globally loaded. // // Not finding cuDNN is a WARNING, not a fatal error — TRT may still locate // it via LD_LIBRARY_PATH / system PATH at load time. s_hCudnn = FindAndLoad(CudnnCandidates(), s_cudnnPath, verbose, /*globalLoad=*/true); if (!s_hCudnn && verbose) { std::cout << "NvDynLoader: WARNING — cuDNN not found in " << kAnsDir << "; TRT will search system PATH / LD_LIBRARY_PATH." << std::endl; } // ── STEP 3: TensorRT core ───────────────────────────────────────────────── s_hTrt = FindAndLoad(TrtCandidates(), s_trtPath, verbose); if (!s_hTrt) { std::cout << "NvDynLoader ERROR: No TensorRT DLL found. " "Searched " << kAnsDir << " and system PATH." << std::endl; ok = false; } else { // Extract major version from the loaded library path. // Windows : "C:\...\nvinfer_10.dll" → digits between last '_' and last '.' // Linux : ".../libnvinfer.so.10" → digits after last '.' #ifdef _WIN32 { const size_t under = s_trtPath.find_last_of('_'); const size_t dot = s_trtPath.find_last_of('.'); if (under != std::string::npos && dot != std::string::npos && dot > under) { try { s_trtMajor = std::stoi(s_trtPath.substr(under + 1, dot - under - 1)); } catch (...) { /* non-numeric — leave s_trtMajor at 0 */ } } } #else { const size_t dot = s_trtPath.find_last_of('.'); if (dot != std::string::npos && dot + 1 < s_trtPath.size()) { try { s_trtMajor = std::stoi(s_trtPath.substr(dot + 1)); } catch (...) { /* non-numeric — leave s_trtMajor at 0 */ } } } #endif bool b = true; b &= Bind(s_hTrt, "createInferBuilder_INTERNAL", pfn_createInferBuilder_INTERNAL); b &= Bind(s_hTrt, "createInferRuntime_INTERNAL", pfn_createInferRuntime_INTERNAL); Bind(s_hTrt, "createInferRefitter_INTERNAL", pfn_createInferRefitter_INTERNAL); // optional if (!b) { std::cout << "NvDynLoader ERROR: Required TRT entry points not found in " << s_trtPath << std::endl; ok = false; } else if (verbose) { std::cout << "NvDynLoader: TRT major version = " << s_trtMajor << std::endl; } } // ── STEP 4: ONNX parser ─────────────────────────────────────────────────── s_hOnnx = FindAndLoad(OnnxCandidates(), s_onnxPath, verbose); if (!s_hOnnx) { std::cout << "NvDynLoader ERROR: No nvonnxparser DLL found. " "Searched " << kAnsDir << " and system PATH." << std::endl; ok = false; } else { if (!Bind(s_hOnnx, "createNvOnnxParser_INTERNAL", pfn_createNvOnnxParser_INTERNAL)) { std::cout << "NvDynLoader ERROR: createNvOnnxParser_INTERNAL not found in " << s_onnxPath << std::endl; ok = false; } else { // TRT 10+ optional symbols — absent on TRT 8/9; missing is not an error. Bind(s_hOnnx, "createNvOnnxParserRefitter_INTERNAL", pfn_createNvOnnxParserRefitter_INTERNAL); Bind(s_hOnnx, "getNvOnnxParserVersion", pfn_getNvOnnxParserVersion); // Cross-check: TRT and ONNX-parser must share the same major version. // Mismatched majors (e.g. nvinfer_10 + nvonnxparser_11) compile fine // but crash at runtime when the ONNX parser tries to populate a TRT // network object built by a different major version. int onnxMajor = 0; #ifdef _WIN32 { const size_t under = s_onnxPath.find_last_of('_'); const size_t dot = s_onnxPath.find_last_of('.'); if (under != std::string::npos && dot != std::string::npos && dot > under) try { onnxMajor = std::stoi(s_onnxPath.substr(under + 1, dot - under - 1)); } catch (...) {} } #else { const size_t dot = s_onnxPath.find_last_of('.'); if (dot != std::string::npos && dot + 1 < s_onnxPath.size()) try { onnxMajor = std::stoi(s_onnxPath.substr(dot + 1)); } catch (...) {} } #endif if (onnxMajor > 0 && s_trtMajor > 0 && onnxMajor != s_trtMajor) { std::cout << "NvDynLoader ERROR: TRT major version (" << s_trtMajor << ") != ONNX-parser major version (" << onnxMajor << ").\n" << " TRT : " << s_trtPath << "\n" << " ONNX : " << s_onnxPath << "\n" << " Replace both DLLs with matching versions in " << kAnsDir << std::endl; ok = false; } else if (verbose && onnxMajor > 0) { std::cout << "NvDynLoader: ONNX-parser major version = " << onnxMajor << std::endl; } } } s_initialized = ok; if (verbose) { std::cout << "NvDynLoader: " << (ok ? "initialised successfully." : "FAILED — check errors above.") << std::endl; std::cout << "==================================================" << std::endl; } return ok; } // ───────────────────────────────────────────────────────────────────────────── // Shutdown // ───────────────────────────────────────────────────────────────────────────── void NvDynLoader::Shutdown() { std::lock_guard lock(g_initMutex); // NOTE: No early-return guard on s_initialized here. // // Shutdown() must always release whatever handles are currently open, // including handles left behind by a *partially*-successful Initialize() // call (where s_initialized == false but some DLLs were already opened). // Every FreeLibrary / dlclose call is guarded by if(h), so this function // is safe to call on a fully- or partially-initialised instance, and also // safe to call multiple times (idempotent). // ── Null all entry-point pointers first ─────────────────────────────────── // Prevents stubs from jumping into freed library memory in the window // between Shutdown() and a subsequent Initialize() call. pfn_createInferBuilder_INTERNAL = nullptr; pfn_createInferRuntime_INTERNAL = nullptr; pfn_createInferRefitter_INTERNAL = nullptr; pfn_createNvOnnxParser_INTERNAL = nullptr; pfn_createNvOnnxParserRefitter_INTERNAL = nullptr; pfn_getNvOnnxParserVersion = nullptr; #ifdef NV_DYNAMIC_CUDA // Null every CUDA function pointer so the assert() in each stub fires // immediately if a CUDA call is made after Shutdown() without a new // Initialize(), rather than silently calling into freed DLL memory. pfn_cudaGetDeviceCount = nullptr; pfn_cudaSetDevice = nullptr; pfn_cudaGetDeviceProperties = nullptr; pfn_cudaDeviceSetLimit = nullptr; pfn_cudaDeviceSynchronize = nullptr; pfn_cudaDeviceGetStreamPriorityRange = nullptr; pfn_cudaMalloc = nullptr; pfn_cudaFree = nullptr; pfn_cudaMemset = nullptr; pfn_cudaMemGetInfo = nullptr; pfn_cudaMemcpy = nullptr; pfn_cudaMemcpyAsync = nullptr; pfn_cudaStreamCreate = nullptr; pfn_cudaStreamCreateWithPriority = nullptr; pfn_cudaStreamDestroy = nullptr; pfn_cudaStreamSynchronize = nullptr; pfn_cudaStreamWaitEvent = nullptr; pfn_cudaEventCreate = nullptr; pfn_cudaEventCreateWithFlags = nullptr; pfn_cudaEventRecord = nullptr; pfn_cudaEventDestroy = nullptr; pfn_cudaGetErrorString = nullptr; pfn_cudaGetLastError = nullptr; #endif // NV_DYNAMIC_CUDA // ── Release handles in reverse load order ───────────────────────────────── // ONNX parser → depends on TRT // TRT → depends on cuDNN + CUDA RT // cuDNN → depends on CUDA RT // CUDA RT → base dependency if (s_hOnnx) { NV_FREE_LIB(s_hOnnx); s_hOnnx = nullptr; } if (s_hTrt) { NV_FREE_LIB(s_hTrt); s_hTrt = nullptr; } if (s_hCudnn) { NV_FREE_LIB(s_hCudnn); s_hCudnn = nullptr; } #ifdef NV_DYNAMIC_CUDA if (s_hCuda) { NV_FREE_LIB(s_hCuda); s_hCuda = nullptr; } #endif // ── Reset version / path state ──────────────────────────────────────────── // A subsequent Initialize() call will re-discover fresh paths and versions. s_trtMajor = 0; s_trtPath.clear(); s_onnxPath.clear(); s_cudaPath.clear(); s_cudnnPath.clear(); s_initialized = false; }