engines/TensorRTAPI/NvDynLoader.cpp

// ============================================================================
// NvDynLoader.cpp
//
// Two responsibilities:
//
//  1. Provide extern "C" stub DEFINITIONS for every symbol that TRT / ONNX
//     parser inline wrappers reference.  This satisfies the linker without
//     linking nvinfer_XX.lib / nvonnxparser_XX.lib.  Each stub forwards the
//     call through a function pointer that Initialize() populates at runtime.
//
//  2. Implement NvDynLoader::Initialize() which searches for installed NVIDIA
//     DLLs (trying multiple major versions, newest first) and binds all
//     required function pointers.
// ============================================================================

#include "include/engine/NvDynLoader.h"
#include <iostream>
#include <cassert>
#include <mutex>

// ─────────────────────────────────────────────────────────────────────────────
// Static member definitions
// ─────────────────────────────────────────────────────────────────────────────
bool        NvDynLoader::s_initialized = false;
int         NvDynLoader::s_trtMajor    = 0;
std::string NvDynLoader::s_trtPath;
std::string NvDynLoader::s_onnxPath;
std::string NvDynLoader::s_cudaPath;
NvLibHandle NvDynLoader::s_hTrt   = nullptr;
NvLibHandle NvDynLoader::s_hOnnx  = nullptr;
NvLibHandle NvDynLoader::s_hCuda  = nullptr;
NvLibHandle NvDynLoader::s_hCudnn = nullptr;
std::string NvDynLoader::s_cudnnPath;

NvDynLoader::PfnBuilder*  NvDynLoader::pfn_createInferBuilder_INTERNAL  = nullptr;
NvDynLoader::PfnRuntime*  NvDynLoader::pfn_createInferRuntime_INTERNAL  = nullptr;
NvDynLoader::PfnRefitter* NvDynLoader::pfn_createInferRefitter_INTERNAL = nullptr;
NvDynLoader::PfnParser*         NvDynLoader::pfn_createNvOnnxParser_INTERNAL          = nullptr;
NvDynLoader::PfnParserRefitter*    NvDynLoader::pfn_createNvOnnxParserRefitter_INTERNAL = nullptr;
NvDynLoader::PfnGetParserVersion*  NvDynLoader::pfn_getNvOnnxParserVersion             = nullptr;

#ifdef NV_DYNAMIC_CUDA
cudaError_t (*NvDynLoader::pfn_cudaGetDeviceCount)             (int*)                                               = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaSetDevice)                  (int)                                                = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaGetDeviceProperties)        (cudaDeviceProp*, int)                               = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaDeviceSetLimit)             (cudaLimit, size_t)                                  = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaDeviceSynchronize)          ()                                                   = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaDeviceGetStreamPriorityRange)(int*, int*)                                        = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMalloc)                     (void**, size_t)                                     = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaFree)                       (void*)                                              = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMemset)                     (void*, int, size_t)                                 = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMemGetInfo)                 (size_t*, size_t*)                                   = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMemcpy)                     (void*, const void*, size_t, cudaMemcpyKind)         = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaMemcpyAsync)                (void*, const void*, size_t, cudaMemcpyKind, cudaStream_t) = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamCreate)               (cudaStream_t*)                                      = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamCreateWithPriority)   (cudaStream_t*, unsigned int, int)                   = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamDestroy)              (cudaStream_t)                                       = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamSynchronize)          (cudaStream_t)                                       = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaStreamWaitEvent)            (cudaStream_t, cudaEvent_t, unsigned int)            = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaEventCreate)                (cudaEvent_t*)                                       = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaEventCreateWithFlags)       (cudaEvent_t*, unsigned int)                         = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaEventRecord)                (cudaEvent_t, cudaStream_t)                          = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaEventDestroy)               (cudaEvent_t)                                        = nullptr;
const char* (*NvDynLoader::pfn_cudaGetErrorString)             (cudaError_t)                                        = nullptr;
cudaError_t (*NvDynLoader::pfn_cudaGetLastError)               ()                                                   = nullptr;
#endif // NV_DYNAMIC_CUDA

// ─────────────────────────────────────────────────────────────────────────────
// extern "C" stubs
//
// These provide the linker symbols that TRT / ONNX-parser inline wrappers
// reference.  With TENSORRTAPI / NVONNXPARSER_API overridden to empty (done
// at the top of engine.h, before TRT headers are first included), the inline
// wrappers compile to direct calls — not indirect import-table calls — so the
// linker resolves them to these definitions rather than to a .lib file.
// ─────────────────────────────────────────────────────────────────────────────
extern "C" {

    void* createInferBuilder_INTERNAL(void* logger, int32_t version) noexcept
    {
        assert(NvDynLoader::pfn_createInferBuilder_INTERNAL &&
               "NvDynLoader::Initialize() has not been called before the first TRT use.");
        return NvDynLoader::pfn_createInferBuilder_INTERNAL(logger, version);
    }

    void* createInferRuntime_INTERNAL(void* logger, int32_t version) noexcept
    {
        assert(NvDynLoader::pfn_createInferRuntime_INTERNAL &&
               "NvDynLoader::Initialize() has not been called before the first TRT use.");
        return NvDynLoader::pfn_createInferRuntime_INTERNAL(logger, version);
    }

    // createInferRefitter_INTERNAL — only needed if engine refitting is used;
    // guarded so a missing symbol is a no-op rather than an assert failure.
    void* createInferRefitter_INTERNAL(void* engine, void* logger, int32_t version) noexcept
    {
        if (!NvDynLoader::pfn_createInferRefitter_INTERNAL) return nullptr;
        return NvDynLoader::pfn_createInferRefitter_INTERNAL(engine, logger, version);
    }

    void* createNvOnnxParser_INTERNAL(void* network, void* logger, int32_t version) noexcept
    {
        assert(NvDynLoader::pfn_createNvOnnxParser_INTERNAL &&
               "NvDynLoader::Initialize() has not been called before the first ONNX-parser use.");
        return NvDynLoader::pfn_createNvOnnxParser_INTERNAL(network, logger, version);
    }

    // createNvOnnxParserRefitter_INTERNAL — TRT 10+ symbol; absent on TRT 8/9.
    // Guarded rather than asserted so older TRT versions continue to work.
    void* createNvOnnxParserRefitter_INTERNAL(void* refitter, void* logger, int32_t version) noexcept
    {
        if (!NvDynLoader::pfn_createNvOnnxParserRefitter_INTERNAL) return nullptr;
        return NvDynLoader::pfn_createNvOnnxParserRefitter_INTERNAL(refitter, logger, version);
    }

    // getNvOnnxParserVersion — optional version query; returns 0 if not available.
    int getNvOnnxParserVersion() noexcept
    {
        if (!NvDynLoader::pfn_getNvOnnxParserVersion) return 0;
        return NvDynLoader::pfn_getNvOnnxParserVersion();
    }

} // extern "C"

// ─────────────────────────────────────────────────────────────────────────────
// CUDA stubs (only when NV_DYNAMIC_CUDA is defined)
// Removes the cudart_static.lib dependency; CUDA RT is loaded dynamically.
//
// WHY extern "C":
//   cuda_runtime.h wraps every public API in `extern "C"` when compiled as C++,
//   giving each function C linkage (no name mangling).  Our stub DEFINITIONS
//   must use the same linkage — otherwise the linker sees a C-linkage declaration
//   (from the header) paired with a C++-mangled definition (from our .cpp), which
//   is an unresolved-symbol error on both MSVC and GCC/Clang.
//
// WHY cudaDeviceSynchronize / cudaGetLastError are NOT in the macro list:
//   CUDA_STUB_N macros require at least one parameter.  These two functions take
//   zero arguments; they are defined manually in the block below.  Do NOT add
//   them back to the macro invocation list — it would create duplicate definitions.
// ─────────────────────────────────────────────────────────────────────────────
#ifdef NV_DYNAMIC_CUDA

extern "C" {   // ← must match the extern "C" in cuda_runtime.h

#define CUDA_STUB_1(ret, name, t0, a0)                                        \
    ret name(t0 a0) {                                                         \
        assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0); }

#define CUDA_STUB_2(ret, name, t0,a0, t1,a1)                                  \
    ret name(t0 a0, t1 a1) {                                                  \
        assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1); }

#define CUDA_STUB_3(ret, name, t0,a0, t1,a1, t2,a2)                           \
    ret name(t0 a0, t1 a1, t2 a2) {                                           \
        assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1,a2); }

#define CUDA_STUB_4(ret, name, t0,a0, t1,a1, t2,a2, t3,a3)                   \
    ret name(t0 a0, t1 a1, t2 a2, t3 a3) {                                   \
        assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1,a2,a3); }

#define CUDA_STUB_5(ret, name, t0,a0, t1,a1, t2,a2, t3,a3, t4,a4)            \
    ret name(t0 a0, t1 a1, t2 a2, t3 a3, t4 a4) {                            \
        assert(NvDynLoader::pfn_##name); return NvDynLoader::pfn_##name(a0,a1,a2,a3,a4); }

CUDA_STUB_1(cudaError_t, cudaGetDeviceCount,               int*,            count)
CUDA_STUB_1(cudaError_t, cudaSetDevice,                    int,             dev)
CUDA_STUB_2(cudaError_t, cudaGetDeviceProperties,          cudaDeviceProp*, prop,    int, dev)
CUDA_STUB_2(cudaError_t, cudaDeviceSetLimit,               cudaLimit,       limit,   size_t, value)
CUDA_STUB_2(cudaError_t, cudaDeviceGetStreamPriorityRange, int*,            leastPriority, int*, greatestPriority)
CUDA_STUB_2(cudaError_t, cudaMalloc,                       void**,          devPtr,  size_t, size)
CUDA_STUB_1(cudaError_t, cudaFree,                         void*,           devPtr)
CUDA_STUB_3(cudaError_t, cudaMemset,                       void*,           devPtr,  int, value, size_t, count)
CUDA_STUB_2(cudaError_t, cudaMemGetInfo,                   size_t*,         free,    size_t*, total)
CUDA_STUB_4(cudaError_t, cudaMemcpy,                       void*,           dst,     const void*, src, size_t, count, cudaMemcpyKind, kind)
CUDA_STUB_5(cudaError_t, cudaMemcpyAsync,                  void*,           dst,     const void*, src, size_t, count, cudaMemcpyKind, kind, cudaStream_t, stream)
CUDA_STUB_1(cudaError_t, cudaStreamCreate,                 cudaStream_t*,   stream)
CUDA_STUB_3(cudaError_t, cudaStreamCreateWithPriority,     cudaStream_t*,   stream,  unsigned int, flags, int, priority)
CUDA_STUB_1(cudaError_t, cudaStreamDestroy,                cudaStream_t,    stream)
CUDA_STUB_1(cudaError_t, cudaStreamSynchronize,            cudaStream_t,    stream)
CUDA_STUB_3(cudaError_t, cudaStreamWaitEvent,              cudaStream_t,    stream,  cudaEvent_t, event, unsigned int, flags)
CUDA_STUB_1(cudaError_t, cudaEventCreate,                  cudaEvent_t*,    event)
CUDA_STUB_2(cudaError_t, cudaEventCreateWithFlags,         cudaEvent_t*,    event,   unsigned int, flags)
CUDA_STUB_2(cudaError_t, cudaEventRecord,                  cudaEvent_t,     event,   cudaStream_t, stream)
CUDA_STUB_1(cudaError_t, cudaEventDestroy,                 cudaEvent_t,     event)
CUDA_STUB_1(const char*, cudaGetErrorString,               cudaError_t,     error)

// 0-argument stubs — defined manually; NOT in the macro list above:
cudaError_t cudaDeviceSynchronize() {
    assert(NvDynLoader::pfn_cudaDeviceSynchronize);
    return NvDynLoader::pfn_cudaDeviceSynchronize();
}
cudaError_t cudaGetLastError() {
    assert(NvDynLoader::pfn_cudaGetLastError);
    return NvDynLoader::pfn_cudaGetLastError();
}

#undef CUDA_STUB_1
#undef CUDA_STUB_2
#undef CUDA_STUB_3
#undef CUDA_STUB_4
#undef CUDA_STUB_5

} // extern "C"
#endif // NV_DYNAMIC_CUDA

// ─────────────────────────────────────────────────────────────────────────────
// Template helper — type-safe GetProcAddress
// ─────────────────────────────────────────────────────────────────────────────
template<typename Fn>
bool NvDynLoader::Bind(NvLibHandle h, const char* sym, Fn*& pfn)
{
    pfn = reinterpret_cast<Fn*>(NV_GET_PROC(h, sym));
    return pfn != nullptr;
}

// ─────────────────────────────────────────────────────────────────────────────
// ANSCENTER shared directory — always checked first, ahead of system PATH.
//
//   Windows : C:\ProgramData\ANSCENTER\Shared\
//   Linux   : /usr/share/ANSCENTER/Shared/
//
// Drop any versioned NVIDIA DLL into this directory and it will be discovered
// automatically on the next application launch — no project rebuild required.
// ─────────────────────────────────────────────────────────────────────────────
#ifdef _WIN32
static const std::string kAnsDir = "C:\\ProgramData\\ANSCENTER\\Shared\\";
#else
static const std::string kAnsDir = "/usr/share/ANSCENTER/Shared/";
#endif

// ─────────────────────────────────────────────────────────────────────────────
// DLL candidate lists
//   Priority 1 — ANSCENTER shared directory  (full absolute path, newest first)
//   Priority 2 — System PATH / LD_LIBRARY_PATH fallback (bare name, newest first)
// ─────────────────────────────────────────────────────────────────────────────
std::vector<std::string> NvDynLoader::TrtCandidates()
{
    std::vector<std::string> v;
    // ── Priority 1: ANSCENTER shared directory ────────────────────────────────
    for (int major = 20; major >= 8; --major) {
#ifdef _WIN32
        v.push_back(kAnsDir + "nvinfer_" + std::to_string(major) + ".dll");
#else
        v.push_back(kAnsDir + "libnvinfer.so." + std::to_string(major));
#endif
    }
    // ── Priority 2: System PATH / LD_LIBRARY_PATH ─────────────────────────────
    for (int major = 20; major >= 8; --major) {
#ifdef _WIN32
        v.push_back("nvinfer_" + std::to_string(major) + ".dll");
#else
        v.push_back("libnvinfer.so." + std::to_string(major));
#endif
    }
    return v;
}

std::vector<std::string> NvDynLoader::OnnxCandidates()
{
    std::vector<std::string> v;
    // ── Priority 1: ANSCENTER shared directory ────────────────────────────────
    for (int major = 20; major >= 8; --major) {
#ifdef _WIN32
        v.push_back(kAnsDir + "nvonnxparser_" + std::to_string(major) + ".dll");
#else
        v.push_back(kAnsDir + "libnvonnxparser.so." + std::to_string(major));
#endif
    }
    // ── Priority 2: System PATH / LD_LIBRARY_PATH ─────────────────────────────
    for (int major = 20; major >= 8; --major) {
#ifdef _WIN32
        v.push_back("nvonnxparser_" + std::to_string(major) + ".dll");
#else
        v.push_back("libnvonnxparser.so." + std::to_string(major));
#endif
    }
    return v;
}

std::vector<std::string> NvDynLoader::CudnnCandidates()
{
    std::vector<std::string> v;
    // ── cuDNN DLL/SO naming conventions ───────────────────────────────────────
    //  Windows: cudnn64_<major>.dll   (e.g. cudnn64_8.dll, cudnn64_9.dll)
    //  Linux:   libcudnn.so.<major>   (e.g. libcudnn.so.8, libcudnn.so.9)
    //
    //  Note: cuDNN 9 on Windows ships as multiple split DLLs
    //  (cudnn_ops.dll, cudnn_cnn.dll, etc.) but cudnn64_9.dll remains the
    //  main entry-point / compatibility shim.  The split DLLs are resolved
    //  automatically once the ANSCENTER directory is added to the search path
    //  via LOAD_WITH_ALTERED_SEARCH_PATH (Windows) or RTLD_GLOBAL (Linux).
    //
    // ── Priority 1: ANSCENTER shared directory ────────────────────────────────
    for (int major = 12; major >= 7; --major) {
#ifdef _WIN32
        v.push_back(kAnsDir + "cudnn64_" + std::to_string(major) + ".dll");
#else
        v.push_back(kAnsDir + "libcudnn.so." + std::to_string(major));
#endif
    }
    // ── Priority 2: System PATH / LD_LIBRARY_PATH ─────────────────────────────
    for (int major = 12; major >= 7; --major) {
#ifdef _WIN32
        v.push_back("cudnn64_" + std::to_string(major) + ".dll");
#else
        v.push_back("libcudnn.so." + std::to_string(major));
#endif
    }
    return v;
}

#ifdef NV_DYNAMIC_CUDA
std::vector<std::string> NvDynLoader::CudaRtCandidates()
{
    std::vector<std::string> v;
    // ── CUDA runtime DLL/SO naming conventions ────────────────────────────────
    //
    //  Windows — NVIDIA changed naming at CUDA 12:
    //    CUDA 11.x : cudart64_110.dll … cudart64_118.dll  (major + minor digit)
    //    CUDA 12+  : cudart64_12.dll, cudart64_13.dll …   (major-only)
    //
    //  Linux — major-only symlink is the stable name on all versions:
    //    libcudart.so.11, libcudart.so.12, libcudart.so.13 …
    //    (major.minor symlinks also tried as fallback for CUDA 11.x)
    //
    // ── Priority 1: ANSCENTER shared directory ────────────────────────────────
#ifdef _WIN32
    for (int major = 15; major >= 12; --major)        // CUDA 12+ : major-only
        v.push_back(kAnsDir + "cudart64_" + std::to_string(major) + ".dll");
    for (int minor = 9; minor >= 0; --minor)          // CUDA 11.x: major+minor
        v.push_back(kAnsDir + "cudart64_11" + std::to_string(minor) + ".dll");
    v.push_back(kAnsDir + "cudart64.dll");            // non-versioned alias
#else
    for (int major = 15; major >= 11; --major)        // all versions: major-only
        v.push_back(kAnsDir + "libcudart.so." + std::to_string(major));
    for (int minor = 9; minor >= 0; --minor)          // CUDA 11.x: major.minor fallback
        v.push_back(kAnsDir + "libcudart.so.11." + std::to_string(minor));
#endif

    // ── Priority 2: System PATH / LD_LIBRARY_PATH ─────────────────────────────
#ifdef _WIN32
    for (int major = 15; major >= 12; --major)
        v.push_back("cudart64_" + std::to_string(major) + ".dll");
    for (int minor = 9; minor >= 0; --minor)
        v.push_back("cudart64_11" + std::to_string(minor) + ".dll");
    v.push_back("cudart64.dll");
#else
    for (int major = 15; major >= 11; --major)
        v.push_back("libcudart.so." + std::to_string(major));
    for (int minor = 9; minor >= 0; --minor)
        v.push_back("libcudart.so.11." + std::to_string(minor));
#endif
    return v;
}
#endif

// ─────────────────────────────────────────────────────────────────────────────
// FindAndLoad — iterate candidates, return first successful handle
// ─────────────────────────────────────────────────────────────────────────────
NvLibHandle NvDynLoader::FindAndLoad(const std::vector<std::string>& candidates,
                                     std::string& loadedPath, bool verbose,
                                     bool globalLoad)
{
    for (const auto& name : candidates) {
#ifdef _WIN32
        // LoadLibraryExA with LOAD_WITH_ALTERED_SEARCH_PATH:
        //   • For absolute paths (ANSCENTER dir): searches the DLL's own directory
        //     for its transitive dependencies (cuDNN, cudart, etc.).
        //   • For bare names (PATH fallback): flag is ignored, standard search applies.
        // globalLoad is irrelevant on Windows — LoadLibraryExA is always process-global.
        (void)globalLoad;
        NvLibHandle h = NV_LOAD_LIB(name.c_str());
#else
        // RTLD_GLOBAL: exports the loaded library's symbols into the process-wide
        //   symbol table so subsequently loaded libraries (e.g. TRT needs cuDNN
        //   symbols) can resolve them.
        // RTLD_LOCAL: keeps symbols private — used for TRT / ONNX where we call
        //   through function pointers and don't want symbol collisions.
        NvLibHandle h = globalLoad
            ? ::dlopen(name.c_str(), RTLD_LAZY | RTLD_GLOBAL)
            : ::dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
#endif
        if (h) {
            loadedPath = name;
            if (verbose)
                std::cout << "NvDynLoader: loaded " << name << std::endl;
            return h;
        }
    }
    return nullptr;
}

// ─────────────────────────────────────────────────────────────────────────────
// Initialize
// ─────────────────────────────────────────────────────────────────────────────
static std::mutex g_initMutex;

bool NvDynLoader::Initialize(bool verbose)
{
    std::lock_guard<std::mutex> lock(g_initMutex);
    if (s_initialized) return true;

    // If a previous Initialize() call partially loaded some libraries before
    // failing (s_initialized stays false but handles may be non-null), release
    // them now so we can retry cleanly without leaking handles.
    // The if(h) guards make this a safe no-op on the very first call.
    if (s_hOnnx)  { NV_FREE_LIB(s_hOnnx);  s_hOnnx  = nullptr; }
    if (s_hTrt)   { NV_FREE_LIB(s_hTrt);   s_hTrt   = nullptr; }
    if (s_hCudnn) { NV_FREE_LIB(s_hCudnn); s_hCudnn = nullptr; }
#ifdef NV_DYNAMIC_CUDA
    if (s_hCuda)  { NV_FREE_LIB(s_hCuda);  s_hCuda  = nullptr; }
#endif

    bool ok = true;

    if (verbose) {
        std::cout << "==================================================" << std::endl;
        std::cout << "NvDynLoader: discovering NVIDIA libraries..."       << std::endl;
        std::cout << "  Shared dir : " << kAnsDir                         << std::endl;
        std::cout << "==================================================" << std::endl;
    }

    // ── STEP 1: CUDA Runtime (NV_DYNAMIC_CUDA only) ───────────────────────────
    // Loaded first so cuDNN and TRT find CUDA symbols already resident in the
    // process image when they are opened.
    // Linux: globalLoad=true (RTLD_GLOBAL) makes cudart symbols process-wide.
    // Windows: LoadLibraryExA is always process-global; globalLoad is a no-op.
#ifdef NV_DYNAMIC_CUDA
    s_hCuda = FindAndLoad(CudaRtCandidates(), s_cudaPath, verbose, /*globalLoad=*/true);
    if (!s_hCuda) {
        std::cout << "NvDynLoader ERROR: No CUDA runtime DLL found. "
                     "Searched " << kAnsDir << " and system PATH." << std::endl;
        ok = false;
    } else {
        bool b = true;
        b &= Bind(s_hCuda, "cudaGetDeviceCount",               pfn_cudaGetDeviceCount);
        b &= Bind(s_hCuda, "cudaSetDevice",                    pfn_cudaSetDevice);
        // cudaGetDeviceProperties: CUDA 12 renamed the export to '_v2'; CUDA 11
        // uses the plain name.  Try the CUDA 12+ symbol first, fall back to
        // the CUDA 11 name.  IMPORTANT: do NOT `b &=` the first attempt before
        // the fallback — that would set b=false on CUDA 11 even though the
        // plain-name fallback succeeds, causing a spurious init failure.
        Bind(s_hCuda, "cudaGetDeviceProperties_v2", pfn_cudaGetDeviceProperties); // CUDA 12+
        if (!pfn_cudaGetDeviceProperties)
            Bind(s_hCuda, "cudaGetDeviceProperties", pfn_cudaGetDeviceProperties); // CUDA 11
        b &= (pfn_cudaGetDeviceProperties != nullptr);
        b &= Bind(s_hCuda, "cudaDeviceSetLimit",               pfn_cudaDeviceSetLimit);
        b &= Bind(s_hCuda, "cudaDeviceSynchronize",            pfn_cudaDeviceSynchronize);
        b &= Bind(s_hCuda, "cudaDeviceGetStreamPriorityRange", pfn_cudaDeviceGetStreamPriorityRange);
        b &= Bind(s_hCuda, "cudaMalloc",                       pfn_cudaMalloc);
        b &= Bind(s_hCuda, "cudaFree",                         pfn_cudaFree);
        b &= Bind(s_hCuda, "cudaMemset",                       pfn_cudaMemset);
        b &= Bind(s_hCuda, "cudaMemGetInfo",                   pfn_cudaMemGetInfo);
        b &= Bind(s_hCuda, "cudaMemcpy",                       pfn_cudaMemcpy);
        b &= Bind(s_hCuda, "cudaMemcpyAsync",                  pfn_cudaMemcpyAsync);
        b &= Bind(s_hCuda, "cudaStreamCreate",                 pfn_cudaStreamCreate);
        b &= Bind(s_hCuda, "cudaStreamCreateWithPriority",     pfn_cudaStreamCreateWithPriority);
        b &= Bind(s_hCuda, "cudaStreamDestroy",                pfn_cudaStreamDestroy);
        b &= Bind(s_hCuda, "cudaStreamSynchronize",            pfn_cudaStreamSynchronize);
        b &= Bind(s_hCuda, "cudaStreamWaitEvent",              pfn_cudaStreamWaitEvent);
        b &= Bind(s_hCuda, "cudaEventCreate",                  pfn_cudaEventCreate);
        b &= Bind(s_hCuda, "cudaEventCreateWithFlags",         pfn_cudaEventCreateWithFlags);
        b &= Bind(s_hCuda, "cudaEventRecord",                  pfn_cudaEventRecord);
        b &= Bind(s_hCuda, "cudaEventDestroy",                 pfn_cudaEventDestroy);
        b &= Bind(s_hCuda, "cudaGetErrorString",               pfn_cudaGetErrorString);
        b &= Bind(s_hCuda, "cudaGetLastError",                 pfn_cudaGetLastError);
        if (!b) {
            std::cout << "NvDynLoader ERROR: One or more CUDA entry points not found in "
                      << s_cudaPath << std::endl;
            ok = false;
        }
    }
#endif // NV_DYNAMIC_CUDA

    // ── STEP 2: cuDNN pre-load ────────────────────────────────────────────────
    // Must be loaded BEFORE TRT so the OS can resolve TRT's cuDNN dependency:
    //
    //  Windows  – LoadLibraryExA + LOAD_WITH_ALTERED_SEARCH_PATH causes Windows
    //             to use the DLL's own directory when resolving its transitive
    //             dependencies.  Pre-loading cuDNN from ANSCENTER dir also puts
    //             it in the process DLL table so TRT's import loader finds it.
    //
    //  Linux    – RTLD_GLOBAL exports cuDNN symbols process-wide so TRT's
    //             DT_NEEDED resolution succeeds without LD_LIBRARY_PATH changes.
    //             cuDNN 9 also ships split DLLs (libcudnn_ops.so.9, etc.); those
    //             are found automatically once libcudnn.so.9 is globally loaded.
    //
    // Not finding cuDNN is a WARNING, not a fatal error — TRT may still locate
    // it via LD_LIBRARY_PATH / system PATH at load time.
    s_hCudnn = FindAndLoad(CudnnCandidates(), s_cudnnPath, verbose, /*globalLoad=*/true);
    if (!s_hCudnn && verbose) {
        std::cout << "NvDynLoader: WARNING — cuDNN not found in " << kAnsDir
                  << "; TRT will search system PATH / LD_LIBRARY_PATH." << std::endl;
    }

    // ── STEP 3: TensorRT core ─────────────────────────────────────────────────
    s_hTrt = FindAndLoad(TrtCandidates(), s_trtPath, verbose);
    if (!s_hTrt) {
        std::cout << "NvDynLoader ERROR: No TensorRT DLL found. "
                     "Searched " << kAnsDir << " and system PATH." << std::endl;
        ok = false;
    } else {
        // Extract major version from the loaded library path.
        //   Windows : "C:\...\nvinfer_10.dll"     → digits between last '_' and last '.'
        //   Linux   : ".../libnvinfer.so.10"      → digits after last '.'
#ifdef _WIN32
        {
            const size_t under = s_trtPath.find_last_of('_');
            const size_t dot   = s_trtPath.find_last_of('.');
            if (under != std::string::npos && dot != std::string::npos && dot > under) {
                try { s_trtMajor = std::stoi(s_trtPath.substr(under + 1, dot - under - 1)); }
                catch (...) { /* non-numeric — leave s_trtMajor at 0 */ }
            }
        }
#else
        {
            const size_t dot = s_trtPath.find_last_of('.');
            if (dot != std::string::npos && dot + 1 < s_trtPath.size()) {
                try { s_trtMajor = std::stoi(s_trtPath.substr(dot + 1)); }
                catch (...) { /* non-numeric — leave s_trtMajor at 0 */ }
            }
        }
#endif

        bool b = true;
        b &= Bind(s_hTrt, "createInferBuilder_INTERNAL",  pfn_createInferBuilder_INTERNAL);
        b &= Bind(s_hTrt, "createInferRuntime_INTERNAL",  pfn_createInferRuntime_INTERNAL);
        Bind(s_hTrt, "createInferRefitter_INTERNAL", pfn_createInferRefitter_INTERNAL); // optional
        if (!b) {
            std::cout << "NvDynLoader ERROR: Required TRT entry points not found in "
                      << s_trtPath << std::endl;
            ok = false;
        } else if (verbose) {
            std::cout << "NvDynLoader: TRT major version = " << s_trtMajor << std::endl;
        }
    }

    // ── STEP 4: ONNX parser ───────────────────────────────────────────────────
    s_hOnnx = FindAndLoad(OnnxCandidates(), s_onnxPath, verbose);
    if (!s_hOnnx) {
        std::cout << "NvDynLoader ERROR: No nvonnxparser DLL found. "
                     "Searched " << kAnsDir << " and system PATH." << std::endl;
        ok = false;
    } else {
        if (!Bind(s_hOnnx, "createNvOnnxParser_INTERNAL", pfn_createNvOnnxParser_INTERNAL)) {
            std::cout << "NvDynLoader ERROR: createNvOnnxParser_INTERNAL not found in "
                      << s_onnxPath << std::endl;
            ok = false;
        } else {
            // TRT 10+ optional symbols — absent on TRT 8/9; missing is not an error.
            Bind(s_hOnnx, "createNvOnnxParserRefitter_INTERNAL", pfn_createNvOnnxParserRefitter_INTERNAL);
            Bind(s_hOnnx, "getNvOnnxParserVersion",              pfn_getNvOnnxParserVersion);

            // Cross-check: TRT and ONNX-parser must share the same major version.
            // Mismatched majors (e.g. nvinfer_10 + nvonnxparser_11) compile fine
            // but crash at runtime when the ONNX parser tries to populate a TRT
            // network object built by a different major version.
            int onnxMajor = 0;
#ifdef _WIN32
            {
                const size_t under = s_onnxPath.find_last_of('_');
                const size_t dot   = s_onnxPath.find_last_of('.');
                if (under != std::string::npos && dot != std::string::npos && dot > under)
                    try { onnxMajor = std::stoi(s_onnxPath.substr(under + 1, dot - under - 1)); }
                    catch (...) {}
            }
#else
            {
                const size_t dot = s_onnxPath.find_last_of('.');
                if (dot != std::string::npos && dot + 1 < s_onnxPath.size())
                    try { onnxMajor = std::stoi(s_onnxPath.substr(dot + 1)); }
                    catch (...) {}
            }
#endif
            if (onnxMajor > 0 && s_trtMajor > 0 && onnxMajor != s_trtMajor) {
                std::cout << "NvDynLoader ERROR: TRT major version (" << s_trtMajor
                          << ") != ONNX-parser major version (" << onnxMajor << ").\n"
                          << "  TRT  : " << s_trtPath  << "\n"
                          << "  ONNX : " << s_onnxPath << "\n"
                          << "  Replace both DLLs with matching versions in "
                          << kAnsDir << std::endl;
                ok = false;
            } else if (verbose && onnxMajor > 0) {
                std::cout << "NvDynLoader: ONNX-parser major version = " << onnxMajor << std::endl;
            }
        }
    }

    s_initialized = ok;

    if (verbose) {
        std::cout << "NvDynLoader: "
                  << (ok ? "initialised successfully." : "FAILED — check errors above.")
                  << std::endl;
        std::cout << "==================================================" << std::endl;
    }
    return ok;
}

// ─────────────────────────────────────────────────────────────────────────────
// Shutdown
// ─────────────────────────────────────────────────────────────────────────────
void NvDynLoader::Shutdown()
{
    std::lock_guard<std::mutex> lock(g_initMutex);

    // NOTE: No early-return guard on s_initialized here.
    //
    // Shutdown() must always release whatever handles are currently open,
    // including handles left behind by a *partially*-successful Initialize()
    // call (where s_initialized == false but some DLLs were already opened).
    // Every FreeLibrary / dlclose call is guarded by if(h), so this function
    // is safe to call on a fully- or partially-initialised instance, and also
    // safe to call multiple times (idempotent).

    // ── Null all entry-point pointers first ───────────────────────────────────
    // Prevents stubs from jumping into freed library memory in the window
    // between Shutdown() and a subsequent Initialize() call.
    pfn_createInferBuilder_INTERNAL          = nullptr;
    pfn_createInferRuntime_INTERNAL          = nullptr;
    pfn_createInferRefitter_INTERNAL         = nullptr;
    pfn_createNvOnnxParser_INTERNAL          = nullptr;
    pfn_createNvOnnxParserRefitter_INTERNAL  = nullptr;
    pfn_getNvOnnxParserVersion               = nullptr;

#ifdef NV_DYNAMIC_CUDA
    // Null every CUDA function pointer so the assert() in each stub fires
    // immediately if a CUDA call is made after Shutdown() without a new
    // Initialize(), rather than silently calling into freed DLL memory.
    pfn_cudaGetDeviceCount               = nullptr;
    pfn_cudaSetDevice                    = nullptr;
    pfn_cudaGetDeviceProperties          = nullptr;
    pfn_cudaDeviceSetLimit               = nullptr;
    pfn_cudaDeviceSynchronize            = nullptr;
    pfn_cudaDeviceGetStreamPriorityRange = nullptr;
    pfn_cudaMalloc                       = nullptr;
    pfn_cudaFree                         = nullptr;
    pfn_cudaMemset                       = nullptr;
    pfn_cudaMemGetInfo                   = nullptr;
    pfn_cudaMemcpy                       = nullptr;
    pfn_cudaMemcpyAsync                  = nullptr;
    pfn_cudaStreamCreate                 = nullptr;
    pfn_cudaStreamCreateWithPriority     = nullptr;
    pfn_cudaStreamDestroy                = nullptr;
    pfn_cudaStreamSynchronize            = nullptr;
    pfn_cudaStreamWaitEvent              = nullptr;
    pfn_cudaEventCreate                  = nullptr;
    pfn_cudaEventCreateWithFlags         = nullptr;
    pfn_cudaEventRecord                  = nullptr;
    pfn_cudaEventDestroy                 = nullptr;
    pfn_cudaGetErrorString               = nullptr;
    pfn_cudaGetLastError                 = nullptr;
#endif // NV_DYNAMIC_CUDA

    // ── Release handles in reverse load order ─────────────────────────────────
    //   ONNX parser  → depends on TRT
    //   TRT          → depends on cuDNN + CUDA RT
    //   cuDNN        → depends on CUDA RT
    //   CUDA RT      → base dependency
    if (s_hOnnx)  { NV_FREE_LIB(s_hOnnx);  s_hOnnx  = nullptr; }
    if (s_hTrt)   { NV_FREE_LIB(s_hTrt);   s_hTrt   = nullptr; }
    if (s_hCudnn) { NV_FREE_LIB(s_hCudnn); s_hCudnn = nullptr; }
#ifdef NV_DYNAMIC_CUDA
    if (s_hCuda)  { NV_FREE_LIB(s_hCuda);  s_hCuda  = nullptr; }
#endif

    // ── Reset version / path state ────────────────────────────────────────────
    // A subsequent Initialize() call will re-discover fresh paths and versions.
    s_trtMajor = 0;
    s_trtPath.clear();
    s_onnxPath.clear();
    s_cudaPath.clear();
    s_cudnnPath.clear();
    s_initialized = false;
}