#pragma once
// ============================================================================
// NvDynLoader -- Runtime discovery and loading of NVIDIA TensorRT / CUDA DLLs
//
// Moved from TensorRTAPI/ to ANSLibsLoader/ for centralized library management.
// Now exported via ANSLIBS_API from ANSLibsLoader.dll.
//
//  PROBLEM SOLVED
//  --------------
//  Linking against nvinfer_10.lib / nvonnxparser_10.lib hard-codes the major
//  version into the binary's import table.  Upgrading TRT 10 -> 11 then forces
//  every project to update its .lib references and relink.
//
//  SOLUTION
//  --------
//  NvDynLoader provides the three extern "C" symbols that TRT / ONNX-parser
//  inline wrappers call (createInferBuilder_INTERNAL, etc.) as thin stubs
//  compiled directly into ANSLibsLoader.dll.  At runtime the stubs call through
//  function pointers into whichever DLL version is actually installed.
//
//  All C++ vtable dispatch (IBuilder, IRuntime, IParser methods) continues to
//  work correctly because the objects are created by -- and owned by the vtable
//  of -- the DLL that was dynamically loaded.
//
//  REQUIRED PROJECT CHANGES
//  ------------------------
//  Consuming projects:
//    REMOVE:  nvinfer_10.lib / nvonnxparser_10.lib from linker input
//    ADD:     ANSLibsLoader.lib
//    KEEP:    cudart_static.lib  (or use NV_DYNAMIC_CUDA for dynamic CUDA RT)
// ============================================================================

#include "ANSLibsLoader.h"   // ANSLIBS_API
#include "DynLibUtils.h"     // LibHandle

// -- TRT / ONNX-parser API decoration override --------------------------------
// Must appear BEFORE including NvInfer.h / NvOnnxParser.h.
// By default these macros expand to __declspec(dllimport), which would conflict
// with our extern "C" stub definitions in NvDynLoader.cpp.  Setting them to
// empty makes all TRT inline-wrapper calls direct, so the linker resolves them
// against our stubs rather than against nvinfer_XX.lib.
//
// The stubs are exported from ANSLibsLoader.dll via the .def file
// (ANSLibsLoader.def), NOT via __declspec(dllexport), to avoid C2375
// linkage conflicts between the NvInfer.h declarations and our definitions.
#ifndef TENSORRTAPI
#  define TENSORRTAPI
#endif
#ifndef NVONNXPARSER_API
#  define NVONNXPARSER_API
#endif

#include <cuda_runtime.h>   // CUDA types (cudaStream_t, cudaDeviceProp, ...)
#include <NvInfer.h>        // TRT types (IBuilder, IRuntime, ...)
#include <NvOnnxParser.h>   // nvonnxparser types

#include <string>
#include <vector>

// ============================================================================
class ANSLIBS_API NvDynLoader
{
public:
    // -- Lifecycle -------------------------------------------------------------
    /// Discover and load NVIDIA DLLs at runtime.
    /// Safe to call multiple times -- subsequent calls are no-ops.
    /// @param verbose  Print discovery results to stdout.
    /// @returns false if a critical library (TRT or CUDA) could not be loaded.
    [[nodiscard]] static bool Initialize(bool verbose = true);

    /// Release all loaded library handles.  Call at application exit.
    static void Shutdown();

    [[nodiscard]] static bool IsInitialized() noexcept { return s_initialized; }

    // -- Informational ---------------------------------------------------------
    [[nodiscard]] static const std::string& TrtDllPath()  noexcept { return s_trtPath;   }
    [[nodiscard]] static const std::string& OnnxDllPath() noexcept { return s_onnxPath;  }
    [[nodiscard]] static const std::string& CudaDllPath() noexcept { return s_cudaPath;  }
    [[nodiscard]] static const std::string& CudnnDllPath()noexcept { return s_cudnnPath; }
    [[nodiscard]] static int                TrtMajor()    noexcept { return s_trtMajor;  }

    // -- TRT factory pointers ---------------------------------------------------
    using PfnBuilder        = void*(void* logger,   int32_t version) noexcept;
    using PfnRuntime        = void*(void* logger,   int32_t version) noexcept;
    using PfnRefitter       = void*(void* engine,   void* logger, int32_t version) noexcept;
    using PfnParser           = void*(void* network,  void* logger, int32_t version) noexcept;
    using PfnParserRefitter   = void*(void* refitter, void* logger, int32_t version) noexcept;
    using PfnGetParserVersion = int() noexcept;

    static PfnBuilder*           pfn_createInferBuilder_INTERNAL;
    static PfnRuntime*           pfn_createInferRuntime_INTERNAL;
    static PfnRefitter*          pfn_createInferRefitter_INTERNAL;
    static PfnParser*            pfn_createNvOnnxParser_INTERNAL;
    static PfnParserRefitter*    pfn_createNvOnnxParserRefitter_INTERNAL;
    static PfnGetParserVersion*  pfn_getNvOnnxParserVersion;

    // -- CUDA function pointers (populated only with NV_DYNAMIC_CUDA) -----------
#ifdef NV_DYNAMIC_CUDA
    static cudaError_t (*pfn_cudaGetDeviceCount)            (int*);
    static cudaError_t (*pfn_cudaSetDevice)                 (int);
    static cudaError_t (*pfn_cudaGetDeviceProperties)       (cudaDeviceProp*, int);
    static cudaError_t (*pfn_cudaDeviceSetLimit)            (cudaLimit, size_t);
    static cudaError_t (*pfn_cudaDeviceSynchronize)         ();
    static cudaError_t (*pfn_cudaDeviceGetStreamPriorityRange)(int*, int*);
    static cudaError_t (*pfn_cudaMalloc)                    (void**, size_t);
    static cudaError_t (*pfn_cudaFree)                      (void*);
    static cudaError_t (*pfn_cudaMemset)                    (void*, int, size_t);
    static cudaError_t (*pfn_cudaMemGetInfo)                (size_t*, size_t*);
    static cudaError_t (*pfn_cudaMemcpy)                    (void*, const void*, size_t, cudaMemcpyKind);
    static cudaError_t (*pfn_cudaMemcpyAsync)               (void*, const void*, size_t, cudaMemcpyKind, cudaStream_t);
    static cudaError_t (*pfn_cudaStreamCreate)              (cudaStream_t*);
    static cudaError_t (*pfn_cudaStreamCreateWithPriority)  (cudaStream_t*, unsigned int, int);
    static cudaError_t (*pfn_cudaStreamDestroy)             (cudaStream_t);
    static cudaError_t (*pfn_cudaStreamSynchronize)         (cudaStream_t);
    static cudaError_t (*pfn_cudaStreamWaitEvent)           (cudaStream_t, cudaEvent_t, unsigned int);
    static cudaError_t (*pfn_cudaEventCreate)               (cudaEvent_t*);
    static cudaError_t (*pfn_cudaEventCreateWithFlags)      (cudaEvent_t*, unsigned int);
    static cudaError_t (*pfn_cudaEventRecord)               (cudaEvent_t, cudaStream_t);
    static cudaError_t (*pfn_cudaEventDestroy)              (cudaEvent_t);
    static const char* (*pfn_cudaGetErrorString)            (cudaError_t);
    static cudaError_t (*pfn_cudaGetLastError)              ();
#endif // NV_DYNAMIC_CUDA

private:
    static bool        s_initialized;
    static int         s_trtMajor;
    static std::string s_trtPath;
    static std::string s_onnxPath;
    static std::string s_cudaPath;
    static std::string s_cudnnPath;
    static LibHandle   s_hTrt;
    static LibHandle   s_hOnnx;
    static LibHandle   s_hCuda;
    static LibHandle   s_hCudnn;

    // Candidate DLL / SO name lists.
    static std::vector<std::string> TrtCandidates();
    static std::vector<std::string> OnnxCandidates();
    static std::vector<std::string> CudnnCandidates();
    static std::vector<std::string> CudaRtCandidates();
};