ANSCORE/modules/ANSODEngine/engine.h

#pragma once

#include "engine_api.h"      // ENGINE_API  (__declspec dllexport / dllimport)

// -- TRT / ONNX-parser API decoration override -----------------------------
// Must appear BEFORE the first inclusion of NvInfer.h / NvOnnxParser.h.
//
// By default these macros expand to __declspec(dllimport) for consumers of
// the TRT DLL, which causes the compiler to emit indirect import-table calls.
// Overriding them to empty makes all calls direct, so the linker resolves
// them against the extern "C" stub definitions in NvDynLoader.cpp instead of
// against nvinfer_XX.lib / nvonnxparser_XX.lib.
//
// Remove nvinfer_XX.lib and nvonnxparser_XX.lib from Linker -> Input.
// Link ANSLibsLoader.lib instead.  See NvDynLoader.h for full details.
#ifndef TENSORRTAPI
#  define TENSORRTAPI
#endif
#ifndef NVONNXPARSER_API
#  define NVONNXPARSER_API
#endif

#include "NvOnnxParser.h"
#include <atomic>
#include <chrono>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <deque>
#include <filesystem>
#include <fstream>
#include <unordered_map>
#include <memory>
#include <condition_variable>
#include <mutex>
#include <string>
#include <thread>
#include <opencv2/core/cuda.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>
#include <opencv2/cudaarithm.hpp>
#include "engine/TRTEngineCache.h"
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/opencv.hpp>
#include "interfaces\IEngine.h"
#include "ANSLicense.h"
#include "Int8Calibrator.h"
#include "NvDynLoader.h"          // from ANSLibsLoader/include/
#ifdef _WIN32
#include <windows.h>
#endif

// Class to extend TensorRT logger
class ENGINE_API Logger : public nvinfer1::ILogger {
    void log(Severity severity, const char *msg) noexcept override;
};

// ============================================================================
// Diagnostic logging to Windows Event Viewer (ANSLogger source).
// Allows internal engine failure points to be traced even when running
// under LabView which swallows stdout/stderr.
// ============================================================================
inline void logEngineEvent(const std::string& msg, bool isError = false) {
#ifdef _WIN32
    // Cache the event source handle to avoid RegisterEventSourceA/
    // DeregisterEventSource churn.  Under heavy load (10+ AI tasks),
    // the per-message register/deregister cycle overwhelms the Windows
    // Event Log service, causing ERROR_UNRECOGNIZED_VOLUME (1005) floods
    // that freeze the system.
    static HANDLE hLog = RegisterEventSourceA(nullptr, "ANSLogger");
    if (hLog) {
        const char* msgs[1] = { msg.c_str() };
        ReportEventA(hLog,
            isError ? EVENTLOG_ERROR_TYPE : EVENTLOG_INFORMATION_TYPE,
            0, isError ? 1004 : 1000, nullptr,
            1, 0, msgs, nullptr);
    }
#endif
    if (isError)
        std::cerr << msg << std::endl;
    else
        std::cout << msg << std::endl;
}

// ============================================================================
// GpuDeviceInfo -- snapshot of one CUDA device captured at pool-init time.
// Returned by Engine<T>::enumerateDevices() and stored inside the pool.
// ============================================================================
struct ENGINE_API GpuDeviceInfo {
    int         index                 = 0;
    std::string name;
    size_t      totalMemoryBytes      = 0;  ///< Physical VRAM
    size_t      freeMemoryAtInitBytes = 0;  ///< Free VRAM when pool started
    int         computeMajor          = 0;
    int         computeMinor          = 0;
    int         slotsAllocated        = 0;  ///< Engine<T> instances on this GPU
    size_t      memoryPerSlotBytes    = 0;  ///< VRAM bytes each slot occupies
};
template <typename T>
class Engine : public IEngine<T> {
public:
    Engine();
    Engine(const ANSCENTER::Options &options);
    ~Engine();

    // Build the onnx model into a TensorRT engine file, cache the model to disk
    // (to avoid rebuilding in future), and then load the model into memory The
    // default implementation will normalize values between [0.f, 1.f] Setting the
    // normalize flag to false will leave values between [0.f, 255.f] (some
    // converted models may require this). If the model requires values to be
    // normalized between [-1.f, 1.f], use the following params:
    //    subVals = {0.5f, 0.5f, 0.5f};
    //    divVals = {0.5f, 0.5f, 0.5f};
    //    normalize = true;
    bool buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
                          const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override;

    // Load a TensorRT engine file from disk into memory
    // The default implementation will normalize values between [0.f, 1.f]
    // Setting the normalize flag to false will leave values between [0.f, 255.f]
    // (some converted models may require this). If the model requires values to
    // be normalized between [-1.f, 1.f], use the following params:
    //    subVals = {0.5f, 0.5f, 0.5f};
    //    divVals = {0.5f, 0.5f, 0.5f};
    //    normalize = true;
    bool loadNetwork(std::string trtModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},
                     const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override;

    // -- Multi-GPU pool overloads (non-virtual; call through Engine<T>* or
    //    unique_ptr<Engine<T>>, not through IEngine<T>*).
    //
    //  maxSlotsPerGpu ==  0  -> BYPASS: identical to the 4-param single-GPU path
    //                          above.  Used by OptimizeModelStr for temporary engines.
    //  maxSlotsPerGpu ==  1  -> ROUND-ROBIN (default): 1 slot per GPU, multi-GPU
    //                          round-robin dispatch.  Tasks queue when all slots
    //                          busy.  Best balance of VRAM usage and multi-GPU
    //                          utilisation.
    //  maxSlotsPerGpu == -1  -> ELASTIC pool: 1 probe slot at init, additional
    //                          slots created on-demand when concurrent requests
    //                          arrive, released when idle via releaseIdleSlots().
    //                          Higher throughput but higher VRAM usage — only for
    //                          large GPUs (≥ 8 GB).
    //  maxSlotsPerGpu >   1  -> PRE-ALLOCATED pool: cap at that many slots/GPU,
    //                          all created at init time.
    //
    //  Usage -- opt into pool mode by adding ONE extra argument:
    //    m_trtEngine->buildLoadNetwork(path, sub, div, norm);      // single-GPU (unchanged)
    //    m_trtEngine->buildLoadNetwork(path, sub, div, norm, 1);   // round-robin (default)
    //    m_trtEngine->buildLoadNetwork(path, sub, div, norm, -1);  // elastic pool
    //    m_trtEngine->buildLoadNetwork(path, sub, div, norm, 4);   // pre-allocated, 4 slots/GPU
    bool buildLoadNetwork(std::string onnxModelPath,
                          const std::array<float, 3> &subVals,
                          const std::array<float, 3> &divVals,
                          bool   normalize,
                          int    maxSlotsPerGpu,
                          double memSafetyFactor = 0.80);
    bool loadNetwork(std::string trtModelPath,
                     const std::array<float, 3> &subVals,
                     const std::array<float, 3> &divVals,
                     bool   normalize,
                     int    maxSlotsPerGpu,
                     double memSafetyFactor = 0.80);

    // Run inference.
    // Input format [input][batch][cv::cuda::GpuMat]
    // Output format [batch][output][feature_vector]
    bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &inputs, std::vector<std::vector<std::vector<T>>> &featureVectors) override;

    // Utility method for resizing an image while maintaining the aspect ratio by
    // adding padding to smaller dimension after scaling While letterbox padding
    // normally adds padding to top & bottom, or left & right sides, this
    // implementation only adds padding to the right or bottom side This is done
    // so that it's easier to convert detected coordinates (ex. YOLO model) back
    // to the original reference frame.
    static cv::cuda::GpuMat resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width,
                                                                const cv::Scalar &bgcolor = cv::Scalar(0, 0, 0));
    static cv::Mat cpuResizeKeepAspectRatioPadRightBottom(const cv::Mat &input, size_t height, size_t width,
                                                          const cv::Scalar &bgcolor = cv::Scalar(114, 114, 114));

    [[nodiscard]] const std::vector<nvinfer1::Dims3> &getInputDims() const override { return m_inputDims; };
    [[nodiscard]] const std::vector<nvinfer1::Dims> &getOutputDims() const override { return m_outputDims; };
    [[nodiscard]] const ANSCENTER::Options& getOptions() const { return m_options; }

    /// Return the CUDA device index that preprocessing should target.
    /// In pool mode this returns the device of the first alive slot;
    /// in single-engine mode it returns m_options.deviceIndex.
    /// Call cudaSetDevice(engine->getPreferredDeviceIndex()) before
    /// any cv::cuda::GpuMat operations to ensure multi-GPU correctness.
    [[nodiscard]] int getPreferredDeviceIndex() const {
        // Pool mode: use the first alive slot's device
        if (!m_slots.empty()) {
            for (const auto& s : m_slots) {
                if (s.engine) return s.deviceIndex;
            }
        }
        return m_options.deviceIndex;
    }

    /// Convenience: set the calling thread's CUDA device to match this engine.
    /// Skips the call if already on the correct device (WDDM overhead avoidance).
    void setDeviceContext() const {
        const int target = getPreferredDeviceIndex();
        int current = -1;
        cudaGetDevice(&current);
        if (current != target) {
            cudaSetDevice(target);
        }
    }

    // Utility method for transforming triple nested output array into 2D array
    // Should be used when the output batch size is 1, but there are multiple
    // output feature vectors
    static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output);

    // Utility method for transforming triple nested output array into single
    // array Should be used when the output batch size is 1, and there is only a
    // single output feature vector
    static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output);
    // Convert NHWC to NCHW and apply scaling and mean subtraction.
    // Pass a cv::cuda::Stream to run all GPU kernels on that stream instead of
    // the CUDA null (default) stream.  This eliminates global sync barriers and
    // enables CUDA-graph-safe preprocessing on OpenCV 4.13+.
    static cv::cuda::GpuMat blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
                                            const std::array<float, 3> &divVals, bool normalize, bool swapRB = false,
                                            cv::cuda::Stream &stream = cv::cuda::Stream::Null());
    // Converts the engine options into a string
    std::string serializeEngineOptions(const ANSCENTER::Options& options, const std::string& onnxModelPath);
    // Build the network (single attempt)
    bool build(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize);
    // SEH-safe wrapper around build().  Catches access violations that corrupt
    // the CUDA context (e.g. opset 19+ ONNX models crash TRT's parser).
    // Returns false and sets *outSehCode on crash.
    bool buildSafe(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize, unsigned long* outSehCode);
    // Build with auto-retry for dynamic spatial dimension models.
    // Pre-analyzes the ONNX model; if H/W are dynamic and the max profile
    // is too large for GPU VRAM, automatically reduces max spatial dims and
    // retries until build succeeds or all candidates are exhausted.
    // Fixed-spatial models fall through to a single build() call.
    bool buildWithRetry(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize);
    void getDeviceNames(std::vector<std::string>& deviceNames);
    void clearGpuBuffers();
	int  getBindingIndexByName(const std::string& name);
    void UpdateOptions(const ANSCENTER::Options& options) { m_options = options; }
    void warmUp(int iterations = 5);
    void setVerbose(bool v) { m_verbose = v; }
    void setDisableGraphs(bool v) { m_disableGraphs = v; }

    // -- Multi-GPU pool management (replaces MultiGpuEngineManager<T>) ---------
    //
    // Call ONE of the two initialize methods below instead of
    // buildLoadNetwork() / loadNetwork(), then use runInference() exactly as
    // you would for single-GPU.  runInference() dispatches automatically to
    // the first idle slot across all GPUs.
    //
    // Single-GPU code that calls buildLoadNetwork() + runInference() directly
    // is completely unaffected -- it follows the existing code path unchanged.

    /**
     * Build from an ONNX file and create an Engine pool across all GPUs.
     *
     * @param baseOptions      Configuration template; deviceIndex is ignored
     *                         and overridden per slot.
     * @param onnxModelPath    Path to the .onnx model.
     * @param subVals          Per-channel subtraction for normalisation.
     * @param divVals          Per-channel divisor for normalisation.
     * @param normalize        Scale pixel values to [0, 1] before inference.
     * @param maxSlotsPerGpu   Cap slots per GPU (-1 = memory-limited only).
     * @param memSafetyFactor  Fraction of free VRAM to consume (default 0.80).
     */
    bool initializePool(const ANSCENTER::Options&   baseOptions,
                        const std::string&          onnxModelPath,
                        const std::array<float, 3>& subVals         = {0.f, 0.f, 0.f},
                        const std::array<float, 3>& divVals         = {1.f, 1.f, 1.f},
                        bool                        normalize       = true,
                        int                         maxSlotsPerGpu  = -1,
                        double                      memSafetyFactor = 0.80);

    /**
     * Load a pre-built TRT engine and create a pool -- no ONNX build step.
     */
    bool initializePoolFromEngine(const ANSCENTER::Options&   baseOptions,
                                  const std::string&          trtEnginePath,
                                  const std::array<float, 3>& subVals         = {0.f, 0.f, 0.f},
                                  const std::array<float, 3>& divVals         = {1.f, 1.f, 1.f},
                                  bool                        normalize       = true,
                                  int                         maxSlotsPerGpu  = -1,
                                  double                      memSafetyFactor = 0.80);

    /** Enumerate all CUDA-capable GPUs without loading any model. */
    static std::vector<GpuDeviceInfo> enumerateDevices();

    /** Total Engine<T> slots across all GPUs (valid after initializePool). */
    int  getTotalCapacity()    const { return m_totalCapacity; }

    /** Slots currently executing inference (approximate, lock-free). */
    int  getActiveInferences() const { return m_activeCount.load(); }

    /** Idle slots not currently running inference. */
    int  getAvailableSlots()   const { return m_totalCapacity - m_activeCount.load(); }

    /** True when every alive slot is busy.
     *  In elastic mode, new work will trigger on-demand slot creation
     *  rather than being rejected (unless all GPUs are physically full). */
    bool isAtCapacity()        const { return getAvailableSlots() <= 0; }

    /** Device snapshots captured at initializePool() time. */
    const std::vector<GpuDeviceInfo>& getDeviceInfo() const { return m_deviceInfos; }

    /** Print a human-readable pool capacity and device report. */
    void printCapacityReport() const;

    /**
     * Release idle slots whose last inference was more than @p idleSeconds
     * ago.  The probe slot (first slot) is never released so the model
     * remains instantly usable.  Returns the number of slots released.
     *
     * Typical usage -- call periodically from a background timer:
     * @code
     *   int freed = engine->releaseIdleSlots(30.0);
     * @endcode
     */
    int releaseIdleSlots(double idleSeconds = 30.0);

    /** Actual max spatial dims from the loaded engine's optimization profile.
     *  Returns -1 if the engine is not loaded yet or dim is fixed. */
    int getProfileMaxHeight() const { return m_profileMaxHeight; }
    int getProfileMaxWidth()  const { return m_profileMaxWidth; }

    /// Grow the pool by up to @p count slots (demand-driven, bypasses grace period).
    /// Returns the number of slots actually created.
    int growPool(int count);

private:
    // Serialises concurrent runInference() calls on the SAME Engine instance.
    // Without this, two threads with different batch sizes corrupt m_buffers,
    // m_lastBatchSize, and the TRT execution context — causing fatal "illegal
    // memory access" errors that permanently destroy the CUDA context.
    // Pool-mode slots have their own busy-flag dispatch, so this only matters
    // for the single-engine path (m_slots.empty()).
    mutable std::mutex           m_inferenceMutex;

    int32_t m_lastBatchSize = -1;

    // BATCH STATE MANAGEMENT (THREAD-SAFE)
    //mutable std::mutex m_batchStateMutex;  // Protects all batch-related state
    //size_t m_targetStableBatchSize = 4;     // Default optimal batch size
    //int m_batchSizeChangeCounter = 0;
    //const int BATCH_STABILITY_THRESHOLD = 15;  // Frames before allowing downsize
    //std::deque<size_t> m_recentBatchSizes;
    //const size_t BATCH_HISTORY_SIZE = 30;
    //std::chrono::steady_clock::time_point m_lastBatchSizeChange;
    //// Context reshape mutex (separate from batch state)
    //mutable std::mutex m_contextReshapeMutex;

    //// Helper methods
    //size_t DetermineOptimalBatchSize(size_t requestedSize);
    //std::vector<std::vector<cv::cuda::GpuMat>> PadBatchToSize(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,size_t targetSize);

    // Normalization, scaling, and mean subtraction of inputs
    std::array<float, 3> m_subVals{};
    std::array<float, 3> m_divVals{};
    bool m_normalize;

    // Holds pointers to the input and output GPU buffers
    std::vector<void *> m_buffers;
    std::vector<uint32_t> m_outputLengths{};
    std::vector<nvinfer1::Dims3> m_inputDims;
    std::vector<nvinfer1::Dims> m_outputDims;
    std::vector<std::string> m_IOTensorNames;
    int32_t m_inputBatchSize = 0;
    bool    m_firstInferenceCall = true;       // Per-instance first-call diagnostics flag
    bool    m_batchShapeChangeLogged = false;  // Suppress per-tensor shape dump after first batch-size change
    bool    m_hasDynamicSpatialDims = false;   // True if any input has dynamic H or W (-1)
    std::vector<nvinfer1::Dims> m_lastSetInputDims;  // Cache of last setInputShape() dims per tensor — skip if unchanged
    int     m_profileMaxHeight = -1;             // Max H from optimization profile (after load)
    int     m_profileMaxWidth  = -1;             // Max W from optimization profile (after load)

    // Must keep IRuntime around for inference, see:
    // https://forums.developer.nvidia.com/t/is-it-safe-to-deallocate-nvinfer1-iruntime-after-creating-an-nvinfer1-icudaengine-but-before-running-inference-with-said-icudaengine/255381/2?u=cyruspk4w6
    //
    // m_runtime and m_engine are shared_ptr to support TRTEngineCache:
    // multiple Engine<T> instances loading the same model share one ICudaEngine
    // in VRAM, each with its own IExecutionContext.
    std::shared_ptr<nvinfer1::IRuntime> m_runtime = nullptr;
    std::unique_ptr<Int8EntropyCalibrator2> m_calibrator = nullptr;
    std::shared_ptr<nvinfer1::ICudaEngine> m_engine = nullptr;
    std::unique_ptr<nvinfer1::IExecutionContext> m_context = nullptr;

    // TRTEngineCache tracking — used by destructor to release shared engine
    std::string m_cachedEnginePath;
    int         m_cachedGpuIndex = -1;
    bool        m_usingCachedEngine = false;
    bool        m_skipEngineCache = false;   // true = bypass TRTEngineCache (for OptimizeModelStr)
    bool        m_skipOnnxRebuild = false;   // true = don't delete/rebuild corrupt engine files (elastic growth)
    bool        m_skipOnnxBuild  = false;   // true = skip ONNX→TRT build if no cached engine (demand growth)
    bool        m_lastLoadFailedVRAM = false; // set by loadNetwork when failure is due to insufficient VRAM
    bool        m_forceNoPool = false;       // true = force single-GPU path, skip multi-GPU pool

    ANSCENTER::Options m_options;
    cudaStream_t m_cudaStream = nullptr;

    cudaStream_t m_inferenceStream;
    bool m_streamInitialized;
    //int32_t m_lastBatchSize;
    cudaStream_t m_memoryStream;  // ADD THIS - separate stream for memory operations
    std::vector<cv::cuda::GpuMat> m_preprocessedInputs;  // Keep inputs alive

    // Note: blobFromGpuMats and resizeKeepAspectRatioPadRightBottom are static,
    // so cached buffers use thread_local inside the functions themselves.


    // Thermal management (ADD THESE)
    //int m_consecutiveInferences;
    //int m_thermalCooldownThreshold;
    //int m_thermalCooldownDelayMs;

    bool m_profileInitialized;

    // -- GPU clock lock (NVML) -------------------------------------------------
    // Dynamically loaded NVML to lock GPU clocks and prevent power throttling.
    // No build-time dependency on nvml.lib — loads nvml.dll at runtime.
    bool         m_clocksLocked   = false;
    unsigned int m_nvmlDeviceIdx  = 0;
    void*        m_nvmlLib        = nullptr;  // LibHandle (HMODULE on Windows)

    void lockGpuClocks(int deviceIndex, int requestedMHz);
    void unlockGpuClocks();

    // -- CUDA Graph acceleration ---------------------------------------------
    // Pinned (page-locked) host buffers -- stable addresses required for graph
    // D2H copies.  One pointer per output tensor, sized for maxBatchSize elements.
    std::vector<T*>                           m_pinnedOutputBuffers;
    std::vector<size_t>                       m_pinnedOutputBufElems; // elements per buffer
    // Cached executable graphs, keyed by batch size.  A new graph is captured
    // the first time each batch size is seen; subsequent calls reuse it.
    std::unordered_map<int, cudaGraphExec_t>  m_graphExecs;

    // Leak diagnostics — per-engine-instance counters for CUDA graph
    // create/destroy balance. Incremented in EngineRunInference.inl and
    // EngineBuildLoadNetwork.inl. Read by the [TRT_Leak] heartbeat in
    // runInference (fires ≤1×/60s per engine instance).
    // m_trtLeakNextLogTick stores a steady_clock epoch count for lock-free
    // compare_exchange window claim across concurrent inference threads.
    std::atomic<int64_t>                      m_trtGraphCreates{0};
    std::atomic<int64_t>                      m_trtGraphDestroys{0};
    std::atomic<long long>                    m_trtLeakNextLogTick{0};

    Logger m_logger;
    bool m_verbose{ true };   // false for non-probe pool slots
    bool m_disableGraphs{ true }; // DISABLED by default — concurrent graph launches + uploads cause GPU deadlock on WDDM

    // -- Multi-GPU pool data ---------------------------------------------------

    // Per-slot descriptor.  Slot engines are separate Engine<T> instances;
    // they are never pooled recursively (their own m_slots deques are empty).
    //
    // Lifecycle in elastic mode:
    //   alive (engine != nullptr, busy == false) -- idle, ready for inference
    //   alive (engine != nullptr, busy == true)  -- running inference
    //   dead  (engine == nullptr)                -- released, VRAM freed
    //         Dead entries stay in the deque to avoid invalidating pointers
    //         held by other threads.  tryGrowPool() reuses them.
    struct InferenceSlot {
        int    deviceIndex = 0;
        bool   busy        = false;
        size_t memUsed     = 0;       ///< VRAM bytes this slot holds
        std::unique_ptr<Engine<T>> engine;
        std::chrono::steady_clock::time_point lastUsedTime
            = std::chrono::steady_clock::now();   ///< For idle-release
    };

    // Internal helpers -- implemented in EngineMultiGpu.inl
    bool loadSlots(const ANSCENTER::Options&   baseOptions,
                   const std::string&          modelPath,
                   const std::array<float, 3>& subVals,
                   const std::array<float, 3>& divVals,
                   bool                        normalize,
                   bool                        fromOnnx,
                   int                         maxSlotsPerGpu,
                   double                      memSafetyFactor);

    bool runInferenceFromPool(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
                              std::vector<std::vector<std::vector<T>>>&          featureVectors);

    InferenceSlot* tryGrowPool(bool bypassGrace = false);

    // -- Elastic pool storage ----------------------------------------------
    //
    // std::deque is used instead of std::vector because push_back on a deque
    // does NOT invalidate references to existing elements.  This is critical
    // because runInferenceFromPool holds a raw InferenceSlot* across the GPU
    // call while another thread may call tryGrowPool -> push_back concurrently.
    std::deque<InferenceSlot>    m_slots;          ///< All managed Engine<T> instances
    std::vector<GpuDeviceInfo>   m_deviceInfos;    ///< Per-device info from init
    mutable std::mutex           m_slotMutex;      ///< Protects m_slots busy flags
    std::condition_variable      m_slotFreeCv;     ///< Notified when any slot becomes idle
    std::mutex                   m_growMutex;       ///< Serialises slot creation
    std::atomic<int>             m_activeCount{0}; ///< Running inference count
    std::atomic<int>             m_totalCapacity{0}; ///< Alive slot count (atomic -- read lock-free by public API)
    std::atomic<size_t>          m_nextSlotHint{0};  ///< Round-robin start index for slot scan

    // -- Elastic pool config (stored from loadSlots for on-demand growth) --
    size_t                       m_memPerSlot{0};
    std::string                  m_poolModelPath;
    std::array<float, 3>        m_poolSubVals{};
    std::array<float, 3>        m_poolDivVals{};
    bool                         m_poolNormalize{true};
    bool                         m_poolFromOnnx{false};
    double                       m_poolSafetyFactor{0.80};
    bool                         m_elasticMode{false};  ///< true when maxSlotsPerGpu == -1

    // -- Global elastic slot cap (shared across ALL pools on ALL GPUs) --
    // Prevents CUDA driver SRW lock convoy when too many concurrent contexts exist.
    // Budget: ~4 slots per GB of total GPU VRAM (computed once on first pool init).
    //   4 GB → 12,  6 GB → 24,  8 GB → 32,  12 GB → 48,  24 GB → 96
    static std::atomic<int>      s_globalElasticCount;  ///< Current total elastic slots
    static std::atomic<int>      s_globalElasticMax;    ///< Max total slots
    static std::once_flag        s_globalCapInitFlag;   ///< Ensures max is computed once

    // -- Elastic growth grace period --
    // Delays elastic growth for N seconds after the most recent pool creation,
    // giving later models time to create their probe engines before elastic
    // growth from earlier pools consumes all free VRAM.
    static constexpr int         s_elasticGraceSec{5};  ///< Grace period in seconds (reduced from 30s: shared pools need fewer probes)
    static std::atomic<int64_t>  s_lastPoolCreatedMs;   ///< Timestamp of last pool creation (ms since epoch)

    // -- Automatic idle-slot cleanup timer (elastic mode only) --
    std::thread                  m_idleTimerThread;
    std::condition_variable      m_idleTimerCv;
    std::mutex                   m_idleTimerMutex;
    std::atomic<bool>            m_idleTimerStop{false};
    double                       m_idleTimerIntervalSec{60.0};  ///< How often to check
    double                       m_idleTimerThresholdSec{600.0}; ///< Idle time (seconds) before releasing — 10 minutes to avoid churn

    void startIdleTimer();
    void stopIdleTimer();
};
template <typename T> Engine<T>::Engine(const ANSCENTER::Options &options) : m_options(options) {
    NvDynLoader::Initialize();   // no-op after first call; safe to call repeatedly
    m_streamInitialized    = false;
    m_inferenceStream      = nullptr;
    m_lastBatchSize        = -1;
    m_inputBatchSize       = 0;
    m_memoryStream         = nullptr;
    m_firstInferenceCall       = true;
    m_batchShapeChangeLogged   = false;
    m_hasDynamicSpatialDims    = false;
    m_profileInitialized       = false;
    m_lastSetInputDims.clear();
    m_preprocessedInputs.clear();
}
template <typename T> Engine<T>::Engine() {
    NvDynLoader::Initialize();
    m_streamInitialized        = false;
    m_inferenceStream          = nullptr;
    m_lastBatchSize            = -1;
    m_inputBatchSize           = 0;
    m_memoryStream             = nullptr;
    m_normalize                = true;
    m_firstInferenceCall       = true;
    m_batchShapeChangeLogged   = false;
    m_hasDynamicSpatialDims    = false;
    m_profileInitialized       = false;
    m_lastSetInputDims.clear();
}
template <typename T> Engine<T>::~Engine() {
    // =========================================================================
    //  CRITICAL: Destructors must NEVER throw.  Every CUDA/TRT call below is
    //  wrapped in try/catch so that a failure in one step does not skip the
    //  remaining cleanup (especially m_engine / m_runtime reset, which release
    //  TensorRT file handles and VRAM).
    //
    //  Previous bug: clearGpuBuffers() called Util::checkCudaErrorCode() which
    //  throws on cudaFree failure.  That aborted the destructor mid-way,
    //  leaving ICudaEngine/IRuntime alive → model folder locked on disk.
    // =========================================================================

    // Stop the idle-slot cleanup timer (if running) before touching pool state
    try { stopIdleTimer(); } catch (...) {}

    // Destroy cached CUDA graphs
    try {
        for (auto& [bs, ge] : m_graphExecs) {
            if (ge) {
                cudaGraphExecDestroy(ge);
                m_trtGraphDestroys.fetch_add(1, std::memory_order_relaxed);
            }
        }
        m_graphExecs.clear();
    } catch (...) {}

    // Free pinned output buffers
    try {
        for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
        m_pinnedOutputBuffers.clear();
    } catch (...) {}

    // Free GPU I/O buffers — use cudaFree directly (NOT checkCudaErrorCode which throws)
    try {
        if (!m_buffers.empty()) {
            for (void* ptr : m_buffers) {
                if (ptr) cudaFree(ptr);  // ignore errors — destructor must not throw
            }
            m_buffers.clear();
        }
    } catch (...) {}

    try {
        if (m_cudaStream) {
            cudaStreamDestroy(m_cudaStream);
            m_cudaStream = nullptr;
        }
    } catch (...) {}
    try {
        if (m_streamInitialized && m_inferenceStream) {
            cudaStreamDestroy(m_inferenceStream);
            m_inferenceStream = nullptr;
            m_streamInitialized = false;
        }
    } catch (...) {}
    try {
        if (m_memoryStream) {
            cudaStreamDestroy(m_memoryStream);
            m_memoryStream = nullptr;
        }
    } catch (...) {}

    // Release GPU clock lock if we hold one
    try { unlockGpuClocks(); } catch (...) {}

    // TRT destruction ordering: context MUST be destroyed before engine.
    // With shared_ptr, m_engine may outlive this instance (other tasks share it).
    // Explicit reset ensures correct order.
    try { m_context.reset(); } catch (...) {}

    // Release from TRTEngineCache (decrements refcount; engine freed when last user releases)
    try {
        if (m_usingCachedEngine && !m_cachedEnginePath.empty()) {
            TRTEngineCache::instance().release(m_cachedEnginePath, m_cachedGpuIndex);
        }
    } catch (...) {}

    // These MUST execute to release TensorRT file handles and VRAM.
    // Previously, a throw in clearGpuBuffers() skipped these lines,
    // leaving the model folder locked on disk.
    try { m_engine.reset(); } catch (...) {}
    try { m_runtime.reset(); } catch (...) {}
}

// ============================================================================
//  NVML GPU Clock Lock — dynamically loaded, no build-time nvml.lib needed
// ============================================================================

// NVML type / constant definitions (subset — avoids #include <nvml.h>)
namespace nvml_types {
    using Return_t  = unsigned int;  // nvmlReturn_t
    using Device_t  = void*;         // nvmlDevice_t (opaque pointer)
    enum : unsigned int { SUCCESS = 0 };
    enum ClockType : unsigned int { CLOCK_GRAPHICS = 0 };
}

template <typename T>
void Engine<T>::lockGpuClocks(int deviceIndex, int requestedMHz) {
    if (requestedMHz == 0) return;  // disabled

#ifdef _WIN32
    // Load nvml.dll from System32 (installed by NVIDIA driver)
    auto hLib = LoadLibraryA("nvml.dll");
    if (!hLib) {
        if (m_verbose) std::cout << "Warning: nvml.dll not found — GPU clock lock unavailable" << std::endl;
        return;
    }
    m_nvmlLib = static_cast<void*>(hLib);

    // Resolve NVML function pointers
    using fn_Init       = nvml_types::Return_t(*)();
    using fn_Shutdown    = nvml_types::Return_t(*)();
    using fn_GetHandle   = nvml_types::Return_t(*)(unsigned int, nvml_types::Device_t*);
    using fn_GetMaxClock = nvml_types::Return_t(*)(nvml_types::Device_t, nvml_types::ClockType, unsigned int*);
    using fn_SetLocked   = nvml_types::Return_t(*)(nvml_types::Device_t, unsigned int, unsigned int);
    using fn_ErrorString = const char*(*)(nvml_types::Return_t);

    auto pInit       = reinterpret_cast<fn_Init>(GetProcAddress(hLib, "nvmlInit_v2"));
    auto pShutdown   = reinterpret_cast<fn_Shutdown>(GetProcAddress(hLib, "nvmlShutdown"));
    auto pGetHandle  = reinterpret_cast<fn_GetHandle>(GetProcAddress(hLib, "nvmlDeviceGetHandleByIndex_v2"));
    auto pGetMaxClk  = reinterpret_cast<fn_GetMaxClock>(GetProcAddress(hLib, "nvmlDeviceGetMaxClockInfo"));
    auto pSetLocked  = reinterpret_cast<fn_SetLocked>(GetProcAddress(hLib, "nvmlDeviceSetGpuLockedClocks"));
    auto pErrStr     = reinterpret_cast<fn_ErrorString>(GetProcAddress(hLib, "nvmlErrorString"));

    if (!pInit || !pGetHandle || !pSetLocked) {
        if (m_verbose) std::cout << "Warning: NVML symbols not found — GPU clock lock unavailable" << std::endl;
        FreeLibrary(hLib); m_nvmlLib = nullptr;
        return;
    }

    auto errName = [&](nvml_types::Return_t r) -> const char* {
        return pErrStr ? pErrStr(r) : "unknown";
    };

    nvml_types::Return_t rc = pInit();
    if (rc != nvml_types::SUCCESS) {
        if (m_verbose) std::cout << "Warning: nvmlInit failed: " << errName(rc) << std::endl;
        FreeLibrary(hLib); m_nvmlLib = nullptr;
        return;
    }

    nvml_types::Device_t nvmlDev = nullptr;
    rc = pGetHandle(static_cast<unsigned int>(deviceIndex), &nvmlDev);
    if (rc != nvml_types::SUCCESS) {
        if (m_verbose) std::cout << "Warning: nvmlDeviceGetHandleByIndex(" << deviceIndex << ") failed: " << errName(rc) << std::endl;
        pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr;
        return;
    }

    unsigned int targetMHz = 0;
    if (requestedMHz < 0) {
        // Auto mode: query max graphics clock and use ~80%
        unsigned int maxClock = 0;
        if (pGetMaxClk) {
            rc = pGetMaxClk(nvmlDev, nvml_types::CLOCK_GRAPHICS, &maxClock);
            if (rc == nvml_types::SUCCESS && maxClock > 0) {
                targetMHz = static_cast<unsigned int>(maxClock * 0.80);
                targetMHz = (targetMHz / 15) * 15;  // Round down to 15 MHz step
                if (targetMHz < 210) targetMHz = 210;
            } else {
                if (m_verbose) std::cout << "Warning: Could not query max GPU clock — skipping clock lock" << std::endl;
                pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr;
                return;
            }
        }
    } else {
        targetMHz = static_cast<unsigned int>(requestedMHz);
    }

    rc = pSetLocked(nvmlDev, targetMHz, targetMHz);
    if (rc == nvml_types::SUCCESS) {
        m_clocksLocked  = true;
        m_nvmlDeviceIdx = static_cast<unsigned int>(deviceIndex);
        // Always emit to DebugView so operators can confirm the lock took
        // effect without needing to read engine-level verbose output.
        ANS_DBG("TRT_Clock",
            "GPU clocks LOCKED at %u MHz (device %d) — P-state will stay high, "
            "no WDDM down-clock between inferences",
            targetMHz, deviceIndex);
        if (m_verbose) std::cout << "Info: GPU clocks locked at " << targetMHz << " MHz (device " << deviceIndex << ")" << std::endl;
    } else {
        // Surface the failure reason + remediation in DebugView. Most common
        // failure is access-denied (requires Administrator) or the driver
        // refusing the requested frequency. Users see this in the log and
        // know to elevate, set NVCP 'Prefer maximum performance', or run
        // `nvidia-smi -lgc <MHz>,<MHz>` before launching.
        ANS_DBG("TRT_Clock",
            "GPU clock lock FAILED (nvml rc=%s) — expect 2-3x inference latency from "
            "WDDM down-clocking. Fix: run as Admin, OR set NVCP 'Prefer maximum "
            "performance' for this app, OR: nvidia-smi -lgc %u,%u",
            errName(rc), targetMHz, targetMHz);
        if (m_verbose) {
            std::cout << "Warning: nvmlDeviceSetGpuLockedClocks failed: " << errName(rc) << std::endl;
            std::cout << "  (Run as Administrator, or use: nvidia-smi -lgc " << targetMHz << "," << targetMHz << ")" << std::endl;
        }
        pShutdown(); FreeLibrary(hLib); m_nvmlLib = nullptr;
    }
#else
    // Linux: similar pattern with dlopen("libnvidia-ml.so.1", RTLD_LAZY)
    // Left as TODO — nvidia-smi -lgc works without elevation on Linux.
    if (m_verbose) std::cout << "Info: GPU clock lock not implemented on this platform — use nvidia-smi -lgc" << std::endl;
#endif
}

template <typename T>
void Engine<T>::unlockGpuClocks() {
    if (!m_clocksLocked || !m_nvmlLib) return;

#ifdef _WIN32
    auto hLib = static_cast<HMODULE>(m_nvmlLib);

    using fn_GetHandle  = nvml_types::Return_t(*)(unsigned int, nvml_types::Device_t*);
    using fn_ResetClk   = nvml_types::Return_t(*)(nvml_types::Device_t);
    using fn_Shutdown    = nvml_types::Return_t(*)();

    auto pGetHandle = reinterpret_cast<fn_GetHandle>(GetProcAddress(hLib, "nvmlDeviceGetHandleByIndex_v2"));
    auto pResetClk  = reinterpret_cast<fn_ResetClk>(GetProcAddress(hLib, "nvmlDeviceResetGpuLockedClocks"));
    auto pShutdown  = reinterpret_cast<fn_Shutdown>(GetProcAddress(hLib, "nvmlShutdown"));

    if (pGetHandle && pResetClk) {
        nvml_types::Device_t nvmlDev = nullptr;
        if (pGetHandle(m_nvmlDeviceIdx, &nvmlDev) == nvml_types::SUCCESS) {
            pResetClk(nvmlDev);
            if (m_verbose) std::cout << "Info: GPU clocks unlocked (device " << m_nvmlDeviceIdx << ")" << std::endl;
        }
    }
    if (pShutdown) pShutdown();
    FreeLibrary(hLib);
#endif

    m_nvmlLib      = nullptr;
    m_clocksLocked = false;
}

namespace Util {

    inline bool doesFileExist(const std::string& filepath) {
        std::ifstream f(filepath.c_str());
        return f.good();
    }

    inline void checkCudaErrorCode(cudaError_t code) {
        if (code != cudaSuccess) {
            std::string errMsg = "CUDA operation failed with code: " + std::to_string(code) + " (" + cudaGetErrorName(code) +
                "), with message: " + cudaGetErrorString(code);
            std::cout<<errMsg;
        }
    }

    inline std::vector<std::string> getFilesInDirectory(const std::string& dirPath) {
        std::vector<std::string> fileNames;
        for (const auto& entry : std::filesystem::directory_iterator(dirPath)) {
            if (entry.is_regular_file()) {
                fileNames.push_back(entry.path().string());
            }
        }
        return fileNames;
    }
}
#include "engine/EngineRunInference.inl"
#include "engine/EngineUtilities.inl"
#include "engine/EngineBuildLoadNetwork.inl"
#include "engine/EngineMultiGpu.inl"

// ============================================================================
// Explicit template instantiation declarations -- DLL consumers only
//
// These tell each consuming translation unit: "Engine<float> is already fully
// compiled and exported by the DLL.  Do NOT re-instantiate it here; resolve
// the symbols at link time from the DLL's import library instead."
//
// This suppresses redundant code generation in every consuming TU and
// guarantees that only ONE definition of each Engine<T> specialisation exists
// across the whole program (in the DLL).
//
// To add a new exported instantiation (e.g. Engine<uint8_t>):
//   1. Add a matching  template class ENGINE_API Engine<uint8_t>;  in engine.cpp
//   2. Add the corresponding extern declaration below.
// ============================================================================
#ifndef ENGINE_EXPORTS
extern template class ENGINE_API Engine<float>;
#endif