Initial setup for CLion

2026-03-28 16:54:11 +11:00
parent 239cc02591
commit 7b4134133c
1136 changed files with 811916 additions and 0 deletions
--- a/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl
+++ b/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl
--- a/engines/TensorRTAPI/include/engine/EngineMultiGpu.inl
+++ b/engines/TensorRTAPI/include/engine/EngineMultiGpu.inl
@@ -0,0 +1,887 @@
+// ============================================================================
+// EngineMultiGpu.inl
+//
+// Multi-GPU inference pool -- merged from MultiGpuEngineManager.h
+//
+// This file is #included at the bottom of engine.h and must not be compiled
+// independently.  It provides implementations for all pool-management methods
+// declared inside Engine<T>:
+//
+//   initializePool()           -- build from ONNX, create pool
+//   initializePoolFromEngine() -- load pre-built TRT engine, create pool
+//   enumerateDevices()         -- static CUDA device enumeration
+//   loadSlots()                -- core pool allocation logic (private)
+//   runInferenceFromPool()     -- thread-safe slot dispatch (private)
+//   getTotalCapacity()         -- inline in engine.h
+//   getActiveInferences()      -- inline in engine.h
+//   getAvailableSlots()        -- inline in engine.h
+//   isAtCapacity()             -- inline in engine.h
+//   printCapacityReport()      -- human-readable pool status
+// ============================================================================
+
+// -- Static member definitions for global elastic slot cap --------------------
+template <typename T>
+std::atomic<int> Engine<T>::s_globalElasticCount{0};
+template <typename T>
+std::atomic<int> Engine<T>::s_globalElasticMax{32};  // safe default, overwritten on first pool init
+template <typename T>
+std::once_flag Engine<T>::s_globalCapInitFlag;
+template <typename T>
+std::atomic<int64_t> Engine<T>::s_lastPoolCreatedMs{0};
+
+// ----------------------------------------------------------------------------
+//  enumerateDevices  -- static, no model required
+// ----------------------------------------------------------------------------
+template <typename T>
+/*static*/ std::vector<GpuDeviceInfo>
+Engine<T>::enumerateDevices()
+{
+    int count = 0;
+    cudaGetDeviceCount(&count);
+
+    std::vector<GpuDeviceInfo> devices;
+    devices.reserve(static_cast<size_t>(count));
+
+    for (int i = 0; i < count; ++i) {
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, i);
+
+        cudaSetDevice(i);
+        size_t freeBytes = 0, totalBytes = 0;
+        cudaMemGetInfo(&freeBytes, &totalBytes);
+
+        GpuDeviceInfo info;
+        info.index                 = i;
+        info.name                  = prop.name;
+        info.totalMemoryBytes      = prop.totalGlobalMem;
+        info.freeMemoryAtInitBytes = freeBytes;
+        info.computeMajor          = prop.major;
+        info.computeMinor          = prop.minor;
+        info.slotsAllocated        = 0;
+        info.memoryPerSlotBytes    = 0;
+        devices.push_back(std::move(info));
+    }
+
+    return devices;
+}
+
+// ----------------------------------------------------------------------------
+//  Public pool-init wrappers
+// ----------------------------------------------------------------------------
+template <typename T>
+bool Engine<T>::initializePool(
+        const ANSCENTER::Options&   baseOptions,
+        const std::string&          onnxModelPath,
+        const std::array<float, 3>& subVals,
+        const std::array<float, 3>& divVals,
+        bool                        normalize,
+        int                         maxSlotsPerGpu,
+        double                      memSafetyFactor)
+{
+    // Apply baseOptions to *this* so that m_options is consistent whether
+    // the user goes through initializePool() or the 6-param buildLoadNetwork().
+    m_options = baseOptions;
+    return buildLoadNetwork(onnxModelPath, subVals, divVals, normalize,
+                            maxSlotsPerGpu, memSafetyFactor);
+}
+
+template <typename T>
+bool Engine<T>::initializePoolFromEngine(
+        const ANSCENTER::Options&   baseOptions,
+        const std::string&          trtEnginePath,
+        const std::array<float, 3>& subVals,
+        const std::array<float, 3>& divVals,
+        bool                        normalize,
+        int                         maxSlotsPerGpu,
+        double                      memSafetyFactor)
+{
+    m_options = baseOptions;
+    return loadNetwork(trtEnginePath, subVals, divVals, normalize,
+                       maxSlotsPerGpu, memSafetyFactor);
+}
+
+// ----------------------------------------------------------------------------
+//  loadSlots  -- core pool allocation logic
+//
+//  Three modes based on maxSlotsPerGpu:
+//
+//     1  =>  ROUND-ROBIN (default)
+//           1 slot per GPU, created at init.  Tasks queue when all slots
+//           busy.  Best balance of VRAM usage and multi-GPU utilisation.
+//           Example: 3 GPUs → 3 slots, round-robin dispatch.
+//
+//    -1  =>  ELASTIC MODE
+//           Only the probe slot is pre-loaded.  Additional slots are created
+//           on-demand by tryGrowPool() when concurrent requests arrive, and
+//           released by releaseIdleSlots() when idle.  Higher throughput but
+//           higher VRAM usage — only recommended for large GPUs (≥ 8 GB).
+//
+//    >1  =>  PRE-ALLOCATED MODE (explicit cap)
+//           Slots are created upfront, capped at maxSlotsPerGpu per GPU.
+//           Useful when the caller knows the required concurrency level.
+// ----------------------------------------------------------------------------
+template <typename T>
+bool Engine<T>::loadSlots(
+        const ANSCENTER::Options&   baseOptions,
+        const std::string&          modelPath,
+        const std::array<float, 3>& subVals,
+        const std::array<float, 3>& divVals,
+        bool                        normalize,
+        bool                        fromOnnx,
+        int                         maxSlotsPerGpu,
+        double                      memSafetyFactor)
+{
+    // -- 1. Enumerate GPUs --------------------------------------------------
+    m_deviceInfos = enumerateDevices();
+
+    if (m_deviceInfos.empty()) {
+        std::cout << "Error [Pool]: No CUDA-capable GPUs detected" << std::endl;
+        return false;
+    }
+
+    const bool elastic = (maxSlotsPerGpu <= 0);
+    m_elasticMode = elastic;
+
+    // Set global elastic slot cap ONCE based on total GPU VRAM.
+    // Budget: ~4 slots per GB.  This cap is shared across ALL pools
+    // to prevent CUDA driver SRW lock convoy (30+ threads deadlocked).
+    //   4 GB → 12,  6 GB → 24,  8 GB → 32,  12 GB → 48,  24 GB → 96
+    if (elastic) {
+        std::call_once(s_globalCapInitFlag, [this]() {
+            int totalGB = 0;
+            for (const auto& dev : m_deviceInfos)
+                totalGB += static_cast<int>(dev.totalMemoryBytes / (1024ULL * 1024ULL * 1024ULL));
+            int cap = std::max(8, totalGB * 4);  // minimum 8
+            s_globalElasticMax.store(cap);
+            std::cout << "Info [Pool]: Global elastic slot cap = "
+                << cap << " (total " << totalGB << " GB VRAM x4)" << std::endl;
+        });
+    }
+
+    std::cout << "\n====================================================" << std::endl;
+    std::cout << "Engine Pool Initialization"
+              << (elastic ? " [ELASTIC]" : " [PRE-ALLOCATED]") << std::endl;
+    std::cout << "====================================================" << std::endl;
+    std::cout << "Found " << m_deviceInfos.size() << " GPU(s):" << std::endl;
+
+    for (const auto& d : m_deviceInfos) {
+        std::cout << "  GPU[" << d.index << "] " << d.name
+            << " | SM " << d.computeMajor << "." << d.computeMinor
+            << " | Total " << d.totalMemoryBytes / 1048576 << " MiB"
+            << " | Free "  << d.freeMemoryAtInitBytes / 1048576 << " MiB"
+            << std::endl;
+    }
+
+    // Warn about heterogeneous GPUs -- TRT engine may not be compatible
+    for (size_t i = 1; i < m_deviceInfos.size(); ++i) {
+        if (m_deviceInfos[i].name != m_deviceInfos[0].name) {
+            std::cout << "Warning [Pool]: GPU[" << i << "] '" << m_deviceInfos[i].name
+                << "' differs from GPU[0] '" << m_deviceInfos[0].name
+                << "'. TRT engine binary may be incompatible with dissimilar GPUs."
+                << std::endl;
+        }
+    }
+
+    // -- 2. Probe engine: measure per-slot VRAM footprint -------------------
+    //
+    //   Memory delta = freeBeforeLoad - freeAfterLoad
+    //   Includes: TRT runtime buffers, CUDA context overhead, I/O buffers,
+    //             stream memory, and workspace allocated by Engine<T>.
+    //
+    //   MULTI-GPU BALANCING: place the probe on the GPU with the most free
+    //   VRAM.  This naturally distributes engines across GPUs as each pool
+    //   init consumes VRAM from its chosen GPU, making the *other* GPU
+    //   the best candidate for the next pool.
+
+    int probeGpuIdx = 0;
+    {
+        size_t bestFree = 0;
+        for (const auto& d : m_deviceInfos) {
+            cudaSetDevice(d.index);
+            size_t freeNow = 0, totalNow = 0;
+            cudaMemGetInfo(&freeNow, &totalNow);
+            std::cout << "  GPU[" << d.index << "] free VRAM: " << freeNow / 1048576 << " MiB" << std::endl;
+            if (freeNow > bestFree) {
+                bestFree    = freeNow;
+                probeGpuIdx = d.index;
+            }
+        }
+    }
+    std::cout << "\nLoading probe engine on GPU[" << probeGpuIdx
+        << "] (most free VRAM) to measure per-slot memory..." << std::endl;
+
+    cudaSetDevice(probeGpuIdx);
+    size_t freeBefore = 0, tmp = 0;
+    cudaMemGetInfo(&freeBefore, &tmp);
+
+    ANSCENTER::Options opts0 = baseOptions;
+    opts0.deviceIndex         = probeGpuIdx;
+
+    auto probeEngine = std::make_unique<Engine<T>>(opts0);
+    const bool probeOk = fromOnnx
+        ? probeEngine->buildLoadNetwork(modelPath, subVals, divVals, normalize)
+        : probeEngine->loadNetwork     (modelPath, subVals, divVals, normalize);
+
+    if (!probeOk) {
+        logEngineEvent("[Engine] loadSlots FAIL: Probe engine failed on GPU["
+            + std::to_string(probeGpuIdx) + "] for " + modelPath
+            + " (freeVRAM before=" + std::to_string(freeBefore / 1048576) + " MiB)", true);
+        return false;
+    }
+
+    size_t freeAfter = 0;
+    cudaMemGetInfo(&freeAfter, &tmp);
+
+    // Floor the delta at 64 MiB to guard against measurement noise
+    constexpr size_t kMinSlotMemBytes = 64ULL * 1024 * 1024;
+    const size_t rawDelta   = (freeBefore > freeAfter) ? (freeBefore - freeAfter) : 0ULL;
+    const size_t memPerSlot = std::max(rawDelta, kMinSlotMemBytes);
+
+    std::cout << "Info [Pool]: Memory per slot = " << memPerSlot / 1048576
+        << " MiB  (measured delta = " << rawDelta / 1048576 << " MiB)" << std::endl;
+
+    // Cache input/output tensor dims on *this* Engine so getInputDims() /
+    // getOutputDims() work correctly when the pool is the active code path.
+    m_inputDims  = probeEngine->getInputDims();
+    m_outputDims = probeEngine->getOutputDims();
+
+    // Sync GPU-capped batch sizes from the probe engine.  The build() function
+    // may have reduced maxBatchSize based on GPU VRAM tier; propagate that to
+    // the pool manager so callers see the actual runtime limits.
+    m_options.maxBatchSize = probeEngine->getOptions().maxBatchSize;
+    m_options.optBatchSize = probeEngine->getOptions().optBatchSize;
+
+    // Store per-slot measurement for on-demand growth
+    m_memPerSlot = memPerSlot;
+
+    // Promote the probe engine into the first slot on the chosen GPU
+    {
+        InferenceSlot s;
+        s.deviceIndex = probeGpuIdx;
+        s.busy        = false;
+        s.memUsed     = memPerSlot;
+        s.engine      = std::move(probeEngine);
+        m_slots.push_back(std::move(s));
+    }
+    m_deviceInfos[probeGpuIdx].slotsAllocated    = 1;
+    m_deviceInfos[probeGpuIdx].memoryPerSlotBytes = memPerSlot;
+
+    // -- 3. Store config for on-demand growth (elastic mode) -------------
+    m_poolModelPath    = modelPath;
+    m_poolSubVals      = subVals;
+    m_poolDivVals      = divVals;
+    m_poolNormalize    = normalize;
+    m_poolFromOnnx     = fromOnnx;
+    m_poolSafetyFactor = memSafetyFactor;
+
+    if (elastic) {
+        // -- ELASTIC: only the probe slot is pre-loaded -----------------
+        std::cout << "Info [Pool]: Elastic mode -- starting with 1 probe slot."
+            << "  Additional slots will be created on-demand as concurrent"
+            << " requests arrive and released when idle." << std::endl;
+
+        m_totalCapacity = 1;
+        // Mark creation time — elastic growth is deferred for s_elasticGraceSec
+        // to let other models create their probe engines first.
+        {
+            using namespace std::chrono;
+            auto now = duration_cast<milliseconds>(
+                steady_clock::now().time_since_epoch()).count();
+            s_lastPoolCreatedMs.store(now);
+        }
+        printCapacityReport();
+        startIdleTimer();     // Auto-cleanup idle slots periodically
+        return true;
+    }
+
+    // -- 4. PRE-ALLOCATED: compute per-GPU capacity, then interleave -----
+    //
+    //   Phase A: determine how many slots each GPU can hold.
+    //   Phase B: create slots in round-robin order across GPUs so that
+    //            the linear m_nextSlotHint scan naturally distributes
+    //            consecutive requests across GPUs:
+    //              m_slots = [GPU0-s0, GPU1-s0, GPU2-s0, GPU0-s1, GPU1-s1, ...]
+    //            This gives: Task1→GPU0, Task2→GPU1, Task3→GPU2, Task4→GPU0 ...
+
+    const int numGpus = static_cast<int>(m_deviceInfos.size());
+
+    // Phase A: compute slotsToAdd per GPU
+    std::vector<int> slotsPerGpu(numGpus, 0);
+    int maxSlotsAny = 0;
+
+    for (int di = 0; di < numGpus; ++di) {
+        cudaSetDevice(di);
+        size_t freeNow = 0, totalNow = 0;
+        cudaMemGetInfo(&freeNow, &totalNow);
+
+        const size_t usableBytes = static_cast<size_t>(
+            static_cast<double>(freeNow) * memSafetyFactor);
+
+        int slotsToAdd = (memPerSlot > 0)
+            ? static_cast<int>(usableBytes / memPerSlot) : 0;
+
+        // Apply explicit per-GPU cap; the probe GPU already has the probe slot
+        if (maxSlotsPerGpu > 0) {
+            const int budget = (di == probeGpuIdx)
+                ? (maxSlotsPerGpu - 1)
+                : maxSlotsPerGpu;
+            slotsToAdd = std::min(slotsToAdd, budget);
+        }
+
+        slotsPerGpu[di] = slotsToAdd;
+        if (slotsToAdd > maxSlotsAny) maxSlotsAny = slotsToAdd;
+        m_deviceInfos[di].memoryPerSlotBytes = memPerSlot;
+
+        std::cout << "Info [Pool]: GPU[" << di << "] " << m_deviceInfos[di].name
+            << " -- free " << freeNow / 1048576 << " MiB"
+            << ", usable " << usableBytes / 1048576 << " MiB"
+            << " => will add " << slotsToAdd << " slot(s)" << std::endl;
+    }
+
+    // Phase B: create slots interleaved across GPUs
+    //   Round  0: GPU0-slot0, GPU1-slot0, GPU2-slot0
+    //   Round  1: GPU0-slot1, GPU1-slot1, GPU2-slot1
+    //   ...
+    std::vector<int> slotsCreated(numGpus, 0);   // track actual success per GPU
+    std::vector<bool> gpuFailed(numGpus, false);  // stop trying failed GPUs
+
+    for (int round = 0; round < maxSlotsAny; ++round) {
+        for (int di = 0; di < numGpus; ++di) {
+            if (gpuFailed[di]) continue;
+            if (slotsCreated[di] >= slotsPerGpu[di]) continue;
+
+            cudaSetDevice(di);
+            ANSCENTER::Options opts = baseOptions;
+            opts.deviceIndex        = di;
+
+            auto eng = std::make_unique<Engine<T>>(opts);
+            eng->setVerbose(false);
+            eng->setDisableGraphs(true); // concurrent graph captures corrupt CUDA context
+            eng->m_skipEngineCache = m_skipEngineCache;  // propagate to pool slots
+            const bool ok = fromOnnx
+                ? eng->buildLoadNetwork(modelPath, subVals, divVals, normalize)
+                : eng->loadNetwork     (modelPath, subVals, divVals, normalize);
+
+            if (!ok) {
+                std::cout << "Warning [Pool]: GPU[" << di << "] slot "
+                    << (slotsCreated[di] + 1) << "/" << slotsPerGpu[di]
+                    << " failed to load; halting allocation on this device." << std::endl;
+                gpuFailed[di] = true;
+                continue;
+            }
+
+            InferenceSlot slot;
+            slot.deviceIndex = di;
+            slot.busy        = false;
+            slot.memUsed     = memPerSlot;
+            slot.engine      = std::move(eng);
+            m_slots.push_back(std::move(slot));
+            m_deviceInfos[di].slotsAllocated++;
+            slotsCreated[di]++;
+        }
+    }
+
+    m_totalCapacity = static_cast<int>(m_slots.size());
+    printCapacityReport();
+
+    if (m_totalCapacity == 0) {
+        std::cout << "Error [Pool]: Zero inference slots allocated -- "
+                     "check available GPU memory." << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+// ----------------------------------------------------------------------------
+//  tryGrowPool  -- on-demand slot creation (elastic mode)
+//
+//  Called by runInferenceFromPool when every alive slot is busy.
+//  Creates ONE new engine on the first GPU that has enough free VRAM.
+//  GPUs are scanned in order (0, 1, ...), concentrating load on GPU 0 first.
+//
+//  Returns a pointer to the new slot (already marked busy) or nullptr if
+//  no GPU has enough VRAM.
+//
+//  Thread-safety: m_growMutex serialises growth so only one thread creates
+//  a slot at a time.  m_slotMutex is acquired briefly to push the new slot
+//  into the deque.  The calling thread waits (engine deserialisation takes
+//  ~0.5-3 s), but that is far better than rejecting the request entirely.
+// ----------------------------------------------------------------------------
+template <typename T>
+typename Engine<T>::InferenceSlot*
+Engine<T>::tryGrowPool(bool bypassGrace)
+{
+    std::lock_guard<std::mutex> growLock(m_growMutex);
+
+    // Grace period: defer elastic growth for s_elasticGraceSec after the most
+    // recent pool creation.  This reserves VRAM for probe engines that haven't
+    // been created yet (e.g., 10 models loading sequentially — early pools
+    // shouldn't grow elastic slots while later probes still need VRAM).
+    // Bypassed for demand-driven growth (a new consumer explicitly joined the
+    // pool, so we KNOW more slots are needed).
+    if (!bypassGrace) {
+        using namespace std::chrono;
+        auto now = duration_cast<milliseconds>(
+            steady_clock::now().time_since_epoch()).count();
+        int64_t lastCreated = s_lastPoolCreatedMs.load();
+        int64_t elapsedSec = (now - lastCreated) / 1000;
+        if (lastCreated > 0 && elapsedSec < s_elasticGraceSec) {
+            // Silently skip — don't spam logs during grace period
+            return nullptr;
+        }
+    }
+
+    // Global cap: prevent too many concurrent CUDA operations across ALL pools.
+    // With shared engine pools, unlimited elastic growth causes CUDA driver
+    // SRW lock convoy (30+ threads all blocked on nvcuda64 internal locks).
+    const int currentGlobal = s_globalElasticCount.load();
+    const int maxGlobal = s_globalElasticMax.load();
+    if (currentGlobal >= maxGlobal) {
+        std::cout << "Info [Pool]: tryGrowPool -- global cap reached ("
+            << currentGlobal << "/" << maxGlobal
+            << " total slots), not growing" << std::endl;
+        return nullptr;
+    }
+
+    // Find the GPU with the most free VRAM that has enough for one more slot.
+    // This naturally balances load across GPUs instead of always filling GPU 0.
+    const size_t requiredBytes = (m_poolSafetyFactor > 0.0)
+        ? static_cast<size_t>(static_cast<double>(m_memPerSlot) / m_poolSafetyFactor)
+        : m_memPerSlot;
+
+    std::cout << "Info [Pool]: tryGrowPool called -- need " << (requiredBytes >> 20)
+        << " MiB per slot, scanning " << m_deviceInfos.size() << " GPU(s)..."
+        << std::endl;
+
+    // Sort device candidates by free VRAM descending (most free first)
+    std::vector<std::pair<size_t, int>> gpuByFreeVram; // {freeBytes, deviceIndex}
+    for (const auto& dev : m_deviceInfos) {
+        cudaSetDevice(dev.index);
+        size_t freeNow = 0, totalNow = 0;
+        cudaMemGetInfo(&freeNow, &totalNow);
+        std::cout << "Info [Pool]:   GPU[" << dev.index << "] free=" << (freeNow >> 20)
+            << " MiB, required=" << (requiredBytes >> 20) << " MiB"
+            << (freeNow >= requiredBytes ? " -> CANDIDATE" : " -> SKIP (not enough)")
+            << std::endl;
+        if (freeNow >= requiredBytes) {
+            gpuByFreeVram.push_back({freeNow, dev.index});
+        }
+    }
+    std::sort(gpuByFreeVram.begin(), gpuByFreeVram.end(),
+              [](const auto& a, const auto& b) { return a.first > b.first; });
+
+    if (gpuByFreeVram.empty()) {
+        std::cout << "Warning [Pool]: tryGrowPool -- no GPU has enough free VRAM ("
+            << (requiredBytes >> 20) << " MiB), cannot grow" << std::endl;
+        return nullptr;
+    }
+
+    for (const auto& [freeVram, devIdx] : gpuByFreeVram) {
+        auto& dev = m_deviceInfos[devIdx];
+
+        std::cout << "Info [Pool]: Creating on-demand slot on GPU[" << dev.index
+            << "] (free=" << (freeVram >> 20) << " MiB)..." << std::endl;
+
+        // Create a new engine on the GPU with the most free VRAM
+        cudaSetDevice(dev.index);
+        ANSCENTER::Options opts = m_options;
+        opts.deviceIndex = dev.index;
+
+        auto eng = std::make_unique<Engine<T>>(opts);
+        eng->setVerbose(false);
+        eng->setDisableGraphs(true); // concurrent graph captures corrupt CUDA context
+        eng->m_skipEngineCache = m_skipEngineCache;  // propagate to on-demand slots
+        eng->m_skipOnnxRebuild = true;  // elastic growth must NOT delete/rebuild engine files
+        eng->m_skipOnnxBuild  = bypassGrace;  // demand-driven growth: skip ONNX→TRT if no cached engine
+        const bool ok = m_poolFromOnnx
+            ? eng->buildLoadNetwork(m_poolModelPath, m_poolSubVals,
+                                    m_poolDivVals, m_poolNormalize)
+            : eng->loadNetwork(m_poolModelPath, m_poolSubVals,
+                               m_poolDivVals, m_poolNormalize);
+
+        if (!ok) {
+            std::cout << "Warning [Pool]: On-demand slot creation FAILED on GPU["
+                << dev.index << "]" << std::endl;
+            continue;   // try next GPU
+        }
+        std::cout << "Info [Pool]: On-demand slot engine loaded OK on GPU["
+            << dev.index << "]" << std::endl;
+
+        // Check if we can reuse a dead slot entry (engine == nullptr)
+        {
+            std::lock_guard<std::mutex> slotLock(m_slotMutex);
+
+            for (auto& s : m_slots) {
+                if (!s.engine) {   // dead entry -- recycle it
+                    s.deviceIndex  = dev.index;
+                    s.busy         = true;
+                    s.memUsed      = m_memPerSlot;
+                    s.engine       = std::move(eng);
+                    s.lastUsedTime = std::chrono::steady_clock::now();
+                    dev.slotsAllocated++;
+                    // Recount alive slots
+                    int alive = 0;
+                    for (const auto& x : m_slots) { if (x.engine) ++alive; }
+                    m_totalCapacity = alive;
+                    s_globalElasticCount++;
+                    std::cout << "Info [Pool]: On-demand slot recycled on GPU["
+                        << dev.index << "] -- pool now " << m_totalCapacity
+                        << " slot(s) (global " << s_globalElasticCount.load()
+                        << "/" << s_globalElasticMax.load() << ")" << std::endl;
+                    return &s;
+                }
+            }
+
+            // No dead entries to recycle -- push a new one.
+            // std::deque::push_back does NOT invalidate references to existing
+            // elements, so pointers held by other threads remain valid.
+            InferenceSlot newSlot;
+            newSlot.deviceIndex  = dev.index;
+            newSlot.busy         = true;
+            newSlot.memUsed      = m_memPerSlot;
+            newSlot.engine       = std::move(eng);
+            newSlot.lastUsedTime = std::chrono::steady_clock::now();
+            m_slots.push_back(std::move(newSlot));
+            dev.slotsAllocated++;
+            m_totalCapacity = static_cast<int>(m_slots.size());  // all alive here
+            s_globalElasticCount++;
+
+            std::cout << "Info [Pool]: On-demand slot created on GPU["
+                << dev.index << "] -- pool now " << m_totalCapacity
+                << " slot(s) (global " << s_globalElasticCount.load()
+                << "/" << s_globalElasticMax.load() << ")" << std::endl;
+
+            return &m_slots.back();
+        }
+    }
+
+    return nullptr;   // every GPU is full
+}
+
+// ----------------------------------------------------------------------------
+//  growPool  -- public demand-driven growth (bypasses grace period)
+// ----------------------------------------------------------------------------
+template <typename T>
+int Engine<T>::growPool(int count)
+{
+    int created = 0;
+    for (int i = 0; i < count; ++i) {
+        auto* slot = tryGrowPool(/*bypassGrace=*/true);
+        if (!slot) break;
+        // Release so inference threads can use it
+        {
+            std::lock_guard<std::mutex> lk(m_slotMutex);
+            slot->busy = false;
+            slot->lastUsedTime = std::chrono::steady_clock::now();
+        }
+        m_slotFreeCv.notify_one();
+        ++created;
+    }
+    return created;
+}
+
+// ----------------------------------------------------------------------------
+//  runInferenceFromPool  -- thread-safe slot dispatch
+// ----------------------------------------------------------------------------
+template <typename T>
+bool Engine<T>::runInferenceFromPool(
+        const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,
+        std::vector<std::vector<std::vector<T>>>&          featureVectors)
+{
+    // -- 1. Acquire an idle, alive slot (round-robin) --------------------
+    //
+    // Round-robin starting point avoids always favouring GPU 0.  Each call
+    // advances m_nextSlotHint so consecutive requests spread across GPUs.
+    // The mutex is held only for the O(N) scan + flag flip -- NOT during GPU
+    // execution -- so threads using different slots proceed in parallel.
+    //
+    // PROACTIVE GROWTH (elastic mode):
+    //   If all alive slots are busy when a request arrives, the pool is
+    //   undersized for the current concurrency level.  We kick off pool
+    //   growth (tryGrowPool) in a detached background thread while we
+    //   wait for the current slot to free.  This ensures multi-GPU
+    //   utilisation: the new slot lands on the GPU with the most free
+    //   VRAM (typically GPU[1]) and is ready for the *next* request.
+    //   Growth is serialised by m_growMutex so duplicate threads are
+    //   harmless — the second one finds a fresh slot immediately.
+    InferenceSlot* slot = nullptr;
+    bool kickedGrowth = false;
+
+    {
+        std::unique_lock<std::mutex> lock(m_slotMutex);
+
+        const auto deadline = std::chrono::steady_clock::now()
+                            + std::chrono::milliseconds(2000);
+
+        while (!slot) {
+            const size_t n = m_slots.size();
+            if (n > 0) {
+                const size_t start = m_nextSlotHint.load() % n;
+                for (size_t i = 0; i < n; ++i) {
+                    auto& s = m_slots[(start + i) % n];
+                    if (!s.busy && s.engine) {   // alive and idle
+                        s.busy = true;
+                        slot   = &s;
+                        m_nextSlotHint = (start + i + 1) % n;
+                        break;
+                    }
+                }
+            }
+
+            if (!slot) {
+                // All slots busy.  In elastic mode, proactively grow the
+                // pool in the background so the next request has a slot
+                // on a different GPU.  We only kick once per wait cycle.
+                if (m_elasticMode && !kickedGrowth
+                    && s_globalElasticCount.load() < s_globalElasticMax.load()) {
+                    kickedGrowth = true;
+                    std::cout << "Info [Pool]: All slots busy -- kicking background growth thread"
+                        << std::endl;
+                    // Fire-and-forget: tryGrowPool is serialised by
+                    // m_growMutex, so concurrent kicks are safe.
+                    std::thread([this]() {
+                        std::cout << "Info [Pool]: Background growth thread started" << std::endl;
+                        auto* newSlot = this->tryGrowPool();
+                        if (newSlot) {
+                            // Slot was created pre-marked busy; release it
+                            // so the next requester can claim it.
+                            {
+                                std::lock_guard<std::mutex> lk(m_slotMutex);
+                                newSlot->busy = false;
+                                newSlot->lastUsedTime = std::chrono::steady_clock::now();
+                            }
+                            m_slotFreeCv.notify_all();
+                            std::cout << "Info [Pool]: Background growth SUCCEEDED -- new slot on GPU["
+                                << newSlot->deviceIndex << "], pool now "
+                                << m_totalCapacity << " slot(s)" << std::endl;
+                        } else {
+                            std::cout << "Warning [Pool]: Background growth FAILED -- no slot created"
+                                << std::endl;
+                        }
+                    }).detach();
+                }
+
+                // Wait for a running slot to finish and signal us
+                if (m_slotFreeCv.wait_until(lock, deadline)
+                        == std::cv_status::timeout) {
+                    break;   // fall through to reject
+                }
+            }
+        }
+    }
+
+    // -- 3. Still no slot => reject ---------------------------------------
+    if (!slot) {
+        std::string errMsg = "[Engine] runInferenceFromPool FAIL: Capacity reached -- all "
+            + std::to_string(m_activeCount.load()) + "/" + std::to_string(m_totalCapacity)
+            + " slot(s) busy"
+            + (m_elasticMode ? " and all GPUs full" : "")
+            + ". Request rejected (2s timeout).";
+        std::cout << errMsg << std::endl;
+        logEngineEvent(errMsg, true);
+        return false;
+    }
+
+    ++m_activeCount;
+
+    // -- RAII guard: guarantee busy-flag and activeCount are restored ----------
+    // If runInference() throws (cv::Exception, std::bad_alloc, ...) the slot
+    // must be released and the counter decremented -- otherwise the slot is
+    // permanently lost and capacity shrinks with every exception.
+    bool result = false;
+    try {
+        // Match the calling thread's CUDA context to the slot's device.
+        // Skip the call if the thread is already on the correct device
+        // (cudaSetDevice under WDDM can cost 1-5ms per call).
+        int currentDev = -1;
+        cudaGetDevice(&currentDev);
+        if (currentDev != slot->deviceIndex) {
+            cudaSetDevice(slot->deviceIndex);
+        }
+        result = slot->engine->runInference(inputs, featureVectors);
+    }
+    catch (const std::exception& ex) {
+        std::cout << "Error [Pool]: runInference threw: " << ex.what() << std::endl;
+    }
+    catch (...) {
+        std::cout << "Error [Pool]: runInference threw unknown exception" << std::endl;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(m_slotMutex);
+        slot->busy = false;
+        slot->lastUsedTime = std::chrono::steady_clock::now();
+    }
+    --m_activeCount;
+    m_slotFreeCv.notify_one();   // wake one thread waiting for a free slot
+
+    return result;
+}
+
+// ----------------------------------------------------------------------------
+//  releaseIdleSlots  -- VRAM reclamation for elastic pools
+//
+//  Destroys engine instances that have been idle for at least `idleSeconds`.
+//  The first slot (probe, index 0) is never released so the model remains
+//  instantly usable without re-measurement.
+//
+//  Dead slots are NOT erased from the deque (to avoid invalidating pointers);
+//  their engine is reset to nullptr and they are recycled by tryGrowPool().
+//
+//  Call from a periodic background timer, e.g. every 10-30 seconds:
+//    engine->releaseIdleSlots(30.0);
+// ----------------------------------------------------------------------------
+template <typename T>
+int Engine<T>::releaseIdleSlots(double idleSeconds)
+{
+    std::lock_guard<std::mutex> growLock(m_growMutex);
+    std::lock_guard<std::mutex> slotLock(m_slotMutex);
+
+    const auto now = std::chrono::steady_clock::now();
+    int released = 0;
+
+    // Skip index 0 -- that's the probe slot, always kept alive
+    for (size_t i = 1; i < m_slots.size(); ++i) {
+        auto& s = m_slots[i];
+        if (!s.busy && s.engine) {   // alive and idle
+            const double idle = std::chrono::duration<double>(
+                now - s.lastUsedTime).count();
+            if (idle >= idleSeconds) {
+                // Update device info
+                for (auto& dev : m_deviceInfos) {
+                    if (dev.index == s.deviceIndex) {
+                        if (dev.slotsAllocated > 0) dev.slotsAllocated--;
+                        break;
+                    }
+                }
+
+                std::cout << "Info [Pool]: Releasing idle slot on GPU["
+                    << s.deviceIndex << "] (idle "
+                    << static_cast<int>(idle) << "s)" << std::endl;
+
+                // Destroy engine -- frees GPU memory.
+                // The InferenceSlot entry stays in the deque (dead) for reuse.
+                s.engine.reset();
+                s.memUsed = 0;
+                released++;
+                s_globalElasticCount--;
+            }
+        }
+    }
+
+    // Recount alive slots
+    int alive = 0;
+    for (const auto& s : m_slots) { if (s.engine) ++alive; }
+    m_totalCapacity = alive;
+
+    if (released > 0) {
+        std::cout << "Info [Pool]: Released " << released << " idle slot(s)"
+            << " -- pool now " << m_totalCapacity << " alive slot(s)"
+            << std::endl;
+    }
+
+    return released;
+}
+
+// ----------------------------------------------------------------------------
+//  printCapacityReport
+// ----------------------------------------------------------------------------
+template <typename T>
+void Engine<T>::printCapacityReport() const
+{
+    // Count alive vs dead -- lock protects against concurrent tryGrowPool
+    std::lock_guard<std::mutex> lock(m_slotMutex);
+    int alive = 0, dead = 0;
+    for (const auto& s : m_slots) {
+        if (s.engine) ++alive; else ++dead;
+    }
+
+    std::cout << "\n=====================================================" << std::endl;
+    std::cout << "  Engine Pool -- Capacity Report"
+              << (m_elasticMode ? " [ELASTIC]" : " [PRE-ALLOCATED]") << std::endl;
+    std::cout << "=====================================================" << std::endl;
+    std::cout << "  Alive inference slots : " << alive << std::endl;
+    if (dead > 0)
+        std::cout << "  Dead (recyclable)     : " << dead << std::endl;
+    std::cout << "  Active inferences     : " << m_activeCount.load() << std::endl;
+    std::cout << "  Available slots       : "
+              << (alive - m_activeCount.load())
+              << (m_elasticMode ? " (+ on-demand)" : "")
+              << std::endl;
+    if (m_elasticMode) {
+        std::cout << "  Global slot usage     : "
+                  << s_globalElasticCount.load() << "/" << s_globalElasticMax.load()
+                  << " (across all pools)" << std::endl;
+    }
+    std::cout << "  Memory per slot       : " << m_memPerSlot / 1048576 << " MiB" << std::endl;
+    std::cout << "-----------------------------------------------------" << std::endl;
+    for (const auto& d : m_deviceInfos) {
+        std::cout << "  GPU[" << d.index << "] " << d.name
+            << " | SM " << d.computeMajor << "." << d.computeMinor
+            << " | Total " << d.totalMemoryBytes / 1048576 << " MiB"
+            << " | Slots: " << d.slotsAllocated
+            << " | Mem/slot: " << d.memoryPerSlotBytes / 1048576 << " MiB"
+            << std::endl;
+    }
+    std::cout << "=====================================================" << std::endl;
+}
+
+// ----------------------------------------------------------------------------
+//  startIdleTimer / stopIdleTimer -- automatic idle-slot cleanup
+//
+//  A background thread wakes every m_idleTimerIntervalSec seconds and calls
+//  releaseIdleSlots(m_idleTimerThresholdSec).  The thread uses a
+//  condition_variable with a timed wait so that stopIdleTimer() can wake it
+//  immediately for a clean shutdown (no dangling sleeps).
+//
+//  Only active in elastic mode -- pre-allocated pools have fixed capacity.
+// ----------------------------------------------------------------------------
+template <typename T>
+void Engine<T>::startIdleTimer()
+{
+    if (!m_elasticMode) return;          // no-op for pre-allocated pools
+    if (m_idleTimerThread.joinable()) return;  // already running
+
+    m_idleTimerStop = false;
+
+    m_idleTimerThread = std::thread([this]() {
+        std::cout << "Info [Pool]: Idle-slot cleanup timer started "
+            << "(interval=" << m_idleTimerIntervalSec << "s, threshold="
+            << m_idleTimerThresholdSec << "s)" << std::endl;
+
+        while (!m_idleTimerStop.load()) {
+            // Sleep for the interval, but wake early if stop is signalled
+            {
+                std::unique_lock<std::mutex> lk(m_idleTimerMutex);
+                m_idleTimerCv.wait_for(lk,
+                    std::chrono::duration<double>(m_idleTimerIntervalSec),
+                    [this]() { return m_idleTimerStop.load(); });
+            }
+
+            if (m_idleTimerStop.load()) break;
+
+            releaseIdleSlots(m_idleTimerThresholdSec);
+        }
+
+        std::cout << "Info [Pool]: Idle-slot cleanup timer stopped." << std::endl;
+    });
+}
+
+template <typename T>
+void Engine<T>::stopIdleTimer()
+{
+    if (!m_idleTimerThread.joinable()) return;  // not running
+
+    m_idleTimerStop = true;
+    m_idleTimerCv.notify_all();       // wake the sleeping thread immediately
+
+    // During ExitProcess, worker threads are already killed by the OS.
+    // Calling join() on a dead thread deadlocks or causes std::terminate.
+    // Detach instead — the OS will reclaim everything momentarily.
+    if (g_processExiting().load(std::memory_order_relaxed)) {
+        m_idleTimerThread.detach();
+    } else {
+        m_idleTimerThread.join();     // normal path: wait for clean exit
+    }
+}
--- a/engines/TensorRTAPI/include/engine/EnginePoolManager.h
+++ b/engines/TensorRTAPI/include/engine/EnginePoolManager.h
@@ -0,0 +1,431 @@
+#pragma once
+// EnginePoolManager.h — Process-wide cache for shared Engine<T> pool instances.
+//
+// When multiple AI tasks load the same model (same ONNX path + GPU + config),
+// this manager ensures they share a SINGLE Engine<T> pool instead of each task
+// creating its own pool with independent execution contexts and VRAM buffers.
+//
+// Without sharing: N tasks × ~500 MB = N × 500 MB VRAM (OOM at ~5-8 tasks on 8GB GPU)
+// With sharing:    1 pool × ~500 MB  = 500 MB total   (unlimited tasks, slower via queuing)
+//
+// Lazy eviction: when refcount drops to 0, the pool is kept alive for
+// kEvictGraceSec seconds.  If a new task acquires it within that window,
+// it gets an instant HIT without rebuilding.  This handles the LabView
+// edit/duplicate/create cycle (destroy → recreate) gracefully.
+//
+// Thread-safety: All public methods are mutex-protected.
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <array>
+#include <iostream>
+#include <functional>
+#include <chrono>
+#include <thread>
+#include <atomic>
+#include <cuda_runtime.h>
+#include "TRTEngineCache.h"      // constructor touches TRTEngineCache::instance() for destruction ordering
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+// Forward declare Engine<T> to avoid circular includes.
+// The header that includes this must also include engine.h.
+template <typename T> class Engine;
+
+namespace ANSCENTER { struct Options; }
+
+template <typename T>
+class EnginePoolManager {
+public:
+    static EnginePoolManager& instance() {
+        static EnginePoolManager s_instance;
+        return s_instance;
+    }
+
+    // ========================================================================
+    //  Cache key — uniquely identifies a compatible Engine pool.
+    // ========================================================================
+    struct PoolKey {
+        std::string modelPath;
+        int         precision   = 0;   // cast from Precision enum
+        int         maxBatch    = 1;
+
+        bool operator==(const PoolKey& o) const {
+            return modelPath == o.modelPath &&
+                   precision == o.precision &&
+                   maxBatch  == o.maxBatch;
+        }
+    };
+
+    struct PoolKeyHash {
+        size_t operator()(const PoolKey& k) const {
+            size_t h = std::hash<std::string>{}(k.modelPath);
+            h ^= std::hash<int>{}(k.precision)  << 16;
+            h ^= std::hash<int>{}(k.maxBatch)   << 24;
+            return h;
+        }
+    };
+
+    // ========================================================================
+    //  acquire() — get or create a shared Engine pool.
+    //
+    //  On first call for a given key: creates a new Engine<T>, calls
+    //  buildLoadNetwork with the provided parameters, and caches it.
+    //
+    //  On subsequent calls (or within lazy-eviction grace period):
+    //  returns the existing shared_ptr and increments refcount.
+    //  No VRAM allocated, near-instant.
+    //
+    //  Returns nullptr if engine creation/loading fails.
+    // ========================================================================
+    std::shared_ptr<Engine<T>> acquire(
+            const PoolKey&              key,
+            const ANSCENTER::Options&   options,
+            const std::string&          modelPath,
+            const std::array<float, 3>& subVals,
+            const std::array<float, 3>& divVals,
+            bool                        normalize,
+            int                         maxSlotsPerGpu)
+    {
+        // Optimizer / temporary engines: maxSlotsPerGpu==0 means the caller
+        // only needs a lightweight, non-shared engine (e.g., OptimizeModelStr).
+        // Bypass the pool cache entirely:
+        //   - Don't hold m_mutex (which blocks ALL other pool creation)
+        //   - Don't cache the result (temporary engine is destroyed on release)
+        //   - Use the simple 4-param buildLoadNetwork (no pool, no probe, no VRAM measurement)
+        // Note: maxSlotsPerGpu==1 is now the normal "1 slot per GPU" multi-GPU
+        // round-robin mode, so it goes through the pool path below.
+        if (maxSlotsPerGpu == 0) {
+            logEvent("[EnginePoolManager] BYPASS (maxSlots=0): " + key.modelPath
+                     + " — creating non-shared engine");
+            auto engine = std::make_shared<Engine<T>>(options);
+            bool ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
+            return ok ? engine : nullptr;
+        }
+
+        std::unique_lock<std::mutex> lock(m_mutex);
+
+        auto it = m_pools.find(key);
+        if (it != m_pools.end()) {
+            it->second.refcount++;
+            it->second.evictTime = TimePoint{};  // cancel pending eviction
+            int refs = it->second.refcount;
+            auto engine = it->second.engine;
+            logEvent("[EnginePoolManager] HIT: " + key.modelPath
+                     + " refs=" + std::to_string(refs));
+
+            // Demand-driven growth: only in elastic mode (maxSlotsPerGpu <= 0
+            // or > 1).  With maxSlotsPerGpu==1 (round-robin default), the pool
+            // already has the right number of slots (1 per GPU) — tasks queue
+            // when all slots are busy, which is the intended behavior.
+            if (maxSlotsPerGpu != 1 && refs > 1 && engine) {
+                int alive = engine->getTotalCapacity();
+                if (alive < refs) {
+                    // Check total GPU VRAM — skip growth on small GPUs
+                    size_t totalVram = 0;
+                    {
+                        size_t freeTmp = 0;
+                        cudaSetDevice(options.deviceIndex);
+                        cudaMemGetInfo(&freeTmp, &totalVram);
+                    }
+                    constexpr size_t kMinVramForGrowth = 6ULL * 1024 * 1024 * 1024;  // 6 GB
+                    if (totalVram >= kMinVramForGrowth) {
+                        lock.unlock();  // release PoolManager lock before growing
+                        std::thread([engine, alive, refs, modelPath = key.modelPath]() {
+                            int created = engine->growPool(1);
+                            if (created > 0) {
+                                logEngineEvent("[EnginePoolManager] DEMAND GROWTH: " + modelPath
+                                         + " grew from " + std::to_string(alive)
+                                         + " to " + std::to_string(engine->getTotalCapacity())
+                                         + " slots (refs=" + std::to_string(refs) + ")");
+                            }
+                        }).detach();
+                    } else {
+                        logEvent("[EnginePoolManager] SKIP GROWTH: " + key.modelPath
+                                 + " (GPU VRAM " + std::to_string(totalVram >> 20)
+                                 + " MiB < 6 GB threshold, refs=" + std::to_string(refs) + ")");
+                    }
+                }
+            }
+
+            return engine;
+        }
+
+        // Cache miss — create new Engine pool
+        logEvent("[EnginePoolManager] MISS: Creating pool for " + key.modelPath + "...");
+
+        // Log VRAM before attempting to create probe
+        {
+            size_t freeMem = 0, totalMem = 0;
+            cudaSetDevice(options.deviceIndex);
+            cudaMemGetInfo(&freeMem, &totalMem);
+            logEvent("[EnginePoolManager] GPU[" + std::to_string(options.deviceIndex)
+                     + "] VRAM: " + std::to_string(freeMem >> 20) + " MiB free / "
+                     + std::to_string(totalMem >> 20) + " MiB total (before probe)");
+        }
+
+        auto engine = std::make_shared<Engine<T>>(options);
+        bool ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize, maxSlotsPerGpu);
+        if (!ok) {
+            // Step 1: Force-evict all pools with refcount=0 to reclaim VRAM
+            int evicted = forceEvictPending();
+            if (evicted > 0) {
+                size_t freeMem2 = 0, totalMem2 = 0;
+                cudaSetDevice(options.deviceIndex);
+                cudaMemGetInfo(&freeMem2, &totalMem2);
+                logEvent("[EnginePoolManager] RETRY EVICT: Force-evicted " + std::to_string(evicted)
+                         + " pending pool(s), now " + std::to_string(freeMem2 >> 20)
+                         + " MiB free. Retrying " + key.modelPath + "...");
+
+                engine = std::make_shared<Engine<T>>(options);
+                ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize, maxSlotsPerGpu);
+            }
+
+            // Step 2: If still failing, retry with lightweight mode (no elastic pool).
+            // The elastic probe does heavy warmup (batch 1-8, 10+ iterations) which
+            // consumes ~300-500 MB vs ~50-100 MB for a simple loadNetwork.
+            // Lightweight mode: tasks queue for a single shared slot — slower but works.
+            if (!ok) {
+                size_t freeMem3 = 0, totalMem3 = 0;
+                cudaSetDevice(options.deviceIndex);
+                cudaMemGetInfo(&freeMem3, &totalMem3);
+                logEvent("[EnginePoolManager] RETRY LIGHTWEIGHT: Elastic probe failed, "
+                         + std::to_string(freeMem3 >> 20) + " MiB free. "
+                         "Retrying with single-slot mode for " + key.modelPath + "...");
+
+                engine = std::make_shared<Engine<T>>(options);
+                ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
+            }
+
+            // Step 3: If still failing, wait briefly and retry.
+            // Transient failures can occur when:
+            //   - TRT engine file is being written by another build (partial file)
+            //   - CUDA driver has temporary resource contention during multi-pool startup
+            //   - GPU memory fragmentation resolves after previous allocations settle
+            // Evidence: FireSmoke/detector.onnx failed at 3740 MiB free, then
+            //           succeeded 4 seconds later at 3154 MiB free (less VRAM!).
+            if (!ok) {
+                size_t freeMem4 = 0, totalMem4 = 0;
+                cudaSetDevice(options.deviceIndex);
+                cudaMemGetInfo(&freeMem4, &totalMem4);
+                logEvent("[EnginePoolManager] RETRY DELAYED: All attempts failed with "
+                         + std::to_string(freeMem4 >> 20) + " MiB free. "
+                         "Waiting 3s before final retry for " + key.modelPath + "...");
+
+                // Release mutex during sleep so other tasks can proceed
+                // (they may complete pool creation that resolves our issue)
+                lock.unlock();
+                std::this_thread::sleep_for(std::chrono::seconds(3));
+                lock.lock();
+
+                // Check if another thread created this pool while we slept
+                auto it2 = m_pools.find(key);
+                if (it2 != m_pools.end()) {
+                    it2->second.refcount++;
+                    it2->second.evictTime = TimePoint{};
+                    logEvent("[EnginePoolManager] HIT (after delay): " + key.modelPath
+                             + " refs=" + std::to_string(it2->second.refcount));
+                    return it2->second.engine;
+                }
+
+                // Final retry — try lightweight again after delay
+                cudaSetDevice(options.deviceIndex);
+                cudaMemGetInfo(&freeMem4, &totalMem4);
+                logEvent("[EnginePoolManager] RETRY FINAL: " + std::to_string(freeMem4 >> 20)
+                         + " MiB free. Last attempt for " + key.modelPath + "...");
+
+                engine = std::make_shared<Engine<T>>(options);
+                ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
+            }
+
+            if (!ok) {
+                size_t freeMem = 0, totalMem = 0;
+                cudaMemGetInfo(&freeMem, &totalMem);
+                logEvent("[EnginePoolManager] FAILED: Could not load engine for "
+                         + key.modelPath + " | GPU[" + std::to_string(options.deviceIndex)
+                         + "] VRAM: " + std::to_string(freeMem >> 20) + " MiB free / "
+                         + std::to_string(totalMem >> 20) + " MiB total"
+                         + " (after 4 attempts: elastic, evict, lightweight, delayed)", true);
+                return nullptr;
+            }
+        }
+
+        PoolEntry entry;
+        entry.engine   = engine;
+        entry.refcount = 1;
+        m_pools.emplace(key, std::move(entry));
+
+        // Start the lazy-eviction sweeper if not already running
+        startSweeperIfNeeded();
+
+        logEvent("[EnginePoolManager] CREATED: " + key.modelPath + " refs=1");
+        return engine;
+    }
+
+    // ========================================================================
+    //  release() — decrement refcount for a shared pool.
+    //
+    //  When refcount reaches 0, the pool is NOT immediately evicted.
+    //  Instead, it is marked for lazy eviction after kEvictGraceSec.
+    //  This handles the LabView edit cycle (destroy → recreate within
+    //  seconds) without rebuilding the engine from scratch.
+    // ========================================================================
+    void release(const PoolKey& key) {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        auto it = m_pools.find(key);
+        if (it == m_pools.end()) return;
+        if (it->second.refcount <= 0) return;
+
+        it->second.refcount--;
+        logEvent("[EnginePoolManager] RELEASE: " + key.modelPath
+                 + " refs=" + std::to_string(it->second.refcount));
+
+        if (it->second.refcount <= 0) {
+            // Mark for lazy eviction — don't destroy yet
+            it->second.evictTime = Clock::now() + std::chrono::seconds(kEvictGraceSec);
+            logEvent("[EnginePoolManager] PENDING EVICT: " + key.modelPath
+                     + " (will evict in " + std::to_string(kEvictGraceSec) + "s if not re-acquired)");
+        }
+    }
+
+    /// Clear all cached pools (call during DLL_PROCESS_DETACH).
+    void clearAll() {
+        {
+            std::lock_guard<std::mutex> lock(m_mutex);
+            logEvent("[EnginePoolManager] CLEAR ALL (" + std::to_string(m_pools.size()) + " pools)");
+            m_pools.clear();
+        }
+        stopSweeper();
+    }
+
+    /// Number of cached pools (for diagnostics).
+    size_t size() const {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        return m_pools.size();
+    }
+
+private:
+    EnginePoolManager() {
+        // CRITICAL: Touch TRTEngineCache singleton to ensure it is constructed
+        // BEFORE EnginePoolManager.  C++ destroys function-local statics in
+        // reverse construction order, so this guarantees TRTEngineCache outlives
+        // EnginePoolManager.  Without this, during ExitProcess the cache may be
+        // destroyed first, and ~Engine calling TRTEngineCache::release() crashes
+        // on a destroyed unordered_map (static destruction order fiasco).
+        (void)TRTEngineCache::instance();
+    }
+    ~EnginePoolManager() {
+        if (g_processExiting().load(std::memory_order_relaxed)) {
+            // ExitProcess path: worker threads are dead, CUDA/TRT state is
+            // unreliable.  Don't destroy Engine objects (their destructors
+            // call cudaFree, thread::join, etc. which deadlock or crash).
+            // The OS reclaims all memory, VRAM, and handles at process exit.
+            m_sweeperRunning.store(false);
+            return;
+        }
+        // Normal FreeLibrary path: threads are alive, safe to clean up.
+        // Explicitly clear pools before implicit member destruction.
+        // This destroys Engine<T> objects (which call TRTEngineCache::release())
+        // while we still hold m_mutex and can log diagnostics.
+        try {
+            std::lock_guard<std::mutex> lock(m_mutex);
+            m_pools.clear();
+        } catch (...) {}
+        stopSweeper();
+    }
+    EnginePoolManager(const EnginePoolManager&) = delete;
+    EnginePoolManager& operator=(const EnginePoolManager&) = delete;
+
+    // Grace period before evicting a pool with refcount=0.
+    // Covers LabView edit/duplicate/create cycles (destroy → recreate).
+    static constexpr int kEvictGraceSec = 120;  // 2 minutes
+
+    // Sweeper interval — how often to check for expired pools.
+    static constexpr int kSweeperIntervalSec = 30;
+
+    using Clock     = std::chrono::steady_clock;
+    using TimePoint = std::chrono::time_point<Clock>;
+
+    // Log to stdout/stderr only — no Windows Event Viewer.
+    // Event Viewer logging is handled by logEngineEvent() in engine.h for
+    // critical engine-level errors.  EnginePoolManager messages are
+    // informational (HIT/MISS/EVICT) and don't need Event Viewer entries.
+    static void logEvent(const std::string& msg, bool isError = false) {
+        if (isError)
+            std::cerr << msg << std::endl;
+        else
+            std::cout << msg << std::endl;
+    }
+
+    struct PoolEntry {
+        std::shared_ptr<Engine<T>> engine;
+        int       refcount  = 0;
+        TimePoint evictTime {};   // when to evict (zero = not pending)
+    };
+
+    // ========================================================================
+    //  Sweeper thread — periodically checks for pools whose eviction
+    //  grace period has expired and removes them.
+    // ========================================================================
+    void startSweeperIfNeeded() {
+        // Called under m_mutex
+        if (m_sweeperRunning.load()) return;
+        m_sweeperRunning.store(true);
+        m_sweeperThread = std::thread([this]() {
+            while (m_sweeperRunning.load()) {
+                std::this_thread::sleep_for(std::chrono::seconds(kSweeperIntervalSec));
+                if (!m_sweeperRunning.load()) break;
+                sweepExpired();
+            }
+        });
+        m_sweeperThread.detach();
+    }
+
+    void stopSweeper() {
+        m_sweeperRunning.store(false);
+    }
+
+    // Force-evict ALL pools with refcount=0 (regardless of grace period).
+    // Called when a new pool creation fails due to low VRAM.
+    // Returns number of pools evicted.
+    // MUST be called under m_mutex.
+    int forceEvictPending() {
+        int evicted = 0;
+        for (auto it = m_pools.begin(); it != m_pools.end(); ) {
+            if (it->second.refcount <= 0) {
+                logEvent("[EnginePoolManager] FORCE EVICT (VRAM recovery): " + it->first.modelPath);
+                it = m_pools.erase(it);
+                evicted++;
+            } else {
+                ++it;
+            }
+        }
+        return evicted;
+    }
+
+    void sweepExpired() {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        auto now = Clock::now();
+        for (auto it = m_pools.begin(); it != m_pools.end(); ) {
+            auto& entry = it->second;
+            // Only evict if refcount is 0 AND evictTime has passed
+            if (entry.refcount <= 0
+                && entry.evictTime != TimePoint{}
+                && now >= entry.evictTime)
+            {
+                logEvent("[EnginePoolManager] EVICT (expired): " + it->first.modelPath);
+                it = m_pools.erase(it);
+            } else {
+                ++it;
+            }
+        }
+    }
+
+    std::unordered_map<PoolKey, PoolEntry, PoolKeyHash> m_pools;
+    mutable std::mutex   m_mutex;
+    std::atomic<bool>    m_sweeperRunning{false};
+    std::thread          m_sweeperThread;
+};
--- a/engines/TensorRTAPI/include/engine/EngineRunInference.inl
+++ b/engines/TensorRTAPI/include/engine/EngineRunInference.inl
@@ -0,0 +1,719 @@
+#pragma once
+#include <cstring>
+#include <filesystem>
+#include "TRTCompat.h"
+
+// Per-device mutex for CUDA graph capture.
+// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
+// at the CUDA context level.  When two Engine instances on the same GPU
+// capture graphs concurrently, these cross-stream dependencies violate
+// graph capture rules ("operation not permitted when stream is capturing").
+// This mutex serialises graph captures across all Engine<T> instances on
+// the same device — subsequent cudaGraphLaunch calls are still concurrent.
+static std::mutex& graphCaptureMutex() {
+    static std::mutex m;
+    return m;
+}
+
+template <typename T>
+void Engine<T>::warmUp(int iterations) {
+    if (m_verbose) {
+        std::cout << "\n========================================" << std::endl;
+        std::cout << "Engine Warmup" << std::endl;
+        std::cout << "========================================" << std::endl;
+    }
+
+    // Determine batch sizes to warm up
+    std::vector<int> batchSizes;
+
+    if (m_options.maxBatchSize > 1) {
+        if (m_verbose) {
+            std::cout << "Dynamic batch engine detected (max batch: " << m_options.maxBatchSize << ")" << std::endl;
+            std::cout << "Warming up common batch sizes to pre-compile kernels..." << std::endl;
+        }
+
+        // Warm up ALL batch sizes from 1 to maxBatchSize.
+        // Each unseen batch size incurs a 100-300ms kernel compilation penalty
+        // on first use. Warming all sizes eliminates that latency at inference
+        // time and ensures every CUDA graph is pre-captured.
+        for (int batch = 1; batch <= m_options.maxBatchSize; ++batch) {
+            batchSizes.push_back(batch);
+        }
+    }
+    else {
+        if (m_verbose) std::cout << "Fixed batch engine detected (batch size: " << m_options.maxBatchSize << ")" << std::endl;
+        batchSizes.push_back(m_options.maxBatchSize);
+    }
+
+    if (m_verbose) {
+        std::cout << "Batch sizes to warm up: ";
+        for (size_t i = 0; i < batchSizes.size(); ++i) {
+            std::cout << batchSizes[i];
+            if (i < batchSizes.size() - 1) std::cout << ", ";
+        }
+        std::cout << std::endl;
+    }
+
+    // Warm up each batch size.
+    // The first call triggers kernel compilation; the second captures the CUDA
+    // graph.  Additional iterations only measure steady-state latency for the
+    // optBatchSize (printed as a diagnostic).
+    for (int batchSize : batchSizes) {
+        const int iters = (batchSize == m_options.optBatchSize) ? iterations : 2;
+        if (m_verbose) std::cout << "\nWarming up batch=" << batchSize << " (x" << iters << " iterations)..." << std::endl;
+
+        // Create dummy inputs for this batch size
+        std::vector<std::vector<cv::cuda::GpuMat>> dummyInputs;
+
+        for (size_t i = 0; i < m_inputDims.size(); ++i) {
+            const auto& dims = m_inputDims[i];
+            std::vector<cv::cuda::GpuMat> batch;
+
+            // FIXED: Create proper dummy images on GPU
+            // For dynamic spatial dims, use opt dimensions for warmup
+            int warmH = (dims.d[1] > 0) ? dims.d[1] : m_options.optInputHeight;
+            int warmW = (dims.d[2] > 0) ? dims.d[2] : m_options.optInputWidth;
+            for (int b = 0; b < batchSize; ++b) {
+                // Create on CPU first
+                cv::Mat cpuImg(warmH, warmW, CV_32FC(dims.d[0]), cv::Scalar(0.5f, 0.5f, 0.5f));
+
+                // Upload to GPU
+                cv::cuda::GpuMat gpuImg;
+                gpuImg.upload(cpuImg);
+
+                batch.push_back(gpuImg);
+            }
+
+            dummyInputs.push_back(batch);
+        }
+
+        std::vector<std::vector<std::vector<T>>> dummyOutputs;
+
+        // Time the first iteration (kernel compilation happens here)
+        auto start = std::chrono::high_resolution_clock::now();
+
+        bool firstSuccess = runInference(dummyInputs, dummyOutputs);
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto firstTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+        if (!firstSuccess) {
+            if (m_verbose) std::cout << "  ✗ ERROR: First iteration failed for batch=" << batchSize << std::endl;
+            continue;
+        }
+
+        if (m_verbose) {
+            std::cout << "  First iteration: " << firstTime << " ms";
+            if (firstTime > 100) {
+                std::cout << " (kernel compilation detected)";
+            }
+            std::cout << std::endl;
+        }
+
+        // Run remaining iterations to measure stable performance
+        if (iters > 1) {
+            auto iterStart = std::chrono::high_resolution_clock::now();
+
+            for (int i = 1; i < iters; ++i) {
+                bool success = runInference(dummyInputs, dummyOutputs);
+                if (!success) {
+                    if (m_verbose) std::cout << "  ✗ ERROR: Iteration " << i << " failed" << std::endl;
+                    break;
+                }
+            }
+
+            auto iterEnd = std::chrono::high_resolution_clock::now();
+            auto totalTime = std::chrono::duration_cast<std::chrono::milliseconds>(iterEnd - iterStart).count();
+            float avgTime = totalTime / static_cast<float>(iters - 1);
+
+            if (m_verbose) {
+                std::cout << "  Subsequent iterations (avg): " << std::fixed << std::setprecision(1)
+                    << avgTime << " ms" << std::endl;
+
+                if (firstTime > 100 && avgTime < firstTime * 0.5f) {
+                    float speedup = firstTime / avgTime;
+                    std::cout << "  ✓ Speedup after warmup: " << std::fixed << std::setprecision(1)
+                        << speedup << "x faster" << std::endl;
+                }
+            }
+        }
+
+        if (m_verbose) std::cout << "  ✓ Batch=" << batchSize << " warmed up successfully" << std::endl;
+    }
+
+    if (m_verbose) {
+        std::cout << "\n========================================" << std::endl;
+        std::cout << "Warmup Complete!" << std::endl;
+        std::cout << "========================================" << std::endl;
+        std::cout << "Kernels pre-compiled for all batch sizes." << std::endl;
+        std::cout << "========================================\n" << std::endl;
+    }
+}
+template <typename T>
+bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,std::vector<std::vector<std::vector<T>>>& featureVectors) {
+
+    // ============================================================================
+    // MULTI-GPU POOL DISPATCH
+    // ============================================================================
+    // If this Engine was initialised with initializePool() / initializePoolFromEngine()
+    // the m_slots vector is non-empty.  In that case, delegate to the pool
+    // dispatcher which acquires the first idle slot and runs inference there.
+    // This branch is NEVER taken for single-GPU use (buildLoadNetwork / loadNetwork).
+    if (!m_slots.empty()) {
+        return runInferenceFromPool(inputs, featureVectors);
+    }
+
+    // ============================================================================
+    // SINGLE-ENGINE SERIALISATION
+    // ============================================================================
+    // The single Engine instance has shared mutable state (m_buffers, m_lastBatchSize,
+    // m_inferenceStream, TRT execution context).  If two LabVIEW threads call
+    // runInference concurrently with different batch sizes, one will overwrite
+    // the input shapes and buffers while the other is mid-inference, causing a
+    // fatal "illegal memory access" that permanently corrupts the CUDA context.
+    //
+    // Pool-mode slots have their own busy-flag dispatch so they do NOT need this.
+    std::lock_guard<std::mutex> inferenceLock(m_inferenceMutex);
+
+    // ============================================================================
+    // THREAD-SAFE GPU CONTEXT
+    // ============================================================================
+    // Ensure the calling thread's CUDA device matches this engine's GPU.
+    // This is essential for multi-GPU round-robin: LabVIEW reuses threads
+    // across tasks, so a thread that last ran inference on GPU 1 might now
+    // be running a task on GPU 0. Without this, cv::cuda::GpuMat allocations
+    // and kernel launches would target the wrong GPU, causing result corruption.
+    // Skip cudaSetDevice if already on the correct device — under WDDM
+    // with multiple GPUs each call costs 1-5ms of scheduler overhead.
+    {
+        int currentDev = -1;
+        cudaGetDevice(&currentDev);
+        if (currentDev != m_options.deviceIndex) {
+            cudaSetDevice(m_options.deviceIndex);
+        }
+    }
+
+    // ============================================================================
+    // DEBUG: First call diagnostics (per-instance, not process-wide)
+    // ============================================================================
+
+    if (m_verbose && m_firstInferenceCall) {
+        std::cout << "\n=== First runInference Call ===" << std::endl;
+        std::cout << "Number of input tensors: " << inputs.size() << std::endl;
+        for (size_t i = 0; i < inputs.size(); ++i) {
+            std::cout << "Input " << i << " batch size: " << inputs[i].size() << std::endl;
+            if (!inputs[i].empty()) {
+                const auto& img = inputs[i][0];
+                std::cout << "  Image shape: " << img.cols << "x" << img.rows
+                    << "x" << img.channels() << " (type: " << img.type() << ")" << std::endl;
+            }
+        }
+
+        // Print optimization profile information
+        std::cout << "\n=== Engine Profile Information ===" << std::endl;
+        std::cout << "Number of optimization profiles: "
+            << m_engine->getNbOptimizationProfiles() << std::endl;
+
+        if (m_engine->getNbOptimizationProfiles() > 0) {
+            for (int profile = 0; profile < m_engine->getNbOptimizationProfiles(); ++profile) {
+                std::cout << "\n--- Profile " << profile << " ---" << std::endl;
+
+                for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
+                    const char* tensorName = m_IOTensorNames[i].c_str();
+
+                    // Check if this is an input tensor
+                    auto ioMode = m_engine->getTensorIOMode(tensorName);
+                    if (ioMode != nvinfer1::TensorIOMode::kINPUT) {
+                        continue;
+                    }
+
+                    auto minDims = m_engine->getProfileShape(tensorName, profile,
+                        nvinfer1::OptProfileSelector::kMIN);
+                    auto optDims = m_engine->getProfileShape(tensorName, profile,
+                        nvinfer1::OptProfileSelector::kOPT);
+                    auto maxDims = m_engine->getProfileShape(tensorName, profile,
+                        nvinfer1::OptProfileSelector::kMAX);
+
+                    std::cout << "Tensor '" << tensorName << "' (INPUT):" << std::endl;
+                    std::cout << "  Min: [" << minDims.d[0];
+                    for (int j = 1; j < minDims.nbDims; ++j) std::cout << "," << minDims.d[j];
+                    std::cout << "]" << std::endl;
+
+                    std::cout << "  Opt: [" << optDims.d[0];
+                    for (int j = 1; j < optDims.nbDims; ++j) std::cout << "," << optDims.d[j];
+                    std::cout << "]" << std::endl;
+
+                    std::cout << "  Max: [" << maxDims.d[0];
+                    for (int j = 1; j < maxDims.nbDims; ++j) std::cout << "," << maxDims.d[j];
+                    std::cout << "]" << std::endl;
+                }
+            }
+        }
+
+        if (!m_context->allInputDimensionsSpecified()) {
+            std::cout << "ERROR: Input dimensions not specified in context!" << std::endl;
+            return false;
+        }
+
+        std::cout << "\nContext state: All dimensions specified ✓" << std::endl;
+        m_firstInferenceCall = false;
+    }
+
+    // ============================================================================
+    // INPUT VALIDATION
+    // ============================================================================
+
+    if (inputs.empty() || inputs[0].empty()) {
+        std::cout << "Error: Empty input" << std::endl;
+        return false;
+    }
+
+    const auto numInputs = m_inputDims.size();
+    if (inputs.size() != numInputs) {
+        std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
+            << ", Got: " << inputs.size() << std::endl;
+        return false;
+    }
+
+    const auto batchSize = static_cast<int32_t>(inputs[0].size());
+
+    if (batchSize > m_options.maxBatchSize) {
+        std::cout << "Error: Batch size " << batchSize << " exceeds maximum "
+            << m_options.maxBatchSize << std::endl;
+        return false;
+    }
+
+    if (batchSize < 1) {
+        std::cout << "Error: Batch size must be at least 1" << std::endl;
+        return false;
+    }
+
+    // Validate batch size consistency across all inputs
+    for (size_t i = 1; i < inputs.size(); ++i) {
+        if (inputs[i].size() != static_cast<size_t>(batchSize)) {
+            std::cout << "Error: Inconsistent batch sizes across inputs. Input 0: "
+                << batchSize << ", Input " << i << ": " << inputs[i].size() << std::endl;
+            return false;
+        }
+    }
+
+    // ============================================================================
+    // STREAM GUARD
+    // ============================================================================
+    // m_inferenceStream is now created eagerly in loadNetwork() so it is always
+    // valid here.  Guard against the (unlikely) edge case where runInference is
+    // called before loadNetwork succeeds.
+    if (!m_streamInitialized || !m_inferenceStream) {
+        std::string errMsg = "Error: Inference stream not initialised. "
+                     "Call loadNetwork() / buildLoadNetwork() before runInference().";
+        std::cout << errMsg << std::endl;
+        logEngineEvent("[Engine] runInference: " + errMsg, true);
+        return false;
+    }
+
+    // ============================================================================
+    // SET INPUT SHAPES (batch size changed OR dynamic spatial dims need updating)
+    // ============================================================================
+    // Fast path: compute desired dims first, then compare against cached dims.
+    // This avoids all TRT API calls when the shape hasn't actually changed —
+    // critical for the recognizer which is called ~50-100x per image with
+    // dynamic width but often the same or similar widths.
+    // ============================================================================
+
+    {
+        // Lazily initialise the dims cache on first call
+        if (m_lastSetInputDims.empty()) {
+            m_lastSetInputDims.resize(numInputs);
+            for (size_t i = 0; i < numInputs; ++i) {
+                m_lastSetInputDims[i].nbDims = 0;  // force mismatch on first call
+            }
+        }
+
+        // Build desired dims for every input tensor (cheap — no TRT API calls)
+        bool anyDimChanged = (m_lastBatchSize != batchSize);
+        std::vector<nvinfer1::Dims> desiredDims(numInputs);
+        for (size_t i = 0; i < numInputs; ++i) {
+            nvinfer1::Dims& nd = desiredDims[i];
+            nd.nbDims = 4;
+            nd.d[0] = batchSize;
+            nd.d[1] = m_inputDims[i].d[0];  // channels
+            if (m_hasDynamicSpatialDims && !inputs[i].empty()) {
+                const auto& firstImg = inputs[i][0];
+                nd.d[2] = (m_inputDims[i].d[1] == -1) ? firstImg.rows : m_inputDims[i].d[1];
+                nd.d[3] = (m_inputDims[i].d[2] == -1) ? firstImg.cols : m_inputDims[i].d[2];
+            } else {
+                nd.d[2] = m_inputDims[i].d[1];
+                nd.d[3] = m_inputDims[i].d[2];
+            }
+            // Compare with cached
+            if (!anyDimChanged) {
+                const auto& cached = m_lastSetInputDims[i];
+                if (cached.nbDims != nd.nbDims ||
+                    cached.d[0] != nd.d[0] || cached.d[1] != nd.d[1] ||
+                    cached.d[2] != nd.d[2] || cached.d[3] != nd.d[3]) {
+                    anyDimChanged = true;
+                }
+            }
+        }
+
+        if (anyDimChanged) {
+            // === First-time diagnostics (verbose, once) ===
+            const bool firstTime = !m_batchShapeChangeLogged;
+
+            if (m_verbose && firstTime) {
+                std::cout << "\nInfo: Batch size change: " << m_lastBatchSize
+                          << " -> " << batchSize << std::endl;
+            }
+
+            // Set optimization profile (only when truly needed)
+            if (m_engine->getNbOptimizationProfiles() > 0) {
+                int currentProfile = m_context->getOptimizationProfile();
+                if (currentProfile != 0 || m_lastBatchSize < 0) {
+                    if (m_verbose && firstTime) {
+                        std::cout << "  Setting optimization profile to 0..." << std::endl;
+                    }
+                    if (!m_context->setOptimizationProfileAsync(0, m_inferenceStream)) {
+                        std::cout << "Error: Failed to set optimization profile 0" << std::endl;
+                        return false;
+                    }
+                    cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
+                    if (syncErr != cudaSuccess) {
+                        std::cout << "Error: Failed to sync after profile change: "
+                            << cudaGetErrorString(syncErr) << std::endl;
+                        return false;
+                    }
+                    if (m_verbose && firstTime) {
+                        std::cout << "  Optimization profile set successfully" << std::endl;
+                    }
+                }
+            }
+
+            // Update shapes for input tensors that actually changed
+            for (size_t i = 0; i < numInputs; ++i) {
+                const char* tensorName = m_IOTensorNames[i].c_str();
+
+                // Skip non-input tensors
+                auto ioMode = m_engine->getTensorIOMode(tensorName);
+                if (ioMode != nvinfer1::TensorIOMode::kINPUT) continue;
+
+                const nvinfer1::Dims& newDims = desiredDims[i];
+                const nvinfer1::Dims& cached = m_lastSetInputDims[i];
+
+                // Skip this tensor if its dims haven't changed
+                if (cached.nbDims == newDims.nbDims &&
+                    cached.d[0] == newDims.d[0] && cached.d[1] == newDims.d[1] &&
+                    cached.d[2] == newDims.d[2] && cached.d[3] == newDims.d[3]) {
+                    continue;
+                }
+
+                // First-time verbose diagnostics
+                if (m_verbose && firstTime) {
+                    std::cout << "\n  Processing tensor " << i << ": '" << tensorName << "'" << std::endl;
+                    // Validate batch size range (first time only)
+                    if (m_engine->getNbOptimizationProfiles() > 0) {
+                        int profileIndex = m_context->getOptimizationProfile();
+                        nvinfer1::Dims minDims = m_engine->getProfileShape(
+                            tensorName, profileIndex, nvinfer1::OptProfileSelector::kMIN);
+                        nvinfer1::Dims maxDims = m_engine->getProfileShape(
+                            tensorName, profileIndex, nvinfer1::OptProfileSelector::kMAX);
+                        std::cout << "    Profile batch range: [" << minDims.d[0]
+                            << " to " << maxDims.d[0] << "]" << std::endl;
+                        if (batchSize < minDims.d[0] || batchSize > maxDims.d[0]) {
+                            std::cout << "Error: Batch size " << batchSize
+                                << " outside profile range" << std::endl;
+                            return false;
+                        }
+                    }
+                    auto currentShape = m_context->getTensorShape(tensorName);
+                    std::cout << "    Current context shape: [";
+                    for (int j = 0; j < currentShape.nbDims; ++j) {
+                        if (j > 0) std::cout << ", ";
+                        std::cout << currentShape.d[j];
+                    }
+                    std::cout << "]" << std::endl;
+                    std::cout << "    Setting new shape: [" << newDims.d[0] << ", "
+                        << newDims.d[1] << ", " << newDims.d[2] << ", "
+                        << newDims.d[3] << "]" << std::endl;
+                }
+
+                if (!m_context->setInputShape(tensorName, newDims)) {
+                    std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
+                    return false;
+                }
+
+                // Verify shape (first time only — trust the API on hot path)
+                if (firstTime) {
+                    auto verifyShape = m_context->getTensorShape(tensorName);
+                    if (verifyShape.d[0] != batchSize) {
+                        std::cout << "Error: Shape change didn't take effect. Expected batch "
+                            << batchSize << ", got " << verifyShape.d[0] << std::endl;
+                        return false;
+                    }
+                    if (m_verbose) {
+                        std::cout << "    Shape updated successfully" << std::endl;
+                    }
+                }
+
+                m_lastSetInputDims[i] = newDims;
+            }
+
+            // Verify all input dimensions specified (first time only)
+            if (firstTime) {
+                if (!m_context->allInputDimensionsSpecified()) {
+                    std::cout << "Error: Not all input dimensions specified after shape change" << std::endl;
+                    for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
+                        auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
+                        std::cout << "    " << m_IOTensorNames[i] << ": [";
+                        for (int j = 0; j < shape.nbDims; ++j) {
+                            if (j > 0) std::cout << ", ";
+                            std::cout << shape.d[j];
+                        }
+                        std::cout << "]" << std::endl;
+                    }
+                    return false;
+                }
+            }
+
+            m_lastBatchSize = batchSize;
+            m_batchShapeChangeLogged = true;
+            if (m_verbose && firstTime) {
+                std::cout << "\nInfo: Input shapes updated successfully for batch size "
+                    << batchSize << " ✓\n" << std::endl;
+            }
+        }
+    }
+
+    // ============================================================================
+    // PREPROCESS AND COPY INPUTS TO GPU BUFFERS
+    // ============================================================================
+
+    // Pass 1: Validate all input dimensions before any GPU work.
+    // Dynamic dims (-1) are skipped in validation (they accept any size).
+    for (size_t i = 0; i < numInputs; ++i) {
+        const auto& batchInput = inputs[i];
+        const auto& dims = m_inputDims[i];
+        if (!batchInput.empty()) {
+            const auto& firstImg = batchInput[0];
+            bool mismatch = false;
+            if (dims.d[0] > 0 && firstImg.channels() != dims.d[0]) mismatch = true;
+            if (dims.d[1] > 0 && firstImg.rows        != dims.d[1]) mismatch = true;
+            if (dims.d[2] > 0 && firstImg.cols        != dims.d[2]) mismatch = true;
+            if (mismatch) {
+                std::cout << "Error: Input " << i << " dimension mismatch!" << std::endl;
+                std::cout << "  Expected: " << dims.d[2] << "x" << dims.d[1]
+                    << "x" << dims.d[0] << " (WxHxC, -1=dynamic)" << std::endl;
+                std::cout << "  Got: " << firstImg.cols << "x" << firstImg.rows
+                    << "x" << firstImg.channels() << " (WxHxC)" << std::endl;
+                return false;
+            }
+        }
+    }
+
+    // Pass 2: Preprocess + D2D copies — all on m_inferenceStream (no null stream).
+    //
+    // All OpenCV CUDA ops (convertTo, subtract, divide, split) in blobFromGpuMats
+    // now run on m_inferenceStream via the cv::cuda::Stream wrapper.  This means:
+    //   • No null-stream interaction — eliminates global sync barriers on WDDM
+    //   • No event bridge needed — same-stream ordering guarantees correctness
+    //   • CUDA graphs are safe — cv::cuda::split runs BEFORE graph capture
+    //
+    // GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
+    // cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
+    cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
+    std::vector<cv::cuda::GpuMat> preprocessedBuffers;
+    preprocessedBuffers.reserve(numInputs);
+
+    for (size_t i = 0; i < numInputs; ++i) {
+        const auto& batchInput = inputs[i];
+
+        // Preprocess on m_inferenceStream (not the null stream).
+        preprocessedBuffers.push_back(
+            blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize, false, cvInferStream));
+
+        // D2D copy: same stream as preprocessing, so ordering is guaranteed.
+        const auto& blobMat = preprocessedBuffers.back();
+        const size_t copySize = static_cast<size_t>(blobMat.rows) * static_cast<size_t>(blobMat.cols) * blobMat.elemSize();
+        cudaError_t copyErr = cudaMemcpyAsync(
+            m_buffers[i],
+            preprocessedBuffers.back().ptr<void>(),
+            copySize,
+            cudaMemcpyDeviceToDevice,
+            m_inferenceStream);
+
+        if (copyErr != cudaSuccess) {
+            std::cout << "Error: Failed to copy input " << i
+                << " to inference buffer: " << cudaGetErrorString(copyErr) << std::endl;
+            return false;
+        }
+    }
+
+    // ============================================================================
+    // PRE-ALLOCATE OUTPUT STRUCTURE
+    // ============================================================================
+
+    const size_t numOutputs = m_outputLengths.size();
+
+    featureVectors.clear();
+    featureVectors.resize(batchSize);
+    for (int batch = 0; batch < batchSize; ++batch) {
+        featureVectors[batch].resize(numOutputs);
+        for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx)
+            featureVectors[batch][outputIdx].resize(m_outputLengths[outputIdx]);
+    }
+
+    // ============================================================================
+    // RUN INFERENCE + COPY OUTPUTS  (CUDA Graph path or direct path)
+    // ============================================================================
+
+    // CUDA Graph path
+    // ---------------
+    // On the first call for a given batchSize we capture enqueueV3 + D2H copies
+    // into a reusable graph.  Subsequent calls use cudaGraphLaunch, replacing
+    // many individual kernel-submission API calls with a single launch.
+    //
+    // Prerequisites satisfied here:
+    //   • Preprocessing + D2D copies are queued on m_inferenceStream (same-stream
+    //     ordering guarantees they complete before captured kernels execute)
+    //   • m_pinnedOutputBuffers has stable addresses (allocated in loadNetwork)
+    //   • m_buffers (GPU outputs) have stable addresses (allocated in loadNetwork)
+    //
+    // Falls back to the direct path if pinned buffers are unavailable or if
+    // graph capture/instantiation fails for any reason.
+
+    // CUDA graphs capture fixed kernel sequences; incompatible with dynamic spatial dims
+    // (input H/W change per inference call → different TRT kernel plans each time).
+    // Disabled for pool slots — concurrent graph captures on the same GPU corrupt the
+    // CUDA context ("operation not permitted when stream is capturing").
+    const bool canGraph = !m_disableGraphs && !m_pinnedOutputBuffers.empty() && !m_hasDynamicSpatialDims;
+    bool graphUsed = false;
+
+    if (canGraph) {
+        auto& graphExec = m_graphExecs[batchSize];   // inserts nullptr on first access
+        if (!graphExec) {
+            // First call for this batchSize -- capture a new graph.
+            // Serialise captures across all Engine instances on this device to
+            // prevent TRT's shared workspace from creating cross-stream
+            // dependencies that violate CUDA graph capture rules.
+            std::lock_guard<std::mutex> captureLock(graphCaptureMutex());
+
+            // Clear any sticky CUDA error from a prior failed capture so that
+            // this attempt starts clean.
+            cudaGetLastError();
+
+            cudaGraph_t graph = nullptr;
+            bool captureOk = false;
+
+            if (cudaStreamBeginCapture(m_inferenceStream,
+                                       cudaStreamCaptureModeRelaxed) == cudaSuccess) {
+                // Record TRT kernels into the graph (not executed yet).
+                TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
+
+                // Record D2H copies to stable pinned addresses.
+                for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
+                    cudaMemcpyAsync(
+                        m_pinnedOutputBuffers[outputIdx],
+                        static_cast<char*>(m_buffers[numInputs + outputIdx]),
+                        static_cast<size_t>(batchSize) * m_outputLengths[outputIdx] * sizeof(T),
+                        cudaMemcpyDeviceToHost,
+                        m_inferenceStream);
+                }
+
+                captureOk = (cudaStreamEndCapture(m_inferenceStream, &graph) == cudaSuccess
+                             && graph != nullptr);
+            }
+
+            if (captureOk) {
+                cudaGraphExec_t exec = nullptr;
+                if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
+                    graphExec = exec;
+                cudaGraphDestroy(graph);
+            }
+
+            if (!graphExec) {
+                std::cout << "Warning: CUDA graph capture failed for batchSize="
+                    << batchSize << " -- falling back to direct inference path." << std::endl;
+                // Disable graph acceleration for this Engine instance.
+                for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
+                m_pinnedOutputBuffers.clear();
+                m_graphExecs.erase(batchSize);
+            }
+        }
+
+        if (graphExec) {
+            // Launch the pre-captured graph (single API call replaces many).
+            cudaGraphLaunch(graphExec, m_inferenceStream);
+            cudaStreamSynchronize(m_inferenceStream);
+
+            // CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
+            for (int batch = 0; batch < batchSize; ++batch) {
+                for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
+                    std::memcpy(
+                        featureVectors[batch][outputIdx].data(),
+                        m_pinnedOutputBuffers[outputIdx]
+                            + static_cast<size_t>(batch) * m_outputLengths[outputIdx],
+                        m_outputLengths[outputIdx] * sizeof(T));
+                }
+            }
+            graphUsed = true;
+        }
+    }
+
+    // Direct path (no graph)
+    // ----------------------
+    // Used when pinned buffers are unavailable or graph capture failed.
+    if (!graphUsed) {
+        bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
+        if (!success) {
+            std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
+                + std::to_string(batchSize)
+                + ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
+            for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
+                auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
+                debugInfo += ", tensor'" + m_IOTensorNames[i] + "'=[";
+                for (int j = 0; j < shape.nbDims; ++j) {
+                    if (j > 0) debugInfo += ",";
+                    debugInfo += std::to_string(shape.d[j]);
+                }
+                debugInfo += "]";
+            }
+            std::cout << debugInfo << std::endl;
+            logEngineEvent(debugInfo, true);
+            return false;
+        }
+
+        for (int batch = 0; batch < batchSize; ++batch) {
+            for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
+                const size_t outputBinding = numInputs + outputIdx;
+                const size_t offset =
+                    static_cast<size_t>(batch) * m_outputLengths[outputIdx] * sizeof(T);
+
+                cudaError_t copyErr = cudaMemcpyAsync(
+                    featureVectors[batch][outputIdx].data(),
+                    static_cast<char*>(m_buffers[outputBinding]) + offset,
+                    m_outputLengths[outputIdx] * sizeof(T),
+                    cudaMemcpyDeviceToHost,
+                    m_inferenceStream);
+
+                if (copyErr != cudaSuccess) {
+                    std::string errMsg = "[Engine] runInference FAIL: cudaMemcpyAsync output "
+                        + std::to_string(outputIdx) + " batch " + std::to_string(batch)
+                        + ": " + cudaGetErrorString(copyErr);
+                    std::cout << errMsg << std::endl;
+                    logEngineEvent(errMsg, true);
+                    return false;
+                }
+            }
+        }
+
+        cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
+        if (syncErr != cudaSuccess) {
+            std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
+                + std::string(cudaGetErrorString(syncErr));
+            std::cout << errMsg << std::endl;
+            logEngineEvent(errMsg, true);
+            return false;
+        }
+    }
+
+    return true;
+}
--- a/engines/TensorRTAPI/include/engine/EngineUtilities.inl
+++ b/engines/TensorRTAPI/include/engine/EngineUtilities.inl
@@ -0,0 +1,250 @@
+#pragma once
+#include <filesystem>
+#include <NvInfer.h>        // NV_TENSORRT_MAJOR/MINOR/PATCH
+#include <NvInferVersion.h> // also defines TRT version macros
+#include <cudnn_version.h>  // CUDNN_MAJOR/MINOR/PATCHLEVEL
+#include <cuda_runtime.h>   // cudaRuntimeGetVersion
+
+template <typename T>
+void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output) {
+    if (input.size() == 1) {
+        output = std::move(input[0]);
+	}
+	else {
+		auto msg = "The feature vector has incorrect dimensions!";
+		std::cout<<msg;
+    }
+}
+template <typename T> 
+void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output) {
+    if (input.size() != 1 || input[0].size() != 1) {
+        auto msg = "The feature vector has incorrect dimensions!";
+        std::cout<<msg;
+    }
+    output = std::move(input[0][0]);
+}
+template <typename T>
+cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input, 
+                                                                size_t height, size_t width,
+                                                                const cv::Scalar& bgcolor) {
+    // Ensure input is valid
+    if (input.empty()) {
+		return cv::cuda::GpuMat();
+    }
+    // Create a CUDA stream
+    cv::cuda::Stream stream;
+    // Calculate aspect ratio and unpadded dimensions
+    float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
+    size_t unpad_w = static_cast<size_t>(r * input.cols);
+    size_t unpad_h = static_cast<size_t>(r * input.rows);
+    // Resize the input image
+    cv::cuda::GpuMat re;
+    re.create(unpad_h, unpad_w, input.type());
+    cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);
+    // Create the output image and fill with the background color
+    cv::cuda::GpuMat out;
+    out.create(height, width, input.type());
+    out.setTo(bgcolor, stream);
+    // Copy the resized content into the top-left corner of the output image
+    re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
+    stream.waitForCompletion();
+    return out;
+}
+
+template <typename T> void Engine<T>::getDeviceNames(std::vector<std::string> &deviceNames) {
+    int numGPUs;
+    cudaGetDeviceCount(&numGPUs);
+    for (int device = 0; device < numGPUs; device++) {
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, device);
+        deviceNames.push_back(std::string(prop.name));
+    }
+}
+template <typename T>  int  Engine<T>::getBindingIndexByName(const std::string& name) {
+    for (int i = 0, e = m_engine->getNbIOTensors(); i < e; i++)
+    {
+		if (name == m_engine->getIOTensorName(i))
+		{
+			return i;
+		}
+    }
+	return -1;
+}
+
+
+//template <typename T> std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options &options, const std::string &onnxModelPath) {
+//    const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
+//    std::string engineName = onnxModelPath.substr(filenamePos, onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
+//
+//    // Add the GPU device name to the file to ensure that the model is only used
+//    // on devices with the exact same GPU
+//    std::vector<std::string> deviceNames;
+//    getDeviceNames(deviceNames);
+//
+//    if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
+//        auto msg = "Error, provided device index is out of range!";
+//        std::cout<<msg;
+//		return "";
+//    }
+//
+//    auto deviceName = deviceNames[options.deviceIndex];
+//    // Remove spaces from the device name
+//    deviceName.erase(std::remove_if(deviceName.begin(), deviceName.end(), ::isspace), deviceName.end());
+//    engineName += "." + deviceName;
+//    // Serialize the specified options into the filename
+//    if (options.precision == ANSCENTER::Precision::FP16) {
+//        engineName += ".fp16";
+//    } else if (options.precision == ANSCENTER::Precision::FP32) {
+//        engineName += ".fp32";
+//    } else {
+//        engineName += ".int8";
+//    }
+//    if (options.maxBatchSize > 1) {
+//        engineName += "." + std::to_string(options.maxBatchSize);
+//    }
+//    return engineName;
+//}
+
+template <typename T>
+std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options& options,
+    const std::string& onnxModelPath)
+{
+    // -- Base name from ONNX file ---------------------------------------------
+    const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
+    std::string engineName = onnxModelPath.substr(
+        filenamePos,
+        onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
+
+    // -- GPU device name ------------------------------------------------------
+    // Ensures the engine is only loaded on the exact GPU it was built for.
+    std::vector<std::string> deviceNames;
+    getDeviceNames(deviceNames);
+    if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
+        std::cout << "Error, provided device index is out of range!";
+        return "";
+    }
+    auto deviceName = deviceNames[options.deviceIndex];
+    deviceName.erase(
+        std::remove_if(deviceName.begin(), deviceName.end(), ::isspace),
+        deviceName.end());
+    engineName += "." + deviceName;
+
+    // -- Precision ------------------------------------------------------------
+    if (options.precision == ANSCENTER::Precision::FP16) {
+        engineName += ".fp16";
+    }
+    else if (options.precision == ANSCENTER::Precision::FP32) {
+        engineName += ".fp32";
+    }
+    else {
+        engineName += ".int8";
+    }
+
+    // -- Batch size -----------------------------------------------------------
+    if (options.maxBatchSize > 1) {
+        engineName += ".b" + std::to_string(options.maxBatchSize);
+    }
+
+    // -- Max spatial dims: intentionally NOT included in the filename ----------
+    // buildWithRetry() may reduce max dims (e.g. 2560→1920) when GPU memory
+    // is insufficient.  If the filename included .s{H}x{W}, the next launch
+    // would look for .s2560x2560, miss the cached .s1920x1920, and waste
+    // minutes re-attempting the doomed 2560 build before falling back.
+    // Without the suffix, the cache is found immediately on the next launch.
+    // The actual profile max is queried at runtime via getProfileMaxHeight/Width.
+
+    // -- TensorRT version -----------------------------------------------------
+    // Engine format changes between TensorRT minor versions -- must rebuild.
+    // NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH are defined in
+    // <NvInferVersion.h> which is included via NvInfer.h.
+    engineName += ".trt"
+        + std::to_string(NV_TENSORRT_MAJOR) + "."
+        + std::to_string(NV_TENSORRT_MINOR) + "."
+        + std::to_string(NV_TENSORRT_PATCH);
+
+    // -- CUDA runtime version -------------------------------------------------
+    // Engines built with different CUDA versions may use different PTX/cubin
+    // formats and must be rebuilt.
+    int cudaVersion = 0;
+    cudaRuntimeGetVersion(&cudaVersion);
+    const int cudaMajor = cudaVersion / 1000;
+    const int cudaMinor = (cudaVersion % 1000) / 10;
+    engineName += ".cuda"
+        + std::to_string(cudaMajor) + "."
+        + std::to_string(cudaMinor);
+
+    // -- cuDNN version --------------------------------------------------------
+    // cuDNN version affects layer implementations inside the engine.
+    // CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL are defined in <cudnn_version.h>.
+    engineName += ".cudnn"
+        + std::to_string(CUDNN_MAJOR) + "."
+        + std::to_string(CUDNN_MINOR);
+
+    return engineName;
+}
+
+template <typename T>
+cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
+                                            const std::array<float, 3> &divVals, bool normalize, bool swapRB,
+                                            cv::cuda::Stream &stream) {
+    cv::cuda::GpuMat result;
+    if (batchInput.empty()) return result;
+    if (batchInput[0].channels() != 3) return result;
+
+    const int H     = batchInput[0].rows;
+    const int W     = batchInput[0].cols;
+    const int batch = static_cast<int>(batchInput.size());
+    const size_t planeSize = static_cast<size_t>(H) * W;   // pixels per channel
+
+    // Output blob: planar NCHW layout stored as a single-channel GpuMat.
+    // Total elements = batch * 3 * H * W.
+    cv::cuda::GpuMat blob(1, batch * 3 * static_cast<int>(planeSize), CV_32FC1);
+
+    for (int img = 0; img < batch; ++img) {
+        // 1. Convert to float and normalise while still in HWC (interleaved) format.
+        //    Channel-wise subtract / divide operate correctly on interleaved data.
+        cv::cuda::GpuMat floatImg;
+        if (normalize) {
+            batchInput[img].convertTo(floatImg, CV_32FC3, 1.f / 255.f, stream);
+        } else {
+            batchInput[img].convertTo(floatImg, CV_32FC3, 1.0, stream);
+        }
+
+        cv::cuda::subtract(floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), floatImg, cv::noArray(), -1, stream);
+        cv::cuda::divide(floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), floatImg, 1, -1, stream);
+
+        // 2. Split normalised HWC image into CHW planes directly into the blob.
+        size_t offset = static_cast<size_t>(img) * 3 * planeSize;
+
+        if (swapRB) {
+            // BGR input -> RGB planes: B goes to plane 2, G to plane 1, R to plane 0
+            std::vector<cv::cuda::GpuMat> channels{
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),  // B -> plane 2
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),      // G -> plane 1
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};                  // R -> plane 0
+            cv::cuda::split(floatImg, channels, stream);
+        } else {
+            // BGR input -> BGR planes: keep channel order
+            std::vector<cv::cuda::GpuMat> channels{
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
+                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
+            cv::cuda::split(floatImg, channels, stream);
+        }
+    }
+
+    return blob;
+}
+
+template <typename T> void Engine<T>::clearGpuBuffers() {
+    if (!m_buffers.empty()) {
+        // Free ALL I/O GPU buffers (both inputs and outputs).
+        // Previously only outputs were freed, leaking input allocations from loadNetwork().
+        for (void* ptr : m_buffers) {
+            if (ptr) {
+                Util::checkCudaErrorCode(cudaFree(ptr));
+            }
+        }
+        m_buffers.clear();
+    }
+}
--- a/engines/TensorRTAPI/include/engine/NvDynLoader.h
+++ b/engines/TensorRTAPI/include/engine/NvDynLoader.h
@@ -0,0 +1,9 @@
+#pragma once
+// ============================================================================
+// Forwarding header — NvDynLoader moved to ANSLibsLoader
+//
+// This file is retained for backward compatibility.  All consuming projects
+// should update their include paths to reference ANSLibsLoader/include/
+// directly.  Once all projects are updated, this file can be removed.
+// ============================================================================
+#include "../../../ANSLibsLoader/include/NvDynLoader.h"
--- a/engines/TensorRTAPI/include/engine/TRTCompat.h
+++ b/engines/TensorRTAPI/include/engine/TRTCompat.h
@@ -0,0 +1,50 @@
+#pragma once
+// ============================================================================
+// TRTCompat.h -- TensorRT version compatibility macros
+//
+// Centralises all TRT-version-dependent API differences so that the rest of
+// the codebase can be compiled against TRT 8.x or TRT 10.x without scattering
+// #if blocks everywhere.
+//
+// Build 1: CUDA 11.8 + cuDNN 8 + TensorRT 8.6 + OpenCV 4.10  (SM 35-86)
+// Build 2: CUDA 13.1 + cuDNN 9 + TensorRT 10   + OpenCV 4.13  (SM 75-121)
+// ============================================================================
+
+#include <NvInferVersion.h>
+
+// ---------------------------------------------------------------------------
+// Network creation
+// ---------------------------------------------------------------------------
+// TRT 10+: kEXPLICIT_BATCH was removed (it is the only mode).
+// TRT 8.x: The flag must be passed explicitly.
+#if NV_TENSORRT_MAJOR >= 10
+  #define TRT_CREATE_NETWORK(builder) \
+      (builder)->createNetworkV2(0)
+#else
+  #define TRT_CREATE_NETWORK(builder) \
+      (builder)->createNetworkV2(                                             \
+          1U << static_cast<uint32_t>(                                        \
+              nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH))
+#endif
+
+// ---------------------------------------------------------------------------
+// Inference execution
+// ---------------------------------------------------------------------------
+// TRT 10+: enqueueV3(stream) — uses tensor addresses pre-bound via
+//          setTensorAddress().
+// TRT 8.x: enqueueV2(bindings, stream, nullptr) — uses a void** array
+//          indexed by binding position.
+#if NV_TENSORRT_MAJOR >= 10
+  #define TRT_ENQUEUE(context, stream, buffers) \
+      (context)->enqueueV3(stream)
+#else
+  #define TRT_ENQUEUE(context, stream, buffers) \
+      (context)->enqueueV2(                                                   \
+          reinterpret_cast<void**>((buffers).data()), (stream), nullptr)
+#endif
+
+// ---------------------------------------------------------------------------
+// Feature-detection helpers
+// ---------------------------------------------------------------------------
+#define TRT_HAS_ENQUEUE_V3          (NV_TENSORRT_MAJOR >= 10)
+#define TRT_HAS_EXPLICIT_BATCH_FLAG (NV_TENSORRT_MAJOR < 10)
--- a/engines/TensorRTAPI/include/engine/TRTEngineCache.h
+++ b/engines/TensorRTAPI/include/engine/TRTEngineCache.h
@@ -0,0 +1,177 @@
+#pragma once
+// TRTEngineCache.h — Process-wide cache for shared TensorRT ICudaEngine instances.
+//
+// When multiple AI tasks load the same model (same .engine file + GPU), this cache
+// ensures only ONE copy of the model weights lives in VRAM. Each task creates its
+// own IExecutionContext from the shared ICudaEngine (TRT-supported pattern).
+//
+// Usage in loadNetwork():
+//   auto& cache = TRTEngineCache::instance();
+//   auto hit = cache.tryGet(enginePath, gpuIdx);
+//   if (hit.engine) {
+//       m_engine = hit.engine;  m_runtime = hit.runtime;  // cache hit
+//   } else {
+//       // ... deserialize as usual ...
+//       m_engine = cache.putIfAbsent(enginePath, gpuIdx, runtime, engine);
+//   }
+//
+// In ~Engine():
+//   cache.release(enginePath, gpuIdx);
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <iostream>
+#include <NvInfer.h>
+
+/// Process-wide flag: set to true during DLL_PROCESS_DETACH when ExitProcess
+/// is in progress (lpReserved != NULL).  Worker threads are already dead in
+/// this state, so thread::join() would deadlock and CUDA/TRT calls are unsafe.
+/// Checked by Engine::~Engine to skip cleanup that requires live threads or GPUs.
+inline std::atomic<bool>& g_processExiting() {
+    static std::atomic<bool> s_flag{false};
+    return s_flag;
+}
+
+class TRTEngineCache {
+public:
+    struct CacheHit {
+        std::shared_ptr<nvinfer1::ICudaEngine> engine;
+        std::shared_ptr<nvinfer1::IRuntime>    runtime;
+    };
+
+    static TRTEngineCache& instance() {
+        static TRTEngineCache s_instance;
+        return s_instance;
+    }
+
+    /// Global bypass — when true, tryGet() always returns miss, putIfAbsent()
+    /// is a no-op, and buildLoadNetwork/loadNetwork force single-GPU path.
+    /// Used by OptimizeModelStr to prevent inner engines (created by
+    /// custom DLLs via ANSLIB.dll) from creating pools/caching.
+    /// Stored as a member of the singleton to guarantee a single instance
+    /// across all translation units (avoids MSVC inline static duplication).
+    static std::atomic<bool>& globalBypass() {
+        return instance().m_globalBypass;
+    }
+
+    std::atomic<bool> m_globalBypass{false};
+
+    /// Try to get a cached engine. Returns {nullptr, nullptr} on miss.
+    /// On hit, increments refcount.
+    CacheHit tryGet(const std::string& engineFilePath, int gpuIndex) {
+        if (globalBypass().load(std::memory_order_relaxed)) return {nullptr, nullptr};
+        std::lock_guard<std::mutex> lock(m_mutex);
+        auto it = m_cache.find({engineFilePath, gpuIndex});
+        if (it != m_cache.end()) {
+            it->second.refcount++;
+            std::cout << "[TRTEngineCache] HIT: " << engineFilePath
+                      << " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl;
+            return {it->second.engine, it->second.runtime};
+        }
+        return {nullptr, nullptr};
+    }
+
+    /// Store a newly deserialized engine. If another thread already stored the
+    /// same key (race), returns the existing one and the caller's copy is discarded.
+    /// Increments refcount for the returned engine.
+    std::shared_ptr<nvinfer1::ICudaEngine> putIfAbsent(
+            const std::string& engineFilePath, int gpuIndex,
+            std::shared_ptr<nvinfer1::IRuntime>    runtime,
+            std::shared_ptr<nvinfer1::ICudaEngine> engine) {
+        if (globalBypass().load(std::memory_order_relaxed)) return engine;  // don't cache
+        std::lock_guard<std::mutex> lock(m_mutex);
+        CacheKey key{engineFilePath, gpuIndex};
+        auto it = m_cache.find(key);
+        if (it != m_cache.end()) {
+            // Another thread beat us — use theirs, discard ours
+            it->second.refcount++;
+            std::cout << "[TRTEngineCache] RACE: using existing for " << engineFilePath
+                      << " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl;
+            return it->second.engine;
+        }
+        // First to store — insert
+        CachedEntry entry;
+        entry.engine   = std::move(engine);
+        entry.runtime  = std::move(runtime);
+        entry.refcount = 1;
+        auto inserted = m_cache.emplace(std::move(key), std::move(entry));
+        std::cout << "[TRTEngineCache] STORED: " << engineFilePath
+                  << " GPU[" << gpuIndex << "] refs=1" << std::endl;
+        return inserted.first->second.engine;
+    }
+
+    /// Decrement refcount. When refcount reaches 0, the engine is evicted immediately
+    /// to release VRAM and file handles (allows ModelOptimizer to rebuild .engine files
+    /// while LabVIEW is running).
+    void release(const std::string& engineFilePath, int gpuIndex) {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        auto it = m_cache.find({engineFilePath, gpuIndex});
+        if (it != m_cache.end() && it->second.refcount > 0) {
+            it->second.refcount--;
+            std::cout << "[TRTEngineCache] RELEASE: " << engineFilePath
+                      << " GPU[" << gpuIndex << "] refs=" << it->second.refcount << std::endl;
+            if (it->second.refcount <= 0) {
+                std::cout << "[TRTEngineCache] EVICT (refcount=0): " << engineFilePath
+                          << " GPU[" << gpuIndex << "]" << std::endl;
+                m_cache.erase(it);
+            }
+        }
+    }
+
+    /// Remove all entries with refcount == 0 (call at shutdown or when VRAM tight).
+    void evictUnused() {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        for (auto it = m_cache.begin(); it != m_cache.end(); ) {
+            if (it->second.refcount <= 0) {
+                std::cout << "[TRTEngineCache] EVICT: " << it->first.path
+                          << " GPU[" << it->first.gpuIndex << "]" << std::endl;
+                it = m_cache.erase(it);
+            } else {
+                ++it;
+            }
+        }
+    }
+
+    /// Clear all cached engines immediately (call during DLL_PROCESS_DETACH
+    /// BEFORE destroying engine handles, to avoid calling into unloaded TRT DLLs).
+    void clearAll() {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        std::cout << "[TRTEngineCache] CLEAR ALL (" << m_cache.size() << " entries)" << std::endl;
+        m_cache.clear();  // shared_ptrs released — engines destroyed while TRT is still loaded
+    }
+
+    /// Number of cached engines (for diagnostics).
+    size_t size() const {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        return m_cache.size();
+    }
+
+private:
+    TRTEngineCache() = default;
+    TRTEngineCache(const TRTEngineCache&) = delete;
+    TRTEngineCache& operator=(const TRTEngineCache&) = delete;
+
+    struct CacheKey {
+        std::string path;
+        int gpuIndex = 0;
+        bool operator==(const CacheKey& o) const {
+            return path == o.path && gpuIndex == o.gpuIndex;
+        }
+    };
+    struct CacheKeyHash {
+        size_t operator()(const CacheKey& k) const {
+            return std::hash<std::string>{}(k.path) ^
+                   (std::hash<int>{}(k.gpuIndex) << 16);
+        }
+    };
+    struct CachedEntry {
+        std::shared_ptr<nvinfer1::ICudaEngine> engine;
+        std::shared_ptr<nvinfer1::IRuntime>    runtime;
+        int refcount = 0;
+    };
+
+    std::unordered_map<CacheKey, CachedEntry, CacheKeyHash> m_cache;
+    mutable std::mutex m_mutex;
+};