Initial setup for CLion

2026-03-28 16:54:11 +11:00
parent 239cc02591
commit 7b4134133c
1136 changed files with 811916 additions and 0 deletions
--- a/engines/TensorRTAPI/include/engine/EngineRunInference.inl
+++ b/engines/TensorRTAPI/include/engine/EngineRunInference.inl
@@ -0,0 +1,719 @@
+#pragma once
+#include <cstring>
+#include <filesystem>
+#include "TRTCompat.h"
+
+// Per-device mutex for CUDA graph capture.
+// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
+// at the CUDA context level.  When two Engine instances on the same GPU
+// capture graphs concurrently, these cross-stream dependencies violate
+// graph capture rules ("operation not permitted when stream is capturing").
+// This mutex serialises graph captures across all Engine<T> instances on
+// the same device — subsequent cudaGraphLaunch calls are still concurrent.
+static std::mutex& graphCaptureMutex() {
+    static std::mutex m;
+    return m;
+}
+
+template <typename T>
+void Engine<T>::warmUp(int iterations) {
+    if (m_verbose) {
+        std::cout << "\n========================================" << std::endl;
+        std::cout << "Engine Warmup" << std::endl;
+        std::cout << "========================================" << std::endl;
+    }
+
+    // Determine batch sizes to warm up
+    std::vector<int> batchSizes;
+
+    if (m_options.maxBatchSize > 1) {
+        if (m_verbose) {
+            std::cout << "Dynamic batch engine detected (max batch: " << m_options.maxBatchSize << ")" << std::endl;
+            std::cout << "Warming up common batch sizes to pre-compile kernels..." << std::endl;
+        }
+
+        // Warm up ALL batch sizes from 1 to maxBatchSize.
+        // Each unseen batch size incurs a 100-300ms kernel compilation penalty
+        // on first use. Warming all sizes eliminates that latency at inference
+        // time and ensures every CUDA graph is pre-captured.
+        for (int batch = 1; batch <= m_options.maxBatchSize; ++batch) {
+            batchSizes.push_back(batch);
+        }
+    }
+    else {
+        if (m_verbose) std::cout << "Fixed batch engine detected (batch size: " << m_options.maxBatchSize << ")" << std::endl;
+        batchSizes.push_back(m_options.maxBatchSize);
+    }
+
+    if (m_verbose) {
+        std::cout << "Batch sizes to warm up: ";
+        for (size_t i = 0; i < batchSizes.size(); ++i) {
+            std::cout << batchSizes[i];
+            if (i < batchSizes.size() - 1) std::cout << ", ";
+        }
+        std::cout << std::endl;
+    }
+
+    // Warm up each batch size.
+    // The first call triggers kernel compilation; the second captures the CUDA
+    // graph.  Additional iterations only measure steady-state latency for the
+    // optBatchSize (printed as a diagnostic).
+    for (int batchSize : batchSizes) {
+        const int iters = (batchSize == m_options.optBatchSize) ? iterations : 2;
+        if (m_verbose) std::cout << "\nWarming up batch=" << batchSize << " (x" << iters << " iterations)..." << std::endl;
+
+        // Create dummy inputs for this batch size
+        std::vector<std::vector<cv::cuda::GpuMat>> dummyInputs;
+
+        for (size_t i = 0; i < m_inputDims.size(); ++i) {
+            const auto& dims = m_inputDims[i];
+            std::vector<cv::cuda::GpuMat> batch;
+
+            // FIXED: Create proper dummy images on GPU
+            // For dynamic spatial dims, use opt dimensions for warmup
+            int warmH = (dims.d[1] > 0) ? dims.d[1] : m_options.optInputHeight;
+            int warmW = (dims.d[2] > 0) ? dims.d[2] : m_options.optInputWidth;
+            for (int b = 0; b < batchSize; ++b) {
+                // Create on CPU first
+                cv::Mat cpuImg(warmH, warmW, CV_32FC(dims.d[0]), cv::Scalar(0.5f, 0.5f, 0.5f));
+
+                // Upload to GPU
+                cv::cuda::GpuMat gpuImg;
+                gpuImg.upload(cpuImg);
+
+                batch.push_back(gpuImg);
+            }
+
+            dummyInputs.push_back(batch);
+        }
+
+        std::vector<std::vector<std::vector<T>>> dummyOutputs;
+
+        // Time the first iteration (kernel compilation happens here)
+        auto start = std::chrono::high_resolution_clock::now();
+
+        bool firstSuccess = runInference(dummyInputs, dummyOutputs);
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto firstTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+
+        if (!firstSuccess) {
+            if (m_verbose) std::cout << "  ✗ ERROR: First iteration failed for batch=" << batchSize << std::endl;
+            continue;
+        }
+
+        if (m_verbose) {
+            std::cout << "  First iteration: " << firstTime << " ms";
+            if (firstTime > 100) {
+                std::cout << " (kernel compilation detected)";
+            }
+            std::cout << std::endl;
+        }
+
+        // Run remaining iterations to measure stable performance
+        if (iters > 1) {
+            auto iterStart = std::chrono::high_resolution_clock::now();
+
+            for (int i = 1; i < iters; ++i) {
+                bool success = runInference(dummyInputs, dummyOutputs);
+                if (!success) {
+                    if (m_verbose) std::cout << "  ✗ ERROR: Iteration " << i << " failed" << std::endl;
+                    break;
+                }
+            }
+
+            auto iterEnd = std::chrono::high_resolution_clock::now();
+            auto totalTime = std::chrono::duration_cast<std::chrono::milliseconds>(iterEnd - iterStart).count();
+            float avgTime = totalTime / static_cast<float>(iters - 1);
+
+            if (m_verbose) {
+                std::cout << "  Subsequent iterations (avg): " << std::fixed << std::setprecision(1)
+                    << avgTime << " ms" << std::endl;
+
+                if (firstTime > 100 && avgTime < firstTime * 0.5f) {
+                    float speedup = firstTime / avgTime;
+                    std::cout << "  ✓ Speedup after warmup: " << std::fixed << std::setprecision(1)
+                        << speedup << "x faster" << std::endl;
+                }
+            }
+        }
+
+        if (m_verbose) std::cout << "  ✓ Batch=" << batchSize << " warmed up successfully" << std::endl;
+    }
+
+    if (m_verbose) {
+        std::cout << "\n========================================" << std::endl;
+        std::cout << "Warmup Complete!" << std::endl;
+        std::cout << "========================================" << std::endl;
+        std::cout << "Kernels pre-compiled for all batch sizes." << std::endl;
+        std::cout << "========================================\n" << std::endl;
+    }
+}
+template <typename T>
+bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,std::vector<std::vector<std::vector<T>>>& featureVectors) {
+
+    // ============================================================================
+    // MULTI-GPU POOL DISPATCH
+    // ============================================================================
+    // If this Engine was initialised with initializePool() / initializePoolFromEngine()
+    // the m_slots vector is non-empty.  In that case, delegate to the pool
+    // dispatcher which acquires the first idle slot and runs inference there.
+    // This branch is NEVER taken for single-GPU use (buildLoadNetwork / loadNetwork).
+    if (!m_slots.empty()) {
+        return runInferenceFromPool(inputs, featureVectors);
+    }
+
+    // ============================================================================
+    // SINGLE-ENGINE SERIALISATION
+    // ============================================================================
+    // The single Engine instance has shared mutable state (m_buffers, m_lastBatchSize,
+    // m_inferenceStream, TRT execution context).  If two LabVIEW threads call
+    // runInference concurrently with different batch sizes, one will overwrite
+    // the input shapes and buffers while the other is mid-inference, causing a
+    // fatal "illegal memory access" that permanently corrupts the CUDA context.
+    //
+    // Pool-mode slots have their own busy-flag dispatch so they do NOT need this.
+    std::lock_guard<std::mutex> inferenceLock(m_inferenceMutex);
+
+    // ============================================================================
+    // THREAD-SAFE GPU CONTEXT
+    // ============================================================================
+    // Ensure the calling thread's CUDA device matches this engine's GPU.
+    // This is essential for multi-GPU round-robin: LabVIEW reuses threads
+    // across tasks, so a thread that last ran inference on GPU 1 might now
+    // be running a task on GPU 0. Without this, cv::cuda::GpuMat allocations
+    // and kernel launches would target the wrong GPU, causing result corruption.
+    // Skip cudaSetDevice if already on the correct device — under WDDM
+    // with multiple GPUs each call costs 1-5ms of scheduler overhead.
+    {
+        int currentDev = -1;
+        cudaGetDevice(&currentDev);
+        if (currentDev != m_options.deviceIndex) {
+            cudaSetDevice(m_options.deviceIndex);
+        }
+    }
+
+    // ============================================================================
+    // DEBUG: First call diagnostics (per-instance, not process-wide)
+    // ============================================================================
+
+    if (m_verbose && m_firstInferenceCall) {
+        std::cout << "\n=== First runInference Call ===" << std::endl;
+        std::cout << "Number of input tensors: " << inputs.size() << std::endl;
+        for (size_t i = 0; i < inputs.size(); ++i) {
+            std::cout << "Input " << i << " batch size: " << inputs[i].size() << std::endl;
+            if (!inputs[i].empty()) {
+                const auto& img = inputs[i][0];
+                std::cout << "  Image shape: " << img.cols << "x" << img.rows
+                    << "x" << img.channels() << " (type: " << img.type() << ")" << std::endl;
+            }
+        }
+
+        // Print optimization profile information
+        std::cout << "\n=== Engine Profile Information ===" << std::endl;
+        std::cout << "Number of optimization profiles: "
+            << m_engine->getNbOptimizationProfiles() << std::endl;
+
+        if (m_engine->getNbOptimizationProfiles() > 0) {
+            for (int profile = 0; profile < m_engine->getNbOptimizationProfiles(); ++profile) {
+                std::cout << "\n--- Profile " << profile << " ---" << std::endl;
+
+                for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
+                    const char* tensorName = m_IOTensorNames[i].c_str();
+
+                    // Check if this is an input tensor
+                    auto ioMode = m_engine->getTensorIOMode(tensorName);
+                    if (ioMode != nvinfer1::TensorIOMode::kINPUT) {
+                        continue;
+                    }
+
+                    auto minDims = m_engine->getProfileShape(tensorName, profile,
+                        nvinfer1::OptProfileSelector::kMIN);
+                    auto optDims = m_engine->getProfileShape(tensorName, profile,
+                        nvinfer1::OptProfileSelector::kOPT);
+                    auto maxDims = m_engine->getProfileShape(tensorName, profile,
+                        nvinfer1::OptProfileSelector::kMAX);
+
+                    std::cout << "Tensor '" << tensorName << "' (INPUT):" << std::endl;
+                    std::cout << "  Min: [" << minDims.d[0];
+                    for (int j = 1; j < minDims.nbDims; ++j) std::cout << "," << minDims.d[j];
+                    std::cout << "]" << std::endl;
+
+                    std::cout << "  Opt: [" << optDims.d[0];
+                    for (int j = 1; j < optDims.nbDims; ++j) std::cout << "," << optDims.d[j];
+                    std::cout << "]" << std::endl;
+
+                    std::cout << "  Max: [" << maxDims.d[0];
+                    for (int j = 1; j < maxDims.nbDims; ++j) std::cout << "," << maxDims.d[j];
+                    std::cout << "]" << std::endl;
+                }
+            }
+        }
+
+        if (!m_context->allInputDimensionsSpecified()) {
+            std::cout << "ERROR: Input dimensions not specified in context!" << std::endl;
+            return false;
+        }
+
+        std::cout << "\nContext state: All dimensions specified ✓" << std::endl;
+        m_firstInferenceCall = false;
+    }
+
+    // ============================================================================
+    // INPUT VALIDATION
+    // ============================================================================
+
+    if (inputs.empty() || inputs[0].empty()) {
+        std::cout << "Error: Empty input" << std::endl;
+        return false;
+    }
+
+    const auto numInputs = m_inputDims.size();
+    if (inputs.size() != numInputs) {
+        std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
+            << ", Got: " << inputs.size() << std::endl;
+        return false;
+    }
+
+    const auto batchSize = static_cast<int32_t>(inputs[0].size());
+
+    if (batchSize > m_options.maxBatchSize) {
+        std::cout << "Error: Batch size " << batchSize << " exceeds maximum "
+            << m_options.maxBatchSize << std::endl;
+        return false;
+    }
+
+    if (batchSize < 1) {
+        std::cout << "Error: Batch size must be at least 1" << std::endl;
+        return false;
+    }
+
+    // Validate batch size consistency across all inputs
+    for (size_t i = 1; i < inputs.size(); ++i) {
+        if (inputs[i].size() != static_cast<size_t>(batchSize)) {
+            std::cout << "Error: Inconsistent batch sizes across inputs. Input 0: "
+                << batchSize << ", Input " << i << ": " << inputs[i].size() << std::endl;
+            return false;
+        }
+    }
+
+    // ============================================================================
+    // STREAM GUARD
+    // ============================================================================
+    // m_inferenceStream is now created eagerly in loadNetwork() so it is always
+    // valid here.  Guard against the (unlikely) edge case where runInference is
+    // called before loadNetwork succeeds.
+    if (!m_streamInitialized || !m_inferenceStream) {
+        std::string errMsg = "Error: Inference stream not initialised. "
+                     "Call loadNetwork() / buildLoadNetwork() before runInference().";
+        std::cout << errMsg << std::endl;
+        logEngineEvent("[Engine] runInference: " + errMsg, true);
+        return false;
+    }
+
+    // ============================================================================
+    // SET INPUT SHAPES (batch size changed OR dynamic spatial dims need updating)
+    // ============================================================================
+    // Fast path: compute desired dims first, then compare against cached dims.
+    // This avoids all TRT API calls when the shape hasn't actually changed —
+    // critical for the recognizer which is called ~50-100x per image with
+    // dynamic width but often the same or similar widths.
+    // ============================================================================
+
+    {
+        // Lazily initialise the dims cache on first call
+        if (m_lastSetInputDims.empty()) {
+            m_lastSetInputDims.resize(numInputs);
+            for (size_t i = 0; i < numInputs; ++i) {
+                m_lastSetInputDims[i].nbDims = 0;  // force mismatch on first call
+            }
+        }
+
+        // Build desired dims for every input tensor (cheap — no TRT API calls)
+        bool anyDimChanged = (m_lastBatchSize != batchSize);
+        std::vector<nvinfer1::Dims> desiredDims(numInputs);
+        for (size_t i = 0; i < numInputs; ++i) {
+            nvinfer1::Dims& nd = desiredDims[i];
+            nd.nbDims = 4;
+            nd.d[0] = batchSize;
+            nd.d[1] = m_inputDims[i].d[0];  // channels
+            if (m_hasDynamicSpatialDims && !inputs[i].empty()) {
+                const auto& firstImg = inputs[i][0];
+                nd.d[2] = (m_inputDims[i].d[1] == -1) ? firstImg.rows : m_inputDims[i].d[1];
+                nd.d[3] = (m_inputDims[i].d[2] == -1) ? firstImg.cols : m_inputDims[i].d[2];
+            } else {
+                nd.d[2] = m_inputDims[i].d[1];
+                nd.d[3] = m_inputDims[i].d[2];
+            }
+            // Compare with cached
+            if (!anyDimChanged) {
+                const auto& cached = m_lastSetInputDims[i];
+                if (cached.nbDims != nd.nbDims ||
+                    cached.d[0] != nd.d[0] || cached.d[1] != nd.d[1] ||
+                    cached.d[2] != nd.d[2] || cached.d[3] != nd.d[3]) {
+                    anyDimChanged = true;
+                }
+            }
+        }
+
+        if (anyDimChanged) {
+            // === First-time diagnostics (verbose, once) ===
+            const bool firstTime = !m_batchShapeChangeLogged;
+
+            if (m_verbose && firstTime) {
+                std::cout << "\nInfo: Batch size change: " << m_lastBatchSize
+                          << " -> " << batchSize << std::endl;
+            }
+
+            // Set optimization profile (only when truly needed)
+            if (m_engine->getNbOptimizationProfiles() > 0) {
+                int currentProfile = m_context->getOptimizationProfile();
+                if (currentProfile != 0 || m_lastBatchSize < 0) {
+                    if (m_verbose && firstTime) {
+                        std::cout << "  Setting optimization profile to 0..." << std::endl;
+                    }
+                    if (!m_context->setOptimizationProfileAsync(0, m_inferenceStream)) {
+                        std::cout << "Error: Failed to set optimization profile 0" << std::endl;
+                        return false;
+                    }
+                    cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
+                    if (syncErr != cudaSuccess) {
+                        std::cout << "Error: Failed to sync after profile change: "
+                            << cudaGetErrorString(syncErr) << std::endl;
+                        return false;
+                    }
+                    if (m_verbose && firstTime) {
+                        std::cout << "  Optimization profile set successfully" << std::endl;
+                    }
+                }
+            }
+
+            // Update shapes for input tensors that actually changed
+            for (size_t i = 0; i < numInputs; ++i) {
+                const char* tensorName = m_IOTensorNames[i].c_str();
+
+                // Skip non-input tensors
+                auto ioMode = m_engine->getTensorIOMode(tensorName);
+                if (ioMode != nvinfer1::TensorIOMode::kINPUT) continue;
+
+                const nvinfer1::Dims& newDims = desiredDims[i];
+                const nvinfer1::Dims& cached = m_lastSetInputDims[i];
+
+                // Skip this tensor if its dims haven't changed
+                if (cached.nbDims == newDims.nbDims &&
+                    cached.d[0] == newDims.d[0] && cached.d[1] == newDims.d[1] &&
+                    cached.d[2] == newDims.d[2] && cached.d[3] == newDims.d[3]) {
+                    continue;
+                }
+
+                // First-time verbose diagnostics
+                if (m_verbose && firstTime) {
+                    std::cout << "\n  Processing tensor " << i << ": '" << tensorName << "'" << std::endl;
+                    // Validate batch size range (first time only)
+                    if (m_engine->getNbOptimizationProfiles() > 0) {
+                        int profileIndex = m_context->getOptimizationProfile();
+                        nvinfer1::Dims minDims = m_engine->getProfileShape(
+                            tensorName, profileIndex, nvinfer1::OptProfileSelector::kMIN);
+                        nvinfer1::Dims maxDims = m_engine->getProfileShape(
+                            tensorName, profileIndex, nvinfer1::OptProfileSelector::kMAX);
+                        std::cout << "    Profile batch range: [" << minDims.d[0]
+                            << " to " << maxDims.d[0] << "]" << std::endl;
+                        if (batchSize < minDims.d[0] || batchSize > maxDims.d[0]) {
+                            std::cout << "Error: Batch size " << batchSize
+                                << " outside profile range" << std::endl;
+                            return false;
+                        }
+                    }
+                    auto currentShape = m_context->getTensorShape(tensorName);
+                    std::cout << "    Current context shape: [";
+                    for (int j = 0; j < currentShape.nbDims; ++j) {
+                        if (j > 0) std::cout << ", ";
+                        std::cout << currentShape.d[j];
+                    }
+                    std::cout << "]" << std::endl;
+                    std::cout << "    Setting new shape: [" << newDims.d[0] << ", "
+                        << newDims.d[1] << ", " << newDims.d[2] << ", "
+                        << newDims.d[3] << "]" << std::endl;
+                }
+
+                if (!m_context->setInputShape(tensorName, newDims)) {
+                    std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
+                    return false;
+                }
+
+                // Verify shape (first time only — trust the API on hot path)
+                if (firstTime) {
+                    auto verifyShape = m_context->getTensorShape(tensorName);
+                    if (verifyShape.d[0] != batchSize) {
+                        std::cout << "Error: Shape change didn't take effect. Expected batch "
+                            << batchSize << ", got " << verifyShape.d[0] << std::endl;
+                        return false;
+                    }
+                    if (m_verbose) {
+                        std::cout << "    Shape updated successfully" << std::endl;
+                    }
+                }
+
+                m_lastSetInputDims[i] = newDims;
+            }
+
+            // Verify all input dimensions specified (first time only)
+            if (firstTime) {
+                if (!m_context->allInputDimensionsSpecified()) {
+                    std::cout << "Error: Not all input dimensions specified after shape change" << std::endl;
+                    for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
+                        auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
+                        std::cout << "    " << m_IOTensorNames[i] << ": [";
+                        for (int j = 0; j < shape.nbDims; ++j) {
+                            if (j > 0) std::cout << ", ";
+                            std::cout << shape.d[j];
+                        }
+                        std::cout << "]" << std::endl;
+                    }
+                    return false;
+                }
+            }
+
+            m_lastBatchSize = batchSize;
+            m_batchShapeChangeLogged = true;
+            if (m_verbose && firstTime) {
+                std::cout << "\nInfo: Input shapes updated successfully for batch size "
+                    << batchSize << " ✓\n" << std::endl;
+            }
+        }
+    }
+
+    // ============================================================================
+    // PREPROCESS AND COPY INPUTS TO GPU BUFFERS
+    // ============================================================================
+
+    // Pass 1: Validate all input dimensions before any GPU work.
+    // Dynamic dims (-1) are skipped in validation (they accept any size).
+    for (size_t i = 0; i < numInputs; ++i) {
+        const auto& batchInput = inputs[i];
+        const auto& dims = m_inputDims[i];
+        if (!batchInput.empty()) {
+            const auto& firstImg = batchInput[0];
+            bool mismatch = false;
+            if (dims.d[0] > 0 && firstImg.channels() != dims.d[0]) mismatch = true;
+            if (dims.d[1] > 0 && firstImg.rows        != dims.d[1]) mismatch = true;
+            if (dims.d[2] > 0 && firstImg.cols        != dims.d[2]) mismatch = true;
+            if (mismatch) {
+                std::cout << "Error: Input " << i << " dimension mismatch!" << std::endl;
+                std::cout << "  Expected: " << dims.d[2] << "x" << dims.d[1]
+                    << "x" << dims.d[0] << " (WxHxC, -1=dynamic)" << std::endl;
+                std::cout << "  Got: " << firstImg.cols << "x" << firstImg.rows
+                    << "x" << firstImg.channels() << " (WxHxC)" << std::endl;
+                return false;
+            }
+        }
+    }
+
+    // Pass 2: Preprocess + D2D copies — all on m_inferenceStream (no null stream).
+    //
+    // All OpenCV CUDA ops (convertTo, subtract, divide, split) in blobFromGpuMats
+    // now run on m_inferenceStream via the cv::cuda::Stream wrapper.  This means:
+    //   • No null-stream interaction — eliminates global sync barriers on WDDM
+    //   • No event bridge needed — same-stream ordering guarantees correctness
+    //   • CUDA graphs are safe — cv::cuda::split runs BEFORE graph capture
+    //
+    // GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
+    // cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
+    cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
+    std::vector<cv::cuda::GpuMat> preprocessedBuffers;
+    preprocessedBuffers.reserve(numInputs);
+
+    for (size_t i = 0; i < numInputs; ++i) {
+        const auto& batchInput = inputs[i];
+
+        // Preprocess on m_inferenceStream (not the null stream).
+        preprocessedBuffers.push_back(
+            blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize, false, cvInferStream));
+
+        // D2D copy: same stream as preprocessing, so ordering is guaranteed.
+        const auto& blobMat = preprocessedBuffers.back();
+        const size_t copySize = static_cast<size_t>(blobMat.rows) * static_cast<size_t>(blobMat.cols) * blobMat.elemSize();
+        cudaError_t copyErr = cudaMemcpyAsync(
+            m_buffers[i],
+            preprocessedBuffers.back().ptr<void>(),
+            copySize,
+            cudaMemcpyDeviceToDevice,
+            m_inferenceStream);
+
+        if (copyErr != cudaSuccess) {
+            std::cout << "Error: Failed to copy input " << i
+                << " to inference buffer: " << cudaGetErrorString(copyErr) << std::endl;
+            return false;
+        }
+    }
+
+    // ============================================================================
+    // PRE-ALLOCATE OUTPUT STRUCTURE
+    // ============================================================================
+
+    const size_t numOutputs = m_outputLengths.size();
+
+    featureVectors.clear();
+    featureVectors.resize(batchSize);
+    for (int batch = 0; batch < batchSize; ++batch) {
+        featureVectors[batch].resize(numOutputs);
+        for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx)
+            featureVectors[batch][outputIdx].resize(m_outputLengths[outputIdx]);
+    }
+
+    // ============================================================================
+    // RUN INFERENCE + COPY OUTPUTS  (CUDA Graph path or direct path)
+    // ============================================================================
+
+    // CUDA Graph path
+    // ---------------
+    // On the first call for a given batchSize we capture enqueueV3 + D2H copies
+    // into a reusable graph.  Subsequent calls use cudaGraphLaunch, replacing
+    // many individual kernel-submission API calls with a single launch.
+    //
+    // Prerequisites satisfied here:
+    //   • Preprocessing + D2D copies are queued on m_inferenceStream (same-stream
+    //     ordering guarantees they complete before captured kernels execute)
+    //   • m_pinnedOutputBuffers has stable addresses (allocated in loadNetwork)
+    //   • m_buffers (GPU outputs) have stable addresses (allocated in loadNetwork)
+    //
+    // Falls back to the direct path if pinned buffers are unavailable or if
+    // graph capture/instantiation fails for any reason.
+
+    // CUDA graphs capture fixed kernel sequences; incompatible with dynamic spatial dims
+    // (input H/W change per inference call → different TRT kernel plans each time).
+    // Disabled for pool slots — concurrent graph captures on the same GPU corrupt the
+    // CUDA context ("operation not permitted when stream is capturing").
+    const bool canGraph = !m_disableGraphs && !m_pinnedOutputBuffers.empty() && !m_hasDynamicSpatialDims;
+    bool graphUsed = false;
+
+    if (canGraph) {
+        auto& graphExec = m_graphExecs[batchSize];   // inserts nullptr on first access
+        if (!graphExec) {
+            // First call for this batchSize -- capture a new graph.
+            // Serialise captures across all Engine instances on this device to
+            // prevent TRT's shared workspace from creating cross-stream
+            // dependencies that violate CUDA graph capture rules.
+            std::lock_guard<std::mutex> captureLock(graphCaptureMutex());
+
+            // Clear any sticky CUDA error from a prior failed capture so that
+            // this attempt starts clean.
+            cudaGetLastError();
+
+            cudaGraph_t graph = nullptr;
+            bool captureOk = false;
+
+            if (cudaStreamBeginCapture(m_inferenceStream,
+                                       cudaStreamCaptureModeRelaxed) == cudaSuccess) {
+                // Record TRT kernels into the graph (not executed yet).
+                TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
+
+                // Record D2H copies to stable pinned addresses.
+                for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
+                    cudaMemcpyAsync(
+                        m_pinnedOutputBuffers[outputIdx],
+                        static_cast<char*>(m_buffers[numInputs + outputIdx]),
+                        static_cast<size_t>(batchSize) * m_outputLengths[outputIdx] * sizeof(T),
+                        cudaMemcpyDeviceToHost,
+                        m_inferenceStream);
+                }
+
+                captureOk = (cudaStreamEndCapture(m_inferenceStream, &graph) == cudaSuccess
+                             && graph != nullptr);
+            }
+
+            if (captureOk) {
+                cudaGraphExec_t exec = nullptr;
+                if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
+                    graphExec = exec;
+                cudaGraphDestroy(graph);
+            }
+
+            if (!graphExec) {
+                std::cout << "Warning: CUDA graph capture failed for batchSize="
+                    << batchSize << " -- falling back to direct inference path." << std::endl;
+                // Disable graph acceleration for this Engine instance.
+                for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
+                m_pinnedOutputBuffers.clear();
+                m_graphExecs.erase(batchSize);
+            }
+        }
+
+        if (graphExec) {
+            // Launch the pre-captured graph (single API call replaces many).
+            cudaGraphLaunch(graphExec, m_inferenceStream);
+            cudaStreamSynchronize(m_inferenceStream);
+
+            // CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
+            for (int batch = 0; batch < batchSize; ++batch) {
+                for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
+                    std::memcpy(
+                        featureVectors[batch][outputIdx].data(),
+                        m_pinnedOutputBuffers[outputIdx]
+                            + static_cast<size_t>(batch) * m_outputLengths[outputIdx],
+                        m_outputLengths[outputIdx] * sizeof(T));
+                }
+            }
+            graphUsed = true;
+        }
+    }
+
+    // Direct path (no graph)
+    // ----------------------
+    // Used when pinned buffers are unavailable or graph capture failed.
+    if (!graphUsed) {
+        bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
+        if (!success) {
+            std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
+                + std::to_string(batchSize)
+                + ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
+            for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
+                auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
+                debugInfo += ", tensor'" + m_IOTensorNames[i] + "'=[";
+                for (int j = 0; j < shape.nbDims; ++j) {
+                    if (j > 0) debugInfo += ",";
+                    debugInfo += std::to_string(shape.d[j]);
+                }
+                debugInfo += "]";
+            }
+            std::cout << debugInfo << std::endl;
+            logEngineEvent(debugInfo, true);
+            return false;
+        }
+
+        for (int batch = 0; batch < batchSize; ++batch) {
+            for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
+                const size_t outputBinding = numInputs + outputIdx;
+                const size_t offset =
+                    static_cast<size_t>(batch) * m_outputLengths[outputIdx] * sizeof(T);
+
+                cudaError_t copyErr = cudaMemcpyAsync(
+                    featureVectors[batch][outputIdx].data(),
+                    static_cast<char*>(m_buffers[outputBinding]) + offset,
+                    m_outputLengths[outputIdx] * sizeof(T),
+                    cudaMemcpyDeviceToHost,
+                    m_inferenceStream);
+
+                if (copyErr != cudaSuccess) {
+                    std::string errMsg = "[Engine] runInference FAIL: cudaMemcpyAsync output "
+                        + std::to_string(outputIdx) + " batch " + std::to_string(batch)
+                        + ": " + cudaGetErrorString(copyErr);
+                    std::cout << errMsg << std::endl;
+                    logEngineEvent(errMsg, true);
+                    return false;
+                }
+            }
+        }
+
+        cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
+        if (syncErr != cudaSuccess) {
+            std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
+                + std::string(cudaGetErrorString(syncErr));
+            std::cout << errMsg << std::endl;
+            logEngineEvent(errMsg, true);
+            return false;
+        }
+    }
+
+    return true;
+}