ANSCORE/engines/TensorRTAPI/include/engine/EngineRunInference.inl

#pragma once
#include <cstring>
#include <filesystem>
#include "TRTCompat.h"

// Per-device mutex for CUDA graph capture.
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
// at the CUDA context level.  When two Engine instances on the same GPU
// capture graphs concurrently, these cross-stream dependencies violate
// graph capture rules ("operation not permitted when stream is capturing").
// This mutex serialises graph captures across all Engine<T> instances on
// the same device — subsequent cudaGraphLaunch calls are still concurrent.
static std::mutex& graphCaptureMutex() {
    static std::mutex m;
    return m;
}

template <typename T>
void Engine<T>::warmUp(int iterations) {
    if (m_verbose) {
        std::cout << "\n========================================" << std::endl;
        std::cout << "Engine Warmup" << std::endl;
        std::cout << "========================================" << std::endl;
    }

    // Determine batch sizes to warm up
    std::vector<int> batchSizes;

    if (m_options.maxBatchSize > 1) {
        if (m_verbose) {
            std::cout << "Dynamic batch engine detected (max batch: " << m_options.maxBatchSize << ")" << std::endl;
            std::cout << "Warming up common batch sizes to pre-compile kernels..." << std::endl;
        }

        // Warm up ALL batch sizes from 1 to maxBatchSize.
        // Each unseen batch size incurs a 100-300ms kernel compilation penalty
        // on first use. Warming all sizes eliminates that latency at inference
        // time and ensures every CUDA graph is pre-captured.
        for (int batch = 1; batch <= m_options.maxBatchSize; ++batch) {
            batchSizes.push_back(batch);
        }
    }
    else {
        if (m_verbose) std::cout << "Fixed batch engine detected (batch size: " << m_options.maxBatchSize << ")" << std::endl;
        batchSizes.push_back(m_options.maxBatchSize);
    }

    if (m_verbose) {
        std::cout << "Batch sizes to warm up: ";
        for (size_t i = 0; i < batchSizes.size(); ++i) {
            std::cout << batchSizes[i];
            if (i < batchSizes.size() - 1) std::cout << ", ";
        }
        std::cout << std::endl;
    }

    // Warm up each batch size.
    // The first call triggers kernel compilation; the second captures the CUDA
    // graph.  Additional iterations only measure steady-state latency for the
    // optBatchSize (printed as a diagnostic).
    for (int batchSize : batchSizes) {
        const int iters = (batchSize == m_options.optBatchSize) ? iterations : 2;
        if (m_verbose) std::cout << "\nWarming up batch=" << batchSize << " (x" << iters << " iterations)..." << std::endl;

        // Create dummy inputs for this batch size
        std::vector<std::vector<cv::cuda::GpuMat>> dummyInputs;

        for (size_t i = 0; i < m_inputDims.size(); ++i) {
            const auto& dims = m_inputDims[i];
            std::vector<cv::cuda::GpuMat> batch;

            // FIXED: Create proper dummy images on GPU
            // For dynamic spatial dims, use opt dimensions for warmup
            int warmH = (dims.d[1] > 0) ? dims.d[1] : m_options.optInputHeight;
            int warmW = (dims.d[2] > 0) ? dims.d[2] : m_options.optInputWidth;
            for (int b = 0; b < batchSize; ++b) {
                // Create on CPU first
                cv::Mat cpuImg(warmH, warmW, CV_32FC(dims.d[0]), cv::Scalar(0.5f, 0.5f, 0.5f));

                // Upload to GPU
                cv::cuda::GpuMat gpuImg;
                gpuImg.upload(cpuImg);

                batch.push_back(gpuImg);
            }

            dummyInputs.push_back(batch);
        }

        std::vector<std::vector<std::vector<T>>> dummyOutputs;

        // Time the first iteration (kernel compilation happens here)
        auto start = std::chrono::high_resolution_clock::now();

        bool firstSuccess = runInference(dummyInputs, dummyOutputs);

        auto end = std::chrono::high_resolution_clock::now();
        auto firstTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();

        if (!firstSuccess) {
            if (m_verbose) std::cout << "  ✗ ERROR: First iteration failed for batch=" << batchSize << std::endl;
            continue;
        }

        if (m_verbose) {
            std::cout << "  First iteration: " << firstTime << " ms";
            if (firstTime > 100) {
                std::cout << " (kernel compilation detected)";
            }
            std::cout << std::endl;
        }

        // Run remaining iterations to measure stable performance
        if (iters > 1) {
            auto iterStart = std::chrono::high_resolution_clock::now();

            for (int i = 1; i < iters; ++i) {
                bool success = runInference(dummyInputs, dummyOutputs);
                if (!success) {
                    if (m_verbose) std::cout << "  ✗ ERROR: Iteration " << i << " failed" << std::endl;
                    break;
                }
            }

            auto iterEnd = std::chrono::high_resolution_clock::now();
            auto totalTime = std::chrono::duration_cast<std::chrono::milliseconds>(iterEnd - iterStart).count();
            float avgTime = totalTime / static_cast<float>(iters - 1);

            if (m_verbose) {
                std::cout << "  Subsequent iterations (avg): " << std::fixed << std::setprecision(1)
                    << avgTime << " ms" << std::endl;

                if (firstTime > 100 && avgTime < firstTime * 0.5f) {
                    float speedup = firstTime / avgTime;
                    std::cout << "  ✓ Speedup after warmup: " << std::fixed << std::setprecision(1)
                        << speedup << "x faster" << std::endl;
                }
            }
        }

        if (m_verbose) std::cout << "  ✓ Batch=" << batchSize << " warmed up successfully" << std::endl;
    }

    if (m_verbose) {
        std::cout << "\n========================================" << std::endl;
        std::cout << "Warmup Complete!" << std::endl;
        std::cout << "========================================" << std::endl;
        std::cout << "Kernels pre-compiled for all batch sizes." << std::endl;
        std::cout << "========================================\n" << std::endl;
    }
}
template <typename T>
bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,std::vector<std::vector<std::vector<T>>>& featureVectors) {

    // ============================================================================
    // MULTI-GPU POOL DISPATCH
    // ============================================================================
    // If this Engine was initialised with initializePool() / initializePoolFromEngine()
    // the m_slots vector is non-empty.  In that case, delegate to the pool
    // dispatcher which acquires the first idle slot and runs inference there.
    // This branch is NEVER taken for single-GPU use (buildLoadNetwork / loadNetwork).
    if (!m_slots.empty()) {
        return runInferenceFromPool(inputs, featureVectors);
    }

    // ============================================================================
    // SINGLE-ENGINE SERIALISATION
    // ============================================================================
    // The single Engine instance has shared mutable state (m_buffers, m_lastBatchSize,
    // m_inferenceStream, TRT execution context).  If two LabVIEW threads call
    // runInference concurrently with different batch sizes, one will overwrite
    // the input shapes and buffers while the other is mid-inference, causing a
    // fatal "illegal memory access" that permanently corrupts the CUDA context.
    //
    // Pool-mode slots have their own busy-flag dispatch so they do NOT need this.
    std::lock_guard<std::mutex> inferenceLock(m_inferenceMutex);

    // ============================================================================
    // THREAD-SAFE GPU CONTEXT
    // ============================================================================
    // Ensure the calling thread's CUDA device matches this engine's GPU.
    // This is essential for multi-GPU round-robin: LabVIEW reuses threads
    // across tasks, so a thread that last ran inference on GPU 1 might now
    // be running a task on GPU 0. Without this, cv::cuda::GpuMat allocations
    // and kernel launches would target the wrong GPU, causing result corruption.
    // Skip cudaSetDevice if already on the correct device — under WDDM
    // with multiple GPUs each call costs 1-5ms of scheduler overhead.
    {
        int currentDev = -1;
        cudaGetDevice(&currentDev);
        if (currentDev != m_options.deviceIndex) {
            cudaSetDevice(m_options.deviceIndex);
        }
    }

    // ============================================================================
    // DEBUG: First call diagnostics (per-instance, not process-wide)
    // ============================================================================

    if (m_verbose && m_firstInferenceCall) {
        std::cout << "\n=== First runInference Call ===" << std::endl;
        std::cout << "Number of input tensors: " << inputs.size() << std::endl;
        for (size_t i = 0; i < inputs.size(); ++i) {
            std::cout << "Input " << i << " batch size: " << inputs[i].size() << std::endl;
            if (!inputs[i].empty()) {
                const auto& img = inputs[i][0];
                std::cout << "  Image shape: " << img.cols << "x" << img.rows
                    << "x" << img.channels() << " (type: " << img.type() << ")" << std::endl;
            }
        }

        // Print optimization profile information
        std::cout << "\n=== Engine Profile Information ===" << std::endl;
        std::cout << "Number of optimization profiles: "
            << m_engine->getNbOptimizationProfiles() << std::endl;

        if (m_engine->getNbOptimizationProfiles() > 0) {
            for (int profile = 0; profile < m_engine->getNbOptimizationProfiles(); ++profile) {
                std::cout << "\n--- Profile " << profile << " ---" << std::endl;

                for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
                    const char* tensorName = m_IOTensorNames[i].c_str();

                    // Check if this is an input tensor
                    auto ioMode = m_engine->getTensorIOMode(tensorName);
                    if (ioMode != nvinfer1::TensorIOMode::kINPUT) {
                        continue;
                    }

                    auto minDims = m_engine->getProfileShape(tensorName, profile,
                        nvinfer1::OptProfileSelector::kMIN);
                    auto optDims = m_engine->getProfileShape(tensorName, profile,
                        nvinfer1::OptProfileSelector::kOPT);
                    auto maxDims = m_engine->getProfileShape(tensorName, profile,
                        nvinfer1::OptProfileSelector::kMAX);

                    std::cout << "Tensor '" << tensorName << "' (INPUT):" << std::endl;
                    std::cout << "  Min: [" << minDims.d[0];
                    for (int j = 1; j < minDims.nbDims; ++j) std::cout << "," << minDims.d[j];
                    std::cout << "]" << std::endl;

                    std::cout << "  Opt: [" << optDims.d[0];
                    for (int j = 1; j < optDims.nbDims; ++j) std::cout << "," << optDims.d[j];
                    std::cout << "]" << std::endl;

                    std::cout << "  Max: [" << maxDims.d[0];
                    for (int j = 1; j < maxDims.nbDims; ++j) std::cout << "," << maxDims.d[j];
                    std::cout << "]" << std::endl;
                }
            }
        }

        if (!m_context->allInputDimensionsSpecified()) {
            std::cout << "ERROR: Input dimensions not specified in context!" << std::endl;
            return false;
        }

        std::cout << "\nContext state: All dimensions specified ✓" << std::endl;
        m_firstInferenceCall = false;
    }

    // ============================================================================
    // INPUT VALIDATION
    // ============================================================================

    if (inputs.empty() || inputs[0].empty()) {
        std::cout << "Error: Empty input" << std::endl;
        return false;
    }

    const auto numInputs = m_inputDims.size();
    if (inputs.size() != numInputs) {
        std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
            << ", Got: " << inputs.size() << std::endl;
        return false;
    }

    const auto batchSize = static_cast<int32_t>(inputs[0].size());

    if (batchSize > m_options.maxBatchSize) {
        std::cout << "Error: Batch size " << batchSize << " exceeds maximum "
            << m_options.maxBatchSize << std::endl;
        return false;
    }

    if (batchSize < 1) {
        std::cout << "Error: Batch size must be at least 1" << std::endl;
        return false;
    }

    // Validate batch size consistency across all inputs
    for (size_t i = 1; i < inputs.size(); ++i) {
        if (inputs[i].size() != static_cast<size_t>(batchSize)) {
            std::cout << "Error: Inconsistent batch sizes across inputs. Input 0: "
                << batchSize << ", Input " << i << ": " << inputs[i].size() << std::endl;
            return false;
        }
    }

    // ============================================================================
    // STREAM GUARD
    // ============================================================================
    // m_inferenceStream is now created eagerly in loadNetwork() so it is always
    // valid here.  Guard against the (unlikely) edge case where runInference is
    // called before loadNetwork succeeds.
    if (!m_streamInitialized || !m_inferenceStream) {
        std::string errMsg = "Error: Inference stream not initialised. "
                     "Call loadNetwork() / buildLoadNetwork() before runInference().";
        std::cout << errMsg << std::endl;
        logEngineEvent("[Engine] runInference: " + errMsg, true);
        return false;
    }

    // ============================================================================
    // SET INPUT SHAPES (batch size changed OR dynamic spatial dims need updating)
    // ============================================================================
    // Fast path: compute desired dims first, then compare against cached dims.
    // This avoids all TRT API calls when the shape hasn't actually changed —
    // critical for the recognizer which is called ~50-100x per image with
    // dynamic width but often the same or similar widths.
    // ============================================================================

    {
        // Lazily initialise the dims cache on first call
        if (m_lastSetInputDims.empty()) {
            m_lastSetInputDims.resize(numInputs);
            for (size_t i = 0; i < numInputs; ++i) {
                m_lastSetInputDims[i].nbDims = 0;  // force mismatch on first call
            }
        }

        // Build desired dims for every input tensor (cheap — no TRT API calls)
        bool anyDimChanged = (m_lastBatchSize != batchSize);
        std::vector<nvinfer1::Dims> desiredDims(numInputs);
        for (size_t i = 0; i < numInputs; ++i) {
            nvinfer1::Dims& nd = desiredDims[i];
            nd.nbDims = 4;
            nd.d[0] = batchSize;
            nd.d[1] = m_inputDims[i].d[0];  // channels
            if (m_hasDynamicSpatialDims && !inputs[i].empty()) {
                const auto& firstImg = inputs[i][0];
                nd.d[2] = (m_inputDims[i].d[1] == -1) ? firstImg.rows : m_inputDims[i].d[1];
                nd.d[3] = (m_inputDims[i].d[2] == -1) ? firstImg.cols : m_inputDims[i].d[2];
            } else {
                nd.d[2] = m_inputDims[i].d[1];
                nd.d[3] = m_inputDims[i].d[2];
            }
            // Compare with cached
            if (!anyDimChanged) {
                const auto& cached = m_lastSetInputDims[i];
                if (cached.nbDims != nd.nbDims ||
                    cached.d[0] != nd.d[0] || cached.d[1] != nd.d[1] ||
                    cached.d[2] != nd.d[2] || cached.d[3] != nd.d[3]) {
                    anyDimChanged = true;
                }
            }
        }

        if (anyDimChanged) {
            // === First-time diagnostics (verbose, once) ===
            const bool firstTime = !m_batchShapeChangeLogged;

            if (m_verbose && firstTime) {
                std::cout << "\nInfo: Batch size change: " << m_lastBatchSize
                          << " -> " << batchSize << std::endl;
            }

            // Set optimization profile (only when truly needed)
            if (m_engine->getNbOptimizationProfiles() > 0) {
                int currentProfile = m_context->getOptimizationProfile();
                if (currentProfile != 0 || m_lastBatchSize < 0) {
                    if (m_verbose && firstTime) {
                        std::cout << "  Setting optimization profile to 0..." << std::endl;
                    }
                    if (!m_context->setOptimizationProfileAsync(0, m_inferenceStream)) {
                        std::cout << "Error: Failed to set optimization profile 0" << std::endl;
                        return false;
                    }
                    cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
                    if (syncErr != cudaSuccess) {
                        std::cout << "Error: Failed to sync after profile change: "
                            << cudaGetErrorString(syncErr) << std::endl;
                        return false;
                    }
                    if (m_verbose && firstTime) {
                        std::cout << "  Optimization profile set successfully" << std::endl;
                    }
                }
            }

            // Update shapes for input tensors that actually changed
            for (size_t i = 0; i < numInputs; ++i) {
                const char* tensorName = m_IOTensorNames[i].c_str();

                // Skip non-input tensors
                auto ioMode = m_engine->getTensorIOMode(tensorName);
                if (ioMode != nvinfer1::TensorIOMode::kINPUT) continue;

                const nvinfer1::Dims& newDims = desiredDims[i];
                const nvinfer1::Dims& cached = m_lastSetInputDims[i];

                // Skip this tensor if its dims haven't changed
                if (cached.nbDims == newDims.nbDims &&
                    cached.d[0] == newDims.d[0] && cached.d[1] == newDims.d[1] &&
                    cached.d[2] == newDims.d[2] && cached.d[3] == newDims.d[3]) {
                    continue;
                }

                // First-time verbose diagnostics
                if (m_verbose && firstTime) {
                    std::cout << "\n  Processing tensor " << i << ": '" << tensorName << "'" << std::endl;
                    // Validate batch size range (first time only)
                    if (m_engine->getNbOptimizationProfiles() > 0) {
                        int profileIndex = m_context->getOptimizationProfile();
                        nvinfer1::Dims minDims = m_engine->getProfileShape(
                            tensorName, profileIndex, nvinfer1::OptProfileSelector::kMIN);
                        nvinfer1::Dims maxDims = m_engine->getProfileShape(
                            tensorName, profileIndex, nvinfer1::OptProfileSelector::kMAX);
                        std::cout << "    Profile batch range: [" << minDims.d[0]
                            << " to " << maxDims.d[0] << "]" << std::endl;
                        if (batchSize < minDims.d[0] || batchSize > maxDims.d[0]) {
                            std::cout << "Error: Batch size " << batchSize
                                << " outside profile range" << std::endl;
                            return false;
                        }
                    }
                    auto currentShape = m_context->getTensorShape(tensorName);
                    std::cout << "    Current context shape: [";
                    for (int j = 0; j < currentShape.nbDims; ++j) {
                        if (j > 0) std::cout << ", ";
                        std::cout << currentShape.d[j];
                    }
                    std::cout << "]" << std::endl;
                    std::cout << "    Setting new shape: [" << newDims.d[0] << ", "
                        << newDims.d[1] << ", " << newDims.d[2] << ", "
                        << newDims.d[3] << "]" << std::endl;
                }

                if (!m_context->setInputShape(tensorName, newDims)) {
                    std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
                    return false;
                }

                // Verify shape (first time only — trust the API on hot path)
                if (firstTime) {
                    auto verifyShape = m_context->getTensorShape(tensorName);
                    if (verifyShape.d[0] != batchSize) {
                        std::cout << "Error: Shape change didn't take effect. Expected batch "
                            << batchSize << ", got " << verifyShape.d[0] << std::endl;
                        return false;
                    }
                    if (m_verbose) {
                        std::cout << "    Shape updated successfully" << std::endl;
                    }
                }

                m_lastSetInputDims[i] = newDims;
            }

            // Verify all input dimensions specified (first time only)
            if (firstTime) {
                if (!m_context->allInputDimensionsSpecified()) {
                    std::cout << "Error: Not all input dimensions specified after shape change" << std::endl;
                    for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
                        auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
                        std::cout << "    " << m_IOTensorNames[i] << ": [";
                        for (int j = 0; j < shape.nbDims; ++j) {
                            if (j > 0) std::cout << ", ";
                            std::cout << shape.d[j];
                        }
                        std::cout << "]" << std::endl;
                    }
                    return false;
                }
            }

            m_lastBatchSize = batchSize;
            m_batchShapeChangeLogged = true;
            if (m_verbose && firstTime) {
                std::cout << "\nInfo: Input shapes updated successfully for batch size "
                    << batchSize << " ✓\n" << std::endl;
            }
        }
    }

    // ============================================================================
    // PREPROCESS AND COPY INPUTS TO GPU BUFFERS
    // ============================================================================

    // Pass 1: Validate all input dimensions before any GPU work.
    // Dynamic dims (-1) are skipped in validation (they accept any size).
    for (size_t i = 0; i < numInputs; ++i) {
        const auto& batchInput = inputs[i];
        const auto& dims = m_inputDims[i];
        if (!batchInput.empty()) {
            const auto& firstImg = batchInput[0];
            bool mismatch = false;
            if (dims.d[0] > 0 && firstImg.channels() != dims.d[0]) mismatch = true;
            if (dims.d[1] > 0 && firstImg.rows        != dims.d[1]) mismatch = true;
            if (dims.d[2] > 0 && firstImg.cols        != dims.d[2]) mismatch = true;
            if (mismatch) {
                std::cout << "Error: Input " << i << " dimension mismatch!" << std::endl;
                std::cout << "  Expected: " << dims.d[2] << "x" << dims.d[1]
                    << "x" << dims.d[0] << " (WxHxC, -1=dynamic)" << std::endl;
                std::cout << "  Got: " << firstImg.cols << "x" << firstImg.rows
                    << "x" << firstImg.channels() << " (WxHxC)" << std::endl;
                return false;
            }
        }
    }

    // Pass 2: Preprocess + D2D copies — all on m_inferenceStream (no null stream).
    //
    // All OpenCV CUDA ops (convertTo, subtract, divide, split) in blobFromGpuMats
    // now run on m_inferenceStream via the cv::cuda::Stream wrapper.  This means:
    //   • No null-stream interaction — eliminates global sync barriers on WDDM
    //   • No event bridge needed — same-stream ordering guarantees correctness
    //   • CUDA graphs are safe — cv::cuda::split runs BEFORE graph capture
    //
    // GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
    // cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
    cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
    std::vector<cv::cuda::GpuMat> preprocessedBuffers;
    preprocessedBuffers.reserve(numInputs);

    for (size_t i = 0; i < numInputs; ++i) {
        const auto& batchInput = inputs[i];

        // Preprocess on m_inferenceStream (not the null stream).
        preprocessedBuffers.push_back(
            blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize, false, cvInferStream));

        // D2D copy: same stream as preprocessing, so ordering is guaranteed.
        const auto& blobMat = preprocessedBuffers.back();
        const size_t copySize = static_cast<size_t>(blobMat.rows) * static_cast<size_t>(blobMat.cols) * blobMat.elemSize();
        cudaError_t copyErr = cudaMemcpyAsync(
            m_buffers[i],
            preprocessedBuffers.back().ptr<void>(),
            copySize,
            cudaMemcpyDeviceToDevice,
            m_inferenceStream);

        if (copyErr != cudaSuccess) {
            std::cout << "Error: Failed to copy input " << i
                << " to inference buffer: " << cudaGetErrorString(copyErr) << std::endl;
            return false;
        }
    }

    // ============================================================================
    // PRE-ALLOCATE OUTPUT STRUCTURE
    // ============================================================================

    const size_t numOutputs = m_outputLengths.size();

    featureVectors.clear();
    featureVectors.resize(batchSize);
    for (int batch = 0; batch < batchSize; ++batch) {
        featureVectors[batch].resize(numOutputs);
        for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx)
            featureVectors[batch][outputIdx].resize(m_outputLengths[outputIdx]);
    }

    // ============================================================================
    // RUN INFERENCE + COPY OUTPUTS  (CUDA Graph path or direct path)
    // ============================================================================

    // CUDA Graph path
    // ---------------
    // On the first call for a given batchSize we capture enqueueV3 + D2H copies
    // into a reusable graph.  Subsequent calls use cudaGraphLaunch, replacing
    // many individual kernel-submission API calls with a single launch.
    //
    // Prerequisites satisfied here:
    //   • Preprocessing + D2D copies are queued on m_inferenceStream (same-stream
    //     ordering guarantees they complete before captured kernels execute)
    //   • m_pinnedOutputBuffers has stable addresses (allocated in loadNetwork)
    //   • m_buffers (GPU outputs) have stable addresses (allocated in loadNetwork)
    //
    // Falls back to the direct path if pinned buffers are unavailable or if
    // graph capture/instantiation fails for any reason.

    // CUDA graphs capture fixed kernel sequences; incompatible with dynamic spatial dims
    // (input H/W change per inference call → different TRT kernel plans each time).
    // Disabled for pool slots — concurrent graph captures on the same GPU corrupt the
    // CUDA context ("operation not permitted when stream is capturing").
    const bool canGraph = !m_disableGraphs && !m_pinnedOutputBuffers.empty() && !m_hasDynamicSpatialDims;
    bool graphUsed = false;

    if (canGraph) {
        auto& graphExec = m_graphExecs[batchSize];   // inserts nullptr on first access
        if (!graphExec) {
            // First call for this batchSize -- capture a new graph.
            // Serialise captures across all Engine instances on this device to
            // prevent TRT's shared workspace from creating cross-stream
            // dependencies that violate CUDA graph capture rules.
            std::lock_guard<std::mutex> captureLock(graphCaptureMutex());

            // Clear any sticky CUDA error from a prior failed capture so that
            // this attempt starts clean.
            cudaGetLastError();

            cudaGraph_t graph = nullptr;
            bool captureOk = false;

            if (cudaStreamBeginCapture(m_inferenceStream,
                                       cudaStreamCaptureModeRelaxed) == cudaSuccess) {
                // Record TRT kernels into the graph (not executed yet).
                TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);

                // Record D2H copies to stable pinned addresses.
                for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
                    cudaMemcpyAsync(
                        m_pinnedOutputBuffers[outputIdx],
                        static_cast<char*>(m_buffers[numInputs + outputIdx]),
                        static_cast<size_t>(batchSize) * m_outputLengths[outputIdx] * sizeof(T),
                        cudaMemcpyDeviceToHost,
                        m_inferenceStream);
                }

                captureOk = (cudaStreamEndCapture(m_inferenceStream, &graph) == cudaSuccess
                             && graph != nullptr);
            }

            if (captureOk) {
                cudaGraphExec_t exec = nullptr;
                if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
                    graphExec = exec;
                cudaGraphDestroy(graph);
            }

            if (!graphExec) {
                std::cout << "Warning: CUDA graph capture failed for batchSize="
                    << batchSize << " -- falling back to direct inference path." << std::endl;
                // Disable graph acceleration for this Engine instance.
                for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
                m_pinnedOutputBuffers.clear();
                m_graphExecs.erase(batchSize);
            }
        }

        if (graphExec) {
            // Launch the pre-captured graph (single API call replaces many).
            cudaGraphLaunch(graphExec, m_inferenceStream);
            cudaStreamSynchronize(m_inferenceStream);

            // CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
            for (int batch = 0; batch < batchSize; ++batch) {
                for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
                    std::memcpy(
                        featureVectors[batch][outputIdx].data(),
                        m_pinnedOutputBuffers[outputIdx]
                            + static_cast<size_t>(batch) * m_outputLengths[outputIdx],
                        m_outputLengths[outputIdx] * sizeof(T));
                }
            }
            graphUsed = true;
        }
    }

    // Direct path (no graph)
    // ----------------------
    // Used when pinned buffers are unavailable or graph capture failed.
    if (!graphUsed) {
        bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
        if (!success) {
            std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
                + std::to_string(batchSize)
                + ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
            for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
                auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
                debugInfo += ", tensor'" + m_IOTensorNames[i] + "'=[";
                for (int j = 0; j < shape.nbDims; ++j) {
                    if (j > 0) debugInfo += ",";
                    debugInfo += std::to_string(shape.d[j]);
                }
                debugInfo += "]";
            }
            std::cout << debugInfo << std::endl;
            logEngineEvent(debugInfo, true);
            return false;
        }

        for (int batch = 0; batch < batchSize; ++batch) {
            for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
                const size_t outputBinding = numInputs + outputIdx;
                const size_t offset =
                    static_cast<size_t>(batch) * m_outputLengths[outputIdx] * sizeof(T);

                cudaError_t copyErr = cudaMemcpyAsync(
                    featureVectors[batch][outputIdx].data(),
                    static_cast<char*>(m_buffers[outputBinding]) + offset,
                    m_outputLengths[outputIdx] * sizeof(T),
                    cudaMemcpyDeviceToHost,
                    m_inferenceStream);

                if (copyErr != cudaSuccess) {
                    std::string errMsg = "[Engine] runInference FAIL: cudaMemcpyAsync output "
                        + std::to_string(outputIdx) + " batch " + std::to_string(batch)
                        + ": " + cudaGetErrorString(copyErr);
                    std::cout << errMsg << std::endl;
                    logEngineEvent(errMsg, true);
                    return false;
                }
            }
        }

        cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
        if (syncErr != cudaSuccess) {
            std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
                + std::string(cudaGetErrorString(syncErr));
            std::cout << errMsg << std::endl;
            logEngineEvent(errMsg, true);
            return false;
        }
    }

    return true;
}