ANSCORE/engines/TensorRTAPI/include/engine/EngineRunInference.inl

#pragma once
#include <cstring>
#include <chrono>
#include <filesystem>
#include <semaphore>
#include "TRTCompat.h"
#include "ANSLicense.h"   // ANS_DBG macro for DebugView logging

// Per-device mutex for CUDA graph capture.
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
// at the CUDA context level.  When two Engine instances on the same GPU
// capture graphs concurrently, these cross-stream dependencies violate
// graph capture rules ("operation not permitted when stream is capturing").
// This mutex serialises graph captures across all Engine<T> instances on
// the same device — subsequent cudaGraphLaunch calls are still concurrent.
static std::mutex& graphCaptureMutex() {
    static std::mutex m;
    return m;
}

// ============================================================================
// GPU INFERENCE THROTTLE
// ============================================================================
// Global counting semaphore that limits how many Engine instances can execute
// CUDA inference simultaneously.  Without this, N separate Engine instances
// (one per camera) all submit GPU work at once, causing:
//   1. SM 100% saturation → each inference takes 5-10x longer
//   2. GPU thermal throttling at 85°C → further slowdown
//   3. cudaStreamSynchronize blocking indefinitely → system freeze
//
// Auto-computed from GPU VRAM:
//   ≤ 4 GB  → 2 concurrent     8 GB  → 4 concurrent
//   6 GB    → 3 concurrent    12+ GB  → 6 concurrent
//   Multi-GPU: sum across all GPUs
//
// Excess threads wait on CPU (nearly zero cost) while the bounded set
// runs efficiently on the GPU without thermal throttling.
static std::counting_semaphore<64>& gpuInferenceSemaphore() {
    static int maxConcurrent = []() {
        int totalSlots = 0;
        int gpuCount = 0;
        cudaGetDeviceCount(&gpuCount);
        if (gpuCount <= 0) return 4;  // fallback

        for (int i = 0; i < gpuCount; ++i) {
            size_t freeMem = 0, totalMem = 0;
            cudaSetDevice(i);
            cudaMemGetInfo(&freeMem, &totalMem);
            int gbTotal = static_cast<int>(totalMem / (1024ULL * 1024ULL * 1024ULL));

            // Scale concurrency with VRAM: ~1 slot per 2 GB, min 2, max 6 per GPU
            int slotsThisGpu = std::clamp(gbTotal / 2, 2, 6);
            totalSlots += slotsThisGpu;
        }

        totalSlots = std::clamp(totalSlots, 2, 64);
        std::cout << "Info [GPU Throttle]: max concurrent inferences = "
                  << totalSlots << " (across " << gpuCount << " GPU(s))" << std::endl;
        return totalSlots;
    }();
    static std::counting_semaphore<64> sem(maxConcurrent);
    return sem;
}

// RAII guard for the GPU inference semaphore
struct GpuInferenceGuard {
    GpuInferenceGuard()  { gpuInferenceSemaphore().acquire(); }
    ~GpuInferenceGuard() { gpuInferenceSemaphore().release(); }
    GpuInferenceGuard(const GpuInferenceGuard&) = delete;
    GpuInferenceGuard& operator=(const GpuInferenceGuard&) = delete;
};

// ============================================================================
// WDDM-SAFE STREAM SYNCHRONIZATION
// ============================================================================
// Under Windows WDDM, cudaStreamSynchronize calls cuStreamQuery in a tight
// loop with SwitchToThread, holding nvcuda64's internal SRW lock the entire
// time.  When the GPU is busy with inference, this spin blocks ALL other CUDA
// operations — including HW video decode (nvcuvid), cuMemAlloc, cuArrayDestroy.
// If a camera Reconnect or decode buffer allocation needs an exclusive SRW lock
// while inference is spinning, the entire system deadlocks.
//
// This function replaces cudaStreamSynchronize with a polling loop that
// explicitly releases the SRW lock between queries by sleeping briefly.
// This allows other CUDA operations to interleave with the sync wait.
static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) {
    // Fast path: check if already done (no sleep overhead for quick kernels)
    cudaError_t err = cudaStreamQuery(stream);
    if (err != cudaErrorNotReady) return err;

    auto syncStart = std::chrono::steady_clock::now();

    // Short Sleep(0) fast path (~10 iterations) catches sub-ms kernel completions.
    for (int i = 0; i < 10; ++i) {
        Sleep(0);
        err = cudaStreamQuery(stream);
        if (err != cudaErrorNotReady) return err;
    }

    // 1ms sleeps — adds negligible latency at 30 FPS but prevents SRW lock starvation.
    int sleepCount = 0;
    while (true) {
        Sleep(1);
        sleepCount++;
        err = cudaStreamQuery(stream);
        if (err != cudaErrorNotReady) {
            // Log if sync took too long (>500ms indicates GPU stall)
            auto elapsed = std::chrono::duration<double, std::milli>(
                std::chrono::steady_clock::now() - syncStart).count();
            if (elapsed > 500.0) {
                ANS_DBG("TRT_Engine", "SLOW SYNC: %.1fms (%d sleeps) stream=%p err=%d",
                        elapsed, sleepCount, (void*)stream, (int)err);
            }
            return err;
        }
    }
}

template <typename T>
void Engine<T>::warmUp(int iterations) {
    if (m_verbose) {
        std::cout << "\n========================================" << std::endl;
        std::cout << "Engine Warmup" << std::endl;
        std::cout << "========================================" << std::endl;
    }

    // Determine batch sizes to warm up
    std::vector<int> batchSizes;

    if (m_options.maxBatchSize > 1) {
        if (m_verbose) {
            std::cout << "Dynamic batch engine detected (max batch: " << m_options.maxBatchSize << ")" << std::endl;
            std::cout << "Warming up common batch sizes to pre-compile kernels..." << std::endl;
        }

        // Warm up ALL batch sizes from 1 to maxBatchSize.
        // Each unseen batch size incurs a 100-300ms kernel compilation penalty
        // on first use. Warming all sizes eliminates that latency at inference
        // time and ensures every CUDA graph is pre-captured.
        for (int batch = 1; batch <= m_options.maxBatchSize; ++batch) {
            batchSizes.push_back(batch);
        }
    }
    else {
        if (m_verbose) std::cout << "Fixed batch engine detected (batch size: " << m_options.maxBatchSize << ")" << std::endl;
        batchSizes.push_back(m_options.maxBatchSize);
    }

    if (m_verbose) {
        std::cout << "Batch sizes to warm up: ";
        for (size_t i = 0; i < batchSizes.size(); ++i) {
            std::cout << batchSizes[i];
            if (i < batchSizes.size() - 1) std::cout << ", ";
        }
        std::cout << std::endl;
    }

    // Warm up each batch size.
    // The first call triggers kernel compilation; the second captures the CUDA
    // graph.  Additional iterations only measure steady-state latency for the
    // optBatchSize (printed as a diagnostic).
    for (int batchSize : batchSizes) {
        const int iters = (batchSize == m_options.optBatchSize) ? iterations : 2;
        if (m_verbose) std::cout << "\nWarming up batch=" << batchSize << " (x" << iters << " iterations)..." << std::endl;

        // Create dummy inputs for this batch size
        std::vector<std::vector<cv::cuda::GpuMat>> dummyInputs;

        for (size_t i = 0; i < m_inputDims.size(); ++i) {
            const auto& dims = m_inputDims[i];
            std::vector<cv::cuda::GpuMat> batch;

            // FIXED: Create proper dummy images on GPU
            // For dynamic spatial dims, use opt dimensions for warmup
            int warmH = (dims.d[1] > 0) ? dims.d[1] : m_options.optInputHeight;
            int warmW = (dims.d[2] > 0) ? dims.d[2] : m_options.optInputWidth;
            for (int b = 0; b < batchSize; ++b) {
                // Create on CPU first
                cv::Mat cpuImg(warmH, warmW, CV_32FC(dims.d[0]), cv::Scalar(0.5f, 0.5f, 0.5f));

                // Upload to GPU
                cv::cuda::GpuMat gpuImg;
                gpuImg.upload(cpuImg);

                batch.push_back(gpuImg);
            }

            dummyInputs.push_back(batch);
        }

        std::vector<std::vector<std::vector<T>>> dummyOutputs;

        // Time the first iteration (kernel compilation happens here)
        auto start = std::chrono::high_resolution_clock::now();

        bool firstSuccess = runInference(dummyInputs, dummyOutputs);

        auto end = std::chrono::high_resolution_clock::now();
        auto firstTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();

        if (!firstSuccess) {
            if (m_verbose) std::cout << "  ✗ ERROR: First iteration failed for batch=" << batchSize << std::endl;
            continue;
        }

        if (m_verbose) {
            std::cout << "  First iteration: " << firstTime << " ms";
            if (firstTime > 100) {
                std::cout << " (kernel compilation detected)";
            }
            std::cout << std::endl;
        }

        // Run remaining iterations to measure stable performance
        if (iters > 1) {
            auto iterStart = std::chrono::high_resolution_clock::now();

            for (int i = 1; i < iters; ++i) {
                bool success = runInference(dummyInputs, dummyOutputs);
                if (!success) {
                    if (m_verbose) std::cout << "  ✗ ERROR: Iteration " << i << " failed" << std::endl;
                    break;
                }
            }

            auto iterEnd = std::chrono::high_resolution_clock::now();
            auto totalTime = std::chrono::duration_cast<std::chrono::milliseconds>(iterEnd - iterStart).count();
            float avgTime = totalTime / static_cast<float>(iters - 1);

            if (m_verbose) {
                std::cout << "  Subsequent iterations (avg): " << std::fixed << std::setprecision(1)
                    << avgTime << " ms" << std::endl;

                if (firstTime > 100 && avgTime < firstTime * 0.5f) {
                    float speedup = firstTime / avgTime;
                    std::cout << "  ✓ Speedup after warmup: " << std::fixed << std::setprecision(1)
                        << speedup << "x faster" << std::endl;
                }
            }
        }

        if (m_verbose) std::cout << "  ✓ Batch=" << batchSize << " warmed up successfully" << std::endl;
    }

    if (m_verbose) {
        std::cout << "\n========================================" << std::endl;
        std::cout << "Warmup Complete!" << std::endl;
        std::cout << "========================================" << std::endl;
        std::cout << "Kernels pre-compiled for all batch sizes." << std::endl;
        std::cout << "========================================\n" << std::endl;
    }
}
template <typename T>
bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,std::vector<std::vector<std::vector<T>>>& featureVectors) {

    // ============================================================================
    // MULTI-GPU POOL DISPATCH
    // ============================================================================
    // If this Engine was initialised with initializePool() / initializePoolFromEngine()
    // the m_slots vector is non-empty.  In that case, delegate to the pool
    // dispatcher which acquires the first idle slot and runs inference there.
    // This branch is NEVER taken for single-GPU use (buildLoadNetwork / loadNetwork).
    if (!m_slots.empty()) {
        return runInferenceFromPool(inputs, featureVectors);
    }

    // ============================================================================
    // GPU INFERENCE THROTTLE
    // ============================================================================
    // Limit how many Engine instances can run CUDA inference simultaneously.
    // Without this, 12 cameras each with their own Engine all submit GPU work
    // at once → SM 100% → thermal throttle → cudaStreamSynchronize hangs.
    // The semaphore lets excess threads wait on CPU (nearly zero cost) while
    // a bounded number use the GPU efficiently.
    GpuInferenceGuard gpuThrottle;

    // ============================================================================
    // SINGLE-ENGINE SERIALISATION
    // ============================================================================
    // The single Engine instance has shared mutable state (m_buffers, m_lastBatchSize,
    // m_inferenceStream, TRT execution context).  If two LabVIEW threads call
    // runInference concurrently with different batch sizes, one will overwrite
    // the input shapes and buffers while the other is mid-inference, causing a
    // fatal "illegal memory access" that permanently corrupts the CUDA context.
    //
    // Pool-mode slots have their own busy-flag dispatch so they do NOT need this.
    auto _mutexWaitStart = std::chrono::steady_clock::now();
    std::lock_guard<std::mutex> inferenceLock(m_inferenceMutex);
    auto _mutexAcquired = std::chrono::steady_clock::now();
    double _mutexWaitMs = std::chrono::duration<double, std::milli>(_mutexAcquired - _mutexWaitStart).count();
    if (_mutexWaitMs > 50.0) {
        ANS_DBG("TRT_Engine", "MUTEX WAIT: %.1fms (queued behind another inference)", _mutexWaitMs);
    }

    // ============================================================================
    // THREAD-SAFE GPU CONTEXT
    // ============================================================================
    // Ensure the calling thread's CUDA device matches this engine's GPU.
    // This is essential for multi-GPU round-robin: LabVIEW reuses threads
    // across tasks, so a thread that last ran inference on GPU 1 might now
    // be running a task on GPU 0. Without this, cv::cuda::GpuMat allocations
    // and kernel launches would target the wrong GPU, causing result corruption.
    // Skip cudaSetDevice if already on the correct device — under WDDM
    // with multiple GPUs each call costs 1-5ms of scheduler overhead.
    {
        int currentDev = -1;
        cudaGetDevice(&currentDev);
        if (currentDev != m_options.deviceIndex) {
            cudaSetDevice(m_options.deviceIndex);
        }
    }

    // ============================================================================
    // DEBUG: First call diagnostics (per-instance, not process-wide)
    // ============================================================================

    if (m_verbose && m_firstInferenceCall) {
        std::cout << "\n=== First runInference Call ===" << std::endl;
        std::cout << "Number of input tensors: " << inputs.size() << std::endl;
        for (size_t i = 0; i < inputs.size(); ++i) {
            std::cout << "Input " << i << " batch size: " << inputs[i].size() << std::endl;
            if (!inputs[i].empty()) {
                const auto& img = inputs[i][0];
                std::cout << "  Image shape: " << img.cols << "x" << img.rows
                    << "x" << img.channels() << " (type: " << img.type() << ")" << std::endl;
            }
        }

        // Print optimization profile information
        std::cout << "\n=== Engine Profile Information ===" << std::endl;
        std::cout << "Number of optimization profiles: "
            << m_engine->getNbOptimizationProfiles() << std::endl;

        if (m_engine->getNbOptimizationProfiles() > 0) {
            for (int profile = 0; profile < m_engine->getNbOptimizationProfiles(); ++profile) {
                std::cout << "\n--- Profile " << profile << " ---" << std::endl;

                for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
                    const char* tensorName = m_IOTensorNames[i].c_str();

                    // Check if this is an input tensor
                    auto ioMode = m_engine->getTensorIOMode(tensorName);
                    if (ioMode != nvinfer1::TensorIOMode::kINPUT) {
                        continue;
                    }

                    auto minDims = m_engine->getProfileShape(tensorName, profile,
                        nvinfer1::OptProfileSelector::kMIN);
                    auto optDims = m_engine->getProfileShape(tensorName, profile,
                        nvinfer1::OptProfileSelector::kOPT);
                    auto maxDims = m_engine->getProfileShape(tensorName, profile,
                        nvinfer1::OptProfileSelector::kMAX);

                    std::cout << "Tensor '" << tensorName << "' (INPUT):" << std::endl;
                    std::cout << "  Min: [" << minDims.d[0];
                    for (int j = 1; j < minDims.nbDims; ++j) std::cout << "," << minDims.d[j];
                    std::cout << "]" << std::endl;

                    std::cout << "  Opt: [" << optDims.d[0];
                    for (int j = 1; j < optDims.nbDims; ++j) std::cout << "," << optDims.d[j];
                    std::cout << "]" << std::endl;

                    std::cout << "  Max: [" << maxDims.d[0];
                    for (int j = 1; j < maxDims.nbDims; ++j) std::cout << "," << maxDims.d[j];
                    std::cout << "]" << std::endl;
                }
            }
        }

        if (!m_context->allInputDimensionsSpecified()) {
            std::cout << "ERROR: Input dimensions not specified in context!" << std::endl;
            return false;
        }

        std::cout << "\nContext state: All dimensions specified ✓" << std::endl;
        m_firstInferenceCall = false;
    }

    // ============================================================================
    // INPUT VALIDATION
    // ============================================================================

    if (inputs.empty() || inputs[0].empty()) {
        std::cout << "Error: Empty input" << std::endl;
        return false;
    }

    // ============================================================================
    // SM=100% DETECTOR — tracks inference timing trends to catch the exact
    // moment GPU becomes saturated. Logs every 50 inferences with rolling
    // average, and immediately when degradation is detected.
    // ============================================================================
    // Global (process-wide) counters shared across all engine instances/threads
    static std::atomic<int64_t> s_globalInfCount{0};
    static std::atomic<int>     s_globalActiveInf{0};   // currently in-flight inferences
    static std::atomic<double>  s_globalLastAvgMs{0.0};  // last known avg inference time

    const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1;
    s_globalActiveInf.fetch_add(1);

    // Per-thread tracking
    {
        static thread_local int64_t s_infCount = 0;
        static thread_local std::chrono::steady_clock::time_point s_lastLog;
        static thread_local double  s_rollingAvgMs = 0.0;
        static thread_local double  s_baselineMs = 0.0;   // avg during first 100 inferences
        static thread_local double  s_maxMs = 0.0;
        static thread_local bool    s_degradationLogged = false;
        s_infCount++;

        if (s_infCount == 1) {
            s_lastLog = std::chrono::steady_clock::now();
            ANS_DBG("TRT_SM100", "FIRST inference — engine alive, globalInf=%lld", myInfNum);
        }

        // Log every 50 inferences (more frequent than 500 to catch transitions)
        if (s_infCount % 50 == 0) {
            auto now = std::chrono::steady_clock::now();
            double elapsed = std::chrono::duration<double>(now - s_lastLog).count();
            double fps = (elapsed > 0) ? (50.0 / elapsed) : 0;
            s_lastLog = now;

            size_t vramFree = 0, vramTotal = 0;
            cudaMemGetInfo(&vramFree, &vramTotal);
            size_t vramUsedMB = (vramTotal - vramFree) / (1024 * 1024);
            size_t vramFreeMB = vramFree / (1024 * 1024);

            ANS_DBG("TRT_SM100", "#%lld [global=%lld active=%d] %.1f inf/sec avgMs=%.1f maxMs=%.1f batch=%d graphs=%zu VRAM=%zuMB/%zuMB",
                    s_infCount, myInfNum, s_globalActiveInf.load(),
                    fps, s_rollingAvgMs, s_maxMs,
                    (int)inputs[0].size(), m_graphExecs.size(),
                    vramUsedMB, vramFreeMB);

            // Capture baseline from first 100 inferences
            if (s_infCount == 100) {
                s_baselineMs = s_rollingAvgMs;
                ANS_DBG("TRT_SM100", "BASELINE established: %.1fms/inference", s_baselineMs);
            }

            // Detect degradation: avg >3x baseline AND baseline is set
            if (s_baselineMs > 0 && s_rollingAvgMs > s_baselineMs * 3.0 && !s_degradationLogged) {
                s_degradationLogged = true;
                ANS_DBG("TRT_SM100", "*** DEGRADATION DETECTED *** avg=%.1fms baseline=%.1fms (%.1fx) VRAM=%zuMB/%zuMB active=%d",
                        s_rollingAvgMs, s_baselineMs, s_rollingAvgMs / s_baselineMs,
                        vramUsedMB, vramFreeMB, s_globalActiveInf.load());
            }

            // Reset max for next window
            s_maxMs = 0.0;
        }
    }

    const auto numInputs = m_inputDims.size();
    if (inputs.size() != numInputs) {
        std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
            << ", Got: " << inputs.size() << std::endl;
        return false;
    }

    const auto batchSize = static_cast<int32_t>(inputs[0].size());

    if (batchSize > m_options.maxBatchSize) {
        std::cout << "Error: Batch size " << batchSize << " exceeds maximum "
            << m_options.maxBatchSize << std::endl;
        return false;
    }

    if (batchSize < 1) {
        std::cout << "Error: Batch size must be at least 1" << std::endl;
        return false;
    }

    // Validate batch size consistency across all inputs
    for (size_t i = 1; i < inputs.size(); ++i) {
        if (inputs[i].size() != static_cast<size_t>(batchSize)) {
            std::cout << "Error: Inconsistent batch sizes across inputs. Input 0: "
                << batchSize << ", Input " << i << ": " << inputs[i].size() << std::endl;
            return false;
        }
    }

    // ============================================================================
    // STREAM GUARD
    // ============================================================================
    // m_inferenceStream is now created eagerly in loadNetwork() so it is always
    // valid here.  Guard against the (unlikely) edge case where runInference is
    // called before loadNetwork succeeds.
    if (!m_streamInitialized || !m_inferenceStream) {
        return false;
    }

    // ============================================================================
    // SET INPUT SHAPES (batch size changed OR dynamic spatial dims need updating)
    // ============================================================================
    // Fast path: compute desired dims first, then compare against cached dims.
    // This avoids all TRT API calls when the shape hasn't actually changed —
    // critical for the recognizer which is called ~50-100x per image with
    // dynamic width but often the same or similar widths.
    // ============================================================================

    {
        // Lazily initialise the dims cache on first call
        if (m_lastSetInputDims.empty()) {
            m_lastSetInputDims.resize(numInputs);
            for (size_t i = 0; i < numInputs; ++i) {
                m_lastSetInputDims[i].nbDims = 0;  // force mismatch on first call
            }
        }

        // Build desired dims for every input tensor (cheap — no TRT API calls)
        bool anyDimChanged = (m_lastBatchSize != batchSize);
        std::vector<nvinfer1::Dims> desiredDims(numInputs);
        for (size_t i = 0; i < numInputs; ++i) {
            nvinfer1::Dims& nd = desiredDims[i];
            nd.nbDims = 4;
            nd.d[0] = batchSize;
            nd.d[1] = m_inputDims[i].d[0];  // channels
            if (m_hasDynamicSpatialDims && !inputs[i].empty()) {
                const auto& firstImg = inputs[i][0];
                nd.d[2] = (m_inputDims[i].d[1] == -1) ? firstImg.rows : m_inputDims[i].d[1];
                nd.d[3] = (m_inputDims[i].d[2] == -1) ? firstImg.cols : m_inputDims[i].d[2];
            } else {
                nd.d[2] = m_inputDims[i].d[1];
                nd.d[3] = m_inputDims[i].d[2];
            }
            // Compare with cached
            if (!anyDimChanged) {
                const auto& cached = m_lastSetInputDims[i];
                if (cached.nbDims != nd.nbDims ||
                    cached.d[0] != nd.d[0] || cached.d[1] != nd.d[1] ||
                    cached.d[2] != nd.d[2] || cached.d[3] != nd.d[3]) {
                    anyDimChanged = true;
                }
            }
        }

        if (anyDimChanged) {
            ANS_DBG("TRT_Engine", "Shape change detected: batch %d -> %d (graphsCached=%zu)",
                    m_lastBatchSize, batchSize, m_graphExecs.size());

            // === First-time diagnostics (verbose, once) ===
            const bool firstTime = !m_batchShapeChangeLogged;

            if (m_verbose && firstTime) {
                std::cout << "\nInfo: Batch size change: " << m_lastBatchSize
                          << " -> " << batchSize << std::endl;
            }

            // Set optimization profile (only when truly needed)
            if (m_engine->getNbOptimizationProfiles() > 0) {
                int currentProfile = m_context->getOptimizationProfile();
                if (currentProfile != 0 || m_lastBatchSize < 0) {
                    if (m_verbose && firstTime) {
                        std::cout << "  Setting optimization profile to 0..." << std::endl;
                    }
                    if (!m_context->setOptimizationProfileAsync(0, m_inferenceStream)) {
                        std::cout << "Error: Failed to set optimization profile 0" << std::endl;
                        return false;
                    }
                    cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
                    if (syncErr != cudaSuccess) {
                        std::cout << "Error: Failed to sync after profile change: "
                            << cudaGetErrorString(syncErr) << std::endl;
                        return false;
                    }
                    if (m_verbose && firstTime) {
                        std::cout << "  Optimization profile set successfully" << std::endl;
                    }
                }
            }

            // Update shapes for input tensors that actually changed
            for (size_t i = 0; i < numInputs; ++i) {
                const char* tensorName = m_IOTensorNames[i].c_str();

                // Skip non-input tensors
                auto ioMode = m_engine->getTensorIOMode(tensorName);
                if (ioMode != nvinfer1::TensorIOMode::kINPUT) continue;

                const nvinfer1::Dims& newDims = desiredDims[i];
                const nvinfer1::Dims& cached = m_lastSetInputDims[i];

                // Skip this tensor if its dims haven't changed
                if (cached.nbDims == newDims.nbDims &&
                    cached.d[0] == newDims.d[0] && cached.d[1] == newDims.d[1] &&
                    cached.d[2] == newDims.d[2] && cached.d[3] == newDims.d[3]) {
                    continue;
                }

                // First-time verbose diagnostics
                if (m_verbose && firstTime) {
                    std::cout << "\n  Processing tensor " << i << ": '" << tensorName << "'" << std::endl;
                    // Validate batch size range (first time only)
                    if (m_engine->getNbOptimizationProfiles() > 0) {
                        int profileIndex = m_context->getOptimizationProfile();
                        nvinfer1::Dims minDims = m_engine->getProfileShape(
                            tensorName, profileIndex, nvinfer1::OptProfileSelector::kMIN);
                        nvinfer1::Dims maxDims = m_engine->getProfileShape(
                            tensorName, profileIndex, nvinfer1::OptProfileSelector::kMAX);
                        std::cout << "    Profile batch range: [" << minDims.d[0]
                            << " to " << maxDims.d[0] << "]" << std::endl;
                        if (batchSize < minDims.d[0] || batchSize > maxDims.d[0]) {
                            std::cout << "Error: Batch size " << batchSize
                                << " outside profile range" << std::endl;
                            return false;
                        }
                    }
                    auto currentShape = m_context->getTensorShape(tensorName);
                    std::cout << "    Current context shape: [";
                    for (int j = 0; j < currentShape.nbDims; ++j) {
                        if (j > 0) std::cout << ", ";
                        std::cout << currentShape.d[j];
                    }
                    std::cout << "]" << std::endl;
                    std::cout << "    Setting new shape: [" << newDims.d[0] << ", "
                        << newDims.d[1] << ", " << newDims.d[2] << ", "
                        << newDims.d[3] << "]" << std::endl;
                }

                ANS_DBG("TRT_Engine", "setInputShape('%s') [%d,%d,%d,%d]",
                        tensorName, newDims.d[0], newDims.d[1], newDims.d[2], newDims.d[3]);
                if (!m_context->setInputShape(tensorName, newDims)) {
                    ANS_DBG("TRT_Engine", "ERROR: setInputShape FAILED for '%s'", tensorName);
                    std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
                    return false;
                }

                // Verify shape (first time only — trust the API on hot path)
                if (firstTime) {
                    auto verifyShape = m_context->getTensorShape(tensorName);
                    if (verifyShape.d[0] != batchSize) {
                        std::cout << "Error: Shape change didn't take effect. Expected batch "
                            << batchSize << ", got " << verifyShape.d[0] << std::endl;
                        return false;
                    }
                    if (m_verbose) {
                        std::cout << "    Shape updated successfully" << std::endl;
                    }
                }

                m_lastSetInputDims[i] = newDims;
            }

            // Verify all input dimensions specified (first time only)
            if (firstTime) {
                if (!m_context->allInputDimensionsSpecified()) {
                    std::cout << "Error: Not all input dimensions specified after shape change" << std::endl;
                    for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
                        auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
                        std::cout << "    " << m_IOTensorNames[i] << ": [";
                        for (int j = 0; j < shape.nbDims; ++j) {
                            if (j > 0) std::cout << ", ";
                            std::cout << shape.d[j];
                        }
                        std::cout << "]" << std::endl;
                    }
                    return false;
                }
            }

            m_lastBatchSize = batchSize;
            m_batchShapeChangeLogged = true;

            // CRITICAL: Invalidate all cached CUDA graphs after shape change.
            // Graphs were captured with the OLD context state (old tensor shapes).
            // Launching them after setInputShape() produces undefined GPU behavior
            // (invalid kernel sequences, SM lockup at 100%, inference hang).
            if (!m_graphExecs.empty()) {
                size_t destroyed = m_graphExecs.size();
                for (auto& [bs, ge] : m_graphExecs) {
                    if (ge) cudaGraphExecDestroy(ge);
                }
                m_graphExecs.clear();
                ANS_DBG("TRT_Engine", "INVALIDATED %zu cached CUDA graphs after shape change (batch=%d)",
                        destroyed, batchSize);
                if (m_verbose || firstTime) {
                    std::cout << "Info: Invalidated " << destroyed
                        << " cached CUDA graphs after shape change" << std::endl;
                }
            }

            if (m_verbose && firstTime) {
                std::cout << "\nInfo: Input shapes updated successfully for batch size "
                    << batchSize << " ✓\n" << std::endl;
            }
        }
    }

    // ============================================================================
    // PREPROCESS AND COPY INPUTS TO GPU BUFFERS
    // ============================================================================

    // Pass 1: Validate all input dimensions before any GPU work.
    // Dynamic dims (-1) are skipped in validation (they accept any size).
    for (size_t i = 0; i < numInputs; ++i) {
        const auto& batchInput = inputs[i];
        const auto& dims = m_inputDims[i];
        if (!batchInput.empty()) {
            const auto& firstImg = batchInput[0];
            bool mismatch = false;
            if (dims.d[0] > 0 && firstImg.channels() != dims.d[0]) mismatch = true;
            if (dims.d[1] > 0 && firstImg.rows        != dims.d[1]) mismatch = true;
            if (dims.d[2] > 0 && firstImg.cols        != dims.d[2]) mismatch = true;
            if (mismatch) {
                std::cout << "Error: Input " << i << " dimension mismatch!" << std::endl;
                std::cout << "  Expected: " << dims.d[2] << "x" << dims.d[1]
                    << "x" << dims.d[0] << " (WxHxC, -1=dynamic)" << std::endl;
                std::cout << "  Got: " << firstImg.cols << "x" << firstImg.rows
                    << "x" << firstImg.channels() << " (WxHxC)" << std::endl;
                return false;
            }
        }
    }

    // Pass 2: Preprocess + D2D copies — all on m_inferenceStream (no null stream).
    //
    // All OpenCV CUDA ops (convertTo, subtract, divide, split) in blobFromGpuMats
    // now run on m_inferenceStream via the cv::cuda::Stream wrapper.  This means:
    //   • No null-stream interaction — eliminates global sync barriers on WDDM
    //   • No event bridge needed — same-stream ordering guarantees correctness
    //   • CUDA graphs are safe — cv::cuda::split runs BEFORE graph capture
    //
    // GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
    // cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
    auto _prepStart = std::chrono::steady_clock::now();
    cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
    std::vector<cv::cuda::GpuMat> preprocessedBuffers;
    preprocessedBuffers.reserve(numInputs);

    for (size_t i = 0; i < numInputs; ++i) {
        const auto& batchInput = inputs[i];

        // Preprocess on m_inferenceStream (not the null stream).
        preprocessedBuffers.push_back(
            blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize, false, cvInferStream));

        // D2D copy: same stream as preprocessing, so ordering is guaranteed.
        const auto& blobMat = preprocessedBuffers.back();
        const size_t copySize = static_cast<size_t>(blobMat.rows) * static_cast<size_t>(blobMat.cols) * blobMat.elemSize();
        cudaError_t copyErr = cudaMemcpyAsync(
            m_buffers[i],
            preprocessedBuffers.back().ptr<void>(),
            copySize,
            cudaMemcpyDeviceToDevice,
            m_inferenceStream);

        if (copyErr != cudaSuccess) {
            std::cout << "Error: Failed to copy input " << i
                << " to inference buffer: " << cudaGetErrorString(copyErr) << std::endl;
            return false;
        }
    }

    {
        double _prepMs = std::chrono::duration<double, std::milli>(
            std::chrono::steady_clock::now() - _prepStart).count();
        if (_prepMs > 100.0) {
            ANS_DBG("TRT_SM100", "SLOW PREPROCESS: %.1fms batch=%d (blobFromGpuMats+D2D)", _prepMs, batchSize);
        }
    }

    // ============================================================================
    // PRE-ALLOCATE OUTPUT STRUCTURE
    // ============================================================================

    const size_t numOutputs = m_outputLengths.size();

    featureVectors.clear();
    featureVectors.resize(batchSize);
    for (int batch = 0; batch < batchSize; ++batch) {
        featureVectors[batch].resize(numOutputs);
        for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx)
            featureVectors[batch][outputIdx].resize(m_outputLengths[outputIdx]);
    }

    // ============================================================================
    // RUN INFERENCE + COPY OUTPUTS  (CUDA Graph path or direct path)
    // ============================================================================

    // CUDA Graph path
    // ---------------
    // On the first call for a given batchSize we capture enqueueV3 + D2H copies
    // into a reusable graph.  Subsequent calls use cudaGraphLaunch, replacing
    // many individual kernel-submission API calls with a single launch.
    //
    // Prerequisites satisfied here:
    //   • Preprocessing + D2D copies are queued on m_inferenceStream (same-stream
    //     ordering guarantees they complete before captured kernels execute)
    //   • m_pinnedOutputBuffers has stable addresses (allocated in loadNetwork)
    //   • m_buffers (GPU outputs) have stable addresses (allocated in loadNetwork)
    //
    // Falls back to the direct path if pinned buffers are unavailable or if
    // graph capture/instantiation fails for any reason.

    // CUDA graphs capture fixed kernel sequences; incompatible with dynamic spatial dims
    // (input H/W change per inference call → different TRT kernel plans each time).
    // Disabled for pool slots — concurrent graph captures on the same GPU corrupt the
    // CUDA context ("operation not permitted when stream is capturing").
    const bool canGraph = !m_disableGraphs && !m_pinnedOutputBuffers.empty() && !m_hasDynamicSpatialDims;
    bool graphUsed = false;

    if (canGraph) {
        auto& graphExec = m_graphExecs[batchSize];   // inserts nullptr on first access
        if (!graphExec) {
            ANS_DBG("TRT_Engine", "CUDA graph CAPTURE starting for batch=%d (cached=%zu)",
                    batchSize, m_graphExecs.size());
            // First call for this batchSize -- capture a new graph.
            // Serialise captures across all Engine instances on this device to
            // prevent TRT's shared workspace from creating cross-stream
            // dependencies that violate CUDA graph capture rules.
            std::lock_guard<std::mutex> captureLock(graphCaptureMutex());

            // Clear any sticky CUDA error from a prior failed capture so that
            // this attempt starts clean.
            cudaGetLastError();

            cudaGraph_t graph = nullptr;
            bool captureOk = false;

            if (cudaStreamBeginCapture(m_inferenceStream,
                                       cudaStreamCaptureModeRelaxed) == cudaSuccess) {
                // Record TRT kernels into the graph (not executed yet).
                TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);

                // Record D2H copies to stable pinned addresses.
                for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
                    cudaMemcpyAsync(
                        m_pinnedOutputBuffers[outputIdx],
                        static_cast<char*>(m_buffers[numInputs + outputIdx]),
                        static_cast<size_t>(batchSize) * m_outputLengths[outputIdx] * sizeof(T),
                        cudaMemcpyDeviceToHost,
                        m_inferenceStream);
                }

                captureOk = (cudaStreamEndCapture(m_inferenceStream, &graph) == cudaSuccess
                             && graph != nullptr);
            }

            if (captureOk) {
                cudaGraphExec_t exec = nullptr;
                if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
                    graphExec = exec;
                cudaGraphDestroy(graph);
                ANS_DBG("TRT_Engine", "CUDA graph CAPTURED OK for batch=%d exec=%p",
                        batchSize, (void*)graphExec);
            }

            if (!graphExec) {
                ANS_DBG("TRT_Engine", "CUDA graph capture FAILED for batch=%d — falling back to direct path",
                        batchSize);
                std::cout << "Warning: CUDA graph capture failed for batchSize="
                    << batchSize << " -- falling back to direct inference path." << std::endl;
                // Disable graph acceleration for this Engine instance.
                for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
                m_pinnedOutputBuffers.clear();
                m_graphExecs.erase(batchSize);
            }
        }

        if (graphExec) {
            ANS_DBG("TRT_Engine", "CUDA graph LAUNCH batch=%d exec=%p", batchSize, (void*)graphExec);
            // Launch the pre-captured graph (single API call replaces many).
            auto _graphStart = std::chrono::steady_clock::now();
            cudaGraphLaunch(graphExec, m_inferenceStream);
            cudaStreamSynchronize_Safe(m_inferenceStream);
            auto _graphEnd = std::chrono::steady_clock::now();
            double _graphMs = std::chrono::duration<double, std::milli>(_graphEnd - _graphStart).count();
            if (_graphMs > 500.0) {
                ANS_DBG("TRT_SM100", "SLOW GRAPH: %.1fms batch=%d active=%d",
                        _graphMs, batchSize, s_globalActiveInf.load());
            }

            // CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
            for (int batch = 0; batch < batchSize; ++batch) {
                for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
                    std::memcpy(
                        featureVectors[batch][outputIdx].data(),
                        m_pinnedOutputBuffers[outputIdx]
                            + static_cast<size_t>(batch) * m_outputLengths[outputIdx],
                        m_outputLengths[outputIdx] * sizeof(T));
                }
            }
            graphUsed = true;
        }
    }

    // Direct path (no graph)
    // ----------------------
    // Used when pinned buffers are unavailable or graph capture failed.
    if (!graphUsed) {
        ANS_DBG("TRT_Engine", "Direct path (no graph) batch=%d", batchSize);
        auto enqueueStart = std::chrono::steady_clock::now();
        bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
        auto enqueueEnd = std::chrono::steady_clock::now();
        double enqueueMs = std::chrono::duration<double, std::milli>(enqueueEnd - enqueueStart).count();
        if (enqueueMs > 500.0) {
            ANS_DBG("TRT_Engine", "SLOW ENQUEUE: %.1fms batch=%d (enqueueV3 blocked!)", enqueueMs, batchSize);
        }
        if (!success) {
            ANS_DBG("TRT_Engine", "ERROR: enqueueV3 FAILED batch=%d", batchSize);
            return false;
        }

        for (int batch = 0; batch < batchSize; ++batch) {
            for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
                const size_t outputBinding = numInputs + outputIdx;
                const size_t offset =
                    static_cast<size_t>(batch) * m_outputLengths[outputIdx] * sizeof(T);

                cudaError_t copyErr = cudaMemcpyAsync(
                    featureVectors[batch][outputIdx].data(),
                    static_cast<char*>(m_buffers[outputBinding]) + offset,
                    m_outputLengths[outputIdx] * sizeof(T),
                    cudaMemcpyDeviceToHost,
                    m_inferenceStream);

                if (copyErr != cudaSuccess) {
                    return false;
                }
            }
        }

        auto syncStart = std::chrono::steady_clock::now();
        cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream);
        auto syncEnd = std::chrono::steady_clock::now();
        double syncMs = std::chrono::duration<double, std::milli>(syncEnd - syncStart).count();
        if (syncMs > 500.0) {
            ANS_DBG("TRT_Engine", "SLOW INFERENCE SYNC: %.1fms batch=%d (direct path)", syncMs, batchSize);
        }
        if (syncErr != cudaSuccess) {
            ANS_DBG("TRT_Engine", "ERROR: cudaStreamSync FAILED err=%d (%s)",
                    (int)syncErr, cudaGetErrorString(syncErr));
            return false;
        }
    }

    // ============================================================================
    // Per-inference total timing breakdown (mutex wait + preprocess + GPU)
    // ============================================================================
    {
        double totalMs = std::chrono::duration<double, std::milli>(
            std::chrono::steady_clock::now() - _mutexWaitStart).count();
        double gpuMs = totalMs - _mutexWaitMs;  // Everything after mutex acquired
        // Log every inference that takes >100ms total (including mutex wait)
        if (totalMs > 100.0) {
            ANS_DBG("TRT_Timing", "total=%.1fms (mutex=%.1fms gpu=%.1fms) batch=%d active=%d",
                    totalMs, _mutexWaitMs, gpuMs, batchSize, s_globalActiveInf.load());
        }
    }

    // ============================================================================
    // SM=100% DETECTOR — end-of-inference timing
    // ============================================================================
    {
        static thread_local double s_ema = 0;
        static thread_local std::chrono::steady_clock::time_point s_prevEnd;
        static thread_local bool s_firstDone = false;

        auto _now = std::chrono::steady_clock::now();
        if (s_firstDone) {
            double sinceLastMs = std::chrono::duration<double, std::milli>(_now - s_prevEnd).count();
            // If time between consecutive inferences jumps dramatically,
            // something blocked the thread (SM=100% or mutex contention)
            if (s_ema > 0 && sinceLastMs > s_ema * 3.0 && sinceLastMs > 500.0) {
                size_t vf = 0, vt = 0;
                cudaMemGetInfo(&vf, &vt);
                ANS_DBG("TRT_SM100", "GAP DETECTED: %.1fms between inferences (avg=%.1fms, %.1fx) active=%d VRAM=%zuMB free",
                        sinceLastMs, s_ema, sinceLastMs / s_ema,
                        s_globalActiveInf.load(), vf / (1024*1024));
            }
            s_ema = (s_ema == 0) ? sinceLastMs : (0.9 * s_ema + 0.1 * sinceLastMs);
        }
        s_prevEnd = _now;
        s_firstDone = true;

        s_globalActiveInf.fetch_sub(1);
    }

    return true;
}