engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl

#pragma once
#include <filesystem>
#include <map>
#include <sstream>
#include "Utility.h"
#include "TRTCompat.h"

// ============================================================================
// Crash-safe wrappers for TensorRT operations that can crash the process.
//
// On Windows: uses SEH (__try/__except) to catch access violations, OOM, etc.
//   SEH cannot coexist with C++ objects that have destructors in the same
//   function scope, so these thin wrappers accept only raw pointers.
//
// On Linux: uses POSIX signals + sigsetjmp/siglongjmp to catch SIGSEGV,
//   SIGBUS, SIGABRT, SIGFPE.  Thread-local jump buffers ensure thread safety.
//   Signal handlers are saved/restored around each dangerous call so that
//   the application's own handlers are not permanently replaced.
//
// outExceptionCode: 0 = OK.
//   Windows: the SEH exception code (e.g. 0xC0000005 = access violation).
//   Linux:   the signal number     (e.g. 11 = SIGSEGV).
// ============================================================================

#ifdef _WIN32
#  ifndef WIN32_LEAN_AND_MEAN
#    define WIN32_LEAN_AND_MEAN
#  endif
#  ifndef NOMINMAX
#    define NOMINMAX
#  endif
#  include <windows.h>
#else
#  include <signal.h>
#  include <setjmp.h>

// Thread-local storage for the POSIX crash-recovery mechanism.
// Each thread gets its own jump buffer and signal number so that
// concurrent engine builds on different threads don't interfere.
static thread_local sigjmp_buf  s_crashJmpBuf;
static thread_local volatile sig_atomic_t s_crashSignal = 0;

// Signal handler installed only around dangerous TensorRT calls.
// It records which signal was received and jumps back to the
// sigsetjmp() checkpoint.  Only synchronous, thread-directed signals
// (SIGSEGV, SIGBUS, SIGFPE) are guaranteed to land on the faulting
// thread; SIGABRT is process-wide but typically raised from the same
// thread that called abort().
static void engineCrashSignalHandler(int sig) {
    s_crashSignal = sig;
    siglongjmp(s_crashJmpBuf, 1);
}

// Helper: install crash signal handlers, saving the previous ones.
struct CrashSignalGuard {
    struct sigaction oldSigsegv, oldSigbus, oldSigabrt, oldSigfpe;

    void install() {
        struct sigaction sa;
        sa.sa_handler = engineCrashSignalHandler;
        sigemptyset(&sa.sa_mask);
        sa.sa_flags = 0;  // no SA_RESTART — let interrupted calls fail
        sigaction(SIGSEGV, &sa, &oldSigsegv);
        sigaction(SIGBUS,  &sa, &oldSigbus);
        sigaction(SIGABRT, &sa, &oldSigabrt);
        sigaction(SIGFPE,  &sa, &oldSigfpe);
        s_crashSignal = 0;
    }

    void restore() {
        sigaction(SIGSEGV, &oldSigsegv, nullptr);
        sigaction(SIGBUS,  &oldSigbus,  nullptr);
        sigaction(SIGABRT, &oldSigabrt, nullptr);
        sigaction(SIGFPE,  &oldSigfpe,  nullptr);
    }
};
#endif  // _WIN32

/// Crash-safe ONNX parser->parse() wrapper.
/// @param outExceptionCode  receives the exception/signal code on crash (0 = OK).
static bool parseOnnxModelSafe(
    nvonnxparser::IParser* parser,
    const void* data,
    size_t dataSize,
    unsigned long* outExceptionCode)
{
#ifdef _WIN32
    *outExceptionCode = 0;
    __try {
        return parser->parse(data, dataSize);
    }
    __except (EXCEPTION_EXECUTE_HANDLER) {
        *outExceptionCode = GetExceptionCode();
        return false;
    }
#else
    *outExceptionCode = 0;
    CrashSignalGuard guard;
    guard.install();
    bool result = false;
    if (sigsetjmp(s_crashJmpBuf, 1) == 0) {
        // Normal execution path
        result = parser->parse(data, dataSize);
    } else {
        // Returned here from signal handler — a crash was caught
        *outExceptionCode = static_cast<unsigned long>(s_crashSignal);
        result = false;
    }
    guard.restore();
    return result;
#endif
}

/// Crash-safe builder->buildSerializedNetwork() wrapper.
/// Returns raw IHostMemory* (caller wraps in unique_ptr).
static nvinfer1::IHostMemory* buildSerializedNetworkSafe(
    nvinfer1::IBuilder* builder,
    nvinfer1::INetworkDefinition& network,
    nvinfer1::IBuilderConfig& config,
    unsigned long* outExceptionCode)
{
#ifdef _WIN32
    *outExceptionCode = 0;
    __try {
        return builder->buildSerializedNetwork(network, config);
    }
    __except (EXCEPTION_EXECUTE_HANDLER) {
        *outExceptionCode = GetExceptionCode();
        return nullptr;
    }
#else
    *outExceptionCode = 0;
    CrashSignalGuard guard;
    guard.install();
    nvinfer1::IHostMemory* plan = nullptr;
    if (sigsetjmp(s_crashJmpBuf, 1) == 0) {
        plan = builder->buildSerializedNetwork(network, config);
    } else {
        *outExceptionCode = static_cast<unsigned long>(s_crashSignal);
        plan = nullptr;
    }
    guard.restore();
    return plan;
#endif
}

/// Crash-safe runtime->deserializeCudaEngine() wrapper.
/// Returns raw ICudaEngine* (caller wraps in unique_ptr).
static nvinfer1::ICudaEngine* deserializeCudaEngineSafe(
    nvinfer1::IRuntime* runtime,
    const void* data,
    size_t dataSize,
    unsigned long* outExceptionCode)
{
#ifdef _WIN32
    *outExceptionCode = 0;
    __try {
        return runtime->deserializeCudaEngine(data, dataSize);
    }
    __except (EXCEPTION_EXECUTE_HANDLER) {
        *outExceptionCode = GetExceptionCode();
        return nullptr;
    }
#else
    *outExceptionCode = 0;
    CrashSignalGuard guard;
    guard.install();
    nvinfer1::ICudaEngine* engine = nullptr;
    if (sigsetjmp(s_crashJmpBuf, 1) == 0) {
        engine = runtime->deserializeCudaEngine(data, dataSize);
    } else {
        *outExceptionCode = static_cast<unsigned long>(s_crashSignal);
        engine = nullptr;
    }
    guard.restore();
    return engine;
#endif
}

/// Crash-safe wrapper for an arbitrary bool-returning function pointer.
/// Used to SEH-protect build() calls that may crash on bad ONNX models.
typedef bool (*BoolFuncPtr)(void* ctx);
static bool callBoolFuncSafe(BoolFuncPtr fn, void* ctx, unsigned long* outExceptionCode) {
#ifdef _WIN32
    *outExceptionCode = 0;
    __try {
        return fn(ctx);
    }
    __except (EXCEPTION_EXECUTE_HANDLER) {
        *outExceptionCode = GetExceptionCode();
        return false;
    }
#else
    *outExceptionCode = 0;
    CrashSignalGuard guard;
    guard.install();
    bool result = false;
    if (sigsetjmp(s_crashJmpBuf, 1) == 0) {
        result = fn(ctx);
    } else {
        *outExceptionCode = static_cast<unsigned long>(s_crashSignal);
        result = false;
    }
    guard.restore();
    return result;
#endif
}

/// Format a crash code for logging (platform-aware).
/// Windows: "SEH exception 0xC0000005"   Linux: "signal 11 (SIGSEGV)"
static std::string formatCrashCode(unsigned long code) {
    std::ostringstream oss;
#ifdef _WIN32
    oss << "SEH exception 0x" << std::hex << code << std::dec;
#else
    oss << "signal " << code;
    switch (code) {
        case SIGSEGV: oss << " (SIGSEGV)"; break;
        case SIGBUS:  oss << " (SIGBUS)";  break;
        case SIGABRT: oss << " (SIGABRT)"; break;
        case SIGFPE:  oss << " (SIGFPE)";  break;
        default:      oss << " (unknown)"; break;
    }
#endif
    return oss.str();
}

template <typename T>
bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals,
    bool normalize)
{
    // -- GPU-tier batch cap (early) -------------------------------------------
    // Apply the same VRAM-based batch cap that build() uses BEFORE computing
    // the engine filename.  Without this, the cache lookup uses the uncapped
    // batch size (e.g. b32), misses the file that was saved with the capped
    // size (e.g. b16), and triggers a needless full rebuild every launch.
    // The cap inside build() still runs later as a safety net (it will be a
    // no-op because the batch is already capped here).
    {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, m_options.deviceIndex);
        const size_t totalMiB = prop.totalGlobalMem / (1024ULL * 1024);

        int gpuMaxBatch;
        if      (totalMiB >= 15800) gpuMaxBatch = 32;   // ~16 GiB+
        else if (totalMiB >= 11800) gpuMaxBatch = 16;   // ~12 GiB
        else if (totalMiB >=  7900) gpuMaxBatch = 8;    // ~ 8 GiB (batch=16 OCR ~987 MiB exec ctx, too large for 4 tasks)
        else if (totalMiB >=  3900) gpuMaxBatch = 4;    // ~ 4 GiB
        else if (totalMiB >=  1900) gpuMaxBatch = 2;    // ~ 2 GiB
        else                        gpuMaxBatch = 1;    // <  2 GiB

        if (m_options.maxBatchSize > gpuMaxBatch) {
            if (m_verbose) {
                std::cout << "Info: GPU-tier early batch cap: "
                          << m_options.maxBatchSize << " -> " << gpuMaxBatch
                          << " (GPU has " << totalMiB << " MiB)" << std::endl;
            }
            m_options.maxBatchSize = gpuMaxBatch;
            m_options.optBatchSize = std::min(m_options.optBatchSize,
                                              m_options.maxBatchSize);
        }
    }

    // It is full path
    std::string engineName = serializeEngineOptions(m_options, onnxModelPath);
    std::string engineDir = m_options.engineFileDir;

    if (FileExist(engineName)) {
        if (m_verbose) { std::cout << "Engine file found: " << engineName << std::endl; }
        bool loadOk = loadNetwork(engineName, subVals, divVals, normalize);
        if (loadOk) {
            return true;
        }
        // Engine file exists but loadNetwork failed.
        // Common causes:
        //   - createExecutionContext returned null (VRAM exhausted)
        //   - Incompatible TRT version or corrupt file
        //   - Partially written by another thread
        if (m_skipOnnxRebuild) {
            // Elastic growth / non-critical path — don't delete and rebuild.
            // Just fail gracefully; the pool continues with existing slots.
            return false;
        }
        // Check if the failure was due to VRAM exhaustion vs. corrupt file.
        // If VRAM was the reason, PRESERVE the engine file — it's valid, just
        // can't fit right now.  Deleting it forces a full ONNX→TRT rebuild
        // (minutes) when VRAM becomes available later, instead of a fast load.
        //
        // Uses the m_lastLoadFailedVRAM flag set by loadNetwork() instead of
        // re-querying cudaMemGetInfo.  The old approach had a TOCTOU race:
        // VRAM could be freed between loadNetwork's check and this re-check,
        // causing a valid engine file to be falsely classified as INVALID
        // and deleted.  Also check current VRAM as a safety net.
        {
            size_t freeCheck = 0, totalCheck = 0;
            cudaMemGetInfo(&freeCheck, &totalCheck);
            constexpr size_t kMinFreeBytes = 256ULL * 1024 * 1024;
            if (m_lastLoadFailedVRAM || freeCheck < kMinFreeBytes) {
                return false;  // Don't delete the file, don't try ONNX rebuild
            }
        }
        // Enough VRAM AND loadNetwork didn't flag VRAM as cause → file is
        // likely corrupt/incompatible.  Delete and rebuild from ONNX.
        try { std::filesystem::remove(engineName); } catch (...) {}
        // Fall through to ONNX build path below
    }

    {
        if (!FileExist(engineName)) {
            // Demand-driven growth: if no cached engine exists, bail out rather
            // than triggering a full ONNX→TRT build (30-60s, massive VRAM).
            if (m_skipOnnxBuild) {
                return false;
            }
        }
        if (!FileExist(onnxModelPath)) {
            // ONNX model does not exist, try to find alternative precision engine
            std::cout << "Searching for alternative precision engine..." << std::endl;

            size_t lastDot = engineName.find_last_of('.');
            std::string alternativeEngineName;
            ANSCENTER::Precision originalPrecision = m_options.precision;

            if (m_options.precision == ANSCENTER::Precision::FP16) {
                alternativeEngineName = engineName.substr(0, lastDot + 1) + "fp32";
                m_options.precision = ANSCENTER::Precision::FP32;
                std::cout << "  Looking for FP32 engine: " << alternativeEngineName << std::endl;
            }
            else {
                alternativeEngineName = engineName.substr(0, lastDot + 1) + "fp16";
                m_options.precision = ANSCENTER::Precision::FP16;
                std::cout << "  Looking for FP16 engine: " << alternativeEngineName << std::endl;
            }

            if (FileExist(alternativeEngineName)) {
                std::cout << "Found alternative precision engine: " << alternativeEngineName << std::endl;
                return loadNetwork(alternativeEngineName, subVals, divVals, normalize);
            }
            else {
                // Restore original precision
                m_options.precision = originalPrecision;
                std::cout << "Error: Neither ONNX model nor engine files exist for: " << onnxModelPath << std::endl;
                std::cout << "  Searched for: " << engineName << std::endl;
                std::cout << "  Searched for: " << alternativeEngineName << std::endl;
                return false;
            }
        }
       
        else {
            // Before building, check if an alternative precision engine already exists
            // (e.g., FP16 requested but a FP32 engine was built by a previous fallback)
            if (m_options.precision == ANSCENTER::Precision::FP16) {
                ANSCENTER::Options fp32Opts = m_options;
                fp32Opts.precision = ANSCENTER::Precision::FP32;
                std::string fp32EngineName = serializeEngineOptions(fp32Opts, onnxModelPath);
                if (FileExist(fp32EngineName)) {
                    std::cout << "FP16 engine not found, but FP32 engine exists: " << fp32EngineName << std::endl;
                    std::cout << "Loading existing FP32 engine..." << std::endl;
                    m_options.precision = ANSCENTER::Precision::FP32;
                    return loadNetwork(fp32EngineName, subVals, divVals, normalize);
                }
            }

            // ONNX model exists, generate engine
            std::cout << "========================================" << std::endl;
            std::cout << "Engine not found, generating from ONNX model" << std::endl;
            std::cout << "========================================" << std::endl;
            std::cout << "ONNX model: " << onnxModelPath << std::endl;
            std::cout << "Target engine: " << engineName << std::endl;

            if (!FolderExist(engineDir)) {
                std::cout << "Creating engine directory: " << engineDir << std::endl;
                std::filesystem::create_directories(engineDir);
            }


            // CRITICAL FIX: Read ONNX to determine if it supports dynamic batch
            int32_t onnxBatchSize = -1;
            bool hasDynamicSpatialDims_onnx = false;
            int onnxFixedH = 0, onnxFixedW = 0;  // 0 = dynamic (-1 in ONNX)
            std::cout << "\nAnalyzing ONNX model structure..." << std::endl;

            auto tempBuilder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(m_logger));
            auto tempNetwork = std::unique_ptr<nvinfer1::INetworkDefinition>(TRT_CREATE_NETWORK(tempBuilder));
            auto tempParser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*tempNetwork, m_logger));

            std::ifstream onnxFile(onnxModelPath, std::ios::binary | std::ios::ate);
            std::streamsize onnxSize = onnxFile.tellg();
            onnxFile.seekg(0, std::ios::beg);
            std::vector<char> onnxBuffer(onnxSize);
            if (!onnxFile.read(onnxBuffer.data(), onnxSize)) {
                std::cout << "Error: Failed to read ONNX file" << std::endl;
                return false;
            }

            unsigned long sehPreAnalysis = 0;
            bool preParsed = parseOnnxModelSafe(tempParser.get(),
                onnxBuffer.data(), onnxBuffer.size(), &sehPreAnalysis);
            if (sehPreAnalysis != 0) {
                // Skipping pre-analysis, proceeding with build...
            }
            else if (preParsed) {
                auto numInputs = tempNetwork->getNbInputs();
                std::cout << "ONNX Model Analysis:" << std::endl;
                std::cout << "  Number of inputs: " << numInputs << std::endl;

                for (int32_t i = 0; i < numInputs; ++i) {
                    auto input = tempNetwork->getInput(i);
                    auto inputDims = input->getDimensions();
                    std::cout << "  Input " << i << " (" << input->getName() << "): [";
                    for (int j = 0; j < inputDims.nbDims; ++j) {
                        if (j > 0) std::cout << ", ";

                        // FIXED: Properly display dynamic dimensions
                        if (inputDims.d[j] == -1) {
                            std::cout << "dynamic";
                        }
                        else {
                            std::cout << inputDims.d[j];
                        }
                    }
                    std::cout << "]" << std::endl;
                }

                // Check first input's batch dimension
                auto firstInput = tempNetwork->getInput(0);
                auto firstInputDims = firstInput->getDimensions();
                onnxBatchSize = firstInputDims.d[0];

                // Detect dynamic spatial dimensions (for auto-retry mechanism)
                if (firstInputDims.nbDims >= 4) {
                    if (firstInputDims.d[2] == -1 || firstInputDims.d[3] == -1) {
                        hasDynamicSpatialDims_onnx = true;
                    }
                    onnxFixedH = (firstInputDims.d[2] != -1) ? firstInputDims.d[2] : 0;
                    onnxFixedW = (firstInputDims.d[3] != -1) ? firstInputDims.d[3] : 0;
                }

                std::cout << "\nBatch dimension analysis:" << std::endl;
                std::cout << "  ONNX model batch dimension: ";
                if (onnxBatchSize == -1) {
                    std::cout << "dynamic (-1)" << std::endl;
                }
                else {
                    std::cout << onnxBatchSize << std::endl;
                }
                std::cout << "  Current maxBatchSize setting: " << m_options.maxBatchSize << std::endl;
                std::cout << "  Current optBatchSize setting: " << m_options.optBatchSize << std::endl;

                // FIXED: Correct logic for dynamic vs fixed batch
                if (onnxBatchSize == -1) {
                    // Dynamic batch size model - keep user settings
                    std::cout << "\n✓ ONNX model supports DYNAMIC batch size" << std::endl;
                    std::cout << "  Engine will support batch sizes 1 to " << m_options.maxBatchSize << std::endl;
                    std::cout << "  Optimal batch size: " << m_options.optBatchSize << std::endl;
                    std::cout << "  Keeping user-defined batch size configuration" << std::endl;
                }
                else if (onnxBatchSize > 0) {
                    // Fixed batch size model - must match ONNX
                    std::cout << "\n⚠ WARNING: ONNX model has FIXED batch size of " << onnxBatchSize << std::endl;
                    std::cout << "  Your model was exported with dynamic=False" << std::endl;
                    std::cout << "  Engine will only support batch size " << onnxBatchSize << std::endl;
                    std::cout << "  To use dynamic batching, re-export ONNX with dynamic=True" << std::endl;
                    std::cout << "\n  Adjusting engine options to match ONNX model..." << std::endl;

                    m_options.optBatchSize = onnxBatchSize;
                    m_options.maxBatchSize = onnxBatchSize;

                    std::cout << "  Updated optBatchSize: " << m_options.optBatchSize << std::endl;
                    std::cout << "  Updated maxBatchSize: " << m_options.maxBatchSize << std::endl;
                    engineName = serializeEngineOptions(m_options, onnxModelPath);
                }
                else {
                    // Unexpected value
                    std::cout << "\n⚠ WARNING: Unexpected batch dimension value: " << onnxBatchSize << std::endl;
                    std::cout << "  This may indicate an issue with the ONNX file" << std::endl;
                    std::cout << "  Proceeding with user-defined settings" << std::endl;
                }
            }
            else {
                std::cout << "Warning: Failed to parse ONNX for pre-analysis. Proceeding with build..." << std::endl;
            }

            std::cout << "\n========================================" << std::endl;
            std::cout << "Starting Engine Build Process" << std::endl;
            std::cout << "========================================" << std::endl;
            std::cout << "This may take 10-20 minutes depending on model complexity..." << std::endl;
            std::cout << "Configuration:" << std::endl;
            std::cout << "  Precision: " << (m_options.precision == ANSCENTER::Precision::FP16 ? "FP16" :
                m_options.precision == ANSCENTER::Precision::INT8 ? "INT8" : "FP32") << std::endl;
            std::cout << "  Optimization Level: 5 (Maximum)" << std::endl;
            std::cout << "  Batch Size Range: 1 to " << m_options.maxBatchSize << std::endl;
            std::cout << "========================================" << std::endl;

            // Build with auto-retry for dynamic spatial dimension models.
            // buildWithRetry() handles the ONNX pre-analysis internally and
            // reduces max spatial dims on OOM, falling back to smaller
            // profiles until build succeeds or all candidates are exhausted.
            // Fixed-spatial models get a single build() attempt.
            bool buildSuccess = buildWithRetry(onnxModelPath, subVals,
                                               divVals, normalize);

            // -- FP16 -> FP32 automatic fallback ---------------------------------
            // Some GPU architectures fail FP16 builds due to:
            //   - platformHasFastFp16() returning false (older GPUs)
            //   - kOBEY_PRECISION_CONSTRAINTS failing for mixed-precision layers
            //   - Insufficient VRAM for FP16 tactic optimization
            // When FP16 build fails, automatically retry with FP32 precision.
            if (!buildSuccess && m_options.precision == ANSCENTER::Precision::FP16) {
                std::cout << "\n========================================" << std::endl;
                std::cout << "FP16 Build Failed - Retrying with FP32" << std::endl;
                std::cout << "========================================" << std::endl;
                std::cout << "FP16 engine build failed on this GPU." << std::endl;
                std::cout << "Automatically falling back to FP32 precision..." << std::endl;
                std::cout << "========================================" << std::endl;

                m_options.precision = ANSCENTER::Precision::FP32;

                // Re-compute engine name for FP32 to avoid caching conflicts
                engineName = serializeEngineOptions(m_options, onnxModelPath);

                buildSuccess = buildWithRetry(onnxModelPath, subVals,
                                              divVals, normalize);

                if (buildSuccess) {
                    std::cout << "\n========================================" << std::endl;
                    std::cout << "FP32 Fallback Build Successful!" << std::endl;
                    std::cout << "========================================" << std::endl;
                    std::cout << "Note: Engine is running in FP32 mode on this GPU." << std::endl;
                    std::cout << "Performance may be lower than FP16 but accuracy is preserved." << std::endl;
                    std::cout << "========================================" << std::endl;
                }
            }

            if (!buildSuccess) {
                std::cout << "\n========================================" << std::endl;
                std::cout << "Engine Build Failed!" << std::endl;
                std::cout << "========================================" << std::endl;
                std::cout << "Error: Failed to build engine from ONNX model" << std::endl;
                std::cout << "Possible causes:" << std::endl;
                std::cout << "  1. Insufficient GPU memory" << std::endl;
                std::cout << "  2. Unsupported ONNX operations" << std::endl;
                std::cout << "  3. Invalid batch size configuration" << std::endl;
                std::cout << "  4. Corrupted ONNX file" << std::endl;
                if (hasDynamicSpatialDims_onnx) {
                    std::cout << "  5. All spatial dimension fallbacks exhausted" << std::endl;
                }
                std::cout << "  Note: Both FP16 and FP32 builds were attempted." << std::endl;
                std::cout << "\nTroubleshooting:" << std::endl;
                std::cout << "  - Check GPU memory availability" << std::endl;
                std::cout << "  - Try reducing maxBatchSize" << std::endl;
                std::cout << "  - Verify ONNX model integrity" << std::endl;
                std::cout << "  - Check TensorRT logs above for details" << std::endl;
                return false;
            }

            // build() may have capped maxBatchSize based on GPU VRAM, which
            // changes the serialized engine filename (e.g. b32 -> b8).  Re-compute
            // so we load the file that build() actually saved.
            std::string actualEngineName = serializeEngineOptions(m_options, onnxModelPath);

            // After building, the engine should be saved, so load it
            std::cout << "\n========================================" << std::endl;
            std::cout << "Engine Build Complete - Loading Engine" << std::endl;
            std::cout << "========================================" << std::endl;

            if (FileExist(actualEngineName)) {
                std::cout << "Engine file created successfully: " << actualEngineName << std::endl;
                std::cout << "Loading engine into memory..." << std::endl;

                bool loadSuccess = loadNetwork(actualEngineName, subVals, divVals, normalize);

                if (loadSuccess) {
                    std::cout << "\n========================================" << std::endl;
                    std::cout << "✓ Engine Ready for Inference!" << std::endl;
                    std::cout << "========================================" << std::endl;
                    std::cout << "Configuration Summary:" << std::endl;
                    std::cout << "  Engine File: " << actualEngineName << std::endl;
                    std::cout << "  Batch Size Support: 1 to " << m_options.maxBatchSize << std::endl;
                    std::cout << "  Precision: " << (m_options.precision == ANSCENTER::Precision::FP16 ? "FP16" :
                        m_options.precision == ANSCENTER::Precision::INT8 ? "INT8" : "FP32") << std::endl;
                    std::cout << "========================================" << std::endl;
                }

                return loadSuccess;
            }
            else {
                std::cout << "\n========================================" << std::endl;
                std::cout << "Engine Build Error!" << std::endl;
                std::cout << "========================================" << std::endl;
                std::cout << "Error: Engine file not found after build: " << actualEngineName << std::endl;
                std::cout << "Expected location: " << std::filesystem::absolute(actualEngineName) << std::endl;
                std::cout << "\nPossible causes:" << std::endl;
                std::cout << "  1. Build succeeded but save failed (disk full?)" << std::endl;
                std::cout << "  2. Incorrect engine directory permissions" << std::endl;
                std::cout << "  3. Engine filename mismatch" << std::endl;
                std::cout << "\nPlease check:" << std::endl;
                std::cout << "  - Available disk space in: " << engineDir << std::endl;
                std::cout << "  - Write permissions for engine directory" << std::endl;
                std::cout << "  - TensorRT build logs above for warnings" << std::endl;
                return false;
            }
        }
    }
}

template <typename T>
bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize)
{
    // Install a custom OpenCV CUDA allocator that uses cudaMallocAsync/cudaFreeAsync
    // instead of the default cudaMalloc/cudaFree.  The stream-ordered allocator
    // respects the cudaMemPool release threshold (set to 0), so freed memory is
    // returned to the GPU immediately instead of being cached forever.
    //
    // The default cudaMalloc/cudaFree allocator caches all freed blocks permanently
    // (no API to force release), causing VRAM to grow monotonically when GpuMat
    // objects of varying sizes are allocated and freed repeatedly (different batch
    // sizes, different image resolutions across cameras).
    {
        static std::once_flag s_allocatorFlag;
        std::call_once(s_allocatorFlag, []() {
            // Set release threshold to 0 on all GPUs
            int deviceCount = 0;
            cudaGetDeviceCount(&deviceCount);
            for (int d = 0; d < deviceCount; ++d) {
                cudaMemPool_t pool = nullptr;
                if (cudaDeviceGetDefaultMemPool(&pool, d) == cudaSuccess && pool) {
                    uint64_t threshold = 0;
                    cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold);
                }
            }

            // Custom allocator: uses cudaMallocAsync on stream 0 (behaves like
            // synchronous cudaMalloc but goes through the stream-ordered pool).
            struct AsyncAllocator : cv::cuda::GpuMat::Allocator {
                bool allocate(cv::cuda::GpuMat* mat, int rows, int cols, size_t elemSize) override {
                    // Same logic as OpenCV's default allocator, but using cudaMallocAsync
                    size_t step = elemSize * cols;
                    // Align step to 256 bytes (same as default allocator)
                    step = (step + 255) & ~size_t(255);
                    void* ptr = nullptr;
                    cudaError_t err = cudaMallocAsync(&ptr, step * rows, nullptr); // stream 0
                    if (err != cudaSuccess || !ptr) {
                        // Fallback to regular cudaMalloc if async not supported
                        err = cudaMalloc(&ptr, step * rows);
                        if (err != cudaSuccess) return false;
                    }
                    mat->data = static_cast<uchar*>(ptr);
                    mat->step = step;
                    mat->refcount = static_cast<int*>(cv::fastMalloc(sizeof(int)));
                    *mat->refcount = 1;
                    return true;
                }
                void free(cv::cuda::GpuMat* mat) override {
                    cudaFreeAsync(mat->data, nullptr); // stream 0 — goes through pool with threshold=0
                    cv::fastFree(mat->refcount);
                    mat->data = nullptr;
                    mat->datastart = nullptr;
                    mat->dataend = nullptr;
                    mat->refcount = nullptr;
                }
            };
            static AsyncAllocator s_allocator;
            cv::cuda::GpuMat::setDefaultAllocator(&s_allocator);
            ANS_DBG("TRT_Load", "Custom CUDA async allocator installed — VRAM freed immediately on GpuMat release");
        });
    }

    m_lastLoadFailedVRAM = false;  // reset on each load attempt
    m_subVals = subVals;
    m_divVals = divVals;
    m_normalize = normalize;

    // ============================================================================
    // TRT ENGINE CACHE CHECK — skip file I/O + deserialization if already cached
    // (Bypassed when m_skipEngineCache is true, e.g., during model optimization)
    // ============================================================================
    if (!m_skipEngineCache) {
        auto cacheHit = TRTEngineCache::instance().tryGet(trtModelPath, m_options.deviceIndex);
        if (cacheHit.engine) {
            // Cache hit — reuse shared ICudaEngine (no deserialization, no file I/O)
            m_context.reset();
            m_engine.reset();
            m_runtime.reset();

            m_engine  = cacheHit.engine;
            m_runtime = cacheHit.runtime;
            m_usingCachedEngine  = true;
            m_cachedEnginePath   = trtModelPath;
            m_cachedGpuIndex     = m_options.deviceIndex;

            // Still need to set GPU device for context + buffer allocation
            cudaSetDevice(m_options.deviceIndex);

            // Jump past file read + deserialization to context creation (below)
            goto trt_cache_create_context;
        }
    }

    // ============================================================================
    // READ ENGINE FILE (cache miss path)
    // ============================================================================

    if (!Util::doesFileExist(trtModelPath)) {
        return false;
    }

    if (m_verbose) { std::cout << "Loading TensorRT engine file at path: " << trtModelPath << std::endl; }

    {
    std::ifstream file(trtModelPath, std::ios::binary | std::ios::ate);
    if (!file.is_open()) {
        return false;
    }

    std::streamsize size = file.tellg();
    if (size <= 0) {
        return false;
    }

    file.seekg(0, std::ios::beg);

    std::vector<char> buffer(size);
    if (!file.read(buffer.data(), size)) {
        return false;
    }

    if (m_verbose) { std::cout << "Engine file size: " << size / (1024 * 1024) << " MiB" << std::endl; }

    // ============================================================================
    // CREATE RUNTIME
    // ============================================================================

    // TRT requires: destroy context before engine, engine before runtime.
    // If loadNetwork() is called more than once on the same instance, the
    // previous objects must be torn down in the correct order before we
    // create new ones.
    m_context.reset();
    m_engine.reset();
    m_runtime.reset();

    m_runtime = std::shared_ptr<nvinfer1::IRuntime>{ nvinfer1::createInferRuntime(m_logger) };
    if (!m_runtime) {
        return false;
    }

    // ============================================================================
    // GPU SELECTION AND CONFIGURATION
    // ============================================================================

    int numGPUs = 0;
    cudaGetDeviceCount(&numGPUs);
    if (m_verbose) std::cout << "Info: Number of GPU devices: " << numGPUs << std::endl;

    if (numGPUs == 0) {
        std::cout << "Error: No CUDA-capable GPUs detected" << std::endl;
        return false;
    }

    if (m_options.deviceIndex < 0 || m_options.deviceIndex >= numGPUs) {
        std::cout << "Error: Invalid GPU index " << m_options.deviceIndex
            << ". Available GPUs: " << numGPUs << std::endl;
        return false;
    }

    if (m_verbose) std::cout << "Info: Using GPU device index: " << m_options.deviceIndex << std::endl;

    // Use yield mode to avoid busy-wait spinning that falsely reports 100% GPU utilization.
    // Must be called before cudaSetDevice creates the CUDA context.
    cudaSetDeviceFlags(cudaDeviceScheduleYield);

    cudaError_t ret = cudaSetDevice(m_options.deviceIndex);
    if (ret != cudaSuccess) {
        std::cout << "Error: Unable to set GPU device index to " << m_options.deviceIndex << std::endl;
        std::cout << "CUDA Error: " << cudaGetErrorString(ret) << std::endl;
        return false;
    }

    // Get GPU properties
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, m_options.deviceIndex);

    // Set GPU device limits.
    // Blackwell GPUs (GB200/B200 = SM 10.x, RTX 5090/5080 = SM 12.x) have
    // deeper kernel-launch pipelines and benefit from a larger pending-launch
    // queue.  Using 8192 on Blackwell avoids throttling with heavily pipelined
    // workloads; 2048 is sufficient for all earlier architectures.
    {
        const size_t pendingLaunches = (prop.major >= 10) ? 8192 : 2048;
        cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, pendingLaunches);
        if (m_verbose) std::cout << "Info: cudaLimitDevRuntimePendingLaunchCount = " << pendingLaunches
            << " (SM " << prop.major << "." << prop.minor << ")" << std::endl;
    }
    cudaDeviceSetLimit(cudaLimitStackSize, 8192);
    cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 2);

    // Lock GPU clocks if requested (prevents power throttling on laptop GPUs)
    if (m_options.gpuClockLockMHz != 0 && !m_clocksLocked) {
        lockGpuClocks(m_options.deviceIndex, m_options.gpuClockLockMHz);
    }

    // -- VRAM safety check before engine deserialization -----------------------
    // Reject early if the GPU doesn't have enough free VRAM to load the engine.
    // This prevents slow degradation (unified memory fallback) or crashes
    // (cudaMalloc failure during inference) when too many tasks are loaded.
    {
        size_t freeVRAM = 0, totalVRAM = 0;
        cudaError_t memErr = cudaMemGetInfo(&freeVRAM, &totalVRAM);
        constexpr size_t kMinFreeBytes = 256ULL * 1024 * 1024;  // 256 MiB minimum
        if (memErr != cudaSuccess) {
            // cudaMemGetInfo failed — CUDA context may not be initialized on this thread.
            // Don't reject: let TRT try to deserialize (it may succeed).
        } else if (freeVRAM < kMinFreeBytes) {
            m_lastLoadFailedVRAM = true;  // signal to buildLoadNetwork: engine file is NOT corrupt
            return false;
        }
        if (m_verbose) {
            std::cout << "Info: GPU " << m_options.deviceIndex << " VRAM: "
                << (freeVRAM / (1024 * 1024)) << " MiB free / "
                << (totalVRAM / (1024 * 1024)) << " MiB total" << std::endl;
        }
    }

    // ============================================================================
    // DESERIALIZE ENGINE
    // ============================================================================

    if (m_verbose) std::cout << "Info: Deserializing TensorRT engine..." << std::endl;
    unsigned long sehCodeDeserialize = 0;
    m_engine = std::shared_ptr<nvinfer1::ICudaEngine>(
        deserializeCudaEngineSafe(m_runtime.get(), buffer.data(),
                                  buffer.size(), &sehCodeDeserialize));
    if (sehCodeDeserialize != 0) {
        return false;
    }
    if (!m_engine) {
        return false;
    }

    if (m_verbose) std::cout << "Info: Engine deserialized successfully" << std::endl;

    // ============================================================================
    // CRITICAL: VERIFY ENGINE BATCH CAPABILITIES IMMEDIATELY
    // ============================================================================

    int numOptProfiles = m_engine->getNbOptimizationProfiles();
    if (m_verbose) {
        std::cout << "\n========================================" << std::endl;
        std::cout << "ENGINE BATCH CAPABILITY VERIFICATION" << std::endl;
        std::cout << "========================================" << std::endl;
        std::cout << "Number of optimization profiles: " << numOptProfiles << std::endl;
    }

    bool engineSupportsDynamicBatch = false;
    int actualMinBatch = 1;
    int actualMaxBatch = 1;

    if (numOptProfiles > 0) {
        // Find the first input tensor to check batch support
        for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
            const char* tensorName = m_engine->getIOTensorName(i);
            if (m_engine->getTensorIOMode(tensorName) == nvinfer1::TensorIOMode::kINPUT) {

                auto minDims = m_engine->getProfileShape(tensorName, 0,
                    nvinfer1::OptProfileSelector::kMIN);
                auto optDims = m_engine->getProfileShape(tensorName, 0,
                    nvinfer1::OptProfileSelector::kOPT);
                auto maxDims = m_engine->getProfileShape(tensorName, 0,
                    nvinfer1::OptProfileSelector::kMAX);

                actualMinBatch = minDims.d[0];
                actualMaxBatch = maxDims.d[0];

                // Store actual profile max spatial dims for runtime queries
                if (maxDims.nbDims >= 4) {
                    m_profileMaxHeight = maxDims.d[2];
                    m_profileMaxWidth  = maxDims.d[3];
                }

                if (actualMinBatch != actualMaxBatch) {
                    engineSupportsDynamicBatch = true;
                }
                if (m_verbose) {
                    std::cout << "\nInput tensor '" << tensorName << "' profile 0:" << std::endl;
                    std::cout << "  Min: [" << minDims.d[0];
                    for (int d = 1; d < minDims.nbDims; ++d) std::cout << "," << minDims.d[d];
                    std::cout << "]" << std::endl;
                    std::cout << "  Opt: [" << optDims.d[0];
                    for (int d = 1; d < optDims.nbDims; ++d) std::cout << "," << optDims.d[d];
                    std::cout << "]" << std::endl;
                    std::cout << "  Max: [" << maxDims.d[0];
                    for (int d = 1; d < maxDims.nbDims; ++d) std::cout << "," << maxDims.d[d];
                    std::cout << "]" << std::endl;
                    if (actualMinBatch != actualMaxBatch)
                        std::cout << "\n✓ Engine supports DYNAMIC batching: "
                            << actualMinBatch << " to " << actualMaxBatch << std::endl;
                    else
                        std::cout << "\n⚠️  Engine has FIXED batch size: " << actualMinBatch << std::endl;
                }

                break; // Only need to check first input
            }
        }
    }
    else {
        if (m_verbose) std::cout << "⚠️  No optimization profiles found" << std::endl;
        // Check if batch dimension is dynamic via -1
        auto firstTensorName = m_engine->getIOTensorName(0);
        auto shape = m_engine->getTensorShape(firstTensorName);
        if (shape.d[0] == -1) {
            engineSupportsDynamicBatch = true;
            actualMaxBatch = m_options.maxBatchSize;
            if (m_verbose) std::cout << "Engine uses implicit dynamic batch (batch dim = -1)" << std::endl;
        }
    }

    // CRITICAL CHECK: Verify engine can support requested batch sizes
    if (!engineSupportsDynamicBatch && m_options.maxBatchSize > actualMaxBatch) {
        std::cout << "\n🚨🚨🚨 CRITICAL ERROR 🚨🚨🚨" << std::endl;
        std::cout << "Requested max batch size: " << m_options.maxBatchSize << std::endl;
        std::cout << "Engine max batch size: " << actualMaxBatch << std::endl;
        std::cout << "\nThis engine CANNOT support batch sizes larger than "
            << actualMaxBatch << "!" << std::endl;
        std::cout << "\nYou have two options:" << std::endl;
        std::cout << "1. Rebuild the engine with dynamic batch support:" << std::endl;
        std::cout << "   trtexec --onnx=model.onnx \\" << std::endl;
        std::cout << "           --minShapes=images:1x3x640x640 \\" << std::endl;
        std::cout << "           --optShapes=images:4x3x640x640 \\" << std::endl;
        std::cout << "           --maxShapes=images:32x3x640x640 \\" << std::endl;
        std::cout << "           --saveEngine=model_dynamic.engine --fp16" << std::endl;
        std::cout << "\n2. Reduce maxBatchSize in your config to " << actualMaxBatch << std::endl;
        std::cout << "========================================\n" << std::endl;

        // Optionally fail here:
        // return false;

        // Or adjust maxBatchSize to match engine capability
        if (m_verbose) std::cout << "⚠️  Auto-adjusting maxBatchSize from " << m_options.maxBatchSize
            << " to " << actualMaxBatch << std::endl;
        m_options.maxBatchSize = actualMaxBatch;
    }

    if (m_verbose) std::cout << "========================================\n" << std::endl;

    // Store in cache for future tasks loading the same model
    if (!m_skipEngineCache) {
        m_engine = TRTEngineCache::instance().putIfAbsent(
            trtModelPath, m_options.deviceIndex, m_runtime, m_engine);
        m_usingCachedEngine = true;
        m_cachedEnginePath  = trtModelPath;
        m_cachedGpuIndex    = m_options.deviceIndex;
    }

    } // end of cache-miss scope (closes the brace opened after cache check)

    // ============================================================================
    // CREATE EXECUTION CONTEXT (both cache-hit and cache-miss paths converge here)
    // ============================================================================
trt_cache_create_context:
    // These variables may not exist if we came from cache-hit path (goto skipped them).
    // Re-derive from the (now valid) m_engine so both paths work.
    {
    int numOptProfiles = m_engine->getNbOptimizationProfiles();
    bool engineSupportsDynamicBatch = false;
    int actualMinBatch = 1;
    int actualMaxBatch = 1;
    if (numOptProfiles > 0) {
        for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
            const char* tn = m_engine->getIOTensorName(i);
            if (m_engine->getTensorIOMode(tn) == nvinfer1::TensorIOMode::kINPUT) {
                auto minDims = m_engine->getProfileShape(tn, 0, nvinfer1::OptProfileSelector::kMIN);
                auto maxDims = m_engine->getProfileShape(tn, 0, nvinfer1::OptProfileSelector::kMAX);
                actualMinBatch = minDims.d[0];
                actualMaxBatch = maxDims.d[0];
                engineSupportsDynamicBatch = (actualMinBatch != actualMaxBatch);
                break;
            }
        }
    }
    if (actualMaxBatch > 0 && m_options.maxBatchSize > actualMaxBatch) {
        m_options.maxBatchSize = actualMaxBatch;
    }

    m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
    if (!m_context) {
        ANS_DBG("TRT_Load", "ERROR: createExecutionContext returned null");
        return false;
    }

    ANS_DBG("TRT_Load", "Execution context created OK for %s", trtModelPath.c_str());
    if (m_verbose) std::cout << "Info: Execution context created successfully" << std::endl;

    // ============================================================================
    // BUFFER ALLOCATION
    // ============================================================================

    if (m_verbose) {
        std::cout << "========================================" << std::endl;
        std::cout << "Initializing Buffers" << std::endl;
        std::cout << "========================================" << std::endl;
    }

    clearGpuBuffers();
    m_buffers.resize(m_engine->getNbIOTensors());
    m_outputLengths.clear();
    m_inputDims.clear();
    m_outputDims.clear();
    m_IOTensorNames.clear();
    m_hasDynamicSpatialDims = false;

    // Check available GPU memory
    size_t free_mem_initial, total_mem;
    cudaMemGetInfo(&free_mem_initial, &total_mem);
    if (m_verbose) {
        std::cout << "GPU Memory before allocation: Free " << free_mem_initial / (1024 * 1024)
            << " MiB / Total " << total_mem / (1024 * 1024) << " MiB" << std::endl;
    }

    size_t totalAllocated = 0;

    if (m_verbose) {
        std::cout << "Engine batch configuration:" << std::endl;
        std::cout << "  Dynamic batch: " << (engineSupportsDynamicBatch ? "YES" : "NO") << std::endl;
        std::cout << "  Actual batch range: " << actualMinBatch << " to " << actualMaxBatch << std::endl;
        std::cout << "  Configured max batch size: " << m_options.maxBatchSize << std::endl;
        std::cout << "  Optimization profiles: " << numOptProfiles << std::endl;
    }

    // Allocate buffers for all I/O tensors
    for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
        const auto tensorName = m_engine->getIOTensorName(i);
        m_IOTensorNames.emplace_back(tensorName);
        const auto tensorType = m_engine->getTensorIOMode(tensorName);
        const auto tensorShape = m_engine->getTensorShape(tensorName);
        const auto tensorDataType = m_engine->getTensorDataType(tensorName);

        if (tensorType == nvinfer1::TensorIOMode::kINPUT) {
            if (m_verbose) std::cout << "\nInfo: Processing input tensor: " << tensorName << std::endl;

            // Validate input type
            if (tensorDataType != nvinfer1::DataType::kFLOAT) {
                std::cout << "Error: Only float inputs are supported" << std::endl;
                return false;
            }

            // Store input dimensions correctly (C, H, W - excluding batch)
            m_inputDims.emplace_back(tensorShape.d[1], tensorShape.d[2], tensorShape.d[3]);

            // Detect dynamic spatial dimensions (e.g., detection models with variable H/W)
            if (tensorShape.d[2] == -1 || tensorShape.d[3] == -1) {
                m_hasDynamicSpatialDims = true;
            }

            if (m_verbose) std::cout << "  Input shape from engine: [" << tensorShape.d[0] << ", " << tensorShape.d[1]
                << ", " << tensorShape.d[2] << ", " << tensorShape.d[3] << "]" << std::endl;

            // Calculate buffer size using actual max batch size from engine
            // Dynamic dimensions (-1) are substituted with the configured max values
            int32_t batchSize = (tensorShape.d[0] == -1) ? actualMaxBatch : tensorShape.d[0];
            int32_t channels = tensorShape.d[1];
            int32_t height = (tensorShape.d[2] == -1) ? m_options.maxInputHeight : tensorShape.d[2];
            int32_t width = (tensorShape.d[3] == -1) ? m_options.maxInputWidth : tensorShape.d[3];

            int64_t inputLength = static_cast<int64_t>(batchSize) * channels * height * width;

            size_t requestedMemory = inputLength * sizeof(float);
            if (m_verbose) std::cout << "  Allocating for max batch size " << batchSize << ": "
                << requestedMemory / (1024 * 1024) << " MiB" << std::endl;

            // Allocate GPU memory
            cudaError_t err = cudaMalloc(&m_buffers[i], requestedMemory);
            if (err != cudaSuccess) {
                return false;
            }

            // Initialize to zero
            cudaMemset(m_buffers[i], 0, requestedMemory);

            totalAllocated += requestedMemory;

        }
        else if (tensorType == nvinfer1::TensorIOMode::kOUTPUT) {
            if (m_verbose) std::cout << "\nInfo: Processing output tensor: " << tensorName << std::endl;

            // Validate output type matches template parameter
            if (tensorDataType == nvinfer1::DataType::kFLOAT && !std::is_same<float, T>::value) {
                std::cout << "Error: Model output type is float, but template parameter is not float" << std::endl;
                return false;
            }
            else if (tensorDataType == nvinfer1::DataType::kHALF && !std::is_same<__half, T>::value) {
                std::cout << "Error: Model output type is half, but template parameter is not __half" << std::endl;
                return false;
            }
            else if (tensorDataType == nvinfer1::DataType::kINT32 && !std::is_same<int32_t, T>::value) {
                std::cout << "Error: Model output type is int32, but template parameter is not int32_t" << std::endl;
                return false;
            }

            // Calculate output buffer size per batch element
            int64_t outputLengthPerBatch = 1;
            m_outputDims.push_back(tensorShape);

            if (m_verbose) std::cout << "  Output shape from engine: [" << tensorShape.d[0];
            for (int j = 1; j < tensorShape.nbDims; ++j) {
                if (m_verbose) std::cout << ", " << tensorShape.d[j];
                int64_t dimSize = tensorShape.d[j];
                if (dimSize <= 0) {
                    // Dynamic output dimension: use max input dims as upper bound
                    if (tensorShape.nbDims == 4) {
                        // NCHW: d[2]=H, d[3]=W
                        dimSize = (j == 2) ? m_options.maxInputHeight : m_options.maxInputWidth;
                    } else {
                        // Generic: use max input width as fallback for dynamic dims
                        dimSize = m_options.maxInputWidth;
                    }
                    if (dimSize <= 0) dimSize = 1; // Safety: avoid zero/negative
                }
                outputLengthPerBatch *= dimSize;
            }
            if (m_verbose) std::cout << "]" << std::endl;

            // Store output length per batch element (excluding batch dimension)
            m_outputLengths.push_back(outputLengthPerBatch);

            // Allocate for actual max batch size from engine
            size_t requestedMemory = outputLengthPerBatch * actualMaxBatch * sizeof(T);
            if (m_verbose) std::cout << "  Allocating for max batch size " << actualMaxBatch << ": "
                << requestedMemory / (1024 * 1024) << " MiB" << std::endl;

            // Check if enough memory available
            size_t free_mem, total_mem_check;
            cudaMemGetInfo(&free_mem, &total_mem_check);
            if (requestedMemory > free_mem) {
                std::cout << "Error: Not enough GPU memory" << std::endl;
                std::cout << "  Requested: " << requestedMemory / (1024 * 1024) << " MiB" << std::endl;
                std::cout << "  Available: " << free_mem / (1024 * 1024) << " MiB" << std::endl;
                return false;
            }

            // Allocate GPU memory
            cudaError_t err = cudaMalloc(&m_buffers[i], requestedMemory);
            if (err != cudaSuccess) {
                return false;
            }

            // Initialize to zero
            cudaMemset(m_buffers[i], 0, requestedMemory);

            totalAllocated += requestedMemory;
        }
        else {
            std::cout << "Error: Tensor is neither input nor output!" << std::endl;
            return false;
        }
    }

    {
        size_t vramFree = 0, vramTotal = 0;
        cudaMemGetInfo(&vramFree, &vramTotal);
        ANS_DBG("TRT_Load", "Buffers allocated: %zuMB, VRAM: %zuMB used / %zuMB free / %zuMB total",
                totalAllocated / (1024*1024),
                (vramTotal - vramFree) / (1024*1024),
                vramFree / (1024*1024),
                vramTotal / (1024*1024));
    }
    if (m_verbose) std::cout << "\nInfo: Total GPU memory allocated: " << totalAllocated / (1024 * 1024) << " MiB" << std::endl;

    // -- Pinned output buffers (CUDA graph prerequisite) -----------------------
    // Invalidate any graphs captured by a previous loadNetwork() call on this instance.
    for (auto& [bs, ge] : m_graphExecs) { if (ge) cudaGraphExecDestroy(ge); }
    m_graphExecs.clear();
    // Free any previously allocated pinned buffers.
    for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
    m_pinnedOutputBuffers.clear();
    m_pinnedOutputBufElems.clear();

    // Allocate one flat pinned buffer per output tensor, sized for
    // actualMaxBatch x outputLength elements.  Stable host addresses enable
    // CUDA graph capture of D2H copies.  If any allocation fails, disable
    // graph acceleration gracefully and fall back to the original code path.
    //
    // Previously disabled for OpenCV 4.13+ because cv::cuda::split on the null
    // stream threw cudaErrorStreamCaptureUnsupported (-217).  Now safe because
    // blobFromGpuMats runs on m_inferenceStream and finishes BEFORE graph capture.
    m_pinnedOutputBuffers.resize(m_outputLengths.size(), nullptr);
    m_pinnedOutputBufElems.resize(m_outputLengths.size(), 0);
    bool pinnedOk = true;
    for (size_t i = 0; i < m_outputLengths.size(); ++i) {
        const size_t nElems = static_cast<size_t>(m_outputLengths[i])
                            * static_cast<size_t>(actualMaxBatch);
        if (cudaMallocHost(reinterpret_cast<void**>(&m_pinnedOutputBuffers[i]),
                           nElems * sizeof(T)) != cudaSuccess) {
            pinnedOk = false;
            break;
        }
        m_pinnedOutputBufElems[i] = nElems;
    }
    if (!pinnedOk) {
        std::cout << "Warning: cudaMallocHost failed -- CUDA graph acceleration disabled."
                  << std::endl;
        for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
        m_pinnedOutputBuffers.clear();
        m_pinnedOutputBufElems.clear();
    } else {
        if (m_verbose) std::cout << "Info: Pinned output buffers allocated -- CUDA graph acceleration enabled."
                  << std::endl;
    }

    // Check final memory state
    size_t free_mem_final, total_mem_final;
    cudaMemGetInfo(&free_mem_final, &total_mem_final);
    if (m_verbose) std::cout << "GPU Memory after allocation: Free " << free_mem_final / (1024 * 1024)
        << " MiB / Total " << total_mem_final / (1024 * 1024) << " MiB" << std::endl;

    // Ensure all pending GPU operations (cudaMalloc, memcpy, etc.) complete
    // before we begin inference on this engine.
    cudaDeviceSynchronize();

    // ============================================================================
    // CONTEXT OPTIMIZATION
    // ============================================================================

    if (m_verbose) {
        std::cout << "========================================" << std::endl;
        std::cout << "Context Optimization" << std::endl;
        std::cout << "========================================" << std::endl;
    }

    // Create temporary stream for context setup
    cudaStream_t setupStream;
    cudaStreamCreate(&setupStream);

    // Check and set optimization profile
    if (m_verbose) std::cout << "Info: Engine has " << numOptProfiles << " optimization profile(s)" << std::endl;

    if (numOptProfiles > 0) {
        int selectedProfile = 0;
        if (m_verbose) std::cout << "Info: Using optimization profile " << selectedProfile
            << " (actual range: batch " << actualMinBatch << " to " << actualMaxBatch << ")" << std::endl;

        // Set optimization profile FIRST
        bool profileSet = m_context->setOptimizationProfileAsync(selectedProfile, setupStream);
        if (!profileSet) {
            std::cout << "Error: Failed to set optimization profile" << std::endl;
            cudaStreamDestroy(setupStream);
            return false;
        }

        // Wait for profile to be set
        cudaStreamSynchronize(setupStream);
        if (m_verbose) std::cout << "Info: Optimization profile " << selectedProfile << " set successfully" << std::endl;
    }

    // Set input shapes and bind buffers
    for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
        const auto tensorName = m_engine->getIOTensorName(i);
        const auto tensorMode = m_engine->getTensorIOMode(tensorName);

        // Set tensor address for both input and output
        if (!m_context->setTensorAddress(tensorName, m_buffers[i])) {
            std::cout << "Error: Failed to set tensor address for " << tensorName << std::endl;
            cudaStreamDestroy(setupStream);
            return false;
        }

        if (tensorMode == nvinfer1::TensorIOMode::kINPUT) {
            auto dims = m_engine->getTensorShape(tensorName);

            if (m_verbose) {
                std::cout << "Info: Input tensor '" << tensorName << "' engine shape: [";
                for (int j = 0; j < dims.nbDims; ++j) {
                    if (j > 0) std::cout << ", ";
                    std::cout << dims.d[j];
                }
                std::cout << "]" << std::endl;
            }

            // For dynamic batch engines, set shape to minimum for initialization
            if (dims.d[0] == -1 || numOptProfiles > 0) {
                nvinfer1::Dims inputDims = dims;
                inputDims.d[0] = actualMinBatch;  // Use actual min from engine

                // Set height if dynamic
                if (inputDims.d[2] == -1) {
                    inputDims.d[2] = m_options.optInputHeight;
                }
                // Set width if dynamic
                if (inputDims.d[3] == -1) {
                    inputDims.d[3] = m_options.optInputWidth;
                }

                if (!m_context->setInputShape(tensorName, inputDims)) {
                    std::cout << "Error: Failed to set input shape for " << tensorName << std::endl;
                    cudaStreamDestroy(setupStream);
                    return false;
                }

                if (m_verbose) {
                    std::cout << "Info: Set initial input shape to [" << inputDims.d[0] << ", "
                        << inputDims.d[1] << ", " << inputDims.d[2] << ", "
                        << inputDims.d[3] << "] (for warmup)" << std::endl;
                    std::cout << "      Actual batch size will be set at inference time" << std::endl;
                }
            }
        }
    }

    // Verify all dimensions are specified
    if (!m_context->allInputDimensionsSpecified()) {
        std::cout << "Error: Not all input dimensions specified after setup" << std::endl;

        // Debug: Show which dimensions are missing
        for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
            const auto tensorName = m_engine->getIOTensorName(i);
            if (m_engine->getTensorIOMode(tensorName) == nvinfer1::TensorIOMode::kINPUT) {
                auto dims = m_context->getTensorShape(tensorName);
                std::cout << "  " << tensorName << " shape: [";
                for (int j = 0; j < dims.nbDims; ++j) {
                    if (j > 0) std::cout << ", ";
                    std::cout << dims.d[j];
                }
                std::cout << "]" << std::endl;
            }
        }

        cudaStreamDestroy(setupStream);
        return false;
    }

    if (m_verbose) {
        std::cout << "Info: All input dimensions specified correctly" << std::endl;
        std::cout << "Info: All tensor addresses bound successfully" << std::endl;
    }

    // Disable profiling for production
    m_context->setEnqueueEmitsProfile(false);
    if (m_verbose) std::cout << "Info: Enqueue profile emissions disabled (production mode)" << std::endl;

    // Clean up setup stream
    cudaStreamSynchronize(setupStream);
    cudaStreamDestroy(setupStream);

    // ============================================================================
    // CREATE PERSISTENT INFERENCE AND MEMORY STREAMS
    // ============================================================================
    // Creating streams here (once, at load time) rather than lazily in
    // runInference() removes the hot-path "if (!m_streamInitialized)" branch
    // and ensures warmUp() already runs on the real inference stream.
    if (!m_streamInitialized) {
        int leastPriority, greatestPriority;
        cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);

        cudaError_t streamErr = cudaStreamCreateWithPriority(
            &m_inferenceStream, cudaStreamNonBlocking, greatestPriority);
        if (streamErr != cudaSuccess) {
            std::cout << "Error: Failed to create inference stream: "
                << cudaGetErrorString(streamErr) << std::endl;
            return false;
        }

        streamErr = cudaStreamCreate(&m_memoryStream);
        if (streamErr != cudaSuccess) {
            std::cout << "Error: Failed to create memory stream: "
                << cudaGetErrorString(streamErr) << std::endl;
            return false;
        }

        m_streamInitialized = true;
        if (m_verbose) {
            std::cout << "Info: Inference stream created at load time with highest priority" << std::endl;
            std::cout << "Info: Memory stream created" << std::endl;
        }
    }

    // ============================================================================
    // PRE-WARMUP DIAGNOSTICS
    // ============================================================================

    if (m_verbose) {
        std::cout << "\n========================================" << std::endl;
        std::cout << "Pre-Warmup Diagnostics" << std::endl;
        std::cout << "========================================" << std::endl;
        std::cout << "Engine has " << m_engine->getNbIOTensors() << " I/O tensors" << std::endl;
        std::cout << "Engine has " << m_engine->getNbOptimizationProfiles() << " optimization profiles" << std::endl;
        for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
            const auto tensorName = m_engine->getIOTensorName(i);
            const auto tensorMode = m_engine->getTensorIOMode(tensorName);
            const auto tensorShape = m_context->getTensorShape(tensorName);
            std::cout << "\nTensor " << i << ": " << tensorName << std::endl;
            std::cout << "  Mode: " << (tensorMode == nvinfer1::TensorIOMode::kINPUT ? "INPUT" : "OUTPUT") << std::endl;
            std::cout << "  Shape: [";
            for (int j = 0; j < tensorShape.nbDims; ++j) {
                if (j > 0) std::cout << ", ";
                std::cout << tensorShape.d[j];
            }
            std::cout << "]" << std::endl;
            std::cout << "  Buffer address: " << m_buffers[i] << std::endl;
        }
        std::cout << "\nContext state check:" << std::endl;
        std::cout << "  All dimensions specified: " << (m_context->allInputDimensionsSpecified() ? "YES" : "NO") << std::endl;
        std::cout << "========================================" << std::endl;
    }
    if (!m_context->allInputDimensionsSpecified()) {
        std::cout << "ERROR: Cannot proceed with warmup - dimensions not specified!" << std::endl;
        return false;
    }

    // ============================================================================
    // ENGINE LOADED SUCCESSFULLY
    // ============================================================================

    if (m_verbose) {
        std::cout << "\n========================================" << std::endl;
        std::cout << "Engine loaded successfully!" << std::endl;
        std::cout << "========================================" << std::endl;
    }

    // ============================================================================
    // WARMUP
    // ============================================================================

    if (m_verbose) std::cout << "\nInfo: Starting warm-up inference..." << std::endl;
    warmUp(m_verbose ? 10 : 1);
    if (m_verbose) std::cout << "Info: Warm-up complete" << std::endl;

    } // end of trt_cache_create_context scope
    return true;
}


template <typename T>
bool Engine<T>::build(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize) {
    const auto engineName = serializeEngineOptions(m_options, onnxModelPath);
    if (FileExist(engineName)) {
        std::cout << "Engine file already exists: " << engineName << std::endl;
        return true;
    }
    if (!FileExist(onnxModelPath)) {
        std::cout << "Error: ONNX model file does not exist: " << onnxModelPath << std::endl;
        return false;
    }

    std::cout << "========================================" << std::endl;
    std::cout << "Building TensorRT Engine" << std::endl;
    std::cout << "========================================" << std::endl;
    std::cout << "TensorRT Version: " << NV_TENSORRT_MAJOR << "."
        << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH << std::endl;

    // TensorRT 10+ detection
#if NV_TENSORRT_MAJOR >= 10
    std::cout << "\n⚠️  TensorRT " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR
        << " detected - will apply dynamic batch optimization flags" << std::endl;
#endif

    // Create our engine builder.
    auto builder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(m_logger));
    if (!builder) {
        std::cout << "Error: Failed to create builder" << std::endl;
        return false;
    }

    auto network = std::unique_ptr<nvinfer1::INetworkDefinition>(TRT_CREATE_NETWORK(builder));
    if (!network) {
        std::cout << "Error: Failed to create network" << std::endl;
        return false;
    }

    // Create a parser for reading the onnx file.
    auto parser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, m_logger));
    if (!parser) {
        std::cout << "Error: Failed to create parser" << std::endl;
        return false;
    }

    // Read the onnx file into memory
    std::ifstream file(onnxModelPath, std::ios::binary | std::ios::ate);
    std::streamsize size = file.tellg();
    file.seekg(0, std::ios::beg);

    std::vector<char> buffer(size);
    if (!file.read(buffer.data(), size)) {
        std::cout << "Error: Unable to read ONNX file" << std::endl;
        return false;
    }

    std::cout << "ONNX model size: " << size / (1024 * 1024) << " MiB" << std::endl;

    // Parse the buffer we read into memory (crash-safe).
    std::cout << "Parsing ONNX model..." << std::endl;
    unsigned long sehCodeParse = 0;
    auto parsed = parseOnnxModelSafe(parser.get(), buffer.data(),
                                     buffer.size(), &sehCodeParse);
    if (sehCodeParse != 0) {
        return false;
    }
    if (!parsed) {
        std::cout << "Error: Failed to parse ONNX model" << std::endl;
        for (int32_t i = 0; i < parser->getNbErrors(); ++i) {
            std::cout << "  " << parser->getError(i)->desc() << std::endl;
        }
        return false;
    }
    std::cout << "ONNX model parsed successfully" << std::endl;

    // ============================================================================
    // ENHANCED ONNX MODEL ANALYSIS
    // ============================================================================

    std::cout << "\n========================================" << std::endl;
    std::cout << "ONNX Model Analysis" << std::endl;
    std::cout << "========================================" << std::endl;

    const auto numInputs = network->getNbInputs();
    if (numInputs < 1) {
        std::cout << "Error: Model needs at least 1 input!" << std::endl;
        return false;
    }

    std::cout << "Number of inputs: " << numInputs << std::endl;

    // Analyze all inputs
    for (int32_t i = 0; i < numInputs; ++i) {
        const auto input = network->getInput(i);
        const auto inputDims = input->getDimensions();

        std::cout << "\nInput [" << i << "] '" << input->getName() << "':" << std::endl;
        std::cout << "  Dimensions: [";
        for (int j = 0; j < inputDims.nbDims; ++j) {
            if (j > 0) std::cout << ", ";
            if (inputDims.d[j] == -1) {
                std::cout << "DYNAMIC";
            }
            else {
                std::cout << inputDims.d[j];
            }
        }
        std::cout << "]" << std::endl;

        // Check batch dimension
        if (inputDims.d[0] == -1) {
            std::cout << "  ✓ Batch dimension: DYNAMIC" << std::endl;
        }
        else {
            std::cout << "  ✗ Batch dimension: FIXED at " << inputDims.d[0] << std::endl;
        }

        // Check height dimension (if applicable)
        if (inputDims.nbDims >= 3 && inputDims.d[2] == -1) {
            std::cout << "  ✓ Height dimension: DYNAMIC" << std::endl;
        }
        else if (inputDims.nbDims >= 3) {
            std::cout << "  • Height dimension: FIXED at " << inputDims.d[2] << std::endl;
        }

        // Check width dimension (if applicable)
        if (inputDims.nbDims >= 4 && inputDims.d[3] == -1) {
            std::cout << "  ✓ Width dimension: DYNAMIC" << std::endl;
        }
        else if (inputDims.nbDims >= 4) {
            std::cout << "  • Width dimension: FIXED at " << inputDims.d[3] << std::endl;
        }
    }

    // Ensure that all the inputs have the same batch size
    const auto input0Batch = network->getInput(0)->getDimensions().d[0];
    for (int32_t i = 1; i < numInputs; ++i) {
        if (network->getInput(i)->getDimensions().d[0] != input0Batch) {
            std::cout << "\nError: Model has multiple inputs with differing batch sizes!" << std::endl;
            return false;
        }
    }

    // Check to see if the model supports dynamic batch size or not
    bool doesSupportDynamicBatch = false;
    if (input0Batch == -1) {
        doesSupportDynamicBatch = true;
        std::cout << "\n✓ Model supports DYNAMIC batch size" << std::endl;
        std::cout << "  Batch size range: min=1, opt=" << m_options.optBatchSize
            << ", max=" << m_options.maxBatchSize << std::endl;
    }
    else {
        std::cout << "\n✗ Model only supports FIXED batch size of " << input0Batch << std::endl;
        std::cout << "  WARNING: This will limit batch processing performance!" << std::endl;
        std::cout << "  Consider re-exporting ONNX with dynamic batch axis." << std::endl;

        // Adjust batch size options to match model's fixed batch size
        if (m_options.optBatchSize != input0Batch || m_options.maxBatchSize != input0Batch) {
            std::cout << "  Adjusting batch size options to match model's fixed batch size" << std::endl;
            m_options.optBatchSize = input0Batch;
            m_options.maxBatchSize = input0Batch;
        }
    }

    // Check for dynamic width and height dimensions
    const auto inputHeight = network->getInput(0)->getDimensions().d[2];
    const auto inputWidth = network->getInput(0)->getDimensions().d[3];

    bool doesSupportDynamicHeight = false;
    bool doesSupportDynamicWidth = false;

    // Check height dimension
    if (inputHeight == -1) {
        doesSupportDynamicHeight = true;
        std::cout << "\n✓ Model supports DYNAMIC height" << std::endl;

        if (m_options.optInputHeight == -1) {
            std::cout << "  No user-configured height found, using default: 640" << std::endl;
            m_options.minInputHeight = 640;
            m_options.optInputHeight = 640;
            m_options.maxInputHeight = 640;
        }
        else {
            std::cout << "  Using user-configured height: " << m_options.optInputHeight << std::endl;
        }
    }
    else {
        std::cout << "\n• Model has FIXED height: " << inputHeight << std::endl;
        m_options.minInputHeight = m_options.optInputHeight = m_options.maxInputHeight = inputHeight;
    }

    // Check width dimension
    if (inputWidth == -1) {
        doesSupportDynamicWidth = true;
        std::cout << "✓ Model supports DYNAMIC width" << std::endl;

        if (m_options.optInputWidth == -1) {
            std::cout << "  No user-configured width found, using default: 640" << std::endl;
            m_options.minInputWidth = 640;
            m_options.optInputWidth = 640;
            m_options.maxInputWidth = 640;
        }
        else {
            std::cout << "  Using user-configured width: " << m_options.optInputWidth << std::endl;
        }
    }
    else {
        std::cout << "• Model has FIXED width: " << inputWidth << std::endl;
        m_options.minInputWidth = m_options.optInputWidth = m_options.maxInputWidth = inputWidth;
    }

    std::cout << "\nFinal input dimensions configured:" << std::endl;
    std::cout << "  Height: " << m_options.optInputHeight << std::endl;
    std::cout << "  Width: " << m_options.optInputWidth << std::endl;
    std::cout << "========================================" << std::endl;

    auto config = std::unique_ptr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
    if (!config) {
        std::cout << "Error: Failed to create builder config" << std::endl;
        return false;
    }

    // ============================================================================
    // PERFORMANCE OPTIMIZATIONS
    // ============================================================================

    std::cout << "\n========================================" << std::endl;
    std::cout << "Configuring Performance Settings" << std::endl;
    std::cout << "========================================" << std::endl;

    // Get GPU properties for the target device (not always GPU 0)
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, m_options.deviceIndex);
    std::cout << "Building engine for GPU " << m_options.deviceIndex << ": " << prop.name << std::endl;
    std::cout << "Compute Capability: " << prop.major << "." << prop.minor << std::endl;
    std::cout << "Total GPU Memory: " << prop.totalGlobalMem / (1024 * 1024) << " MiB" << std::endl;

    size_t free_mem, total_mem;
    cudaMemGetInfo(&free_mem, &total_mem);
    const size_t totalMiB = total_mem / (1024ULL * 1024);

    // -- GPU-tier adaptive configuration --------------------------------------
    // All performance parameters scale with GPU VRAM to avoid OOM on small
    // GPUs while maximising throughput on larger ones.
    //
    //  VRAM       | Workspace | Opt Level | Max Batch | Tactic DRAM
    // ------------|-----------|-----------|-----------|-------------------
    //  <= 1 GiB    | 256 MiB   |     3     |     1     | up to 2 GiB cap
    //  <= 2 GiB    | 512 MiB   |     3     |     2     | up to 2 GiB cap
    //  <= 4 GiB    | 1 GiB     |     3     |     4     | up to 2 GiB cap
    //  <= 6 GiB    | 2 GiB     |     3     |     8     | up to 2 GiB cap
    //  <= 8 GiB    | 2 GiB     |     3     |    16     | up to 2 GiB cap
    //  <=12 GiB    | 2 GiB     |     3     |    16     | up to 2 GiB cap
    //  <=16 GiB    | 8 GiB     |     5     |    32     | up to 4 GiB cap
    //  <=24 GiB    | 8 GiB     |     5     |    32     | up to 4 GiB cap
    //  > 24 GiB   | 16 GiB    |     5     |    32     | up to 8 GiB cap

    // -- 1. Workspace size ----------------------------------------------------
    size_t max_workspace;
    const char* tierLabel;
    if (totalMiB > 24576) {                                          // > 24 GiB
        max_workspace = 16ULL * 1024 * 1024 * 1024;
        tierLabel = "high-end (>24 GiB)";
    } else if (totalMiB > 12288) {                                   // > 12 GiB
        max_workspace = 8ULL * 1024 * 1024 * 1024;
        tierLabel = "desktop (>12 GiB)";
    } else if (totalMiB > 4096) {                                    // > 4 GiB
        max_workspace = 2ULL * 1024 * 1024 * 1024;
        tierLabel = "laptop (4-12 GiB)";
    } else if (totalMiB > 2048) {                                    // > 2 GiB
        max_workspace = 1ULL * 1024 * 1024 * 1024;
        tierLabel = "low-end (2-4 GiB)";
    } else if (totalMiB > 1024) {                                    // > 1 GiB
        max_workspace = 512ULL * 1024 * 1024;
        tierLabel = "minimal (1-2 GiB)";
    } else {                                                         // <= 1 GiB
        max_workspace = 256ULL * 1024 * 1024;
        tierLabel = "ultra-low (<=1 GiB)";
    }

    size_t workspace_size = std::min(max_workspace, static_cast<size_t>(free_mem * 0.4));
    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspace_size);
    std::cout << "Workspace size set to: " << workspace_size / (1024 * 1024)
              << " MiB (" << tierLabel << " tier)" << std::endl;

    // -- 2. Max batch size cap ------------------------------------------------
    //   The model config sets the *desired* maxBatchSize; the GPU VRAM
    //   determines the *actual* cap.  This affects the optimisation profile
    //   range, warmup, and runtime chunk splitting.
    //   Thresholds use ~97% of marketing size to account for OS/driver reserved
    //   memory (e.g. an "8 GB" GPU reports 8187 MiB).
    if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
        int gpuMaxBatch;
        if      (totalMiB >= 15800) gpuMaxBatch = 32;   // ~16 GiB  (e.g. 16384 -> reports ~15900+)
        else if (totalMiB >= 11800) gpuMaxBatch = 16;   // ~12 GiB  (e.g. 12288 -> reports ~11800+)
        else if (totalMiB >=  7900) gpuMaxBatch = 8;    // ~ 8 GiB  (e.g.  8192 -> reports ~8100+; batch=16 OCR ~987 MiB too large for 4 tasks)
        else if (totalMiB >=  3900) gpuMaxBatch = 4;    // ~ 4 GiB  (e.g.  4096 -> reports ~3950+)
        else if (totalMiB >=  1900) gpuMaxBatch = 2;    // ~ 2 GiB  (e.g.  2048 -> reports ~1950+)
        else                        gpuMaxBatch = 1;    // <  2 GiB

        const int prevMax = m_options.maxBatchSize;
        m_options.maxBatchSize = std::min(m_options.maxBatchSize, gpuMaxBatch);
        m_options.optBatchSize = std::min(m_options.optBatchSize, m_options.maxBatchSize);

        if (prevMax != m_options.maxBatchSize) {
            std::cout << "Max batch size capped by GPU VRAM: " << prevMax
                      << " -> " << m_options.maxBatchSize
                      << " (GPU has " << totalMiB << " MiB)" << std::endl;
        }
        std::cout << "Batch config: opt=" << m_options.optBatchSize
                  << ", max=" << m_options.maxBatchSize << std::endl;
    }

    // -- 3. Optimisation level ------------------------------------------------
    //   Level 5 (exhaustive kernel search) only on GPUs with ≥16 GiB where
    //   the tactic DRAM pool can hold the largest tactics.  On smaller GPUs,
    //   level 3 gives ~95 % of the runtime performance with dramatically
    //   shorter build times.
    // Level 3 = balanced (best tradeoff: fast build, near-optimal kernels)
    // Level 5 = exhaustive (10x slower build for ~1-3% faster inference)
    // Use level 3 for all GPUs — the marginal runtime gain from level 5
    // is not worth the 10-30 minute build time on first run.
    const int optLevel = 3;
    config->setBuilderOptimizationLevel(optLevel);
    std::cout << "Builder optimization level set to " << optLevel
              << " (balanced)" << std::endl;

    // Enable TF32 for Ampere and newer GPUs
    if (prop.major >= 8) {
        config->setFlag(nvinfer1::BuilderFlag::kTF32);
        std::cout << "TF32 enabled for Ampere/Ada/Blackwell architecture" << std::endl;
    }

    // Enable optimization flags
    // kPREFER_PRECISION_CONSTRAINTS removed: deprecated in TRT 10.12, no-op in TRT 10.15.1.
    config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
    std::cout << "Optimization flags enabled" << std::endl;

    // kDIRECT_IO removed: deprecated in TRT 10.7 as "Unneeded API".
    // TRT 10.7+ enables this behaviour automatically; the flag is a no-op in TRT 10.15.1.

    // Enable all available tactic sources
    uint32_t tacticSources = 1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUBLAS) |
        1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUBLAS_LT) |
        1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUDNN);

    if (prop.major >= 8) {
        tacticSources |= 1U << static_cast<uint32_t>(nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS);
        tacticSources |= 1U << static_cast<uint32_t>(nvinfer1::TacticSource::kJIT_CONVOLUTIONS);
        std::cout << "Enhanced tactic sources enabled for Ampere+ architecture" << std::endl;
    }
    config->setTacticSources(tacticSources);

    // kDETAILED profiling embeds per-layer metadata in the engine and adds measurable
    // build/inference overhead.  Use kNONE for production; switch to kDETAILED or
    // kLAYER_NAMES_ONLY only when profiling with Nsight Systems / trt-exec --profilingVerbosity.
    config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kNONE);

    // Set timing iterations
    config->setAvgTimingIterations(4);
    std::cout << "Timing iterations set to 4 for stable kernel selection" << std::endl;

    // Set hardware compatibility
    config->setHardwareCompatibilityLevel(nvinfer1::HardwareCompatibilityLevel::kNONE);

    // -- TensorRT 10+ tactic DRAM pool ----------------------------------------
    // Separate scratch pool for kernel-selection during build.  Without this,
    // tactic evaluation competes with the workspace allocation and tactics
    // requesting >1 GiB get skipped, causing hours of wasted fallback searches.
    //
    // Strategy: give the tactic pool as much memory as possible while reserving
    // enough for workspace + builder overhead.  The cap scales with GPU VRAM:
    //   <=12 GiB  -> up to 2 GiB   (most tactics fit within 1.5 GiB)
    //   <=24 GiB  -> up to 4 GiB   (room for larger model tactics)
    //   > 24 GiB -> up to 8 GiB   (future-proof for very large models)
#if NV_TENSORRT_MAJOR >= 10
    {
        // Scale the tactic cap by GPU VRAM -- larger GPUs can afford more
        size_t tacticCap;
        if      (totalMiB > 24576) tacticCap = 8ULL * 1024 * 1024 * 1024;   // > 24 GiB
        else if (totalMiB > 12288) tacticCap = 4ULL * 1024 * 1024 * 1024;   // > 12 GiB
        else                       tacticCap = 2ULL * 1024 * 1024 * 1024;   // <= 12 GiB

        // Reserve workspace + 512 MiB safety margin for builder internals
        const size_t reserveForBuild = workspace_size + (512ULL * 1024 * 1024);
        const size_t availableForTactic =
            (free_mem > reserveForBuild) ? (free_mem - reserveForBuild) : 0ULL;

        size_t tacticMemory = std::min(tacticCap, availableForTactic);

        // kTACTIC_DRAM requires a power-of-2 size; floor to nearest power of 2
        if (tacticMemory > 0) {
            size_t p = 1ULL;
            while (p * 2 <= tacticMemory) p *= 2;
            tacticMemory = p;
        }
        config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kTACTIC_DRAM, tacticMemory);
        std::cout << "kTACTIC_DRAM pool: " << tacticMemory / (1024 * 1024) << " MiB (TRT 10+)" << std::endl;
    }
#endif

    // -- kSTRONGLY_TYPED (TRT 8.5 - 9.x only) --------------------------------
    // This flag existed in TRT 8.5 through 9.x to opt into strict type
    // enforcement.  NVIDIA removed the enum in TRT 10.0 because strongly-typed
    // networks became the default behaviour -- setting it on TRT 10+ produces a
    // compile error ("undeclared identifier").  For TRT 10+ simply log a note.
#if NV_TENSORRT_MAJOR < 10
    if (m_options.precision != ANSCENTER::Precision::INT8) {
        config->setFlag(nvinfer1::BuilderFlag::kSTRONGLY_TYPED);
        std::cout << "kSTRONGLY_TYPED enabled (TRT 8.5-9.x, FP32/FP16 mode)" << std::endl;
    }
#else
    // TRT 10+: strongly-typed networks are the default; no flag required.
    std::cout << "Info: Strongly-typed mode is default in TRT 10+ (kSTRONGLY_TYPED removed)" << std::endl;
#endif

    // -- kFASTER_DYNAMIC_SHAPES ------------------------------------------------
    // This flag reduces context-reshape overhead when batch size changes between
    // calls (10-100x faster switching, ~5% larger engine).  It was added in a
    // TRT 10 minor release but the exact version varies by NVIDIA build; the
    // enum is absent from the installed headers so it is disabled here.
    // To re-enable: uncomment the block below once you confirm your TRT version
    // exposes nvinfer1::BuilderFlag::kFASTER_DYNAMIC_SHAPES.
    //
    // if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
    //     config->setFlag(nvinfer1::BuilderFlag::kFASTER_DYNAMIC_SHAPES);
    //     std::cout << "kFASTER_DYNAMIC_SHAPES enabled" << std::endl;
    // }

    // -- kWEIGHT_STREAMING (TRT 10+) ------------------------------------------
    // DISABLED: kWEIGHT_STREAMING requires INetworkDefinition::setStronglyTyped(true)
    // to be called on the network before buildSerializedNetwork(), which is not done
    // for ONNX-imported networks in this code path.  BuilderFlag::kSTRONGLY_TYPED was
    // removed from TRT 10+ (compile error), so there is no flag-level workaround.
    // Re-enable only if the ONNX parser layer is updated to call setStronglyTyped(true).
    // #if NV_TENSORRT_MAJOR >= 10
    //     config->setFlag(nvinfer1::BuilderFlag::kWEIGHT_STREAMING);
    //     std::cout << "kWEIGHT_STREAMING enabled (TRT 10+)" << std::endl;
    // #endif

    // ============================================================================
    // TENSORRT 10+ DYNAMIC BATCH SUMMARY
    // ============================================================================

#if NV_TENSORRT_MAJOR >= 10
    std::cout << "\nTensorRT " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH
              << " | dynamic batch: " << (doesSupportDynamicBatch && m_options.maxBatchSize > 1 ? "YES" : "NO")
              << " | max batch: " << m_options.maxBatchSize
              << " | opt level: " << optLevel
              << " | GPU VRAM: " << totalMiB << " MiB" << std::endl;
#endif

    // Load timing cache if available (use actual engine name -- batch may have been capped above)
    const auto currentEngineName = serializeEngineOptions(m_options, onnxModelPath);
    std::string timingCachePath = currentEngineName + ".timing.cache";
    std::vector<char> timingCache;
    std::ifstream timingCacheFile(timingCachePath, std::ios::binary);
    if (timingCacheFile.good()) {
        timingCacheFile.seekg(0, std::ios::end);
        timingCache.resize(timingCacheFile.tellg());
        timingCacheFile.seekg(0, std::ios::beg);
        timingCacheFile.read(timingCache.data(), timingCache.size());

        auto cache = config->createTimingCache(timingCache.data(), timingCache.size());
        if (cache) {
            config->setTimingCache(*cache, false);
            std::cout << "Loaded timing cache from: " << timingCachePath << std::endl;
            std::cout << "  Cache size: " << timingCache.size() / 1024 << " KiB" << std::endl;
        }
    }
    else {
        std::cout << "No existing timing cache found (this is normal for first build)" << std::endl;
    }

    // ============================================================================
    // OPTIMIZATION PROFILE CONFIGURATION
    // ============================================================================

    std::cout << "\n========================================" << std::endl;
    std::cout << "Configuring Optimization Profiles" << std::endl;
    std::cout << "========================================" << std::endl;

    // Validate batch size options
    if (doesSupportDynamicBatch) {
        if (m_options.optBatchSize < 1) {
            std::cout << "Warning: optBatchSize < 1, setting to 1" << std::endl;
            m_options.optBatchSize = 1;
        }
        if (m_options.maxBatchSize < m_options.optBatchSize) {
            std::cout << "Warning: maxBatchSize < optBatchSize, adjusting maxBatchSize" << std::endl;
            m_options.maxBatchSize = m_options.optBatchSize;
        }
        std::cout << "Dynamic batch configuration validated:" << std::endl;
        std::cout << "  Min batch size: 1" << std::endl;
        std::cout << "  Opt batch size: " << m_options.optBatchSize << std::endl;
        std::cout << "  Max batch size: " << m_options.maxBatchSize << std::endl;
    }

    // Create optimization profile
    nvinfer1::IOptimizationProfile* optProfile = builder->createOptimizationProfile();
    if (!optProfile) {
        std::cout << "Error: Failed to create optimization profile" << std::endl;
        return false;
    }

    for (int32_t i = 0; i < numInputs; ++i) {
        const auto input = network->getInput(i);
        const auto inputName = input->getName();
        const auto inputDims = input->getDimensions();
        int32_t inputC = inputDims.d[1];
        int32_t inputH = inputDims.d[2];
        int32_t inputW = inputDims.d[3];

        // Use configured values for height
        int32_t minInputHeight = doesSupportDynamicHeight ? m_options.minInputHeight : inputH;
        int32_t optInputHeight = doesSupportDynamicHeight ? m_options.optInputHeight : inputH;
        int32_t maxInputHeight = doesSupportDynamicHeight ? m_options.maxInputHeight : inputH;

        // Use configured values for width
        int32_t minInputWidth = doesSupportDynamicWidth ? m_options.minInputWidth : inputW;
        int32_t optInputWidth = doesSupportDynamicWidth ? m_options.optInputWidth : inputW;
        int32_t maxInputWidth = doesSupportDynamicWidth ? m_options.maxInputWidth : inputW;

        // Create dimension objects
        int32_t minBatch = doesSupportDynamicBatch ? 1 : m_options.optBatchSize;
        int32_t optBatch = doesSupportDynamicBatch ? m_options.optBatchSize : m_options.optBatchSize;
        int32_t maxBatch = doesSupportDynamicBatch ? m_options.maxBatchSize : m_options.maxBatchSize;

        nvinfer1::Dims4 minDims(minBatch, inputC, minInputHeight, minInputWidth);
        nvinfer1::Dims4 optDims(optBatch, inputC, optInputHeight, optInputWidth);
        nvinfer1::Dims4 maxDims(maxBatch, inputC, maxInputHeight, maxInputWidth);

        std::cout << "\nSetting profile for input '" << inputName << "':" << std::endl;
        std::cout << "  MIN: [" << minDims.d[0] << "," << minDims.d[1] << ","
            << minDims.d[2] << "," << minDims.d[3] << "]" << std::endl;
        std::cout << "  OPT: [" << optDims.d[0] << "," << optDims.d[1] << ","
            << optDims.d[2] << "," << optDims.d[3] << "]" << std::endl;
        std::cout << "  MAX: [" << maxDims.d[0] << "," << maxDims.d[1] << ","
            << maxDims.d[2] << "," << maxDims.d[3] << "]" << std::endl;

        // Set the dimensions with error checking
        bool minSet = optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN, minDims);
        bool optSet = optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT, optDims);
        bool maxSet = optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX, maxDims);

        if (!minSet || !optSet || !maxSet) {
            std::cout << "  ✗ ERROR: Failed to set profile dimensions!" << std::endl;
            std::cout << "    minSet: " << (minSet ? "OK" : "FAILED") << std::endl;
            std::cout << "    optSet: " << (optSet ? "OK" : "FAILED") << std::endl;
            std::cout << "    maxSet: " << (maxSet ? "OK" : "FAILED") << std::endl;
            return false;
        }

        std::cout << "  ✓ Profile dimensions set successfully" << std::endl;
    }

    // Validate the profile
    std::cout << "\n========================================" << std::endl;
    std::cout << "VALIDATING OPTIMIZATION PROFILE" << std::endl;
    std::cout << "========================================" << std::endl;

    bool profileValid = optProfile->isValid();
    std::cout << "Profile validation result: " << (profileValid ? "✓ VALID" : "✗ INVALID") << std::endl;

    if (!profileValid) {
        std::cout << "ERROR: Profile is invalid! Cannot continue." << std::endl;
        std::cout << "This usually means the min/opt/max dimensions are inconsistent." << std::endl;
        return false;
    }

    // Verify what we actually set
    for (int32_t i = 0; i < numInputs; ++i) {
        const auto input = network->getInput(i);
        const auto inputName = input->getName();

        auto minDims = optProfile->getDimensions(inputName, nvinfer1::OptProfileSelector::kMIN);
        auto optDims = optProfile->getDimensions(inputName, nvinfer1::OptProfileSelector::kOPT);
        auto maxDims = optProfile->getDimensions(inputName, nvinfer1::OptProfileSelector::kMAX);

        std::cout << "\nVerified profile for input '" << inputName << "':" << std::endl;
        std::cout << "  MIN: [" << minDims.d[0] << "," << minDims.d[1] << ","
            << minDims.d[2] << "," << minDims.d[3] << "]" << std::endl;
        std::cout << "  OPT: [" << optDims.d[0] << "," << optDims.d[1] << ","
            << optDims.d[2] << "," << optDims.d[3] << "]" << std::endl;
        std::cout << "  MAX: [" << maxDims.d[0] << "," << maxDims.d[1] << ","
            << maxDims.d[2] << "," << maxDims.d[3] << "]" << std::endl;

        // Check batch dimension range
        if (minDims.d[0] != maxDims.d[0]) {
            std::cout << "  ✓ Profile IS DYNAMIC (batch " << minDims.d[0]
                << " to " << maxDims.d[0] << ")" << std::endl;
        }
        else {
            std::cout << "  • Profile IS FIXED at batch " << minDims.d[0] << std::endl;

            if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
                std::cout << "\n🚨 CRITICAL ERROR: ONNX supports dynamic batch but profile is fixed!" << std::endl;
                return false;
            }
        }
    }

    std::cout << "========================================" << std::endl;

    // Add the validated profile
    config->addOptimizationProfile(optProfile);

    int32_t numProfiles = config->getNbOptimizationProfiles();
    std::cout << "\n✓ Optimization profile added successfully" << std::endl;
    std::cout << "  Total profiles in config: " << numProfiles << std::endl;

    if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
        std::cout << "  ✓ Profile covers DYNAMIC batch range: 1 to " << m_options.maxBatchSize << std::endl;
    }
    else {
        std::cout << "  • Profile has FIXED batch size: " << m_options.maxBatchSize << std::endl;
    }

    // ============================================================================
    // PRECISION CONFIGURATION
    // ============================================================================

    std::cout << "\n========================================" << std::endl;
    std::cout << "Configuring Precision" << std::endl;
    std::cout << "========================================" << std::endl;

    if (m_options.precision == ANSCENTER::Precision::FP16) {
        if (!builder->platformHasFastFp16()) {
            std::cout << "Error: GPU does not support FP16 precision" << std::endl;
            return false;
        }
        config->setFlag(nvinfer1::BuilderFlag::kFP16);
        std::cout << "FP16 precision enabled" << std::endl;

        // Mixed precision safety: force numerically sensitive layers to FP32.
        // Some models (e.g. PP-OCRv5 det) produce NaN when certain layers
        // run in FP16 due to overflow in intermediate accumulators.  Forcing
        // these layers to FP32 has negligible performance impact while
        // preventing NaN corruption.
        //
        // Targeted layer types:
        //   - kREDUCE        : accumulation overflows FP16 max (65504)
        //   - kELEMENTWISE/Pow: large intermediate values
        //   - kNORMALIZATION : mean/variance reduction + 1/sqrt overflow
        //   - kSOFTMAX       : exp() extremely sensitive to precision
        //   - kACTIVATION/Sigmoid: 1/(1+exp(-x)) overflows for large |x|
        //   - kUNARY/Exp,Log : exp overflows for x>~11, log underflows
        //
        // IMPORTANT: setPrecision() is only a HINT without kOBEY_PRECISION_CONSTRAINTS.
        // We must set this flag so TRT strictly respects our per-layer FP32 overrides.
        // (kPREFER_PRECISION_CONSTRAINTS is deprecated/no-op in TRT 10.12+;
        //  kOBEY means build FAILS if no FP32 kernel exists — better than silent NaN.)
        int fp32Overrides = 0;
        const int numLayers = network->getNbLayers();

        // --- Diagnostic: enumerate all layer types in this network ---
        std::map<std::string, int> layerTypeCounts;
        auto layerTypeName = [](nvinfer1::LayerType t) -> std::string {
            switch (t) {
            case nvinfer1::LayerType::kCONVOLUTION:    return "Convolution";
            case nvinfer1::LayerType::kCAST:            return "Cast";
            case nvinfer1::LayerType::kACTIVATION:      return "Activation";
            case nvinfer1::LayerType::kPOOLING:         return "Pooling";
            case nvinfer1::LayerType::kLRN:             return "LRN";
            case nvinfer1::LayerType::kSCALE:           return "Scale";
            case nvinfer1::LayerType::kSOFTMAX:         return "Softmax";
            case nvinfer1::LayerType::kDECONVOLUTION:   return "Deconvolution";
            case nvinfer1::LayerType::kCONCATENATION:   return "Concatenation";
            case nvinfer1::LayerType::kELEMENTWISE:     return "ElementWise";
            case nvinfer1::LayerType::kPLUGIN:          return "Plugin";
            case nvinfer1::LayerType::kUNARY:           return "Unary";
            case nvinfer1::LayerType::kPADDING:         return "Padding";
            case nvinfer1::LayerType::kSHUFFLE:         return "Shuffle";
            case nvinfer1::LayerType::kREDUCE:          return "Reduce";
            case nvinfer1::LayerType::kTOPK:            return "TopK";
            case nvinfer1::LayerType::kGATHER:          return "Gather";
            case nvinfer1::LayerType::kMATRIX_MULTIPLY: return "MatrixMultiply";
            case nvinfer1::LayerType::kCONSTANT:        return "Constant";
            case nvinfer1::LayerType::kIDENTITY:        return "Identity";
            case nvinfer1::LayerType::kSLICE:           return "Slice";
            case nvinfer1::LayerType::kSHAPE:           return "Shape";
            case nvinfer1::LayerType::kRESIZE:          return "Resize";
            case nvinfer1::LayerType::kSELECT:          return "Select";
            case nvinfer1::LayerType::kFILL:            return "Fill";
            case nvinfer1::LayerType::kQUANTIZE:        return "Quantize";
            case nvinfer1::LayerType::kDEQUANTIZE:      return "Dequantize";
            case nvinfer1::LayerType::kSCATTER:         return "Scatter";
            case nvinfer1::LayerType::kEINSUM:          return "Einsum";
            case nvinfer1::LayerType::kGRID_SAMPLE:     return "GridSample";
            case nvinfer1::LayerType::kNMS:             return "NMS";
            case nvinfer1::LayerType::kNORMALIZATION:   return "Normalization";
            case nvinfer1::LayerType::kSQUEEZE:        return "Squeeze";
            case nvinfer1::LayerType::kUNSQUEEZE:      return "Unsqueeze";
            default: return "Unknown(" + std::to_string(static_cast<int>(t)) + ")";
            }
        };

        for (int i = 0; i < numLayers; ++i) {
            auto* layer = network->getLayer(i);
            const auto ltype = layer->getType();

            bool needsFP32 = false;

            switch (ltype) {
            case nvinfer1::LayerType::kREDUCE:
                needsFP32 = true;
                break;
            case nvinfer1::LayerType::kELEMENTWISE:
            {
                // Only force Pow to FP32; Add/Mul/etc. are fine in FP16
                auto* ew = static_cast<nvinfer1::IElementWiseLayer*>(layer);
                if (ew->getOperation() == nvinfer1::ElementWiseOperation::kPOW) {
                    needsFP32 = true;
                }
                break;
            }
            case nvinfer1::LayerType::kNORMALIZATION:
                needsFP32 = true;
                break;
            case nvinfer1::LayerType::kSOFTMAX:
                needsFP32 = true;
                break;
            case nvinfer1::LayerType::kACTIVATION:
            {
                // Sigmoid is 1/(1+exp(-x)) — exp overflows FP16 for large |x|
                auto* act = static_cast<nvinfer1::IActivationLayer*>(layer);
                if (act->getActivationType() == nvinfer1::ActivationType::kSIGMOID) {
                    needsFP32 = true;
                }
                break;
            }
            case nvinfer1::LayerType::kUNARY:
            {
                // Exp overflows FP16 for x > ~11; Log underflows for tiny values
                auto* un = static_cast<nvinfer1::IUnaryLayer*>(layer);
                const auto op = un->getOperation();
                if (op == nvinfer1::UnaryOperation::kEXP ||
                    op == nvinfer1::UnaryOperation::kLOG) {
                    needsFP32 = true;
                }
                break;
            }
            default:
                break;
            }

            // Track layer type for diagnostic summary
            std::string name = layerTypeName(ltype);
            if (needsFP32) name += " [FP32]";
            layerTypeCounts[name]++;

            if (needsFP32) {
                layer->setPrecision(nvinfer1::DataType::kFLOAT);
                for (int o = 0; o < layer->getNbOutputs(); ++o) {
                    layer->setOutputType(o, nvinfer1::DataType::kFLOAT);
                }
                ++fp32Overrides;
            }
        }

        // Print layer type summary
        std::cout << "  Network layer types (" << numLayers << " total):" << std::endl;
        for (const auto& kv : layerTypeCounts) {
            std::cout << "    " << kv.first << ": " << kv.second << std::endl;
        }

        if (fp32Overrides > 0) {
            // Enforce per-layer precision constraints — without this flag,
            // setPrecision(kFLOAT) is merely a hint that TRT can ignore.
            config->setFlag(nvinfer1::BuilderFlag::kOBEY_PRECISION_CONSTRAINTS);
            std::cout << "  Mixed precision: " << fp32Overrides
                      << " / " << numLayers
                      << " layers forced to FP32"
                      << std::endl;
            std::cout << "  kOBEY_PRECISION_CONSTRAINTS enabled to enforce FP32 on marked layers"
                      << std::endl;
        }
    }
    else if (m_options.precision == ANSCENTER::Precision::INT8) {
        if (numInputs > 1) {
            std::cout << "Error: This implementation currently only supports INT8 for single input models" << std::endl;
            return false;
        }

        if (!builder->platformHasFastInt8()) {
            std::cout << "Error: GPU does not support INT8 precision" << std::endl;
            return false;
        }

        if (m_options.calibrationDataDirectoryPath.empty()) {
            std::cout << "Error: INT8 precision requires calibration data directory path" << std::endl;
            return false;
        }

        config->setFlag(nvinfer1::BuilderFlag::kINT8);
        std::cout << "INT8 precision enabled" << std::endl;

        const auto input = network->getInput(0);
        const auto inputName = input->getName();
        const auto inputDims = input->getDimensions();
        const auto calibrationFileName = currentEngineName + ".calibration";

        m_calibrator = std::make_unique<Int8EntropyCalibrator2>(m_options.calibrationBatchSize, inputDims.d[3], inputDims.d[2],
            m_options.calibrationDataDirectoryPath, calibrationFileName, inputName,
            subVals, divVals, normalize);
        config->setInt8Calibrator(m_calibrator.get());
    }
    else {
        // FP32 mode - do NOT enable kFP16 flag; some models (e.g. PP-OCRv5 det)
        // produce NaN when TRT silently promotes layers to FP16.
        std::cout << "FP32 precision (strict, no FP16 fallback)" << std::endl;
    }

    // ============================================================================
    // BUILD ENGINE
    // ============================================================================

    std::cout << "\n========================================" << std::endl;
    std::cout << "Building Engine" << std::endl;
    std::cout << "========================================" << std::endl;

    cudaStream_t profileStream;
    Util::checkCudaErrorCode(cudaStreamCreate(&profileStream));
    config->setProfileStream(profileStream);

    std::cout << "Building engine... This may take several minutes." << std::endl;
    std::cout << "Progress will be shown as layers are optimized..." << std::endl;

    if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
        std::cout << "✓ Building with DYNAMIC batch support (1-" << m_options.maxBatchSize << ")" << std::endl;
    }
    else {
        std::cout << "• Building with FIXED batch size " << m_options.maxBatchSize << std::endl;
    }

    // Build the engine (crash-safe)
    auto startTime = std::chrono::high_resolution_clock::now();
    unsigned long sehCodeBuild = 0;
    std::unique_ptr<nvinfer1::IHostMemory> plan{
        buildSerializedNetworkSafe(builder.get(), *network, *config, &sehCodeBuild)
    };
    auto endTime = std::chrono::high_resolution_clock::now();

    if (sehCodeBuild != 0) {
        Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
        return false;
    }

    if (!plan) {
        std::cout << "\n========================================" << std::endl;
        std::cout << "Build Failed!" << std::endl;
        std::cout << "========================================" << std::endl;
        std::cout << "Error: Failed to build engine." << std::endl;
        Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
        return false;
    }

    auto buildTime = std::chrono::duration_cast<std::chrono::seconds>(endTime - startTime).count();
    std::cout << "\n========================================" << std::endl;
    std::cout << "Build Successful!" << std::endl;
    std::cout << "========================================" << std::endl;
    std::cout << "Build time: " << buildTime << " seconds (" << buildTime / 60 << " minutes)" << std::endl;

    // Write the engine to disk.
    // Re-compute the filename because build() may have capped maxBatchSize
    // (e.g. b32 -> b8), so the saved file must match the actual config.
    const auto actualEngineName = serializeEngineOptions(m_options, onnxModelPath);
    const auto enginePath = std::filesystem::path(m_options.engineFileDir) / actualEngineName;
    std::ofstream outfile(enginePath, std::ofstream::binary);
    if (!outfile) {
        std::cout << "Error: Failed to open file for writing: " << enginePath << std::endl;
        Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
        return false;
    }

    outfile.write(reinterpret_cast<const char*>(plan->data()), plan->size());
    outfile.close();

    std::cout << "Engine saved to: " << enginePath.string() << std::endl;
    std::cout << "Engine size: " << plan->size() / (1024 * 1024) << " MiB" << std::endl;

    if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
        std::cout << "✓ Engine supports DYNAMIC batch sizes: 1 to " << m_options.maxBatchSize << std::endl;
    }
    else {
        std::cout << "• Engine supports FIXED batch size: " << m_options.maxBatchSize << std::endl;
    }

    // Save timing cache
    auto timingCacheFromConfig = config->getTimingCache();
    if (timingCacheFromConfig) {
        auto timingCacheData = timingCacheFromConfig->serialize();
        if (timingCacheData) {
            std::ofstream timingCacheOut(timingCachePath, std::ios::binary);
            if (timingCacheOut) {
                timingCacheOut.write(static_cast<const char*>(timingCacheData->data()), timingCacheData->size());
                timingCacheOut.close();
                std::cout << "Timing cache saved to: " << timingCachePath << std::endl;
                std::cout << "  Cache size: " << timingCacheData->size() / 1024 << " KiB" << std::endl;
            }
        }
    }

    Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));

    std::cout << "\n========================================" << std::endl;
    std::cout << "Build Complete!" << std::endl;
    std::cout << "========================================" << std::endl;

    if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
        std::cout << "\n✓ Engine supports batch inference (1-" << m_options.maxBatchSize << " images)" << std::endl;
    }

    return true;
}

// ============================================================================
// buildSafe()
//
// SEH wrapper around build().  Cannot use __try in a function with C++
// destructors, so the actual build() call is forwarded through a plain-C
// function pointer via callBoolFuncSafe().
// ============================================================================
struct BuildSafeCtx_Base {
    void* enginePtr;       // Engine<T>*
    const char* onnxPath;
    size_t onnxPathLen;
    const float* subVals;
    const float* divVals;
    bool normalize;
    bool result;
};

template <typename T>
static bool buildSafe_trampoline(void* ctx) {
    auto* c = static_cast<BuildSafeCtx_Base*>(ctx);
    auto* engine = static_cast<Engine<T>*>(c->enginePtr);
    std::string path(c->onnxPath, c->onnxPathLen);
    std::array<float, 3> sub = { c->subVals[0], c->subVals[1], c->subVals[2] };
    std::array<float, 3> div = { c->divVals[0], c->divVals[1], c->divVals[2] };
    return engine->build(path, sub, div, c->normalize);
}

template <typename T>
bool Engine<T>::buildSafe(std::string onnxModelPath,
                           const std::array<float, 3>& subVals,
                           const std::array<float, 3>& divVals,
                           bool normalize,
                           unsigned long* outSehCode)
{
    BuildSafeCtx_Base ctx;
    ctx.enginePtr   = this;
    ctx.onnxPath    = onnxModelPath.c_str();
    ctx.onnxPathLen = onnxModelPath.size();
    ctx.subVals     = subVals.data();
    ctx.divVals     = divVals.data();
    ctx.normalize   = normalize;
    ctx.result      = false;

    bool ok = callBoolFuncSafe(&buildSafe_trampoline<T>, &ctx, outSehCode);
    return ok;
}

// ============================================================================
// buildWithRetry()
//
// Wraps build() with auto-retry for dynamic spatial dimension models.
// Pre-analyzes the ONNX model to detect dynamic H/W dims, then builds a
// fallback chain (max → 75% → 56% → ... → 640 → 320).  Each candidate
// calls build(), which checks for a cached engine first (fast) then tries
// building if no cache exists.  Fixed-spatial models skip retry.
// ============================================================================
template <typename T>
bool Engine<T>::buildWithRetry(std::string onnxModelPath,
                                const std::array<float, 3>& subVals,
                                const std::array<float, 3>& divVals,
                                bool normalize)
{
    // -- Quick pre-analysis: detect dynamic spatial dims in ONNX ---------------
    bool hasDynamicSpatial = false;
    int onnxFixedH = 0, onnxFixedW = 0;  // 0 = dynamic (-1 in ONNX)

    if (m_options.maxInputHeight > 0 && m_options.maxInputWidth > 0) {
        auto tempBuilder = std::unique_ptr<nvinfer1::IBuilder>(
            nvinfer1::createInferBuilder(m_logger));
        auto tempNetwork = std::unique_ptr<nvinfer1::INetworkDefinition>(TRT_CREATE_NETWORK(tempBuilder));
        auto tempParser = std::unique_ptr<nvonnxparser::IParser>(
            nvonnxparser::createParser(*tempNetwork, m_logger));

        std::ifstream onnxFile(onnxModelPath, std::ios::binary | std::ios::ate);
        if (onnxFile.good()) {
            std::streamsize onnxSize = onnxFile.tellg();
            onnxFile.seekg(0, std::ios::beg);
            std::vector<char> onnxBuffer(onnxSize);
            if (onnxFile.read(onnxBuffer.data(), onnxSize)) {
                unsigned long sehRetryParse = 0;
                bool retryParsed = parseOnnxModelSafe(tempParser.get(),
                    onnxBuffer.data(), onnxBuffer.size(), &sehRetryParse);
                if (sehRetryParse != 0) {
                    // hasDynamicSpatial stays false → single build() attempt
                }
                else if (retryParsed && tempNetwork->getNbInputs() > 0) {
                    auto dims = tempNetwork->getInput(0)->getDimensions();
                    if (dims.nbDims >= 4) {
                        if (dims.d[2] == -1 || dims.d[3] == -1)
                            hasDynamicSpatial = true;
                        onnxFixedH = (dims.d[2] != -1) ? dims.d[2] : 0;
                        onnxFixedW = (dims.d[3] != -1) ? dims.d[3] : 0;
                    }
                }
            }
        }
    }

    // -- Fixed-spatial or no dynamic dims: single build attempt ----------------
    if (!hasDynamicSpatial) {
        unsigned long sehBuild = 0;
        bool ok = buildSafe(onnxModelPath, subVals, divVals, normalize, &sehBuild);
        if (sehBuild != 0) {
            return false;
        }
        return ok;
    }

    // -- Dynamic spatial dims: build with fallback chain ----------------------
    const bool dynamicH = (onnxFixedH == 0);
    const bool dynamicW = (onnxFixedW == 0);

    const int origMaxH = m_options.maxInputHeight;
    const int origMaxW = m_options.maxInputWidth;
    const int origOptH = m_options.optInputHeight;
    const int origOptW = m_options.optInputWidth;
    const int origMinH = m_options.minInputHeight;
    const int origMinW = m_options.minInputWidth;

    int dynMaxH = dynamicH ? origMaxH : 0;
    int dynMaxW = dynamicW ? origMaxW : 0;
    int maxDynDim = std::max(dynMaxH, dynMaxW);

    // Build fallback chain: max → 75% → 56% → ... → 640 → 320
    std::vector<int> candidates;
    for (int s = maxDynDim; s >= 320; s = (s * 3) / 4) {
        s = (s / 32) * 32;
        if (candidates.empty() || candidates.back() != s)
            candidates.push_back(s);
    }
    if (candidates.back() > 640) candidates.push_back(640);
    if (candidates.back() > 320) candidates.push_back(320);

    // Helper: configure m_options for a given candidate
    auto setCandidateOptions = [&](int candidate) {
        float scale = static_cast<float>(candidate) / maxDynDim;
        m_options.maxInputHeight = dynamicH
            ? std::max(32, (static_cast<int>(origMaxH * scale) / 32) * 32)
            : onnxFixedH;
        m_options.maxInputWidth = dynamicW
            ? std::max(32, (static_cast<int>(origMaxW * scale) / 32) * 32)
            : onnxFixedW;
        m_options.minInputHeight = dynamicH
            ? std::min(origMinH, m_options.maxInputHeight) : onnxFixedH;
        m_options.minInputWidth = dynamicW
            ? std::min(origMinW, m_options.maxInputWidth) : onnxFixedW;
        m_options.optInputHeight = dynamicH
            ? std::min(origOptH, m_options.maxInputHeight) : onnxFixedH;
        m_options.optInputWidth = dynamicW
            ? std::min(origOptW, m_options.maxInputWidth) : onnxFixedW;
    };

    // Try each candidate (largest first).  build() checks cache before
    // building, so previously cached smaller engines are found quickly.
    for (size_t attempt = 0; attempt < candidates.size(); ++attempt) {
        setCandidateOptions(candidates[attempt]);

        {
            unsigned long sehAttempt = 0;
            bool attemptOk = buildSafe(onnxModelPath, subVals, divVals, normalize, &sehAttempt);
            if (sehAttempt != 0) {
                // CUDA context may be corrupted — no point retrying
                return false;
            }
            if (attemptOk) {
                return true;
            }
        }
    }

    // All candidates exhausted — restore original options for error reporting
    m_options.maxInputHeight = origMaxH;
    m_options.maxInputWidth  = origMaxW;
    m_options.optInputHeight = origOptH;
    m_options.optInputWidth  = origOptW;
    m_options.minInputHeight = origMinH;
    m_options.minInputWidth  = origMinW;

    return false;
}

// ============================================================================
// 6-param pool overloads
//
// These are non-virtual additions to Engine<T> that let callers opt into
// multi-GPU pool mode simply by supplying one extra argument:
//
//   m_trtEngine->buildLoadNetwork(path, sub, div, norm);       // single-GPU
//   m_trtEngine->buildLoadNetwork(path, sub, div, norm, -1);   // pool
//
// When maxSlotsPerGpu == 1 the call delegates to the existing 4-param
// single-GPU implementation -- zero behavioural difference.
// Any other value routes through loadSlots() which fills all GPUs.
// ============================================================================

template <typename T>
bool Engine<T>::buildLoadNetwork(
        std::string                  onnxModelPath,
        const std::array<float, 3>& subVals,
        const std::array<float, 3>& divVals,
        bool                         normalize,
        int                          maxSlotsPerGpu,
        double                       memSafetyFactor)
{
    // Force single-GPU when: maxSlotsPerGpu==0 (optimizer bypass),
    // per-instance forceNoPool, global bypass (OptimizeModelStr),
    // exported g_forceNoPool, OR single-GPU system with maxSlotsPerGpu==1.
    //
    // On a single-GPU system, the pool with 1 slot adds contention overhead
    // (2s timeout + reject) without any multi-GPU benefit.  The CUDA stream
    // handles serialization naturally in single-GPU mode.
    {
        extern std::atomic<bool> g_forceNoPool;
        int gpuCount = 0;
        cudaGetDeviceCount(&gpuCount);
        bool singleGpuNoElastic = (gpuCount <= 1 && maxSlotsPerGpu == 1);
        bool noPool = (maxSlotsPerGpu == 0) || m_forceNoPool ||
                      g_forceNoPool.load(std::memory_order_relaxed) ||
                      TRTEngineCache::globalBypass().load(std::memory_order_relaxed) ||
                      singleGpuNoElastic;
        if (noPool) {
            std::cout << "Info: buildLoadNetwork -- single-GPU forced (maxSlots=" << maxSlotsPerGpu
                      << ", forceNoPool=" << m_forceNoPool
                      << ", g_forceNoPool=" << g_forceNoPool.load()
                      << ", gpuCount=" << gpuCount << ")" << std::endl;
            return buildLoadNetwork(onnxModelPath, subVals, divVals, normalize);
        }
    }

    // Multi-GPU pool path.  m_options carries the base configuration that was
    // set either at construction (Engine(options)) or by initializePool().
    std::cout << "Info: buildLoadNetwork -- activating multi-GPU pool"
              << " (maxSlotsPerGpu=" << maxSlotsPerGpu
              << ", memSafetyFactor=" << memSafetyFactor << ")" << std::endl;

    return loadSlots(m_options, onnxModelPath,
                     subVals, divVals, normalize,
                     /*fromOnnx=*/true,
                     maxSlotsPerGpu, memSafetyFactor);
}

template <typename T>
bool Engine<T>::loadNetwork(
        std::string                  trtModelPath,
        const std::array<float, 3>& subVals,
        const std::array<float, 3>& divVals,
        bool                         normalize,
        int                          maxSlotsPerGpu,
        double                       memSafetyFactor)
{
    {
        extern std::atomic<bool> g_forceNoPool;
        int gpuCount = 0;
        cudaGetDeviceCount(&gpuCount);
        bool singleGpuNoElastic = (gpuCount <= 1 && maxSlotsPerGpu == 1);
        bool noPool = (maxSlotsPerGpu == 0) || m_forceNoPool ||
                      g_forceNoPool.load(std::memory_order_relaxed) ||
                      TRTEngineCache::globalBypass().load(std::memory_order_relaxed) ||
                      singleGpuNoElastic;
        if (noPool) {
            std::cout << "Info: loadNetwork -- single-GPU forced (maxSlots=" << maxSlotsPerGpu
                      << ", g_forceNoPool=" << g_forceNoPool.load()
                      << ", gpuCount=" << gpuCount << ")" << std::endl;
            return loadNetwork(trtModelPath, subVals, divVals, normalize);
        }
    }

    // Multi-GPU pool path.
    std::cout << "Info: loadNetwork -- activating multi-GPU pool"
              << " (maxSlotsPerGpu=" << maxSlotsPerGpu
              << ", memSafetyFactor=" << memSafetyFactor << ")" << std::endl;

    return loadSlots(m_options, trtModelPath,
                     subVals, divVals, normalize,
                     /*fromOnnx=*/false,
                     maxSlotsPerGpu, memSafetyFactor);
}