#pragma once #include #include #include #include "Utility.h" #include "TRTCompat.h" // ============================================================================ // Crash-safe wrappers for TensorRT operations that can crash the process. // // On Windows: uses SEH (__try/__except) to catch access violations, OOM, etc. // SEH cannot coexist with C++ objects that have destructors in the same // function scope, so these thin wrappers accept only raw pointers. // // On Linux: uses POSIX signals + sigsetjmp/siglongjmp to catch SIGSEGV, // SIGBUS, SIGABRT, SIGFPE. Thread-local jump buffers ensure thread safety. // Signal handlers are saved/restored around each dangerous call so that // the application's own handlers are not permanently replaced. // // outExceptionCode: 0 = OK. // Windows: the SEH exception code (e.g. 0xC0000005 = access violation). // Linux: the signal number (e.g. 11 = SIGSEGV). // ============================================================================ #ifdef _WIN32 # ifndef WIN32_LEAN_AND_MEAN # define WIN32_LEAN_AND_MEAN # endif # ifndef NOMINMAX # define NOMINMAX # endif # include #else # include # include // Thread-local storage for the POSIX crash-recovery mechanism. // Each thread gets its own jump buffer and signal number so that // concurrent engine builds on different threads don't interfere. static thread_local sigjmp_buf s_crashJmpBuf; static thread_local volatile sig_atomic_t s_crashSignal = 0; // Signal handler installed only around dangerous TensorRT calls. // It records which signal was received and jumps back to the // sigsetjmp() checkpoint. Only synchronous, thread-directed signals // (SIGSEGV, SIGBUS, SIGFPE) are guaranteed to land on the faulting // thread; SIGABRT is process-wide but typically raised from the same // thread that called abort(). static void engineCrashSignalHandler(int sig) { s_crashSignal = sig; siglongjmp(s_crashJmpBuf, 1); } // Helper: install crash signal handlers, saving the previous ones. struct CrashSignalGuard { struct sigaction oldSigsegv, oldSigbus, oldSigabrt, oldSigfpe; void install() { struct sigaction sa; sa.sa_handler = engineCrashSignalHandler; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; // no SA_RESTART — let interrupted calls fail sigaction(SIGSEGV, &sa, &oldSigsegv); sigaction(SIGBUS, &sa, &oldSigbus); sigaction(SIGABRT, &sa, &oldSigabrt); sigaction(SIGFPE, &sa, &oldSigfpe); s_crashSignal = 0; } void restore() { sigaction(SIGSEGV, &oldSigsegv, nullptr); sigaction(SIGBUS, &oldSigbus, nullptr); sigaction(SIGABRT, &oldSigabrt, nullptr); sigaction(SIGFPE, &oldSigfpe, nullptr); } }; #endif // _WIN32 /// Crash-safe ONNX parser->parse() wrapper. /// @param outExceptionCode receives the exception/signal code on crash (0 = OK). static bool parseOnnxModelSafe( nvonnxparser::IParser* parser, const void* data, size_t dataSize, unsigned long* outExceptionCode) { #ifdef _WIN32 *outExceptionCode = 0; __try { return parser->parse(data, dataSize); } __except (EXCEPTION_EXECUTE_HANDLER) { *outExceptionCode = GetExceptionCode(); return false; } #else *outExceptionCode = 0; CrashSignalGuard guard; guard.install(); bool result = false; if (sigsetjmp(s_crashJmpBuf, 1) == 0) { // Normal execution path result = parser->parse(data, dataSize); } else { // Returned here from signal handler — a crash was caught *outExceptionCode = static_cast(s_crashSignal); result = false; } guard.restore(); return result; #endif } /// Crash-safe builder->buildSerializedNetwork() wrapper. /// Returns raw IHostMemory* (caller wraps in unique_ptr). static nvinfer1::IHostMemory* buildSerializedNetworkSafe( nvinfer1::IBuilder* builder, nvinfer1::INetworkDefinition& network, nvinfer1::IBuilderConfig& config, unsigned long* outExceptionCode) { #ifdef _WIN32 *outExceptionCode = 0; __try { return builder->buildSerializedNetwork(network, config); } __except (EXCEPTION_EXECUTE_HANDLER) { *outExceptionCode = GetExceptionCode(); return nullptr; } #else *outExceptionCode = 0; CrashSignalGuard guard; guard.install(); nvinfer1::IHostMemory* plan = nullptr; if (sigsetjmp(s_crashJmpBuf, 1) == 0) { plan = builder->buildSerializedNetwork(network, config); } else { *outExceptionCode = static_cast(s_crashSignal); plan = nullptr; } guard.restore(); return plan; #endif } /// Crash-safe runtime->deserializeCudaEngine() wrapper. /// Returns raw ICudaEngine* (caller wraps in unique_ptr). static nvinfer1::ICudaEngine* deserializeCudaEngineSafe( nvinfer1::IRuntime* runtime, const void* data, size_t dataSize, unsigned long* outExceptionCode) { #ifdef _WIN32 *outExceptionCode = 0; __try { return runtime->deserializeCudaEngine(data, dataSize); } __except (EXCEPTION_EXECUTE_HANDLER) { *outExceptionCode = GetExceptionCode(); return nullptr; } #else *outExceptionCode = 0; CrashSignalGuard guard; guard.install(); nvinfer1::ICudaEngine* engine = nullptr; if (sigsetjmp(s_crashJmpBuf, 1) == 0) { engine = runtime->deserializeCudaEngine(data, dataSize); } else { *outExceptionCode = static_cast(s_crashSignal); engine = nullptr; } guard.restore(); return engine; #endif } /// Crash-safe wrapper for an arbitrary bool-returning function pointer. /// Used to SEH-protect build() calls that may crash on bad ONNX models. typedef bool (*BoolFuncPtr)(void* ctx); static bool callBoolFuncSafe(BoolFuncPtr fn, void* ctx, unsigned long* outExceptionCode) { #ifdef _WIN32 *outExceptionCode = 0; __try { return fn(ctx); } __except (EXCEPTION_EXECUTE_HANDLER) { *outExceptionCode = GetExceptionCode(); return false; } #else *outExceptionCode = 0; CrashSignalGuard guard; guard.install(); bool result = false; if (sigsetjmp(s_crashJmpBuf, 1) == 0) { result = fn(ctx); } else { *outExceptionCode = static_cast(s_crashSignal); result = false; } guard.restore(); return result; #endif } /// Format a crash code for logging (platform-aware). /// Windows: "SEH exception 0xC0000005" Linux: "signal 11 (SIGSEGV)" static std::string formatCrashCode(unsigned long code) { std::ostringstream oss; #ifdef _WIN32 oss << "SEH exception 0x" << std::hex << code << std::dec; #else oss << "signal " << code; switch (code) { case SIGSEGV: oss << " (SIGSEGV)"; break; case SIGBUS: oss << " (SIGBUS)"; break; case SIGABRT: oss << " (SIGABRT)"; break; case SIGFPE: oss << " (SIGFPE)"; break; default: oss << " (unknown)"; break; } #endif return oss.str(); } template bool Engine::buildLoadNetwork(std::string onnxModelPath, const std::array& subVals, const std::array& divVals, bool normalize) { // -- GPU-tier batch cap (early) ------------------------------------------- // Apply the same VRAM-based batch cap that build() uses BEFORE computing // the engine filename. Without this, the cache lookup uses the uncapped // batch size (e.g. b32), misses the file that was saved with the capped // size (e.g. b16), and triggers a needless full rebuild every launch. // The cap inside build() still runs later as a safety net (it will be a // no-op because the batch is already capped here). { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, m_options.deviceIndex); const size_t totalMiB = prop.totalGlobalMem / (1024ULL * 1024); int gpuMaxBatch; if (totalMiB >= 15800) gpuMaxBatch = 32; // ~16 GiB+ else if (totalMiB >= 11800) gpuMaxBatch = 16; // ~12 GiB else if (totalMiB >= 7900) gpuMaxBatch = 8; // ~ 8 GiB (batch=16 OCR ~987 MiB exec ctx, too large for 4 tasks) else if (totalMiB >= 3900) gpuMaxBatch = 4; // ~ 4 GiB else if (totalMiB >= 1900) gpuMaxBatch = 2; // ~ 2 GiB else gpuMaxBatch = 1; // < 2 GiB if (m_options.maxBatchSize > gpuMaxBatch) { if (m_verbose) { std::cout << "Info: GPU-tier early batch cap: " << m_options.maxBatchSize << " -> " << gpuMaxBatch << " (GPU has " << totalMiB << " MiB)" << std::endl; } m_options.maxBatchSize = gpuMaxBatch; m_options.optBatchSize = std::min(m_options.optBatchSize, m_options.maxBatchSize); } } // It is full path std::string engineName = serializeEngineOptions(m_options, onnxModelPath); std::string engineDir = m_options.engineFileDir; if (FileExist(engineName)) { if (m_verbose) { std::cout << "Engine file found: " << engineName << std::endl; } logEngineEvent("[Engine] buildLoadNetwork: Loading cached engine: " + engineName); bool loadOk = loadNetwork(engineName, subVals, divVals, normalize); if (loadOk) { return true; } // Engine file exists but loadNetwork failed. // Common causes: // - createExecutionContext returned null (VRAM exhausted) // - Incompatible TRT version or corrupt file // - Partially written by another thread if (m_skipOnnxRebuild) { // Elastic growth / non-critical path — don't delete and rebuild. // Just fail gracefully; the pool continues with existing slots. size_t freeMem = 0, totalMem = 0; cudaMemGetInfo(&freeMem, &totalMem); logEngineEvent("[Engine] buildLoadNetwork: Load failed (skip rebuild, " + std::to_string(freeMem >> 20) + " MiB free): " + engineName, true); return false; } // Check if the failure was due to VRAM exhaustion vs. corrupt file. // If VRAM was the reason, PRESERVE the engine file — it's valid, just // can't fit right now. Deleting it forces a full ONNX→TRT rebuild // (minutes) when VRAM becomes available later, instead of a fast load. // // Uses the m_lastLoadFailedVRAM flag set by loadNetwork() instead of // re-querying cudaMemGetInfo. The old approach had a TOCTOU race: // VRAM could be freed between loadNetwork's check and this re-check, // causing a valid engine file to be falsely classified as INVALID // and deleted. Also check current VRAM as a safety net. { size_t freeCheck = 0, totalCheck = 0; cudaMemGetInfo(&freeCheck, &totalCheck); constexpr size_t kMinFreeBytes = 256ULL * 1024 * 1024; if (m_lastLoadFailedVRAM || freeCheck < kMinFreeBytes) { logEngineEvent("[Engine] buildLoadNetwork: Load failed due to LOW VRAM (" + std::to_string(freeCheck / (1024 * 1024)) + " MiB free / " + std::to_string(totalCheck / (1024 * 1024)) + " MiB total" + ", vramFlag=" + std::to_string(m_lastLoadFailedVRAM) + "). Preserving engine file (not corrupt): " + engineName, true); return false; // Don't delete the file, don't try ONNX rebuild } } // Enough VRAM AND loadNetwork didn't flag VRAM as cause → file is // likely corrupt/incompatible. Delete and rebuild from ONNX. logEngineEvent("[Engine] buildLoadNetwork: Cached engine INVALID, deleting and rebuilding: " + engineName, true); try { std::filesystem::remove(engineName); } catch (...) {} // Fall through to ONNX build path below } { if (!FileExist(engineName)) { // Demand-driven growth: if no cached engine exists, bail out rather // than triggering a full ONNX→TRT build (30-60s, massive VRAM). if (m_skipOnnxBuild) { logEngineEvent("[Engine] buildLoadNetwork: Engine file NOT found, skipping ONNX build (demand growth): " + engineName); return false; } logEngineEvent("[Engine] buildLoadNetwork: Engine file NOT found, will build from ONNX: " + engineName); } if (!FileExist(onnxModelPath)) { // ONNX model does not exist, try to find alternative precision engine logEngineEvent("[Engine] buildLoadNetwork: ONNX model also not found: " + onnxModelPath, true); std::cout << "Searching for alternative precision engine..." << std::endl; size_t lastDot = engineName.find_last_of('.'); std::string alternativeEngineName; ANSCENTER::Precision originalPrecision = m_options.precision; if (m_options.precision == ANSCENTER::Precision::FP16) { alternativeEngineName = engineName.substr(0, lastDot + 1) + "fp32"; m_options.precision = ANSCENTER::Precision::FP32; std::cout << " Looking for FP32 engine: " << alternativeEngineName << std::endl; } else { alternativeEngineName = engineName.substr(0, lastDot + 1) + "fp16"; m_options.precision = ANSCENTER::Precision::FP16; std::cout << " Looking for FP16 engine: " << alternativeEngineName << std::endl; } if (FileExist(alternativeEngineName)) { std::cout << "Found alternative precision engine: " << alternativeEngineName << std::endl; return loadNetwork(alternativeEngineName, subVals, divVals, normalize); } else { // Restore original precision m_options.precision = originalPrecision; std::cout << "Error: Neither ONNX model nor engine files exist for: " << onnxModelPath << std::endl; std::cout << " Searched for: " << engineName << std::endl; std::cout << " Searched for: " << alternativeEngineName << std::endl; return false; } } else { // Before building, check if an alternative precision engine already exists // (e.g., FP16 requested but a FP32 engine was built by a previous fallback) if (m_options.precision == ANSCENTER::Precision::FP16) { ANSCENTER::Options fp32Opts = m_options; fp32Opts.precision = ANSCENTER::Precision::FP32; std::string fp32EngineName = serializeEngineOptions(fp32Opts, onnxModelPath); if (FileExist(fp32EngineName)) { std::cout << "FP16 engine not found, but FP32 engine exists: " << fp32EngineName << std::endl; std::cout << "Loading existing FP32 engine..." << std::endl; m_options.precision = ANSCENTER::Precision::FP32; return loadNetwork(fp32EngineName, subVals, divVals, normalize); } } // ONNX model exists, generate engine std::cout << "========================================" << std::endl; std::cout << "Engine not found, generating from ONNX model" << std::endl; std::cout << "========================================" << std::endl; std::cout << "ONNX model: " << onnxModelPath << std::endl; std::cout << "Target engine: " << engineName << std::endl; if (!FolderExist(engineDir)) { std::cout << "Creating engine directory: " << engineDir << std::endl; std::filesystem::create_directories(engineDir); } // CRITICAL FIX: Read ONNX to determine if it supports dynamic batch int32_t onnxBatchSize = -1; bool hasDynamicSpatialDims_onnx = false; int onnxFixedH = 0, onnxFixedW = 0; // 0 = dynamic (-1 in ONNX) std::cout << "\nAnalyzing ONNX model structure..." << std::endl; auto tempBuilder = std::unique_ptr(nvinfer1::createInferBuilder(m_logger)); auto tempNetwork = std::unique_ptr(TRT_CREATE_NETWORK(tempBuilder)); auto tempParser = std::unique_ptr(nvonnxparser::createParser(*tempNetwork, m_logger)); std::ifstream onnxFile(onnxModelPath, std::ios::binary | std::ios::ate); std::streamsize onnxSize = onnxFile.tellg(); onnxFile.seekg(0, std::ios::beg); std::vector onnxBuffer(onnxSize); if (!onnxFile.read(onnxBuffer.data(), onnxSize)) { std::cout << "Error: Failed to read ONNX file" << std::endl; return false; } unsigned long sehPreAnalysis = 0; bool preParsed = parseOnnxModelSafe(tempParser.get(), onnxBuffer.data(), onnxBuffer.size(), &sehPreAnalysis); if (sehPreAnalysis != 0) { std::cout << "[Engine] WARNING: ONNX pre-analysis parse crashed (" << formatCrashCode(sehPreAnalysis) << "). Skipping pre-analysis, proceeding with build..." << std::endl; } else if (preParsed) { auto numInputs = tempNetwork->getNbInputs(); std::cout << "ONNX Model Analysis:" << std::endl; std::cout << " Number of inputs: " << numInputs << std::endl; for (int32_t i = 0; i < numInputs; ++i) { auto input = tempNetwork->getInput(i); auto inputDims = input->getDimensions(); std::cout << " Input " << i << " (" << input->getName() << "): ["; for (int j = 0; j < inputDims.nbDims; ++j) { if (j > 0) std::cout << ", "; // FIXED: Properly display dynamic dimensions if (inputDims.d[j] == -1) { std::cout << "dynamic"; } else { std::cout << inputDims.d[j]; } } std::cout << "]" << std::endl; } // Check first input's batch dimension auto firstInput = tempNetwork->getInput(0); auto firstInputDims = firstInput->getDimensions(); onnxBatchSize = firstInputDims.d[0]; // Detect dynamic spatial dimensions (for auto-retry mechanism) if (firstInputDims.nbDims >= 4) { if (firstInputDims.d[2] == -1 || firstInputDims.d[3] == -1) { hasDynamicSpatialDims_onnx = true; } onnxFixedH = (firstInputDims.d[2] != -1) ? firstInputDims.d[2] : 0; onnxFixedW = (firstInputDims.d[3] != -1) ? firstInputDims.d[3] : 0; } std::cout << "\nBatch dimension analysis:" << std::endl; std::cout << " ONNX model batch dimension: "; if (onnxBatchSize == -1) { std::cout << "dynamic (-1)" << std::endl; } else { std::cout << onnxBatchSize << std::endl; } std::cout << " Current maxBatchSize setting: " << m_options.maxBatchSize << std::endl; std::cout << " Current optBatchSize setting: " << m_options.optBatchSize << std::endl; // FIXED: Correct logic for dynamic vs fixed batch if (onnxBatchSize == -1) { // Dynamic batch size model - keep user settings std::cout << "\n✓ ONNX model supports DYNAMIC batch size" << std::endl; std::cout << " Engine will support batch sizes 1 to " << m_options.maxBatchSize << std::endl; std::cout << " Optimal batch size: " << m_options.optBatchSize << std::endl; std::cout << " Keeping user-defined batch size configuration" << std::endl; } else if (onnxBatchSize > 0) { // Fixed batch size model - must match ONNX std::cout << "\n⚠ WARNING: ONNX model has FIXED batch size of " << onnxBatchSize << std::endl; std::cout << " Your model was exported with dynamic=False" << std::endl; std::cout << " Engine will only support batch size " << onnxBatchSize << std::endl; std::cout << " To use dynamic batching, re-export ONNX with dynamic=True" << std::endl; std::cout << "\n Adjusting engine options to match ONNX model..." << std::endl; m_options.optBatchSize = onnxBatchSize; m_options.maxBatchSize = onnxBatchSize; std::cout << " Updated optBatchSize: " << m_options.optBatchSize << std::endl; std::cout << " Updated maxBatchSize: " << m_options.maxBatchSize << std::endl; engineName = serializeEngineOptions(m_options, onnxModelPath); } else { // Unexpected value std::cout << "\n⚠ WARNING: Unexpected batch dimension value: " << onnxBatchSize << std::endl; std::cout << " This may indicate an issue with the ONNX file" << std::endl; std::cout << " Proceeding with user-defined settings" << std::endl; } } else { std::cout << "Warning: Failed to parse ONNX for pre-analysis. Proceeding with build..." << std::endl; } std::cout << "\n========================================" << std::endl; std::cout << "Starting Engine Build Process" << std::endl; std::cout << "========================================" << std::endl; std::cout << "This may take 10-20 minutes depending on model complexity..." << std::endl; std::cout << "Configuration:" << std::endl; std::cout << " Precision: " << (m_options.precision == ANSCENTER::Precision::FP16 ? "FP16" : m_options.precision == ANSCENTER::Precision::INT8 ? "INT8" : "FP32") << std::endl; std::cout << " Optimization Level: 5 (Maximum)" << std::endl; std::cout << " Batch Size Range: 1 to " << m_options.maxBatchSize << std::endl; std::cout << "========================================" << std::endl; // Build with auto-retry for dynamic spatial dimension models. // buildWithRetry() handles the ONNX pre-analysis internally and // reduces max spatial dims on OOM, falling back to smaller // profiles until build succeeds or all candidates are exhausted. // Fixed-spatial models get a single build() attempt. bool buildSuccess = buildWithRetry(onnxModelPath, subVals, divVals, normalize); // -- FP16 -> FP32 automatic fallback --------------------------------- // Some GPU architectures fail FP16 builds due to: // - platformHasFastFp16() returning false (older GPUs) // - kOBEY_PRECISION_CONSTRAINTS failing for mixed-precision layers // - Insufficient VRAM for FP16 tactic optimization // When FP16 build fails, automatically retry with FP32 precision. if (!buildSuccess && m_options.precision == ANSCENTER::Precision::FP16) { std::cout << "\n========================================" << std::endl; std::cout << "FP16 Build Failed - Retrying with FP32" << std::endl; std::cout << "========================================" << std::endl; std::cout << "FP16 engine build failed on this GPU." << std::endl; std::cout << "Automatically falling back to FP32 precision..." << std::endl; std::cout << "========================================" << std::endl; m_options.precision = ANSCENTER::Precision::FP32; // Re-compute engine name for FP32 to avoid caching conflicts engineName = serializeEngineOptions(m_options, onnxModelPath); buildSuccess = buildWithRetry(onnxModelPath, subVals, divVals, normalize); if (buildSuccess) { std::cout << "\n========================================" << std::endl; std::cout << "FP32 Fallback Build Successful!" << std::endl; std::cout << "========================================" << std::endl; std::cout << "Note: Engine is running in FP32 mode on this GPU." << std::endl; std::cout << "Performance may be lower than FP16 but accuracy is preserved." << std::endl; std::cout << "========================================" << std::endl; } } if (!buildSuccess) { std::cout << "\n========================================" << std::endl; std::cout << "Engine Build Failed!" << std::endl; std::cout << "========================================" << std::endl; std::cout << "Error: Failed to build engine from ONNX model" << std::endl; std::cout << "Possible causes:" << std::endl; std::cout << " 1. Insufficient GPU memory" << std::endl; std::cout << " 2. Unsupported ONNX operations" << std::endl; std::cout << " 3. Invalid batch size configuration" << std::endl; std::cout << " 4. Corrupted ONNX file" << std::endl; if (hasDynamicSpatialDims_onnx) { std::cout << " 5. All spatial dimension fallbacks exhausted" << std::endl; } std::cout << " Note: Both FP16 and FP32 builds were attempted." << std::endl; std::cout << "\nTroubleshooting:" << std::endl; std::cout << " - Check GPU memory availability" << std::endl; std::cout << " - Try reducing maxBatchSize" << std::endl; std::cout << " - Verify ONNX model integrity" << std::endl; std::cout << " - Check TensorRT logs above for details" << std::endl; return false; } // build() may have capped maxBatchSize based on GPU VRAM, which // changes the serialized engine filename (e.g. b32 -> b8). Re-compute // so we load the file that build() actually saved. std::string actualEngineName = serializeEngineOptions(m_options, onnxModelPath); // After building, the engine should be saved, so load it std::cout << "\n========================================" << std::endl; std::cout << "Engine Build Complete - Loading Engine" << std::endl; std::cout << "========================================" << std::endl; if (FileExist(actualEngineName)) { std::cout << "Engine file created successfully: " << actualEngineName << std::endl; std::cout << "Loading engine into memory..." << std::endl; bool loadSuccess = loadNetwork(actualEngineName, subVals, divVals, normalize); if (loadSuccess) { std::cout << "\n========================================" << std::endl; std::cout << "✓ Engine Ready for Inference!" << std::endl; std::cout << "========================================" << std::endl; std::cout << "Configuration Summary:" << std::endl; std::cout << " Engine File: " << actualEngineName << std::endl; std::cout << " Batch Size Support: 1 to " << m_options.maxBatchSize << std::endl; std::cout << " Precision: " << (m_options.precision == ANSCENTER::Precision::FP16 ? "FP16" : m_options.precision == ANSCENTER::Precision::INT8 ? "INT8" : "FP32") << std::endl; std::cout << "========================================" << std::endl; } return loadSuccess; } else { std::cout << "\n========================================" << std::endl; std::cout << "Engine Build Error!" << std::endl; std::cout << "========================================" << std::endl; std::cout << "Error: Engine file not found after build: " << actualEngineName << std::endl; std::cout << "Expected location: " << std::filesystem::absolute(actualEngineName) << std::endl; std::cout << "\nPossible causes:" << std::endl; std::cout << " 1. Build succeeded but save failed (disk full?)" << std::endl; std::cout << " 2. Incorrect engine directory permissions" << std::endl; std::cout << " 3. Engine filename mismatch" << std::endl; std::cout << "\nPlease check:" << std::endl; std::cout << " - Available disk space in: " << engineDir << std::endl; std::cout << " - Write permissions for engine directory" << std::endl; std::cout << " - TensorRT build logs above for warnings" << std::endl; return false; } } } } template bool Engine::loadNetwork(std::string trtModelPath, const std::array& subVals, const std::array& divVals, bool normalize) { // Install a custom OpenCV CUDA allocator that uses cudaMallocAsync/cudaFreeAsync // instead of the default cudaMalloc/cudaFree. The stream-ordered allocator // respects the cudaMemPool release threshold (set to 0), so freed memory is // returned to the GPU immediately instead of being cached forever. // // The default cudaMalloc/cudaFree allocator caches all freed blocks permanently // (no API to force release), causing VRAM to grow monotonically when GpuMat // objects of varying sizes are allocated and freed repeatedly (different batch // sizes, different image resolutions across cameras). { static std::once_flag s_allocatorFlag; std::call_once(s_allocatorFlag, []() { // Set release threshold to 0 on all GPUs int deviceCount = 0; cudaGetDeviceCount(&deviceCount); for (int d = 0; d < deviceCount; ++d) { cudaMemPool_t pool = nullptr; if (cudaDeviceGetDefaultMemPool(&pool, d) == cudaSuccess && pool) { uint64_t threshold = 0; cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold); } } // Custom allocator: uses cudaMallocAsync on stream 0 (behaves like // synchronous cudaMalloc but goes through the stream-ordered pool). struct AsyncAllocator : cv::cuda::GpuMat::Allocator { bool allocate(cv::cuda::GpuMat* mat, int rows, int cols, size_t elemSize) override { // Same logic as OpenCV's default allocator, but using cudaMallocAsync size_t step = elemSize * cols; // Align step to 256 bytes (same as default allocator) step = (step + 255) & ~size_t(255); void* ptr = nullptr; cudaError_t err = cudaMallocAsync(&ptr, step * rows, nullptr); // stream 0 if (err != cudaSuccess || !ptr) { // Fallback to regular cudaMalloc if async not supported err = cudaMalloc(&ptr, step * rows); if (err != cudaSuccess) return false; } mat->data = static_cast(ptr); mat->step = step; mat->refcount = static_cast(cv::fastMalloc(sizeof(int))); *mat->refcount = 1; return true; } void free(cv::cuda::GpuMat* mat) override { cudaFreeAsync(mat->data, nullptr); // stream 0 — goes through pool with threshold=0 cv::fastFree(mat->refcount); mat->data = nullptr; mat->datastart = nullptr; mat->dataend = nullptr; mat->refcount = nullptr; } }; static AsyncAllocator s_allocator; cv::cuda::GpuMat::setDefaultAllocator(&s_allocator); ANS_DBG("TRT_Load", "Custom CUDA async allocator installed — VRAM freed immediately on GpuMat release"); }); } m_lastLoadFailedVRAM = false; // reset on each load attempt m_subVals = subVals; m_divVals = divVals; m_normalize = normalize; // ============================================================================ // TRT ENGINE CACHE CHECK — skip file I/O + deserialization if already cached // (Bypassed when m_skipEngineCache is true, e.g., during model optimization) // ============================================================================ if (!m_skipEngineCache) { auto cacheHit = TRTEngineCache::instance().tryGet(trtModelPath, m_options.deviceIndex); if (cacheHit.engine) { // Cache hit — reuse shared ICudaEngine (no deserialization, no file I/O) m_context.reset(); m_engine.reset(); m_runtime.reset(); m_engine = cacheHit.engine; m_runtime = cacheHit.runtime; m_usingCachedEngine = true; m_cachedEnginePath = trtModelPath; m_cachedGpuIndex = m_options.deviceIndex; // Still need to set GPU device for context + buffer allocation cudaSetDevice(m_options.deviceIndex); // Jump past file read + deserialization to context creation (below) goto trt_cache_create_context; } } // ============================================================================ // READ ENGINE FILE (cache miss path) // ============================================================================ if (!Util::doesFileExist(trtModelPath)) { logEngineEvent("[Engine] loadNetwork FAIL: Engine file not found: " + trtModelPath, true); return false; } if (m_verbose) { std::cout << "Loading TensorRT engine file at path: " << trtModelPath << std::endl; } { std::ifstream file(trtModelPath, std::ios::binary | std::ios::ate); if (!file.is_open()) { logEngineEvent("[Engine] loadNetwork FAIL: Cannot open engine file: " + trtModelPath, true); return false; } std::streamsize size = file.tellg(); if (size <= 0) { logEngineEvent("[Engine] loadNetwork FAIL: Engine file is empty (0 bytes): " + trtModelPath, true); return false; } file.seekg(0, std::ios::beg); std::vector buffer(size); if (!file.read(buffer.data(), size)) { logEngineEvent("[Engine] loadNetwork FAIL: Read error on engine file: " + trtModelPath, true); return false; } if (m_verbose) { std::cout << "Engine file size: " << size / (1024 * 1024) << " MiB" << std::endl; } // ============================================================================ // CREATE RUNTIME // ============================================================================ // TRT requires: destroy context before engine, engine before runtime. // If loadNetwork() is called more than once on the same instance, the // previous objects must be torn down in the correct order before we // create new ones. m_context.reset(); m_engine.reset(); m_runtime.reset(); m_runtime = std::shared_ptr{ nvinfer1::createInferRuntime(m_logger) }; if (!m_runtime) { logEngineEvent("[Engine] loadNetwork FAIL: createInferRuntime returned null for " + trtModelPath, true); return false; } // ============================================================================ // GPU SELECTION AND CONFIGURATION // ============================================================================ int numGPUs = 0; cudaGetDeviceCount(&numGPUs); if (m_verbose) std::cout << "Info: Number of GPU devices: " << numGPUs << std::endl; if (numGPUs == 0) { std::cout << "Error: No CUDA-capable GPUs detected" << std::endl; return false; } if (m_options.deviceIndex < 0 || m_options.deviceIndex >= numGPUs) { std::cout << "Error: Invalid GPU index " << m_options.deviceIndex << ". Available GPUs: " << numGPUs << std::endl; return false; } if (m_verbose) std::cout << "Info: Using GPU device index: " << m_options.deviceIndex << std::endl; // Use yield mode to avoid busy-wait spinning that falsely reports 100% GPU utilization. // Must be called before cudaSetDevice creates the CUDA context. cudaSetDeviceFlags(cudaDeviceScheduleYield); cudaError_t ret = cudaSetDevice(m_options.deviceIndex); if (ret != cudaSuccess) { std::cout << "Error: Unable to set GPU device index to " << m_options.deviceIndex << std::endl; std::cout << "CUDA Error: " << cudaGetErrorString(ret) << std::endl; return false; } // Get GPU properties cudaDeviceProp prop; cudaGetDeviceProperties(&prop, m_options.deviceIndex); // Set GPU device limits. // Blackwell GPUs (GB200/B200 = SM 10.x, RTX 5090/5080 = SM 12.x) have // deeper kernel-launch pipelines and benefit from a larger pending-launch // queue. Using 8192 on Blackwell avoids throttling with heavily pipelined // workloads; 2048 is sufficient for all earlier architectures. { const size_t pendingLaunches = (prop.major >= 10) ? 8192 : 2048; cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, pendingLaunches); if (m_verbose) std::cout << "Info: cudaLimitDevRuntimePendingLaunchCount = " << pendingLaunches << " (SM " << prop.major << "." << prop.minor << ")" << std::endl; } cudaDeviceSetLimit(cudaLimitStackSize, 8192); cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 2); // Lock GPU clocks if requested (prevents power throttling on laptop GPUs) if (m_options.gpuClockLockMHz != 0 && !m_clocksLocked) { lockGpuClocks(m_options.deviceIndex, m_options.gpuClockLockMHz); } // -- VRAM safety check before engine deserialization ----------------------- // Reject early if the GPU doesn't have enough free VRAM to load the engine. // This prevents slow degradation (unified memory fallback) or crashes // (cudaMalloc failure during inference) when too many tasks are loaded. { size_t freeVRAM = 0, totalVRAM = 0; cudaError_t memErr = cudaMemGetInfo(&freeVRAM, &totalVRAM); constexpr size_t kMinFreeBytes = 256ULL * 1024 * 1024; // 256 MiB minimum if (memErr != cudaSuccess) { // cudaMemGetInfo failed — CUDA context may not be initialized on this thread. // Log but don't reject: let TRT try to deserialize (it may succeed). logEngineEvent("[Engine] loadNetwork WARNING: cudaMemGetInfo failed (" + std::string(cudaGetErrorString(memErr)) + ") on GPU[" + std::to_string(m_options.deviceIndex) + "] — skipping VRAM check for " + trtModelPath, true); } else if (freeVRAM < kMinFreeBytes) { logEngineEvent("[Engine] loadNetwork FAIL: GPU[" + std::to_string(m_options.deviceIndex) + "] only " + std::to_string(freeVRAM / (1024 * 1024)) + " MiB free / " + std::to_string(totalVRAM / (1024 * 1024)) + " MiB total (need " + std::to_string(kMinFreeBytes / (1024 * 1024)) + " MiB) for " + trtModelPath, true); m_lastLoadFailedVRAM = true; // signal to buildLoadNetwork: engine file is NOT corrupt return false; } if (m_verbose) { std::cout << "Info: GPU " << m_options.deviceIndex << " VRAM: " << (freeVRAM / (1024 * 1024)) << " MiB free / " << (totalVRAM / (1024 * 1024)) << " MiB total" << std::endl; } } // ============================================================================ // DESERIALIZE ENGINE // ============================================================================ if (m_verbose) std::cout << "Info: Deserializing TensorRT engine..." << std::endl; unsigned long sehCodeDeserialize = 0; m_engine = std::shared_ptr( deserializeCudaEngineSafe(m_runtime.get(), buffer.data(), buffer.size(), &sehCodeDeserialize)); if (sehCodeDeserialize != 0) { logEngineEvent("[Engine] loadNetwork FAIL: deserializeCudaEngine CRASHED (SEH " + formatCrashCode(sehCodeDeserialize) + ") for " + trtModelPath, true); return false; } if (!m_engine) { logEngineEvent("[Engine] loadNetwork FAIL: deserializeCudaEngine returned null for " + trtModelPath + " (file size=" + std::to_string(size / (1024*1024)) + " MiB)", true); return false; } if (m_verbose) std::cout << "Info: Engine deserialized successfully" << std::endl; // ============================================================================ // CRITICAL: VERIFY ENGINE BATCH CAPABILITIES IMMEDIATELY // ============================================================================ int numOptProfiles = m_engine->getNbOptimizationProfiles(); if (m_verbose) { std::cout << "\n========================================" << std::endl; std::cout << "ENGINE BATCH CAPABILITY VERIFICATION" << std::endl; std::cout << "========================================" << std::endl; std::cout << "Number of optimization profiles: " << numOptProfiles << std::endl; } bool engineSupportsDynamicBatch = false; int actualMinBatch = 1; int actualMaxBatch = 1; if (numOptProfiles > 0) { // Find the first input tensor to check batch support for (int i = 0; i < m_engine->getNbIOTensors(); ++i) { const char* tensorName = m_engine->getIOTensorName(i); if (m_engine->getTensorIOMode(tensorName) == nvinfer1::TensorIOMode::kINPUT) { auto minDims = m_engine->getProfileShape(tensorName, 0, nvinfer1::OptProfileSelector::kMIN); auto optDims = m_engine->getProfileShape(tensorName, 0, nvinfer1::OptProfileSelector::kOPT); auto maxDims = m_engine->getProfileShape(tensorName, 0, nvinfer1::OptProfileSelector::kMAX); actualMinBatch = minDims.d[0]; actualMaxBatch = maxDims.d[0]; // Store actual profile max spatial dims for runtime queries if (maxDims.nbDims >= 4) { m_profileMaxHeight = maxDims.d[2]; m_profileMaxWidth = maxDims.d[3]; } if (actualMinBatch != actualMaxBatch) { engineSupportsDynamicBatch = true; } if (m_verbose) { std::cout << "\nInput tensor '" << tensorName << "' profile 0:" << std::endl; std::cout << " Min: [" << minDims.d[0]; for (int d = 1; d < minDims.nbDims; ++d) std::cout << "," << minDims.d[d]; std::cout << "]" << std::endl; std::cout << " Opt: [" << optDims.d[0]; for (int d = 1; d < optDims.nbDims; ++d) std::cout << "," << optDims.d[d]; std::cout << "]" << std::endl; std::cout << " Max: [" << maxDims.d[0]; for (int d = 1; d < maxDims.nbDims; ++d) std::cout << "," << maxDims.d[d]; std::cout << "]" << std::endl; if (actualMinBatch != actualMaxBatch) std::cout << "\n✓ Engine supports DYNAMIC batching: " << actualMinBatch << " to " << actualMaxBatch << std::endl; else std::cout << "\n⚠️ Engine has FIXED batch size: " << actualMinBatch << std::endl; } break; // Only need to check first input } } } else { if (m_verbose) std::cout << "⚠️ No optimization profiles found" << std::endl; // Check if batch dimension is dynamic via -1 auto firstTensorName = m_engine->getIOTensorName(0); auto shape = m_engine->getTensorShape(firstTensorName); if (shape.d[0] == -1) { engineSupportsDynamicBatch = true; actualMaxBatch = m_options.maxBatchSize; if (m_verbose) std::cout << "Engine uses implicit dynamic batch (batch dim = -1)" << std::endl; } } // CRITICAL CHECK: Verify engine can support requested batch sizes if (!engineSupportsDynamicBatch && m_options.maxBatchSize > actualMaxBatch) { std::cout << "\n🚨🚨🚨 CRITICAL ERROR 🚨🚨🚨" << std::endl; std::cout << "Requested max batch size: " << m_options.maxBatchSize << std::endl; std::cout << "Engine max batch size: " << actualMaxBatch << std::endl; std::cout << "\nThis engine CANNOT support batch sizes larger than " << actualMaxBatch << "!" << std::endl; std::cout << "\nYou have two options:" << std::endl; std::cout << "1. Rebuild the engine with dynamic batch support:" << std::endl; std::cout << " trtexec --onnx=model.onnx \\" << std::endl; std::cout << " --minShapes=images:1x3x640x640 \\" << std::endl; std::cout << " --optShapes=images:4x3x640x640 \\" << std::endl; std::cout << " --maxShapes=images:32x3x640x640 \\" << std::endl; std::cout << " --saveEngine=model_dynamic.engine --fp16" << std::endl; std::cout << "\n2. Reduce maxBatchSize in your config to " << actualMaxBatch << std::endl; std::cout << "========================================\n" << std::endl; // Optionally fail here: // return false; // Or adjust maxBatchSize to match engine capability if (m_verbose) std::cout << "⚠️ Auto-adjusting maxBatchSize from " << m_options.maxBatchSize << " to " << actualMaxBatch << std::endl; m_options.maxBatchSize = actualMaxBatch; } if (m_verbose) std::cout << "========================================\n" << std::endl; // Store in cache for future tasks loading the same model if (!m_skipEngineCache) { m_engine = TRTEngineCache::instance().putIfAbsent( trtModelPath, m_options.deviceIndex, m_runtime, m_engine); m_usingCachedEngine = true; m_cachedEnginePath = trtModelPath; m_cachedGpuIndex = m_options.deviceIndex; } } // end of cache-miss scope (closes the brace opened after cache check) // ============================================================================ // CREATE EXECUTION CONTEXT (both cache-hit and cache-miss paths converge here) // ============================================================================ trt_cache_create_context: // These variables may not exist if we came from cache-hit path (goto skipped them). // Re-derive from the (now valid) m_engine so both paths work. { int numOptProfiles = m_engine->getNbOptimizationProfiles(); bool engineSupportsDynamicBatch = false; int actualMinBatch = 1; int actualMaxBatch = 1; if (numOptProfiles > 0) { for (int i = 0; i < m_engine->getNbIOTensors(); ++i) { const char* tn = m_engine->getIOTensorName(i); if (m_engine->getTensorIOMode(tn) == nvinfer1::TensorIOMode::kINPUT) { auto minDims = m_engine->getProfileShape(tn, 0, nvinfer1::OptProfileSelector::kMIN); auto maxDims = m_engine->getProfileShape(tn, 0, nvinfer1::OptProfileSelector::kMAX); actualMinBatch = minDims.d[0]; actualMaxBatch = maxDims.d[0]; engineSupportsDynamicBatch = (actualMinBatch != actualMaxBatch); break; } } } if (actualMaxBatch > 0 && m_options.maxBatchSize > actualMaxBatch) { m_options.maxBatchSize = actualMaxBatch; } m_context = std::unique_ptr(m_engine->createExecutionContext()); if (!m_context) { ANS_DBG("TRT_Load", "ERROR: createExecutionContext returned null"); logEngineEvent("[Engine] loadNetwork FAIL: createExecutionContext returned null for " + trtModelPath, true); return false; } ANS_DBG("TRT_Load", "Execution context created OK for %s", trtModelPath.c_str()); if (m_verbose) std::cout << "Info: Execution context created successfully" << std::endl; // ============================================================================ // BUFFER ALLOCATION // ============================================================================ if (m_verbose) { std::cout << "========================================" << std::endl; std::cout << "Initializing Buffers" << std::endl; std::cout << "========================================" << std::endl; } clearGpuBuffers(); m_buffers.resize(m_engine->getNbIOTensors()); m_outputLengths.clear(); m_inputDims.clear(); m_outputDims.clear(); m_IOTensorNames.clear(); m_hasDynamicSpatialDims = false; // Check available GPU memory size_t free_mem_initial, total_mem; cudaMemGetInfo(&free_mem_initial, &total_mem); if (m_verbose) { std::cout << "GPU Memory before allocation: Free " << free_mem_initial / (1024 * 1024) << " MiB / Total " << total_mem / (1024 * 1024) << " MiB" << std::endl; } size_t totalAllocated = 0; if (m_verbose) { std::cout << "Engine batch configuration:" << std::endl; std::cout << " Dynamic batch: " << (engineSupportsDynamicBatch ? "YES" : "NO") << std::endl; std::cout << " Actual batch range: " << actualMinBatch << " to " << actualMaxBatch << std::endl; std::cout << " Configured max batch size: " << m_options.maxBatchSize << std::endl; std::cout << " Optimization profiles: " << numOptProfiles << std::endl; } // Allocate buffers for all I/O tensors for (int i = 0; i < m_engine->getNbIOTensors(); ++i) { const auto tensorName = m_engine->getIOTensorName(i); m_IOTensorNames.emplace_back(tensorName); const auto tensorType = m_engine->getTensorIOMode(tensorName); const auto tensorShape = m_engine->getTensorShape(tensorName); const auto tensorDataType = m_engine->getTensorDataType(tensorName); if (tensorType == nvinfer1::TensorIOMode::kINPUT) { if (m_verbose) std::cout << "\nInfo: Processing input tensor: " << tensorName << std::endl; // Validate input type if (tensorDataType != nvinfer1::DataType::kFLOAT) { std::cout << "Error: Only float inputs are supported" << std::endl; return false; } // Store input dimensions correctly (C, H, W - excluding batch) m_inputDims.emplace_back(tensorShape.d[1], tensorShape.d[2], tensorShape.d[3]); // Detect dynamic spatial dimensions (e.g., detection models with variable H/W) if (tensorShape.d[2] == -1 || tensorShape.d[3] == -1) { m_hasDynamicSpatialDims = true; } if (m_verbose) std::cout << " Input shape from engine: [" << tensorShape.d[0] << ", " << tensorShape.d[1] << ", " << tensorShape.d[2] << ", " << tensorShape.d[3] << "]" << std::endl; // Calculate buffer size using actual max batch size from engine // Dynamic dimensions (-1) are substituted with the configured max values int32_t batchSize = (tensorShape.d[0] == -1) ? actualMaxBatch : tensorShape.d[0]; int32_t channels = tensorShape.d[1]; int32_t height = (tensorShape.d[2] == -1) ? m_options.maxInputHeight : tensorShape.d[2]; int32_t width = (tensorShape.d[3] == -1) ? m_options.maxInputWidth : tensorShape.d[3]; int64_t inputLength = static_cast(batchSize) * channels * height * width; size_t requestedMemory = inputLength * sizeof(float); if (m_verbose) std::cout << " Allocating for max batch size " << batchSize << ": " << requestedMemory / (1024 * 1024) << " MiB" << std::endl; // Allocate GPU memory cudaError_t err = cudaMalloc(&m_buffers[i], requestedMemory); if (err != cudaSuccess) { logEngineEvent("[Engine] loadNetwork FAIL: cudaMalloc input buffer (" + std::to_string(requestedMemory / (1024*1024)) + " MiB): " + cudaGetErrorString(err) + " for " + trtModelPath, true); return false; } // Initialize to zero cudaMemset(m_buffers[i], 0, requestedMemory); totalAllocated += requestedMemory; } else if (tensorType == nvinfer1::TensorIOMode::kOUTPUT) { if (m_verbose) std::cout << "\nInfo: Processing output tensor: " << tensorName << std::endl; // Validate output type matches template parameter if (tensorDataType == nvinfer1::DataType::kFLOAT && !std::is_same::value) { std::cout << "Error: Model output type is float, but template parameter is not float" << std::endl; return false; } else if (tensorDataType == nvinfer1::DataType::kHALF && !std::is_same<__half, T>::value) { std::cout << "Error: Model output type is half, but template parameter is not __half" << std::endl; return false; } else if (tensorDataType == nvinfer1::DataType::kINT32 && !std::is_same::value) { std::cout << "Error: Model output type is int32, but template parameter is not int32_t" << std::endl; return false; } // Calculate output buffer size per batch element int64_t outputLengthPerBatch = 1; m_outputDims.push_back(tensorShape); if (m_verbose) std::cout << " Output shape from engine: [" << tensorShape.d[0]; for (int j = 1; j < tensorShape.nbDims; ++j) { if (m_verbose) std::cout << ", " << tensorShape.d[j]; int64_t dimSize = tensorShape.d[j]; if (dimSize <= 0) { // Dynamic output dimension: use max input dims as upper bound if (tensorShape.nbDims == 4) { // NCHW: d[2]=H, d[3]=W dimSize = (j == 2) ? m_options.maxInputHeight : m_options.maxInputWidth; } else { // Generic: use max input width as fallback for dynamic dims dimSize = m_options.maxInputWidth; } if (dimSize <= 0) dimSize = 1; // Safety: avoid zero/negative } outputLengthPerBatch *= dimSize; } if (m_verbose) std::cout << "]" << std::endl; // Store output length per batch element (excluding batch dimension) m_outputLengths.push_back(outputLengthPerBatch); // Allocate for actual max batch size from engine size_t requestedMemory = outputLengthPerBatch * actualMaxBatch * sizeof(T); if (m_verbose) std::cout << " Allocating for max batch size " << actualMaxBatch << ": " << requestedMemory / (1024 * 1024) << " MiB" << std::endl; // Check if enough memory available size_t free_mem, total_mem_check; cudaMemGetInfo(&free_mem, &total_mem_check); if (requestedMemory > free_mem) { std::cout << "Error: Not enough GPU memory" << std::endl; std::cout << " Requested: " << requestedMemory / (1024 * 1024) << " MiB" << std::endl; std::cout << " Available: " << free_mem / (1024 * 1024) << " MiB" << std::endl; return false; } // Allocate GPU memory cudaError_t err = cudaMalloc(&m_buffers[i], requestedMemory); if (err != cudaSuccess) { logEngineEvent("[Engine] loadNetwork FAIL: cudaMalloc output buffer (" + std::to_string(requestedMemory / (1024*1024)) + " MiB): " + cudaGetErrorString(err) + " for " + trtModelPath, true); return false; } // Initialize to zero cudaMemset(m_buffers[i], 0, requestedMemory); totalAllocated += requestedMemory; } else { std::cout << "Error: Tensor is neither input nor output!" << std::endl; return false; } } { size_t vramFree = 0, vramTotal = 0; cudaMemGetInfo(&vramFree, &vramTotal); ANS_DBG("TRT_Load", "Buffers allocated: %zuMB, VRAM: %zuMB used / %zuMB free / %zuMB total", totalAllocated / (1024*1024), (vramTotal - vramFree) / (1024*1024), vramFree / (1024*1024), vramTotal / (1024*1024)); } if (m_verbose) std::cout << "\nInfo: Total GPU memory allocated: " << totalAllocated / (1024 * 1024) << " MiB" << std::endl; // -- Pinned output buffers (CUDA graph prerequisite) ----------------------- // Invalidate any graphs captured by a previous loadNetwork() call on this instance. for (auto& [bs, ge] : m_graphExecs) { if (ge) cudaGraphExecDestroy(ge); } m_graphExecs.clear(); // Free any previously allocated pinned buffers. for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); } m_pinnedOutputBuffers.clear(); m_pinnedOutputBufElems.clear(); // Allocate one flat pinned buffer per output tensor, sized for // actualMaxBatch x outputLength elements. Stable host addresses enable // CUDA graph capture of D2H copies. If any allocation fails, disable // graph acceleration gracefully and fall back to the original code path. // // Previously disabled for OpenCV 4.13+ because cv::cuda::split on the null // stream threw cudaErrorStreamCaptureUnsupported (-217). Now safe because // blobFromGpuMats runs on m_inferenceStream and finishes BEFORE graph capture. m_pinnedOutputBuffers.resize(m_outputLengths.size(), nullptr); m_pinnedOutputBufElems.resize(m_outputLengths.size(), 0); bool pinnedOk = true; for (size_t i = 0; i < m_outputLengths.size(); ++i) { const size_t nElems = static_cast(m_outputLengths[i]) * static_cast(actualMaxBatch); if (cudaMallocHost(reinterpret_cast(&m_pinnedOutputBuffers[i]), nElems * sizeof(T)) != cudaSuccess) { pinnedOk = false; break; } m_pinnedOutputBufElems[i] = nElems; } if (!pinnedOk) { std::cout << "Warning: cudaMallocHost failed -- CUDA graph acceleration disabled." << std::endl; for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); } m_pinnedOutputBuffers.clear(); m_pinnedOutputBufElems.clear(); } else { if (m_verbose) std::cout << "Info: Pinned output buffers allocated -- CUDA graph acceleration enabled." << std::endl; } // Check final memory state size_t free_mem_final, total_mem_final; cudaMemGetInfo(&free_mem_final, &total_mem_final); if (m_verbose) std::cout << "GPU Memory after allocation: Free " << free_mem_final / (1024 * 1024) << " MiB / Total " << total_mem_final / (1024 * 1024) << " MiB" << std::endl; // Ensure all pending GPU operations (cudaMalloc, memcpy, etc.) complete // before we begin inference on this engine. cudaDeviceSynchronize(); // ============================================================================ // CONTEXT OPTIMIZATION // ============================================================================ if (m_verbose) { std::cout << "========================================" << std::endl; std::cout << "Context Optimization" << std::endl; std::cout << "========================================" << std::endl; } // Create temporary stream for context setup cudaStream_t setupStream; cudaStreamCreate(&setupStream); // Check and set optimization profile if (m_verbose) std::cout << "Info: Engine has " << numOptProfiles << " optimization profile(s)" << std::endl; if (numOptProfiles > 0) { int selectedProfile = 0; if (m_verbose) std::cout << "Info: Using optimization profile " << selectedProfile << " (actual range: batch " << actualMinBatch << " to " << actualMaxBatch << ")" << std::endl; // Set optimization profile FIRST bool profileSet = m_context->setOptimizationProfileAsync(selectedProfile, setupStream); if (!profileSet) { std::cout << "Error: Failed to set optimization profile" << std::endl; cudaStreamDestroy(setupStream); return false; } // Wait for profile to be set cudaStreamSynchronize(setupStream); if (m_verbose) std::cout << "Info: Optimization profile " << selectedProfile << " set successfully" << std::endl; } // Set input shapes and bind buffers for (int i = 0; i < m_engine->getNbIOTensors(); ++i) { const auto tensorName = m_engine->getIOTensorName(i); const auto tensorMode = m_engine->getTensorIOMode(tensorName); // Set tensor address for both input and output if (!m_context->setTensorAddress(tensorName, m_buffers[i])) { std::cout << "Error: Failed to set tensor address for " << tensorName << std::endl; cudaStreamDestroy(setupStream); return false; } if (tensorMode == nvinfer1::TensorIOMode::kINPUT) { auto dims = m_engine->getTensorShape(tensorName); if (m_verbose) { std::cout << "Info: Input tensor '" << tensorName << "' engine shape: ["; for (int j = 0; j < dims.nbDims; ++j) { if (j > 0) std::cout << ", "; std::cout << dims.d[j]; } std::cout << "]" << std::endl; } // For dynamic batch engines, set shape to minimum for initialization if (dims.d[0] == -1 || numOptProfiles > 0) { nvinfer1::Dims inputDims = dims; inputDims.d[0] = actualMinBatch; // Use actual min from engine // Set height if dynamic if (inputDims.d[2] == -1) { inputDims.d[2] = m_options.optInputHeight; } // Set width if dynamic if (inputDims.d[3] == -1) { inputDims.d[3] = m_options.optInputWidth; } if (!m_context->setInputShape(tensorName, inputDims)) { std::cout << "Error: Failed to set input shape for " << tensorName << std::endl; cudaStreamDestroy(setupStream); return false; } if (m_verbose) { std::cout << "Info: Set initial input shape to [" << inputDims.d[0] << ", " << inputDims.d[1] << ", " << inputDims.d[2] << ", " << inputDims.d[3] << "] (for warmup)" << std::endl; std::cout << " Actual batch size will be set at inference time" << std::endl; } } } } // Verify all dimensions are specified if (!m_context->allInputDimensionsSpecified()) { std::cout << "Error: Not all input dimensions specified after setup" << std::endl; // Debug: Show which dimensions are missing for (int i = 0; i < m_engine->getNbIOTensors(); ++i) { const auto tensorName = m_engine->getIOTensorName(i); if (m_engine->getTensorIOMode(tensorName) == nvinfer1::TensorIOMode::kINPUT) { auto dims = m_context->getTensorShape(tensorName); std::cout << " " << tensorName << " shape: ["; for (int j = 0; j < dims.nbDims; ++j) { if (j > 0) std::cout << ", "; std::cout << dims.d[j]; } std::cout << "]" << std::endl; } } cudaStreamDestroy(setupStream); return false; } if (m_verbose) { std::cout << "Info: All input dimensions specified correctly" << std::endl; std::cout << "Info: All tensor addresses bound successfully" << std::endl; } // Disable profiling for production m_context->setEnqueueEmitsProfile(false); if (m_verbose) std::cout << "Info: Enqueue profile emissions disabled (production mode)" << std::endl; // Clean up setup stream cudaStreamSynchronize(setupStream); cudaStreamDestroy(setupStream); // ============================================================================ // CREATE PERSISTENT INFERENCE AND MEMORY STREAMS // ============================================================================ // Creating streams here (once, at load time) rather than lazily in // runInference() removes the hot-path "if (!m_streamInitialized)" branch // and ensures warmUp() already runs on the real inference stream. if (!m_streamInitialized) { int leastPriority, greatestPriority; cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority); cudaError_t streamErr = cudaStreamCreateWithPriority( &m_inferenceStream, cudaStreamNonBlocking, greatestPriority); if (streamErr != cudaSuccess) { std::cout << "Error: Failed to create inference stream: " << cudaGetErrorString(streamErr) << std::endl; return false; } streamErr = cudaStreamCreate(&m_memoryStream); if (streamErr != cudaSuccess) { std::cout << "Error: Failed to create memory stream: " << cudaGetErrorString(streamErr) << std::endl; return false; } m_streamInitialized = true; if (m_verbose) { std::cout << "Info: Inference stream created at load time with highest priority" << std::endl; std::cout << "Info: Memory stream created" << std::endl; } } // ============================================================================ // PRE-WARMUP DIAGNOSTICS // ============================================================================ if (m_verbose) { std::cout << "\n========================================" << std::endl; std::cout << "Pre-Warmup Diagnostics" << std::endl; std::cout << "========================================" << std::endl; std::cout << "Engine has " << m_engine->getNbIOTensors() << " I/O tensors" << std::endl; std::cout << "Engine has " << m_engine->getNbOptimizationProfiles() << " optimization profiles" << std::endl; for (int i = 0; i < m_engine->getNbIOTensors(); ++i) { const auto tensorName = m_engine->getIOTensorName(i); const auto tensorMode = m_engine->getTensorIOMode(tensorName); const auto tensorShape = m_context->getTensorShape(tensorName); std::cout << "\nTensor " << i << ": " << tensorName << std::endl; std::cout << " Mode: " << (tensorMode == nvinfer1::TensorIOMode::kINPUT ? "INPUT" : "OUTPUT") << std::endl; std::cout << " Shape: ["; for (int j = 0; j < tensorShape.nbDims; ++j) { if (j > 0) std::cout << ", "; std::cout << tensorShape.d[j]; } std::cout << "]" << std::endl; std::cout << " Buffer address: " << m_buffers[i] << std::endl; } std::cout << "\nContext state check:" << std::endl; std::cout << " All dimensions specified: " << (m_context->allInputDimensionsSpecified() ? "YES" : "NO") << std::endl; std::cout << "========================================" << std::endl; } if (!m_context->allInputDimensionsSpecified()) { std::cout << "ERROR: Cannot proceed with warmup - dimensions not specified!" << std::endl; return false; } // ============================================================================ // ENGINE LOADED SUCCESSFULLY // ============================================================================ if (m_verbose) { std::cout << "\n========================================" << std::endl; std::cout << "Engine loaded successfully!" << std::endl; std::cout << "========================================" << std::endl; } // ============================================================================ // WARMUP // ============================================================================ if (m_verbose) std::cout << "\nInfo: Starting warm-up inference..." << std::endl; warmUp(m_verbose ? 10 : 1); if (m_verbose) std::cout << "Info: Warm-up complete" << std::endl; } // end of trt_cache_create_context scope return true; } template bool Engine::build(std::string onnxModelPath, const std::array& subVals, const std::array& divVals, bool normalize) { const auto engineName = serializeEngineOptions(m_options, onnxModelPath); if (FileExist(engineName)) { std::cout << "Engine file already exists: " << engineName << std::endl; return true; } if (!FileExist(onnxModelPath)) { std::cout << "Error: ONNX model file does not exist: " << onnxModelPath << std::endl; return false; } std::cout << "========================================" << std::endl; std::cout << "Building TensorRT Engine" << std::endl; std::cout << "========================================" << std::endl; std::cout << "TensorRT Version: " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH << std::endl; // TensorRT 10+ detection #if NV_TENSORRT_MAJOR >= 10 std::cout << "\n⚠️ TensorRT " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR << " detected - will apply dynamic batch optimization flags" << std::endl; #endif // Create our engine builder. auto builder = std::unique_ptr(nvinfer1::createInferBuilder(m_logger)); if (!builder) { std::cout << "Error: Failed to create builder" << std::endl; return false; } auto network = std::unique_ptr(TRT_CREATE_NETWORK(builder)); if (!network) { std::cout << "Error: Failed to create network" << std::endl; return false; } // Create a parser for reading the onnx file. auto parser = std::unique_ptr(nvonnxparser::createParser(*network, m_logger)); if (!parser) { std::cout << "Error: Failed to create parser" << std::endl; return false; } // Read the onnx file into memory std::ifstream file(onnxModelPath, std::ios::binary | std::ios::ate); std::streamsize size = file.tellg(); file.seekg(0, std::ios::beg); std::vector buffer(size); if (!file.read(buffer.data(), size)) { std::cout << "Error: Unable to read ONNX file" << std::endl; return false; } std::cout << "ONNX model size: " << size / (1024 * 1024) << " MiB" << std::endl; // Parse the buffer we read into memory (crash-safe). std::cout << "Parsing ONNX model..." << std::endl; unsigned long sehCodeParse = 0; auto parsed = parseOnnxModelSafe(parser.get(), buffer.data(), buffer.size(), &sehCodeParse); if (sehCodeParse != 0) { std::cout << "[Engine] FATAL: ONNX parser crashed (" << formatCrashCode(sehCodeParse) << ")" << std::endl; std::cout << "[Engine] This may indicate a corrupt ONNX file or driver issue." << std::endl; return false; } if (!parsed) { std::cout << "Error: Failed to parse ONNX model" << std::endl; for (int32_t i = 0; i < parser->getNbErrors(); ++i) { std::cout << " " << parser->getError(i)->desc() << std::endl; } return false; } std::cout << "ONNX model parsed successfully" << std::endl; // ============================================================================ // ENHANCED ONNX MODEL ANALYSIS // ============================================================================ std::cout << "\n========================================" << std::endl; std::cout << "ONNX Model Analysis" << std::endl; std::cout << "========================================" << std::endl; const auto numInputs = network->getNbInputs(); if (numInputs < 1) { std::cout << "Error: Model needs at least 1 input!" << std::endl; return false; } std::cout << "Number of inputs: " << numInputs << std::endl; // Analyze all inputs for (int32_t i = 0; i < numInputs; ++i) { const auto input = network->getInput(i); const auto inputDims = input->getDimensions(); std::cout << "\nInput [" << i << "] '" << input->getName() << "':" << std::endl; std::cout << " Dimensions: ["; for (int j = 0; j < inputDims.nbDims; ++j) { if (j > 0) std::cout << ", "; if (inputDims.d[j] == -1) { std::cout << "DYNAMIC"; } else { std::cout << inputDims.d[j]; } } std::cout << "]" << std::endl; // Check batch dimension if (inputDims.d[0] == -1) { std::cout << " ✓ Batch dimension: DYNAMIC" << std::endl; } else { std::cout << " ✗ Batch dimension: FIXED at " << inputDims.d[0] << std::endl; } // Check height dimension (if applicable) if (inputDims.nbDims >= 3 && inputDims.d[2] == -1) { std::cout << " ✓ Height dimension: DYNAMIC" << std::endl; } else if (inputDims.nbDims >= 3) { std::cout << " • Height dimension: FIXED at " << inputDims.d[2] << std::endl; } // Check width dimension (if applicable) if (inputDims.nbDims >= 4 && inputDims.d[3] == -1) { std::cout << " ✓ Width dimension: DYNAMIC" << std::endl; } else if (inputDims.nbDims >= 4) { std::cout << " • Width dimension: FIXED at " << inputDims.d[3] << std::endl; } } // Ensure that all the inputs have the same batch size const auto input0Batch = network->getInput(0)->getDimensions().d[0]; for (int32_t i = 1; i < numInputs; ++i) { if (network->getInput(i)->getDimensions().d[0] != input0Batch) { std::cout << "\nError: Model has multiple inputs with differing batch sizes!" << std::endl; return false; } } // Check to see if the model supports dynamic batch size or not bool doesSupportDynamicBatch = false; if (input0Batch == -1) { doesSupportDynamicBatch = true; std::cout << "\n✓ Model supports DYNAMIC batch size" << std::endl; std::cout << " Batch size range: min=1, opt=" << m_options.optBatchSize << ", max=" << m_options.maxBatchSize << std::endl; } else { std::cout << "\n✗ Model only supports FIXED batch size of " << input0Batch << std::endl; std::cout << " WARNING: This will limit batch processing performance!" << std::endl; std::cout << " Consider re-exporting ONNX with dynamic batch axis." << std::endl; // Adjust batch size options to match model's fixed batch size if (m_options.optBatchSize != input0Batch || m_options.maxBatchSize != input0Batch) { std::cout << " Adjusting batch size options to match model's fixed batch size" << std::endl; m_options.optBatchSize = input0Batch; m_options.maxBatchSize = input0Batch; } } // Check for dynamic width and height dimensions const auto inputHeight = network->getInput(0)->getDimensions().d[2]; const auto inputWidth = network->getInput(0)->getDimensions().d[3]; bool doesSupportDynamicHeight = false; bool doesSupportDynamicWidth = false; // Check height dimension if (inputHeight == -1) { doesSupportDynamicHeight = true; std::cout << "\n✓ Model supports DYNAMIC height" << std::endl; if (m_options.optInputHeight == -1) { std::cout << " No user-configured height found, using default: 640" << std::endl; m_options.minInputHeight = 640; m_options.optInputHeight = 640; m_options.maxInputHeight = 640; } else { std::cout << " Using user-configured height: " << m_options.optInputHeight << std::endl; } } else { std::cout << "\n• Model has FIXED height: " << inputHeight << std::endl; m_options.minInputHeight = m_options.optInputHeight = m_options.maxInputHeight = inputHeight; } // Check width dimension if (inputWidth == -1) { doesSupportDynamicWidth = true; std::cout << "✓ Model supports DYNAMIC width" << std::endl; if (m_options.optInputWidth == -1) { std::cout << " No user-configured width found, using default: 640" << std::endl; m_options.minInputWidth = 640; m_options.optInputWidth = 640; m_options.maxInputWidth = 640; } else { std::cout << " Using user-configured width: " << m_options.optInputWidth << std::endl; } } else { std::cout << "• Model has FIXED width: " << inputWidth << std::endl; m_options.minInputWidth = m_options.optInputWidth = m_options.maxInputWidth = inputWidth; } std::cout << "\nFinal input dimensions configured:" << std::endl; std::cout << " Height: " << m_options.optInputHeight << std::endl; std::cout << " Width: " << m_options.optInputWidth << std::endl; std::cout << "========================================" << std::endl; auto config = std::unique_ptr(builder->createBuilderConfig()); if (!config) { std::cout << "Error: Failed to create builder config" << std::endl; return false; } // ============================================================================ // PERFORMANCE OPTIMIZATIONS // ============================================================================ std::cout << "\n========================================" << std::endl; std::cout << "Configuring Performance Settings" << std::endl; std::cout << "========================================" << std::endl; // Get GPU properties for the target device (not always GPU 0) cudaDeviceProp prop; cudaGetDeviceProperties(&prop, m_options.deviceIndex); std::cout << "Building engine for GPU " << m_options.deviceIndex << ": " << prop.name << std::endl; std::cout << "Compute Capability: " << prop.major << "." << prop.minor << std::endl; std::cout << "Total GPU Memory: " << prop.totalGlobalMem / (1024 * 1024) << " MiB" << std::endl; size_t free_mem, total_mem; cudaMemGetInfo(&free_mem, &total_mem); const size_t totalMiB = total_mem / (1024ULL * 1024); // -- GPU-tier adaptive configuration -------------------------------------- // All performance parameters scale with GPU VRAM to avoid OOM on small // GPUs while maximising throughput on larger ones. // // VRAM | Workspace | Opt Level | Max Batch | Tactic DRAM // ------------|-----------|-----------|-----------|------------------- // <= 1 GiB | 256 MiB | 3 | 1 | up to 2 GiB cap // <= 2 GiB | 512 MiB | 3 | 2 | up to 2 GiB cap // <= 4 GiB | 1 GiB | 3 | 4 | up to 2 GiB cap // <= 6 GiB | 2 GiB | 3 | 8 | up to 2 GiB cap // <= 8 GiB | 2 GiB | 3 | 16 | up to 2 GiB cap // <=12 GiB | 2 GiB | 3 | 16 | up to 2 GiB cap // <=16 GiB | 8 GiB | 5 | 32 | up to 4 GiB cap // <=24 GiB | 8 GiB | 5 | 32 | up to 4 GiB cap // > 24 GiB | 16 GiB | 5 | 32 | up to 8 GiB cap // -- 1. Workspace size ---------------------------------------------------- size_t max_workspace; const char* tierLabel; if (totalMiB > 24576) { // > 24 GiB max_workspace = 16ULL * 1024 * 1024 * 1024; tierLabel = "high-end (>24 GiB)"; } else if (totalMiB > 12288) { // > 12 GiB max_workspace = 8ULL * 1024 * 1024 * 1024; tierLabel = "desktop (>12 GiB)"; } else if (totalMiB > 4096) { // > 4 GiB max_workspace = 2ULL * 1024 * 1024 * 1024; tierLabel = "laptop (4-12 GiB)"; } else if (totalMiB > 2048) { // > 2 GiB max_workspace = 1ULL * 1024 * 1024 * 1024; tierLabel = "low-end (2-4 GiB)"; } else if (totalMiB > 1024) { // > 1 GiB max_workspace = 512ULL * 1024 * 1024; tierLabel = "minimal (1-2 GiB)"; } else { // <= 1 GiB max_workspace = 256ULL * 1024 * 1024; tierLabel = "ultra-low (<=1 GiB)"; } size_t workspace_size = std::min(max_workspace, static_cast(free_mem * 0.4)); config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspace_size); std::cout << "Workspace size set to: " << workspace_size / (1024 * 1024) << " MiB (" << tierLabel << " tier)" << std::endl; // -- 2. Max batch size cap ------------------------------------------------ // The model config sets the *desired* maxBatchSize; the GPU VRAM // determines the *actual* cap. This affects the optimisation profile // range, warmup, and runtime chunk splitting. // Thresholds use ~97% of marketing size to account for OS/driver reserved // memory (e.g. an "8 GB" GPU reports 8187 MiB). if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) { int gpuMaxBatch; if (totalMiB >= 15800) gpuMaxBatch = 32; // ~16 GiB (e.g. 16384 -> reports ~15900+) else if (totalMiB >= 11800) gpuMaxBatch = 16; // ~12 GiB (e.g. 12288 -> reports ~11800+) else if (totalMiB >= 7900) gpuMaxBatch = 8; // ~ 8 GiB (e.g. 8192 -> reports ~8100+; batch=16 OCR ~987 MiB too large for 4 tasks) else if (totalMiB >= 3900) gpuMaxBatch = 4; // ~ 4 GiB (e.g. 4096 -> reports ~3950+) else if (totalMiB >= 1900) gpuMaxBatch = 2; // ~ 2 GiB (e.g. 2048 -> reports ~1950+) else gpuMaxBatch = 1; // < 2 GiB const int prevMax = m_options.maxBatchSize; m_options.maxBatchSize = std::min(m_options.maxBatchSize, gpuMaxBatch); m_options.optBatchSize = std::min(m_options.optBatchSize, m_options.maxBatchSize); if (prevMax != m_options.maxBatchSize) { std::cout << "Max batch size capped by GPU VRAM: " << prevMax << " -> " << m_options.maxBatchSize << " (GPU has " << totalMiB << " MiB)" << std::endl; } std::cout << "Batch config: opt=" << m_options.optBatchSize << ", max=" << m_options.maxBatchSize << std::endl; } // -- 3. Optimisation level ------------------------------------------------ // Level 5 (exhaustive kernel search) only on GPUs with ≥16 GiB where // the tactic DRAM pool can hold the largest tactics. On smaller GPUs, // level 3 gives ~95 % of the runtime performance with dramatically // shorter build times. // Level 3 = balanced (best tradeoff: fast build, near-optimal kernels) // Level 5 = exhaustive (10x slower build for ~1-3% faster inference) // Use level 3 for all GPUs — the marginal runtime gain from level 5 // is not worth the 10-30 minute build time on first run. const int optLevel = 3; config->setBuilderOptimizationLevel(optLevel); std::cout << "Builder optimization level set to " << optLevel << " (balanced)" << std::endl; // Enable TF32 for Ampere and newer GPUs if (prop.major >= 8) { config->setFlag(nvinfer1::BuilderFlag::kTF32); std::cout << "TF32 enabled for Ampere/Ada/Blackwell architecture" << std::endl; } // Enable optimization flags // kPREFER_PRECISION_CONSTRAINTS removed: deprecated in TRT 10.12, no-op in TRT 10.15.1. config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); std::cout << "Optimization flags enabled" << std::endl; // kDIRECT_IO removed: deprecated in TRT 10.7 as "Unneeded API". // TRT 10.7+ enables this behaviour automatically; the flag is a no-op in TRT 10.15.1. // Enable all available tactic sources uint32_t tacticSources = 1U << static_cast(nvinfer1::TacticSource::kCUBLAS) | 1U << static_cast(nvinfer1::TacticSource::kCUBLAS_LT) | 1U << static_cast(nvinfer1::TacticSource::kCUDNN); if (prop.major >= 8) { tacticSources |= 1U << static_cast(nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS); tacticSources |= 1U << static_cast(nvinfer1::TacticSource::kJIT_CONVOLUTIONS); std::cout << "Enhanced tactic sources enabled for Ampere+ architecture" << std::endl; } config->setTacticSources(tacticSources); // kDETAILED profiling embeds per-layer metadata in the engine and adds measurable // build/inference overhead. Use kNONE for production; switch to kDETAILED or // kLAYER_NAMES_ONLY only when profiling with Nsight Systems / trt-exec --profilingVerbosity. config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kNONE); // Set timing iterations config->setAvgTimingIterations(4); std::cout << "Timing iterations set to 4 for stable kernel selection" << std::endl; // Set hardware compatibility config->setHardwareCompatibilityLevel(nvinfer1::HardwareCompatibilityLevel::kNONE); // -- TensorRT 10+ tactic DRAM pool ---------------------------------------- // Separate scratch pool for kernel-selection during build. Without this, // tactic evaluation competes with the workspace allocation and tactics // requesting >1 GiB get skipped, causing hours of wasted fallback searches. // // Strategy: give the tactic pool as much memory as possible while reserving // enough for workspace + builder overhead. The cap scales with GPU VRAM: // <=12 GiB -> up to 2 GiB (most tactics fit within 1.5 GiB) // <=24 GiB -> up to 4 GiB (room for larger model tactics) // > 24 GiB -> up to 8 GiB (future-proof for very large models) #if NV_TENSORRT_MAJOR >= 10 { // Scale the tactic cap by GPU VRAM -- larger GPUs can afford more size_t tacticCap; if (totalMiB > 24576) tacticCap = 8ULL * 1024 * 1024 * 1024; // > 24 GiB else if (totalMiB > 12288) tacticCap = 4ULL * 1024 * 1024 * 1024; // > 12 GiB else tacticCap = 2ULL * 1024 * 1024 * 1024; // <= 12 GiB // Reserve workspace + 512 MiB safety margin for builder internals const size_t reserveForBuild = workspace_size + (512ULL * 1024 * 1024); const size_t availableForTactic = (free_mem > reserveForBuild) ? (free_mem - reserveForBuild) : 0ULL; size_t tacticMemory = std::min(tacticCap, availableForTactic); // kTACTIC_DRAM requires a power-of-2 size; floor to nearest power of 2 if (tacticMemory > 0) { size_t p = 1ULL; while (p * 2 <= tacticMemory) p *= 2; tacticMemory = p; } config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kTACTIC_DRAM, tacticMemory); std::cout << "kTACTIC_DRAM pool: " << tacticMemory / (1024 * 1024) << " MiB (TRT 10+)" << std::endl; } #endif // -- kSTRONGLY_TYPED (TRT 8.5 - 9.x only) -------------------------------- // This flag existed in TRT 8.5 through 9.x to opt into strict type // enforcement. NVIDIA removed the enum in TRT 10.0 because strongly-typed // networks became the default behaviour -- setting it on TRT 10+ produces a // compile error ("undeclared identifier"). For TRT 10+ simply log a note. #if NV_TENSORRT_MAJOR < 10 if (m_options.precision != ANSCENTER::Precision::INT8) { config->setFlag(nvinfer1::BuilderFlag::kSTRONGLY_TYPED); std::cout << "kSTRONGLY_TYPED enabled (TRT 8.5-9.x, FP32/FP16 mode)" << std::endl; } #else // TRT 10+: strongly-typed networks are the default; no flag required. std::cout << "Info: Strongly-typed mode is default in TRT 10+ (kSTRONGLY_TYPED removed)" << std::endl; #endif // -- kFASTER_DYNAMIC_SHAPES ------------------------------------------------ // This flag reduces context-reshape overhead when batch size changes between // calls (10-100x faster switching, ~5% larger engine). It was added in a // TRT 10 minor release but the exact version varies by NVIDIA build; the // enum is absent from the installed headers so it is disabled here. // To re-enable: uncomment the block below once you confirm your TRT version // exposes nvinfer1::BuilderFlag::kFASTER_DYNAMIC_SHAPES. // // if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) { // config->setFlag(nvinfer1::BuilderFlag::kFASTER_DYNAMIC_SHAPES); // std::cout << "kFASTER_DYNAMIC_SHAPES enabled" << std::endl; // } // -- kWEIGHT_STREAMING (TRT 10+) ------------------------------------------ // DISABLED: kWEIGHT_STREAMING requires INetworkDefinition::setStronglyTyped(true) // to be called on the network before buildSerializedNetwork(), which is not done // for ONNX-imported networks in this code path. BuilderFlag::kSTRONGLY_TYPED was // removed from TRT 10+ (compile error), so there is no flag-level workaround. // Re-enable only if the ONNX parser layer is updated to call setStronglyTyped(true). // #if NV_TENSORRT_MAJOR >= 10 // config->setFlag(nvinfer1::BuilderFlag::kWEIGHT_STREAMING); // std::cout << "kWEIGHT_STREAMING enabled (TRT 10+)" << std::endl; // #endif // ============================================================================ // TENSORRT 10+ DYNAMIC BATCH SUMMARY // ============================================================================ #if NV_TENSORRT_MAJOR >= 10 std::cout << "\nTensorRT " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH << " | dynamic batch: " << (doesSupportDynamicBatch && m_options.maxBatchSize > 1 ? "YES" : "NO") << " | max batch: " << m_options.maxBatchSize << " | opt level: " << optLevel << " | GPU VRAM: " << totalMiB << " MiB" << std::endl; #endif // Load timing cache if available (use actual engine name -- batch may have been capped above) const auto currentEngineName = serializeEngineOptions(m_options, onnxModelPath); std::string timingCachePath = currentEngineName + ".timing.cache"; std::vector timingCache; std::ifstream timingCacheFile(timingCachePath, std::ios::binary); if (timingCacheFile.good()) { timingCacheFile.seekg(0, std::ios::end); timingCache.resize(timingCacheFile.tellg()); timingCacheFile.seekg(0, std::ios::beg); timingCacheFile.read(timingCache.data(), timingCache.size()); auto cache = config->createTimingCache(timingCache.data(), timingCache.size()); if (cache) { config->setTimingCache(*cache, false); std::cout << "Loaded timing cache from: " << timingCachePath << std::endl; std::cout << " Cache size: " << timingCache.size() / 1024 << " KiB" << std::endl; } } else { std::cout << "No existing timing cache found (this is normal for first build)" << std::endl; } // ============================================================================ // OPTIMIZATION PROFILE CONFIGURATION // ============================================================================ std::cout << "\n========================================" << std::endl; std::cout << "Configuring Optimization Profiles" << std::endl; std::cout << "========================================" << std::endl; // Validate batch size options if (doesSupportDynamicBatch) { if (m_options.optBatchSize < 1) { std::cout << "Warning: optBatchSize < 1, setting to 1" << std::endl; m_options.optBatchSize = 1; } if (m_options.maxBatchSize < m_options.optBatchSize) { std::cout << "Warning: maxBatchSize < optBatchSize, adjusting maxBatchSize" << std::endl; m_options.maxBatchSize = m_options.optBatchSize; } std::cout << "Dynamic batch configuration validated:" << std::endl; std::cout << " Min batch size: 1" << std::endl; std::cout << " Opt batch size: " << m_options.optBatchSize << std::endl; std::cout << " Max batch size: " << m_options.maxBatchSize << std::endl; } // Create optimization profile nvinfer1::IOptimizationProfile* optProfile = builder->createOptimizationProfile(); if (!optProfile) { std::cout << "Error: Failed to create optimization profile" << std::endl; return false; } for (int32_t i = 0; i < numInputs; ++i) { const auto input = network->getInput(i); const auto inputName = input->getName(); const auto inputDims = input->getDimensions(); int32_t inputC = inputDims.d[1]; int32_t inputH = inputDims.d[2]; int32_t inputW = inputDims.d[3]; // Use configured values for height int32_t minInputHeight = doesSupportDynamicHeight ? m_options.minInputHeight : inputH; int32_t optInputHeight = doesSupportDynamicHeight ? m_options.optInputHeight : inputH; int32_t maxInputHeight = doesSupportDynamicHeight ? m_options.maxInputHeight : inputH; // Use configured values for width int32_t minInputWidth = doesSupportDynamicWidth ? m_options.minInputWidth : inputW; int32_t optInputWidth = doesSupportDynamicWidth ? m_options.optInputWidth : inputW; int32_t maxInputWidth = doesSupportDynamicWidth ? m_options.maxInputWidth : inputW; // Create dimension objects int32_t minBatch = doesSupportDynamicBatch ? 1 : m_options.optBatchSize; int32_t optBatch = doesSupportDynamicBatch ? m_options.optBatchSize : m_options.optBatchSize; int32_t maxBatch = doesSupportDynamicBatch ? m_options.maxBatchSize : m_options.maxBatchSize; nvinfer1::Dims4 minDims(minBatch, inputC, minInputHeight, minInputWidth); nvinfer1::Dims4 optDims(optBatch, inputC, optInputHeight, optInputWidth); nvinfer1::Dims4 maxDims(maxBatch, inputC, maxInputHeight, maxInputWidth); std::cout << "\nSetting profile for input '" << inputName << "':" << std::endl; std::cout << " MIN: [" << minDims.d[0] << "," << minDims.d[1] << "," << minDims.d[2] << "," << minDims.d[3] << "]" << std::endl; std::cout << " OPT: [" << optDims.d[0] << "," << optDims.d[1] << "," << optDims.d[2] << "," << optDims.d[3] << "]" << std::endl; std::cout << " MAX: [" << maxDims.d[0] << "," << maxDims.d[1] << "," << maxDims.d[2] << "," << maxDims.d[3] << "]" << std::endl; // Set the dimensions with error checking bool minSet = optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN, minDims); bool optSet = optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT, optDims); bool maxSet = optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX, maxDims); if (!minSet || !optSet || !maxSet) { std::cout << " ✗ ERROR: Failed to set profile dimensions!" << std::endl; std::cout << " minSet: " << (minSet ? "OK" : "FAILED") << std::endl; std::cout << " optSet: " << (optSet ? "OK" : "FAILED") << std::endl; std::cout << " maxSet: " << (maxSet ? "OK" : "FAILED") << std::endl; return false; } std::cout << " ✓ Profile dimensions set successfully" << std::endl; } // Validate the profile std::cout << "\n========================================" << std::endl; std::cout << "VALIDATING OPTIMIZATION PROFILE" << std::endl; std::cout << "========================================" << std::endl; bool profileValid = optProfile->isValid(); std::cout << "Profile validation result: " << (profileValid ? "✓ VALID" : "✗ INVALID") << std::endl; if (!profileValid) { std::cout << "ERROR: Profile is invalid! Cannot continue." << std::endl; std::cout << "This usually means the min/opt/max dimensions are inconsistent." << std::endl; return false; } // Verify what we actually set for (int32_t i = 0; i < numInputs; ++i) { const auto input = network->getInput(i); const auto inputName = input->getName(); auto minDims = optProfile->getDimensions(inputName, nvinfer1::OptProfileSelector::kMIN); auto optDims = optProfile->getDimensions(inputName, nvinfer1::OptProfileSelector::kOPT); auto maxDims = optProfile->getDimensions(inputName, nvinfer1::OptProfileSelector::kMAX); std::cout << "\nVerified profile for input '" << inputName << "':" << std::endl; std::cout << " MIN: [" << minDims.d[0] << "," << minDims.d[1] << "," << minDims.d[2] << "," << minDims.d[3] << "]" << std::endl; std::cout << " OPT: [" << optDims.d[0] << "," << optDims.d[1] << "," << optDims.d[2] << "," << optDims.d[3] << "]" << std::endl; std::cout << " MAX: [" << maxDims.d[0] << "," << maxDims.d[1] << "," << maxDims.d[2] << "," << maxDims.d[3] << "]" << std::endl; // Check batch dimension range if (minDims.d[0] != maxDims.d[0]) { std::cout << " ✓ Profile IS DYNAMIC (batch " << minDims.d[0] << " to " << maxDims.d[0] << ")" << std::endl; } else { std::cout << " • Profile IS FIXED at batch " << minDims.d[0] << std::endl; if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) { std::cout << "\n🚨 CRITICAL ERROR: ONNX supports dynamic batch but profile is fixed!" << std::endl; return false; } } } std::cout << "========================================" << std::endl; // Add the validated profile config->addOptimizationProfile(optProfile); int32_t numProfiles = config->getNbOptimizationProfiles(); std::cout << "\n✓ Optimization profile added successfully" << std::endl; std::cout << " Total profiles in config: " << numProfiles << std::endl; if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) { std::cout << " ✓ Profile covers DYNAMIC batch range: 1 to " << m_options.maxBatchSize << std::endl; } else { std::cout << " • Profile has FIXED batch size: " << m_options.maxBatchSize << std::endl; } // ============================================================================ // PRECISION CONFIGURATION // ============================================================================ std::cout << "\n========================================" << std::endl; std::cout << "Configuring Precision" << std::endl; std::cout << "========================================" << std::endl; if (m_options.precision == ANSCENTER::Precision::FP16) { if (!builder->platformHasFastFp16()) { std::cout << "Error: GPU does not support FP16 precision" << std::endl; return false; } config->setFlag(nvinfer1::BuilderFlag::kFP16); std::cout << "FP16 precision enabled" << std::endl; // Mixed precision safety: force numerically sensitive layers to FP32. // Some models (e.g. PP-OCRv5 det) produce NaN when certain layers // run in FP16 due to overflow in intermediate accumulators. Forcing // these layers to FP32 has negligible performance impact while // preventing NaN corruption. // // Targeted layer types: // - kREDUCE : accumulation overflows FP16 max (65504) // - kELEMENTWISE/Pow: large intermediate values // - kNORMALIZATION : mean/variance reduction + 1/sqrt overflow // - kSOFTMAX : exp() extremely sensitive to precision // - kACTIVATION/Sigmoid: 1/(1+exp(-x)) overflows for large |x| // - kUNARY/Exp,Log : exp overflows for x>~11, log underflows // // IMPORTANT: setPrecision() is only a HINT without kOBEY_PRECISION_CONSTRAINTS. // We must set this flag so TRT strictly respects our per-layer FP32 overrides. // (kPREFER_PRECISION_CONSTRAINTS is deprecated/no-op in TRT 10.12+; // kOBEY means build FAILS if no FP32 kernel exists — better than silent NaN.) int fp32Overrides = 0; const int numLayers = network->getNbLayers(); // --- Diagnostic: enumerate all layer types in this network --- std::map layerTypeCounts; auto layerTypeName = [](nvinfer1::LayerType t) -> std::string { switch (t) { case nvinfer1::LayerType::kCONVOLUTION: return "Convolution"; case nvinfer1::LayerType::kCAST: return "Cast"; case nvinfer1::LayerType::kACTIVATION: return "Activation"; case nvinfer1::LayerType::kPOOLING: return "Pooling"; case nvinfer1::LayerType::kLRN: return "LRN"; case nvinfer1::LayerType::kSCALE: return "Scale"; case nvinfer1::LayerType::kSOFTMAX: return "Softmax"; case nvinfer1::LayerType::kDECONVOLUTION: return "Deconvolution"; case nvinfer1::LayerType::kCONCATENATION: return "Concatenation"; case nvinfer1::LayerType::kELEMENTWISE: return "ElementWise"; case nvinfer1::LayerType::kPLUGIN: return "Plugin"; case nvinfer1::LayerType::kUNARY: return "Unary"; case nvinfer1::LayerType::kPADDING: return "Padding"; case nvinfer1::LayerType::kSHUFFLE: return "Shuffle"; case nvinfer1::LayerType::kREDUCE: return "Reduce"; case nvinfer1::LayerType::kTOPK: return "TopK"; case nvinfer1::LayerType::kGATHER: return "Gather"; case nvinfer1::LayerType::kMATRIX_MULTIPLY: return "MatrixMultiply"; case nvinfer1::LayerType::kCONSTANT: return "Constant"; case nvinfer1::LayerType::kIDENTITY: return "Identity"; case nvinfer1::LayerType::kSLICE: return "Slice"; case nvinfer1::LayerType::kSHAPE: return "Shape"; case nvinfer1::LayerType::kRESIZE: return "Resize"; case nvinfer1::LayerType::kSELECT: return "Select"; case nvinfer1::LayerType::kFILL: return "Fill"; case nvinfer1::LayerType::kQUANTIZE: return "Quantize"; case nvinfer1::LayerType::kDEQUANTIZE: return "Dequantize"; case nvinfer1::LayerType::kSCATTER: return "Scatter"; case nvinfer1::LayerType::kEINSUM: return "Einsum"; case nvinfer1::LayerType::kGRID_SAMPLE: return "GridSample"; case nvinfer1::LayerType::kNMS: return "NMS"; case nvinfer1::LayerType::kNORMALIZATION: return "Normalization"; case nvinfer1::LayerType::kSQUEEZE: return "Squeeze"; case nvinfer1::LayerType::kUNSQUEEZE: return "Unsqueeze"; default: return "Unknown(" + std::to_string(static_cast(t)) + ")"; } }; for (int i = 0; i < numLayers; ++i) { auto* layer = network->getLayer(i); const auto ltype = layer->getType(); bool needsFP32 = false; switch (ltype) { case nvinfer1::LayerType::kREDUCE: needsFP32 = true; break; case nvinfer1::LayerType::kELEMENTWISE: { // Only force Pow to FP32; Add/Mul/etc. are fine in FP16 auto* ew = static_cast(layer); if (ew->getOperation() == nvinfer1::ElementWiseOperation::kPOW) { needsFP32 = true; } break; } case nvinfer1::LayerType::kNORMALIZATION: needsFP32 = true; break; case nvinfer1::LayerType::kSOFTMAX: needsFP32 = true; break; case nvinfer1::LayerType::kACTIVATION: { // Sigmoid is 1/(1+exp(-x)) — exp overflows FP16 for large |x| auto* act = static_cast(layer); if (act->getActivationType() == nvinfer1::ActivationType::kSIGMOID) { needsFP32 = true; } break; } case nvinfer1::LayerType::kUNARY: { // Exp overflows FP16 for x > ~11; Log underflows for tiny values auto* un = static_cast(layer); const auto op = un->getOperation(); if (op == nvinfer1::UnaryOperation::kEXP || op == nvinfer1::UnaryOperation::kLOG) { needsFP32 = true; } break; } default: break; } // Track layer type for diagnostic summary std::string name = layerTypeName(ltype); if (needsFP32) name += " [FP32]"; layerTypeCounts[name]++; if (needsFP32) { layer->setPrecision(nvinfer1::DataType::kFLOAT); for (int o = 0; o < layer->getNbOutputs(); ++o) { layer->setOutputType(o, nvinfer1::DataType::kFLOAT); } ++fp32Overrides; } } // Print layer type summary std::cout << " Network layer types (" << numLayers << " total):" << std::endl; for (const auto& kv : layerTypeCounts) { std::cout << " " << kv.first << ": " << kv.second << std::endl; } if (fp32Overrides > 0) { // Enforce per-layer precision constraints — without this flag, // setPrecision(kFLOAT) is merely a hint that TRT can ignore. config->setFlag(nvinfer1::BuilderFlag::kOBEY_PRECISION_CONSTRAINTS); std::cout << " Mixed precision: " << fp32Overrides << " / " << numLayers << " layers forced to FP32" << std::endl; std::cout << " kOBEY_PRECISION_CONSTRAINTS enabled to enforce FP32 on marked layers" << std::endl; } } else if (m_options.precision == ANSCENTER::Precision::INT8) { if (numInputs > 1) { std::cout << "Error: This implementation currently only supports INT8 for single input models" << std::endl; return false; } if (!builder->platformHasFastInt8()) { std::cout << "Error: GPU does not support INT8 precision" << std::endl; return false; } if (m_options.calibrationDataDirectoryPath.empty()) { std::cout << "Error: INT8 precision requires calibration data directory path" << std::endl; return false; } config->setFlag(nvinfer1::BuilderFlag::kINT8); std::cout << "INT8 precision enabled" << std::endl; const auto input = network->getInput(0); const auto inputName = input->getName(); const auto inputDims = input->getDimensions(); const auto calibrationFileName = currentEngineName + ".calibration"; m_calibrator = std::make_unique(m_options.calibrationBatchSize, inputDims.d[3], inputDims.d[2], m_options.calibrationDataDirectoryPath, calibrationFileName, inputName, subVals, divVals, normalize); config->setInt8Calibrator(m_calibrator.get()); } else { // FP32 mode - do NOT enable kFP16 flag; some models (e.g. PP-OCRv5 det) // produce NaN when TRT silently promotes layers to FP16. std::cout << "FP32 precision (strict, no FP16 fallback)" << std::endl; } // ============================================================================ // BUILD ENGINE // ============================================================================ std::cout << "\n========================================" << std::endl; std::cout << "Building Engine" << std::endl; std::cout << "========================================" << std::endl; cudaStream_t profileStream; Util::checkCudaErrorCode(cudaStreamCreate(&profileStream)); config->setProfileStream(profileStream); std::cout << "Building engine... This may take several minutes." << std::endl; std::cout << "Progress will be shown as layers are optimized..." << std::endl; if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) { std::cout << "✓ Building with DYNAMIC batch support (1-" << m_options.maxBatchSize << ")" << std::endl; } else { std::cout << "• Building with FIXED batch size " << m_options.maxBatchSize << std::endl; } // Build the engine (crash-safe) auto startTime = std::chrono::high_resolution_clock::now(); unsigned long sehCodeBuild = 0; std::unique_ptr plan{ buildSerializedNetworkSafe(builder.get(), *network, *config, &sehCodeBuild) }; auto endTime = std::chrono::high_resolution_clock::now(); if (sehCodeBuild != 0) { std::cout << "\n========================================" << std::endl; std::cout << "Build CRASHED!" << std::endl; std::cout << "========================================" << std::endl; std::cout << "[Engine] FATAL: buildSerializedNetwork crashed (" << formatCrashCode(sehCodeBuild) << ")" << std::endl; std::cout << "[Engine] This typically indicates insufficient GPU memory or a driver crash." << std::endl; Util::checkCudaErrorCode(cudaStreamDestroy(profileStream)); return false; } if (!plan) { std::cout << "\n========================================" << std::endl; std::cout << "Build Failed!" << std::endl; std::cout << "========================================" << std::endl; std::cout << "Error: Failed to build engine." << std::endl; Util::checkCudaErrorCode(cudaStreamDestroy(profileStream)); return false; } auto buildTime = std::chrono::duration_cast(endTime - startTime).count(); std::cout << "\n========================================" << std::endl; std::cout << "Build Successful!" << std::endl; std::cout << "========================================" << std::endl; std::cout << "Build time: " << buildTime << " seconds (" << buildTime / 60 << " minutes)" << std::endl; // Write the engine to disk. // Re-compute the filename because build() may have capped maxBatchSize // (e.g. b32 -> b8), so the saved file must match the actual config. const auto actualEngineName = serializeEngineOptions(m_options, onnxModelPath); const auto enginePath = std::filesystem::path(m_options.engineFileDir) / actualEngineName; std::ofstream outfile(enginePath, std::ofstream::binary); if (!outfile) { std::cout << "Error: Failed to open file for writing: " << enginePath << std::endl; Util::checkCudaErrorCode(cudaStreamDestroy(profileStream)); return false; } outfile.write(reinterpret_cast(plan->data()), plan->size()); outfile.close(); std::cout << "Engine saved to: " << enginePath.string() << std::endl; std::cout << "Engine size: " << plan->size() / (1024 * 1024) << " MiB" << std::endl; if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) { std::cout << "✓ Engine supports DYNAMIC batch sizes: 1 to " << m_options.maxBatchSize << std::endl; } else { std::cout << "• Engine supports FIXED batch size: " << m_options.maxBatchSize << std::endl; } // Save timing cache auto timingCacheFromConfig = config->getTimingCache(); if (timingCacheFromConfig) { auto timingCacheData = timingCacheFromConfig->serialize(); if (timingCacheData) { std::ofstream timingCacheOut(timingCachePath, std::ios::binary); if (timingCacheOut) { timingCacheOut.write(static_cast(timingCacheData->data()), timingCacheData->size()); timingCacheOut.close(); std::cout << "Timing cache saved to: " << timingCachePath << std::endl; std::cout << " Cache size: " << timingCacheData->size() / 1024 << " KiB" << std::endl; } } } Util::checkCudaErrorCode(cudaStreamDestroy(profileStream)); std::cout << "\n========================================" << std::endl; std::cout << "Build Complete!" << std::endl; std::cout << "========================================" << std::endl; if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) { std::cout << "\n✓ Engine supports batch inference (1-" << m_options.maxBatchSize << " images)" << std::endl; } return true; } // ============================================================================ // buildSafe() // // SEH wrapper around build(). Cannot use __try in a function with C++ // destructors, so the actual build() call is forwarded through a plain-C // function pointer via callBoolFuncSafe(). // ============================================================================ struct BuildSafeCtx_Base { void* enginePtr; // Engine* const char* onnxPath; size_t onnxPathLen; const float* subVals; const float* divVals; bool normalize; bool result; }; template static bool buildSafe_trampoline(void* ctx) { auto* c = static_cast(ctx); auto* engine = static_cast*>(c->enginePtr); std::string path(c->onnxPath, c->onnxPathLen); std::array sub = { c->subVals[0], c->subVals[1], c->subVals[2] }; std::array div = { c->divVals[0], c->divVals[1], c->divVals[2] }; return engine->build(path, sub, div, c->normalize); } template bool Engine::buildSafe(std::string onnxModelPath, const std::array& subVals, const std::array& divVals, bool normalize, unsigned long* outSehCode) { BuildSafeCtx_Base ctx; ctx.enginePtr = this; ctx.onnxPath = onnxModelPath.c_str(); ctx.onnxPathLen = onnxModelPath.size(); ctx.subVals = subVals.data(); ctx.divVals = divVals.data(); ctx.normalize = normalize; ctx.result = false; bool ok = callBoolFuncSafe(&buildSafe_trampoline, &ctx, outSehCode); return ok; } // ============================================================================ // buildWithRetry() // // Wraps build() with auto-retry for dynamic spatial dimension models. // Pre-analyzes the ONNX model to detect dynamic H/W dims, then builds a // fallback chain (max → 75% → 56% → ... → 640 → 320). Each candidate // calls build(), which checks for a cached engine first (fast) then tries // building if no cache exists. Fixed-spatial models skip retry. // ============================================================================ template bool Engine::buildWithRetry(std::string onnxModelPath, const std::array& subVals, const std::array& divVals, bool normalize) { // -- Quick pre-analysis: detect dynamic spatial dims in ONNX --------------- bool hasDynamicSpatial = false; int onnxFixedH = 0, onnxFixedW = 0; // 0 = dynamic (-1 in ONNX) if (m_options.maxInputHeight > 0 && m_options.maxInputWidth > 0) { auto tempBuilder = std::unique_ptr( nvinfer1::createInferBuilder(m_logger)); auto tempNetwork = std::unique_ptr(TRT_CREATE_NETWORK(tempBuilder)); auto tempParser = std::unique_ptr( nvonnxparser::createParser(*tempNetwork, m_logger)); std::ifstream onnxFile(onnxModelPath, std::ios::binary | std::ios::ate); if (onnxFile.good()) { std::streamsize onnxSize = onnxFile.tellg(); onnxFile.seekg(0, std::ios::beg); std::vector onnxBuffer(onnxSize); if (onnxFile.read(onnxBuffer.data(), onnxSize)) { unsigned long sehRetryParse = 0; bool retryParsed = parseOnnxModelSafe(tempParser.get(), onnxBuffer.data(), onnxBuffer.size(), &sehRetryParse); if (sehRetryParse != 0) { std::cout << "[Engine] WARNING: ONNX pre-analysis parse crashed in " << "buildWithRetry (" << formatCrashCode(sehRetryParse) << "). Skipping spatial analysis." << std::endl; // hasDynamicSpatial stays false → single build() attempt } else if (retryParsed && tempNetwork->getNbInputs() > 0) { auto dims = tempNetwork->getInput(0)->getDimensions(); if (dims.nbDims >= 4) { if (dims.d[2] == -1 || dims.d[3] == -1) hasDynamicSpatial = true; onnxFixedH = (dims.d[2] != -1) ? dims.d[2] : 0; onnxFixedW = (dims.d[3] != -1) ? dims.d[3] : 0; } } } } } // -- Fixed-spatial or no dynamic dims: single build attempt ---------------- if (!hasDynamicSpatial) { unsigned long sehBuild = 0; bool ok = buildSafe(onnxModelPath, subVals, divVals, normalize, &sehBuild); if (sehBuild != 0) { std::cout << "[Engine] FATAL: build() crashed in buildWithRetry (" << formatCrashCode(sehBuild) << ")" << std::endl; return false; } return ok; } // -- Dynamic spatial dims: build with fallback chain ---------------------- const bool dynamicH = (onnxFixedH == 0); const bool dynamicW = (onnxFixedW == 0); const int origMaxH = m_options.maxInputHeight; const int origMaxW = m_options.maxInputWidth; const int origOptH = m_options.optInputHeight; const int origOptW = m_options.optInputWidth; const int origMinH = m_options.minInputHeight; const int origMinW = m_options.minInputWidth; int dynMaxH = dynamicH ? origMaxH : 0; int dynMaxW = dynamicW ? origMaxW : 0; int maxDynDim = std::max(dynMaxH, dynMaxW); // Build fallback chain: max → 75% → 56% → ... → 640 → 320 std::vector candidates; for (int s = maxDynDim; s >= 320; s = (s * 3) / 4) { s = (s / 32) * 32; if (candidates.empty() || candidates.back() != s) candidates.push_back(s); } if (candidates.back() > 640) candidates.push_back(640); if (candidates.back() > 320) candidates.push_back(320); // Helper: configure m_options for a given candidate auto setCandidateOptions = [&](int candidate) { float scale = static_cast(candidate) / maxDynDim; m_options.maxInputHeight = dynamicH ? std::max(32, (static_cast(origMaxH * scale) / 32) * 32) : onnxFixedH; m_options.maxInputWidth = dynamicW ? std::max(32, (static_cast(origMaxW * scale) / 32) * 32) : onnxFixedW; m_options.minInputHeight = dynamicH ? std::min(origMinH, m_options.maxInputHeight) : onnxFixedH; m_options.minInputWidth = dynamicW ? std::min(origMinW, m_options.maxInputWidth) : onnxFixedW; m_options.optInputHeight = dynamicH ? std::min(origOptH, m_options.maxInputHeight) : onnxFixedH; m_options.optInputWidth = dynamicW ? std::min(origOptW, m_options.maxInputWidth) : onnxFixedW; }; // Try each candidate (largest first). build() checks cache before // building, so previously cached smaller engines are found quickly. for (size_t attempt = 0; attempt < candidates.size(); ++attempt) { setCandidateOptions(candidates[attempt]); std::cout << "[Engine] buildWithRetry attempt " << (attempt + 1) << "/" << candidates.size() << " (max " << m_options.maxInputHeight << "x" << m_options.maxInputWidth << ")" << std::endl; { unsigned long sehAttempt = 0; bool attemptOk = buildSafe(onnxModelPath, subVals, divVals, normalize, &sehAttempt); if (sehAttempt != 0) { std::cout << "[Engine] Build crashed (" << formatCrashCode(sehAttempt) << ") at max " << m_options.maxInputHeight << "x" << m_options.maxInputWidth << std::endl; // CUDA context may be corrupted — no point retrying return false; } if (attemptOk) { if (attempt > 0) { std::cout << "[Engine] Built with reduced max " << m_options.maxInputHeight << "x" << m_options.maxInputWidth << " (requested " << origMaxH << "x" << origMaxW << " exceeded GPU capacity)" << std::endl; } return true; } } if (attempt + 1 < candidates.size()) { std::cout << "[Engine] Build failed at max " << m_options.maxInputHeight << "x" << m_options.maxInputWidth << ", trying smaller..." << std::endl; } } // All candidates exhausted — restore original options for error reporting m_options.maxInputHeight = origMaxH; m_options.maxInputWidth = origMaxW; m_options.optInputHeight = origOptH; m_options.optInputWidth = origOptW; m_options.minInputHeight = origMinH; m_options.minInputWidth = origMinW; std::cout << "[Engine] buildWithRetry: all spatial dimension fallbacks " << "exhausted (tried " << candidates.size() << " candidates from " << candidates.front() << " down to " << candidates.back() << ")" << std::endl; return false; } // ============================================================================ // 6-param pool overloads // // These are non-virtual additions to Engine that let callers opt into // multi-GPU pool mode simply by supplying one extra argument: // // m_trtEngine->buildLoadNetwork(path, sub, div, norm); // single-GPU // m_trtEngine->buildLoadNetwork(path, sub, div, norm, -1); // pool // // When maxSlotsPerGpu == 1 the call delegates to the existing 4-param // single-GPU implementation -- zero behavioural difference. // Any other value routes through loadSlots() which fills all GPUs. // ============================================================================ template bool Engine::buildLoadNetwork( std::string onnxModelPath, const std::array& subVals, const std::array& divVals, bool normalize, int maxSlotsPerGpu, double memSafetyFactor) { // Force single-GPU when: maxSlotsPerGpu==0 (optimizer bypass), // per-instance forceNoPool, global bypass (OptimizeModelStr), // exported g_forceNoPool, OR single-GPU system with maxSlotsPerGpu==1. // // On a single-GPU system, the pool with 1 slot adds contention overhead // (2s timeout + reject) without any multi-GPU benefit. The CUDA stream // handles serialization naturally in single-GPU mode. { extern std::atomic g_forceNoPool; int gpuCount = 0; cudaGetDeviceCount(&gpuCount); bool singleGpuNoElastic = (gpuCount <= 1 && maxSlotsPerGpu == 1); bool noPool = (maxSlotsPerGpu == 0) || m_forceNoPool || g_forceNoPool.load(std::memory_order_relaxed) || TRTEngineCache::globalBypass().load(std::memory_order_relaxed) || singleGpuNoElastic; if (noPool) { std::cout << "Info: buildLoadNetwork -- single-GPU forced (maxSlots=" << maxSlotsPerGpu << ", forceNoPool=" << m_forceNoPool << ", g_forceNoPool=" << g_forceNoPool.load() << ", gpuCount=" << gpuCount << ")" << std::endl; return buildLoadNetwork(onnxModelPath, subVals, divVals, normalize); } } // Multi-GPU pool path. m_options carries the base configuration that was // set either at construction (Engine(options)) or by initializePool(). std::cout << "Info: buildLoadNetwork -- activating multi-GPU pool" << " (maxSlotsPerGpu=" << maxSlotsPerGpu << ", memSafetyFactor=" << memSafetyFactor << ")" << std::endl; return loadSlots(m_options, onnxModelPath, subVals, divVals, normalize, /*fromOnnx=*/true, maxSlotsPerGpu, memSafetyFactor); } template bool Engine::loadNetwork( std::string trtModelPath, const std::array& subVals, const std::array& divVals, bool normalize, int maxSlotsPerGpu, double memSafetyFactor) { { extern std::atomic g_forceNoPool; int gpuCount = 0; cudaGetDeviceCount(&gpuCount); bool singleGpuNoElastic = (gpuCount <= 1 && maxSlotsPerGpu == 1); bool noPool = (maxSlotsPerGpu == 0) || m_forceNoPool || g_forceNoPool.load(std::memory_order_relaxed) || TRTEngineCache::globalBypass().load(std::memory_order_relaxed) || singleGpuNoElastic; if (noPool) { std::cout << "Info: loadNetwork -- single-GPU forced (maxSlots=" << maxSlotsPerGpu << ", g_forceNoPool=" << g_forceNoPool.load() << ", gpuCount=" << gpuCount << ")" << std::endl; return loadNetwork(trtModelPath, subVals, divVals, normalize); } } // Multi-GPU pool path. std::cout << "Info: loadNetwork -- activating multi-GPU pool" << " (maxSlotsPerGpu=" << maxSlotsPerGpu << ", memSafetyFactor=" << memSafetyFactor << ")" << std::endl; return loadSlots(m_options, trtModelPath, subVals, divVals, normalize, /*fromOnnx=*/false, maxSlotsPerGpu, memSafetyFactor); }