Files
ANSCORE/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl

2624 lines
121 KiB
Plaintext
Raw Normal View History

2026-03-28 16:54:11 +11:00
#pragma once
#include <filesystem>
#include <map>
#include <sstream>
#include "Utility.h"
#include "TRTCompat.h"
// ============================================================================
// Crash-safe wrappers for TensorRT operations that can crash the process.
//
// On Windows: uses SEH (__try/__except) to catch access violations, OOM, etc.
// SEH cannot coexist with C++ objects that have destructors in the same
// function scope, so these thin wrappers accept only raw pointers.
//
// On Linux: uses POSIX signals + sigsetjmp/siglongjmp to catch SIGSEGV,
// SIGBUS, SIGABRT, SIGFPE. Thread-local jump buffers ensure thread safety.
// Signal handlers are saved/restored around each dangerous call so that
// the application's own handlers are not permanently replaced.
//
// outExceptionCode: 0 = OK.
// Windows: the SEH exception code (e.g. 0xC0000005 = access violation).
// Linux: the signal number (e.g. 11 = SIGSEGV).
// ============================================================================
#ifdef _WIN32
# ifndef WIN32_LEAN_AND_MEAN
# define WIN32_LEAN_AND_MEAN
# endif
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
#else
# include <signal.h>
# include <setjmp.h>
// Thread-local storage for the POSIX crash-recovery mechanism.
// Each thread gets its own jump buffer and signal number so that
// concurrent engine builds on different threads don't interfere.
static thread_local sigjmp_buf s_crashJmpBuf;
static thread_local volatile sig_atomic_t s_crashSignal = 0;
// Signal handler installed only around dangerous TensorRT calls.
// It records which signal was received and jumps back to the
// sigsetjmp() checkpoint. Only synchronous, thread-directed signals
// (SIGSEGV, SIGBUS, SIGFPE) are guaranteed to land on the faulting
// thread; SIGABRT is process-wide but typically raised from the same
// thread that called abort().
static void engineCrashSignalHandler(int sig) {
s_crashSignal = sig;
siglongjmp(s_crashJmpBuf, 1);
}
// Helper: install crash signal handlers, saving the previous ones.
struct CrashSignalGuard {
struct sigaction oldSigsegv, oldSigbus, oldSigabrt, oldSigfpe;
void install() {
struct sigaction sa;
sa.sa_handler = engineCrashSignalHandler;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0; // no SA_RESTART — let interrupted calls fail
sigaction(SIGSEGV, &sa, &oldSigsegv);
sigaction(SIGBUS, &sa, &oldSigbus);
sigaction(SIGABRT, &sa, &oldSigabrt);
sigaction(SIGFPE, &sa, &oldSigfpe);
s_crashSignal = 0;
}
void restore() {
sigaction(SIGSEGV, &oldSigsegv, nullptr);
sigaction(SIGBUS, &oldSigbus, nullptr);
sigaction(SIGABRT, &oldSigabrt, nullptr);
sigaction(SIGFPE, &oldSigfpe, nullptr);
}
};
#endif // _WIN32
/// Crash-safe ONNX parser->parse() wrapper.
/// @param outExceptionCode receives the exception/signal code on crash (0 = OK).
static bool parseOnnxModelSafe(
nvonnxparser::IParser* parser,
const void* data,
size_t dataSize,
unsigned long* outExceptionCode)
{
#ifdef _WIN32
*outExceptionCode = 0;
__try {
return parser->parse(data, dataSize);
}
__except (EXCEPTION_EXECUTE_HANDLER) {
*outExceptionCode = GetExceptionCode();
return false;
}
#else
*outExceptionCode = 0;
CrashSignalGuard guard;
guard.install();
bool result = false;
if (sigsetjmp(s_crashJmpBuf, 1) == 0) {
// Normal execution path
result = parser->parse(data, dataSize);
} else {
// Returned here from signal handler — a crash was caught
*outExceptionCode = static_cast<unsigned long>(s_crashSignal);
result = false;
}
guard.restore();
return result;
#endif
}
/// Crash-safe builder->buildSerializedNetwork() wrapper.
/// Returns raw IHostMemory* (caller wraps in unique_ptr).
static nvinfer1::IHostMemory* buildSerializedNetworkSafe(
nvinfer1::IBuilder* builder,
nvinfer1::INetworkDefinition& network,
nvinfer1::IBuilderConfig& config,
unsigned long* outExceptionCode)
{
#ifdef _WIN32
*outExceptionCode = 0;
__try {
return builder->buildSerializedNetwork(network, config);
}
__except (EXCEPTION_EXECUTE_HANDLER) {
*outExceptionCode = GetExceptionCode();
return nullptr;
}
#else
*outExceptionCode = 0;
CrashSignalGuard guard;
guard.install();
nvinfer1::IHostMemory* plan = nullptr;
if (sigsetjmp(s_crashJmpBuf, 1) == 0) {
plan = builder->buildSerializedNetwork(network, config);
} else {
*outExceptionCode = static_cast<unsigned long>(s_crashSignal);
plan = nullptr;
}
guard.restore();
return plan;
#endif
}
/// Crash-safe runtime->deserializeCudaEngine() wrapper.
/// Returns raw ICudaEngine* (caller wraps in unique_ptr).
static nvinfer1::ICudaEngine* deserializeCudaEngineSafe(
nvinfer1::IRuntime* runtime,
const void* data,
size_t dataSize,
unsigned long* outExceptionCode)
{
#ifdef _WIN32
*outExceptionCode = 0;
__try {
return runtime->deserializeCudaEngine(data, dataSize);
}
__except (EXCEPTION_EXECUTE_HANDLER) {
*outExceptionCode = GetExceptionCode();
return nullptr;
}
#else
*outExceptionCode = 0;
CrashSignalGuard guard;
guard.install();
nvinfer1::ICudaEngine* engine = nullptr;
if (sigsetjmp(s_crashJmpBuf, 1) == 0) {
engine = runtime->deserializeCudaEngine(data, dataSize);
} else {
*outExceptionCode = static_cast<unsigned long>(s_crashSignal);
engine = nullptr;
}
guard.restore();
return engine;
#endif
}
/// Crash-safe wrapper for an arbitrary bool-returning function pointer.
/// Used to SEH-protect build() calls that may crash on bad ONNX models.
typedef bool (*BoolFuncPtr)(void* ctx);
static bool callBoolFuncSafe(BoolFuncPtr fn, void* ctx, unsigned long* outExceptionCode) {
#ifdef _WIN32
*outExceptionCode = 0;
__try {
return fn(ctx);
}
__except (EXCEPTION_EXECUTE_HANDLER) {
*outExceptionCode = GetExceptionCode();
return false;
}
#else
*outExceptionCode = 0;
CrashSignalGuard guard;
guard.install();
bool result = false;
if (sigsetjmp(s_crashJmpBuf, 1) == 0) {
result = fn(ctx);
} else {
*outExceptionCode = static_cast<unsigned long>(s_crashSignal);
result = false;
}
guard.restore();
return result;
#endif
}
/// Format a crash code for logging (platform-aware).
/// Windows: "SEH exception 0xC0000005" Linux: "signal 11 (SIGSEGV)"
static std::string formatCrashCode(unsigned long code) {
std::ostringstream oss;
#ifdef _WIN32
oss << "SEH exception 0x" << std::hex << code << std::dec;
#else
oss << "signal " << code;
switch (code) {
case SIGSEGV: oss << " (SIGSEGV)"; break;
case SIGBUS: oss << " (SIGBUS)"; break;
case SIGABRT: oss << " (SIGABRT)"; break;
case SIGFPE: oss << " (SIGFPE)"; break;
default: oss << " (unknown)"; break;
}
#endif
return oss.str();
}
template <typename T>
bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals,
bool normalize)
{
// -- GPU-tier batch cap (early) -------------------------------------------
// Apply the same VRAM-based batch cap that build() uses BEFORE computing
// the engine filename. Without this, the cache lookup uses the uncapped
// batch size (e.g. b32), misses the file that was saved with the capped
// size (e.g. b16), and triggers a needless full rebuild every launch.
// The cap inside build() still runs later as a safety net (it will be a
// no-op because the batch is already capped here).
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, m_options.deviceIndex);
const size_t totalMiB = prop.totalGlobalMem / (1024ULL * 1024);
int gpuMaxBatch;
if (totalMiB >= 15800) gpuMaxBatch = 32; // ~16 GiB+
else if (totalMiB >= 11800) gpuMaxBatch = 16; // ~12 GiB
else if (totalMiB >= 7900) gpuMaxBatch = 8; // ~ 8 GiB (batch=16 OCR ~987 MiB exec ctx, too large for 4 tasks)
else if (totalMiB >= 3900) gpuMaxBatch = 4; // ~ 4 GiB
else if (totalMiB >= 1900) gpuMaxBatch = 2; // ~ 2 GiB
else gpuMaxBatch = 1; // < 2 GiB
if (m_options.maxBatchSize > gpuMaxBatch) {
if (m_verbose) {
std::cout << "Info: GPU-tier early batch cap: "
<< m_options.maxBatchSize << " -> " << gpuMaxBatch
<< " (GPU has " << totalMiB << " MiB)" << std::endl;
}
m_options.maxBatchSize = gpuMaxBatch;
m_options.optBatchSize = std::min(m_options.optBatchSize,
m_options.maxBatchSize);
}
}
// It is full path
std::string engineName = serializeEngineOptions(m_options, onnxModelPath);
std::string engineDir = m_options.engineFileDir;
if (FileExist(engineName)) {
if (m_verbose) { std::cout << "Engine file found: " << engineName << std::endl; }
bool loadOk = loadNetwork(engineName, subVals, divVals, normalize);
if (loadOk) {
return true;
}
// Engine file exists but loadNetwork failed.
// Common causes:
// - createExecutionContext returned null (VRAM exhausted)
// - Incompatible TRT version or corrupt file
// - Partially written by another thread
if (m_skipOnnxRebuild) {
// Elastic growth / non-critical path — don't delete and rebuild.
// Just fail gracefully; the pool continues with existing slots.
return false;
}
// Check if the failure was due to VRAM exhaustion vs. corrupt file.
// If VRAM was the reason, PRESERVE the engine file — it's valid, just
// can't fit right now. Deleting it forces a full ONNX→TRT rebuild
// (minutes) when VRAM becomes available later, instead of a fast load.
//
// Uses the m_lastLoadFailedVRAM flag set by loadNetwork() instead of
// re-querying cudaMemGetInfo. The old approach had a TOCTOU race:
// VRAM could be freed between loadNetwork's check and this re-check,
// causing a valid engine file to be falsely classified as INVALID
// and deleted. Also check current VRAM as a safety net.
{
size_t freeCheck = 0, totalCheck = 0;
cudaMemGetInfo(&freeCheck, &totalCheck);
constexpr size_t kMinFreeBytes = 256ULL * 1024 * 1024;
if (m_lastLoadFailedVRAM || freeCheck < kMinFreeBytes) {
return false; // Don't delete the file, don't try ONNX rebuild
}
}
// Enough VRAM AND loadNetwork didn't flag VRAM as cause → file is
// likely corrupt/incompatible. Delete and rebuild from ONNX.
try { std::filesystem::remove(engineName); } catch (...) {}
// Fall through to ONNX build path below
}
{
if (!FileExist(engineName)) {
// Demand-driven growth: if no cached engine exists, bail out rather
// than triggering a full ONNX→TRT build (30-60s, massive VRAM).
if (m_skipOnnxBuild) {
return false;
}
}
if (!FileExist(onnxModelPath)) {
// ONNX model does not exist, try to find alternative precision engine
std::cout << "Searching for alternative precision engine..." << std::endl;
size_t lastDot = engineName.find_last_of('.');
std::string alternativeEngineName;
ANSCENTER::Precision originalPrecision = m_options.precision;
if (m_options.precision == ANSCENTER::Precision::FP16) {
alternativeEngineName = engineName.substr(0, lastDot + 1) + "fp32";
m_options.precision = ANSCENTER::Precision::FP32;
std::cout << " Looking for FP32 engine: " << alternativeEngineName << std::endl;
}
else {
alternativeEngineName = engineName.substr(0, lastDot + 1) + "fp16";
m_options.precision = ANSCENTER::Precision::FP16;
std::cout << " Looking for FP16 engine: " << alternativeEngineName << std::endl;
}
if (FileExist(alternativeEngineName)) {
std::cout << "Found alternative precision engine: " << alternativeEngineName << std::endl;
return loadNetwork(alternativeEngineName, subVals, divVals, normalize);
}
else {
// Restore original precision
m_options.precision = originalPrecision;
std::cout << "Error: Neither ONNX model nor engine files exist for: " << onnxModelPath << std::endl;
std::cout << " Searched for: " << engineName << std::endl;
std::cout << " Searched for: " << alternativeEngineName << std::endl;
return false;
}
}
else {
// Before building, check if an alternative precision engine already exists
// (e.g., FP16 requested but a FP32 engine was built by a previous fallback)
if (m_options.precision == ANSCENTER::Precision::FP16) {
ANSCENTER::Options fp32Opts = m_options;
fp32Opts.precision = ANSCENTER::Precision::FP32;
std::string fp32EngineName = serializeEngineOptions(fp32Opts, onnxModelPath);
if (FileExist(fp32EngineName)) {
std::cout << "FP16 engine not found, but FP32 engine exists: " << fp32EngineName << std::endl;
std::cout << "Loading existing FP32 engine..." << std::endl;
m_options.precision = ANSCENTER::Precision::FP32;
return loadNetwork(fp32EngineName, subVals, divVals, normalize);
}
}
// ONNX model exists, generate engine
std::cout << "========================================" << std::endl;
std::cout << "Engine not found, generating from ONNX model" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "ONNX model: " << onnxModelPath << std::endl;
std::cout << "Target engine: " << engineName << std::endl;
if (!FolderExist(engineDir)) {
std::cout << "Creating engine directory: " << engineDir << std::endl;
std::filesystem::create_directories(engineDir);
}
// CRITICAL FIX: Read ONNX to determine if it supports dynamic batch
int32_t onnxBatchSize = -1;
bool hasDynamicSpatialDims_onnx = false;
int onnxFixedH = 0, onnxFixedW = 0; // 0 = dynamic (-1 in ONNX)
std::cout << "\nAnalyzing ONNX model structure..." << std::endl;
auto tempBuilder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(m_logger));
auto tempNetwork = std::unique_ptr<nvinfer1::INetworkDefinition>(TRT_CREATE_NETWORK(tempBuilder));
auto tempParser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*tempNetwork, m_logger));
std::ifstream onnxFile(onnxModelPath, std::ios::binary | std::ios::ate);
std::streamsize onnxSize = onnxFile.tellg();
onnxFile.seekg(0, std::ios::beg);
std::vector<char> onnxBuffer(onnxSize);
if (!onnxFile.read(onnxBuffer.data(), onnxSize)) {
std::cout << "Error: Failed to read ONNX file" << std::endl;
return false;
}
unsigned long sehPreAnalysis = 0;
bool preParsed = parseOnnxModelSafe(tempParser.get(),
onnxBuffer.data(), onnxBuffer.size(), &sehPreAnalysis);
if (sehPreAnalysis != 0) {
// Skipping pre-analysis, proceeding with build...
2026-03-28 16:54:11 +11:00
}
else if (preParsed) {
auto numInputs = tempNetwork->getNbInputs();
std::cout << "ONNX Model Analysis:" << std::endl;
std::cout << " Number of inputs: " << numInputs << std::endl;
for (int32_t i = 0; i < numInputs; ++i) {
auto input = tempNetwork->getInput(i);
auto inputDims = input->getDimensions();
std::cout << " Input " << i << " (" << input->getName() << "): [";
for (int j = 0; j < inputDims.nbDims; ++j) {
if (j > 0) std::cout << ", ";
// FIXED: Properly display dynamic dimensions
if (inputDims.d[j] == -1) {
std::cout << "dynamic";
}
else {
std::cout << inputDims.d[j];
}
}
std::cout << "]" << std::endl;
}
// Check first input's batch dimension
auto firstInput = tempNetwork->getInput(0);
auto firstInputDims = firstInput->getDimensions();
onnxBatchSize = firstInputDims.d[0];
// Detect dynamic spatial dimensions (for auto-retry mechanism)
if (firstInputDims.nbDims >= 4) {
if (firstInputDims.d[2] == -1 || firstInputDims.d[3] == -1) {
hasDynamicSpatialDims_onnx = true;
}
onnxFixedH = (firstInputDims.d[2] != -1) ? firstInputDims.d[2] : 0;
onnxFixedW = (firstInputDims.d[3] != -1) ? firstInputDims.d[3] : 0;
}
std::cout << "\nBatch dimension analysis:" << std::endl;
std::cout << " ONNX model batch dimension: ";
if (onnxBatchSize == -1) {
std::cout << "dynamic (-1)" << std::endl;
}
else {
std::cout << onnxBatchSize << std::endl;
}
std::cout << " Current maxBatchSize setting: " << m_options.maxBatchSize << std::endl;
std::cout << " Current optBatchSize setting: " << m_options.optBatchSize << std::endl;
// FIXED: Correct logic for dynamic vs fixed batch
if (onnxBatchSize == -1) {
// Dynamic batch size model - keep user settings
std::cout << "\n✓ ONNX model supports DYNAMIC batch size" << std::endl;
std::cout << " Engine will support batch sizes 1 to " << m_options.maxBatchSize << std::endl;
std::cout << " Optimal batch size: " << m_options.optBatchSize << std::endl;
std::cout << " Keeping user-defined batch size configuration" << std::endl;
}
else if (onnxBatchSize > 0) {
// Fixed batch size model - must match ONNX
std::cout << "\n⚠ WARNING: ONNX model has FIXED batch size of " << onnxBatchSize << std::endl;
std::cout << " Your model was exported with dynamic=False" << std::endl;
std::cout << " Engine will only support batch size " << onnxBatchSize << std::endl;
std::cout << " To use dynamic batching, re-export ONNX with dynamic=True" << std::endl;
std::cout << "\n Adjusting engine options to match ONNX model..." << std::endl;
m_options.optBatchSize = onnxBatchSize;
m_options.maxBatchSize = onnxBatchSize;
std::cout << " Updated optBatchSize: " << m_options.optBatchSize << std::endl;
std::cout << " Updated maxBatchSize: " << m_options.maxBatchSize << std::endl;
engineName = serializeEngineOptions(m_options, onnxModelPath);
}
else {
// Unexpected value
std::cout << "\n⚠ WARNING: Unexpected batch dimension value: " << onnxBatchSize << std::endl;
std::cout << " This may indicate an issue with the ONNX file" << std::endl;
std::cout << " Proceeding with user-defined settings" << std::endl;
}
}
else {
std::cout << "Warning: Failed to parse ONNX for pre-analysis. Proceeding with build..." << std::endl;
}
std::cout << "\n========================================" << std::endl;
std::cout << "Starting Engine Build Process" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "This may take 10-20 minutes depending on model complexity..." << std::endl;
std::cout << "Configuration:" << std::endl;
std::cout << " Precision: " << (m_options.precision == ANSCENTER::Precision::FP16 ? "FP16" :
m_options.precision == ANSCENTER::Precision::INT8 ? "INT8" : "FP32") << std::endl;
std::cout << " Optimization Level: 5 (Maximum)" << std::endl;
std::cout << " Batch Size Range: 1 to " << m_options.maxBatchSize << std::endl;
std::cout << "========================================" << std::endl;
// Build with auto-retry for dynamic spatial dimension models.
// buildWithRetry() handles the ONNX pre-analysis internally and
// reduces max spatial dims on OOM, falling back to smaller
// profiles until build succeeds or all candidates are exhausted.
// Fixed-spatial models get a single build() attempt.
bool buildSuccess = buildWithRetry(onnxModelPath, subVals,
divVals, normalize);
// -- FP16 -> FP32 automatic fallback ---------------------------------
// Some GPU architectures fail FP16 builds due to:
// - platformHasFastFp16() returning false (older GPUs)
// - kOBEY_PRECISION_CONSTRAINTS failing for mixed-precision layers
// - Insufficient VRAM for FP16 tactic optimization
// When FP16 build fails, automatically retry with FP32 precision.
if (!buildSuccess && m_options.precision == ANSCENTER::Precision::FP16) {
std::cout << "\n========================================" << std::endl;
std::cout << "FP16 Build Failed - Retrying with FP32" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "FP16 engine build failed on this GPU." << std::endl;
std::cout << "Automatically falling back to FP32 precision..." << std::endl;
std::cout << "========================================" << std::endl;
m_options.precision = ANSCENTER::Precision::FP32;
// Re-compute engine name for FP32 to avoid caching conflicts
engineName = serializeEngineOptions(m_options, onnxModelPath);
buildSuccess = buildWithRetry(onnxModelPath, subVals,
divVals, normalize);
if (buildSuccess) {
std::cout << "\n========================================" << std::endl;
std::cout << "FP32 Fallback Build Successful!" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "Note: Engine is running in FP32 mode on this GPU." << std::endl;
std::cout << "Performance may be lower than FP16 but accuracy is preserved." << std::endl;
std::cout << "========================================" << std::endl;
}
}
if (!buildSuccess) {
std::cout << "\n========================================" << std::endl;
std::cout << "Engine Build Failed!" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "Error: Failed to build engine from ONNX model" << std::endl;
std::cout << "Possible causes:" << std::endl;
std::cout << " 1. Insufficient GPU memory" << std::endl;
std::cout << " 2. Unsupported ONNX operations" << std::endl;
std::cout << " 3. Invalid batch size configuration" << std::endl;
std::cout << " 4. Corrupted ONNX file" << std::endl;
if (hasDynamicSpatialDims_onnx) {
std::cout << " 5. All spatial dimension fallbacks exhausted" << std::endl;
}
std::cout << " Note: Both FP16 and FP32 builds were attempted." << std::endl;
std::cout << "\nTroubleshooting:" << std::endl;
std::cout << " - Check GPU memory availability" << std::endl;
std::cout << " - Try reducing maxBatchSize" << std::endl;
std::cout << " - Verify ONNX model integrity" << std::endl;
std::cout << " - Check TensorRT logs above for details" << std::endl;
return false;
}
// build() may have capped maxBatchSize based on GPU VRAM, which
// changes the serialized engine filename (e.g. b32 -> b8). Re-compute
// so we load the file that build() actually saved.
std::string actualEngineName = serializeEngineOptions(m_options, onnxModelPath);
// After building, the engine should be saved, so load it
std::cout << "\n========================================" << std::endl;
std::cout << "Engine Build Complete - Loading Engine" << std::endl;
std::cout << "========================================" << std::endl;
if (FileExist(actualEngineName)) {
std::cout << "Engine file created successfully: " << actualEngineName << std::endl;
std::cout << "Loading engine into memory..." << std::endl;
bool loadSuccess = loadNetwork(actualEngineName, subVals, divVals, normalize);
if (loadSuccess) {
std::cout << "\n========================================" << std::endl;
std::cout << "✓ Engine Ready for Inference!" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "Configuration Summary:" << std::endl;
std::cout << " Engine File: " << actualEngineName << std::endl;
std::cout << " Batch Size Support: 1 to " << m_options.maxBatchSize << std::endl;
std::cout << " Precision: " << (m_options.precision == ANSCENTER::Precision::FP16 ? "FP16" :
m_options.precision == ANSCENTER::Precision::INT8 ? "INT8" : "FP32") << std::endl;
std::cout << "========================================" << std::endl;
}
return loadSuccess;
}
else {
std::cout << "\n========================================" << std::endl;
std::cout << "Engine Build Error!" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "Error: Engine file not found after build: " << actualEngineName << std::endl;
std::cout << "Expected location: " << std::filesystem::absolute(actualEngineName) << std::endl;
std::cout << "\nPossible causes:" << std::endl;
std::cout << " 1. Build succeeded but save failed (disk full?)" << std::endl;
std::cout << " 2. Incorrect engine directory permissions" << std::endl;
std::cout << " 3. Engine filename mismatch" << std::endl;
std::cout << "\nPlease check:" << std::endl;
std::cout << " - Available disk space in: " << engineDir << std::endl;
std::cout << " - Write permissions for engine directory" << std::endl;
std::cout << " - TensorRT build logs above for warnings" << std::endl;
return false;
}
}
}
}
template <typename T>
bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize)
{
2026-04-04 20:19:54 +11:00
// Install a custom OpenCV CUDA allocator that uses cudaMallocAsync/cudaFreeAsync
// instead of the default cudaMalloc/cudaFree. The stream-ordered allocator
// respects the cudaMemPool release threshold (set to 0), so freed memory is
// returned to the GPU immediately instead of being cached forever.
//
// The default cudaMalloc/cudaFree allocator caches all freed blocks permanently
// (no API to force release), causing VRAM to grow monotonically when GpuMat
// objects of varying sizes are allocated and freed repeatedly (different batch
// sizes, different image resolutions across cameras).
{
static std::once_flag s_allocatorFlag;
std::call_once(s_allocatorFlag, []() {
// Set release threshold to 0 on all GPUs
int deviceCount = 0;
cudaGetDeviceCount(&deviceCount);
for (int d = 0; d < deviceCount; ++d) {
cudaMemPool_t pool = nullptr;
if (cudaDeviceGetDefaultMemPool(&pool, d) == cudaSuccess && pool) {
uint64_t threshold = 0;
cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold);
}
}
// Custom allocator: uses cudaMallocAsync on stream 0 (behaves like
// synchronous cudaMalloc but goes through the stream-ordered pool).
struct AsyncAllocator : cv::cuda::GpuMat::Allocator {
bool allocate(cv::cuda::GpuMat* mat, int rows, int cols, size_t elemSize) override {
// Same logic as OpenCV's default allocator, but using cudaMallocAsync
size_t step = elemSize * cols;
// Align step to 256 bytes (same as default allocator)
step = (step + 255) & ~size_t(255);
void* ptr = nullptr;
cudaError_t err = cudaMallocAsync(&ptr, step * rows, nullptr); // stream 0
if (err != cudaSuccess || !ptr) {
// Fallback to regular cudaMalloc if async not supported
err = cudaMalloc(&ptr, step * rows);
if (err != cudaSuccess) return false;
}
mat->data = static_cast<uchar*>(ptr);
mat->step = step;
mat->refcount = static_cast<int*>(cv::fastMalloc(sizeof(int)));
*mat->refcount = 1;
return true;
}
void free(cv::cuda::GpuMat* mat) override {
cudaFreeAsync(mat->data, nullptr); // stream 0 — goes through pool with threshold=0
cv::fastFree(mat->refcount);
mat->data = nullptr;
mat->datastart = nullptr;
mat->dataend = nullptr;
mat->refcount = nullptr;
}
};
static AsyncAllocator s_allocator;
cv::cuda::GpuMat::setDefaultAllocator(&s_allocator);
ANS_DBG("TRT_Load", "Custom CUDA async allocator installed — VRAM freed immediately on GpuMat release");
});
}
2026-03-28 16:54:11 +11:00
m_lastLoadFailedVRAM = false; // reset on each load attempt
m_subVals = subVals;
m_divVals = divVals;
m_normalize = normalize;
// ============================================================================
// TRT ENGINE CACHE CHECK — skip file I/O + deserialization if already cached
// (Bypassed when m_skipEngineCache is true, e.g., during model optimization)
// ============================================================================
if (!m_skipEngineCache) {
auto cacheHit = TRTEngineCache::instance().tryGet(trtModelPath, m_options.deviceIndex);
if (cacheHit.engine) {
// Cache hit — reuse shared ICudaEngine (no deserialization, no file I/O)
m_context.reset();
m_engine.reset();
m_runtime.reset();
m_engine = cacheHit.engine;
m_runtime = cacheHit.runtime;
m_usingCachedEngine = true;
m_cachedEnginePath = trtModelPath;
m_cachedGpuIndex = m_options.deviceIndex;
// Still need to set GPU device for context + buffer allocation
cudaSetDevice(m_options.deviceIndex);
// Jump past file read + deserialization to context creation (below)
goto trt_cache_create_context;
}
}
// ============================================================================
// READ ENGINE FILE (cache miss path)
// ============================================================================
if (!Util::doesFileExist(trtModelPath)) {
return false;
}
if (m_verbose) { std::cout << "Loading TensorRT engine file at path: " << trtModelPath << std::endl; }
{
std::ifstream file(trtModelPath, std::ios::binary | std::ios::ate);
if (!file.is_open()) {
return false;
}
std::streamsize size = file.tellg();
if (size <= 0) {
return false;
}
file.seekg(0, std::ios::beg);
std::vector<char> buffer(size);
if (!file.read(buffer.data(), size)) {
return false;
}
if (m_verbose) { std::cout << "Engine file size: " << size / (1024 * 1024) << " MiB" << std::endl; }
// ============================================================================
// CREATE RUNTIME
// ============================================================================
// TRT requires: destroy context before engine, engine before runtime.
// If loadNetwork() is called more than once on the same instance, the
// previous objects must be torn down in the correct order before we
// create new ones.
m_context.reset();
m_engine.reset();
m_runtime.reset();
m_runtime = std::shared_ptr<nvinfer1::IRuntime>{ nvinfer1::createInferRuntime(m_logger) };
if (!m_runtime) {
return false;
}
// ============================================================================
// GPU SELECTION AND CONFIGURATION
// ============================================================================
int numGPUs = 0;
cudaGetDeviceCount(&numGPUs);
if (m_verbose) std::cout << "Info: Number of GPU devices: " << numGPUs << std::endl;
if (numGPUs == 0) {
std::cout << "Error: No CUDA-capable GPUs detected" << std::endl;
return false;
}
if (m_options.deviceIndex < 0 || m_options.deviceIndex >= numGPUs) {
std::cout << "Error: Invalid GPU index " << m_options.deviceIndex
<< ". Available GPUs: " << numGPUs << std::endl;
return false;
}
if (m_verbose) std::cout << "Info: Using GPU device index: " << m_options.deviceIndex << std::endl;
// Use yield mode to avoid busy-wait spinning that falsely reports 100% GPU utilization.
// Must be called before cudaSetDevice creates the CUDA context.
cudaSetDeviceFlags(cudaDeviceScheduleYield);
cudaError_t ret = cudaSetDevice(m_options.deviceIndex);
if (ret != cudaSuccess) {
std::cout << "Error: Unable to set GPU device index to " << m_options.deviceIndex << std::endl;
std::cout << "CUDA Error: " << cudaGetErrorString(ret) << std::endl;
return false;
}
// Get GPU properties
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, m_options.deviceIndex);
// Set GPU device limits.
// Blackwell GPUs (GB200/B200 = SM 10.x, RTX 5090/5080 = SM 12.x) have
// deeper kernel-launch pipelines and benefit from a larger pending-launch
// queue. Using 8192 on Blackwell avoids throttling with heavily pipelined
// workloads; 2048 is sufficient for all earlier architectures.
{
const size_t pendingLaunches = (prop.major >= 10) ? 8192 : 2048;
cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, pendingLaunches);
if (m_verbose) std::cout << "Info: cudaLimitDevRuntimePendingLaunchCount = " << pendingLaunches
<< " (SM " << prop.major << "." << prop.minor << ")" << std::endl;
}
cudaDeviceSetLimit(cudaLimitStackSize, 8192);
cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 2);
// Lock GPU clocks if requested (prevents power throttling on laptop GPUs)
if (m_options.gpuClockLockMHz != 0 && !m_clocksLocked) {
lockGpuClocks(m_options.deviceIndex, m_options.gpuClockLockMHz);
}
// -- VRAM safety check before engine deserialization -----------------------
// Reject early if the GPU doesn't have enough free VRAM to load the engine.
// This prevents slow degradation (unified memory fallback) or crashes
// (cudaMalloc failure during inference) when too many tasks are loaded.
{
size_t freeVRAM = 0, totalVRAM = 0;
cudaError_t memErr = cudaMemGetInfo(&freeVRAM, &totalVRAM);
constexpr size_t kMinFreeBytes = 256ULL * 1024 * 1024; // 256 MiB minimum
if (memErr != cudaSuccess) {
// cudaMemGetInfo failed — CUDA context may not be initialized on this thread.
// Don't reject: let TRT try to deserialize (it may succeed).
2026-03-28 16:54:11 +11:00
} else if (freeVRAM < kMinFreeBytes) {
m_lastLoadFailedVRAM = true; // signal to buildLoadNetwork: engine file is NOT corrupt
return false;
}
if (m_verbose) {
std::cout << "Info: GPU " << m_options.deviceIndex << " VRAM: "
<< (freeVRAM / (1024 * 1024)) << " MiB free / "
<< (totalVRAM / (1024 * 1024)) << " MiB total" << std::endl;
}
}
// ============================================================================
// DESERIALIZE ENGINE
// ============================================================================
if (m_verbose) std::cout << "Info: Deserializing TensorRT engine..." << std::endl;
unsigned long sehCodeDeserialize = 0;
m_engine = std::shared_ptr<nvinfer1::ICudaEngine>(
deserializeCudaEngineSafe(m_runtime.get(), buffer.data(),
buffer.size(), &sehCodeDeserialize));
if (sehCodeDeserialize != 0) {
return false;
}
if (!m_engine) {
return false;
}
if (m_verbose) std::cout << "Info: Engine deserialized successfully" << std::endl;
// ============================================================================
// CRITICAL: VERIFY ENGINE BATCH CAPABILITIES IMMEDIATELY
// ============================================================================
int numOptProfiles = m_engine->getNbOptimizationProfiles();
if (m_verbose) {
std::cout << "\n========================================" << std::endl;
std::cout << "ENGINE BATCH CAPABILITY VERIFICATION" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "Number of optimization profiles: " << numOptProfiles << std::endl;
}
bool engineSupportsDynamicBatch = false;
int actualMinBatch = 1;
int actualMaxBatch = 1;
if (numOptProfiles > 0) {
// Find the first input tensor to check batch support
for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
const char* tensorName = m_engine->getIOTensorName(i);
if (m_engine->getTensorIOMode(tensorName) == nvinfer1::TensorIOMode::kINPUT) {
auto minDims = m_engine->getProfileShape(tensorName, 0,
nvinfer1::OptProfileSelector::kMIN);
auto optDims = m_engine->getProfileShape(tensorName, 0,
nvinfer1::OptProfileSelector::kOPT);
auto maxDims = m_engine->getProfileShape(tensorName, 0,
nvinfer1::OptProfileSelector::kMAX);
actualMinBatch = minDims.d[0];
actualMaxBatch = maxDims.d[0];
// Store actual profile max spatial dims for runtime queries
if (maxDims.nbDims >= 4) {
m_profileMaxHeight = maxDims.d[2];
m_profileMaxWidth = maxDims.d[3];
}
if (actualMinBatch != actualMaxBatch) {
engineSupportsDynamicBatch = true;
}
if (m_verbose) {
std::cout << "\nInput tensor '" << tensorName << "' profile 0:" << std::endl;
std::cout << " Min: [" << minDims.d[0];
for (int d = 1; d < minDims.nbDims; ++d) std::cout << "," << minDims.d[d];
std::cout << "]" << std::endl;
std::cout << " Opt: [" << optDims.d[0];
for (int d = 1; d < optDims.nbDims; ++d) std::cout << "," << optDims.d[d];
std::cout << "]" << std::endl;
std::cout << " Max: [" << maxDims.d[0];
for (int d = 1; d < maxDims.nbDims; ++d) std::cout << "," << maxDims.d[d];
std::cout << "]" << std::endl;
if (actualMinBatch != actualMaxBatch)
std::cout << "\n✓ Engine supports DYNAMIC batching: "
<< actualMinBatch << " to " << actualMaxBatch << std::endl;
else
std::cout << "\n⚠ Engine has FIXED batch size: " << actualMinBatch << std::endl;
}
break; // Only need to check first input
}
}
}
else {
if (m_verbose) std::cout << "⚠️ No optimization profiles found" << std::endl;
// Check if batch dimension is dynamic via -1
auto firstTensorName = m_engine->getIOTensorName(0);
auto shape = m_engine->getTensorShape(firstTensorName);
if (shape.d[0] == -1) {
engineSupportsDynamicBatch = true;
actualMaxBatch = m_options.maxBatchSize;
if (m_verbose) std::cout << "Engine uses implicit dynamic batch (batch dim = -1)" << std::endl;
}
}
// CRITICAL CHECK: Verify engine can support requested batch sizes
if (!engineSupportsDynamicBatch && m_options.maxBatchSize > actualMaxBatch) {
std::cout << "\n🚨🚨🚨 CRITICAL ERROR 🚨🚨🚨" << std::endl;
std::cout << "Requested max batch size: " << m_options.maxBatchSize << std::endl;
std::cout << "Engine max batch size: " << actualMaxBatch << std::endl;
std::cout << "\nThis engine CANNOT support batch sizes larger than "
<< actualMaxBatch << "!" << std::endl;
std::cout << "\nYou have two options:" << std::endl;
std::cout << "1. Rebuild the engine with dynamic batch support:" << std::endl;
std::cout << " trtexec --onnx=model.onnx \\" << std::endl;
std::cout << " --minShapes=images:1x3x640x640 \\" << std::endl;
std::cout << " --optShapes=images:4x3x640x640 \\" << std::endl;
std::cout << " --maxShapes=images:32x3x640x640 \\" << std::endl;
std::cout << " --saveEngine=model_dynamic.engine --fp16" << std::endl;
std::cout << "\n2. Reduce maxBatchSize in your config to " << actualMaxBatch << std::endl;
std::cout << "========================================\n" << std::endl;
// Optionally fail here:
// return false;
// Or adjust maxBatchSize to match engine capability
if (m_verbose) std::cout << "⚠️ Auto-adjusting maxBatchSize from " << m_options.maxBatchSize
<< " to " << actualMaxBatch << std::endl;
m_options.maxBatchSize = actualMaxBatch;
}
if (m_verbose) std::cout << "========================================\n" << std::endl;
// Store in cache for future tasks loading the same model
if (!m_skipEngineCache) {
m_engine = TRTEngineCache::instance().putIfAbsent(
trtModelPath, m_options.deviceIndex, m_runtime, m_engine);
m_usingCachedEngine = true;
m_cachedEnginePath = trtModelPath;
m_cachedGpuIndex = m_options.deviceIndex;
}
} // end of cache-miss scope (closes the brace opened after cache check)
// ============================================================================
// CREATE EXECUTION CONTEXT (both cache-hit and cache-miss paths converge here)
// ============================================================================
trt_cache_create_context:
// These variables may not exist if we came from cache-hit path (goto skipped them).
// Re-derive from the (now valid) m_engine so both paths work.
{
int numOptProfiles = m_engine->getNbOptimizationProfiles();
bool engineSupportsDynamicBatch = false;
int actualMinBatch = 1;
int actualMaxBatch = 1;
if (numOptProfiles > 0) {
for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
const char* tn = m_engine->getIOTensorName(i);
if (m_engine->getTensorIOMode(tn) == nvinfer1::TensorIOMode::kINPUT) {
auto minDims = m_engine->getProfileShape(tn, 0, nvinfer1::OptProfileSelector::kMIN);
auto maxDims = m_engine->getProfileShape(tn, 0, nvinfer1::OptProfileSelector::kMAX);
actualMinBatch = minDims.d[0];
actualMaxBatch = maxDims.d[0];
engineSupportsDynamicBatch = (actualMinBatch != actualMaxBatch);
break;
}
}
}
if (actualMaxBatch > 0 && m_options.maxBatchSize > actualMaxBatch) {
m_options.maxBatchSize = actualMaxBatch;
}
m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
if (!m_context) {
2026-04-04 20:19:54 +11:00
ANS_DBG("TRT_Load", "ERROR: createExecutionContext returned null");
2026-03-28 16:54:11 +11:00
return false;
}
2026-04-04 20:19:54 +11:00
ANS_DBG("TRT_Load", "Execution context created OK for %s", trtModelPath.c_str());
2026-03-28 16:54:11 +11:00
if (m_verbose) std::cout << "Info: Execution context created successfully" << std::endl;
// ============================================================================
// BUFFER ALLOCATION
// ============================================================================
if (m_verbose) {
std::cout << "========================================" << std::endl;
std::cout << "Initializing Buffers" << std::endl;
std::cout << "========================================" << std::endl;
}
clearGpuBuffers();
m_buffers.resize(m_engine->getNbIOTensors());
m_outputLengths.clear();
m_inputDims.clear();
m_outputDims.clear();
m_IOTensorNames.clear();
m_hasDynamicSpatialDims = false;
// Check available GPU memory
size_t free_mem_initial, total_mem;
cudaMemGetInfo(&free_mem_initial, &total_mem);
if (m_verbose) {
std::cout << "GPU Memory before allocation: Free " << free_mem_initial / (1024 * 1024)
<< " MiB / Total " << total_mem / (1024 * 1024) << " MiB" << std::endl;
}
size_t totalAllocated = 0;
if (m_verbose) {
std::cout << "Engine batch configuration:" << std::endl;
std::cout << " Dynamic batch: " << (engineSupportsDynamicBatch ? "YES" : "NO") << std::endl;
std::cout << " Actual batch range: " << actualMinBatch << " to " << actualMaxBatch << std::endl;
std::cout << " Configured max batch size: " << m_options.maxBatchSize << std::endl;
std::cout << " Optimization profiles: " << numOptProfiles << std::endl;
}
// Allocate buffers for all I/O tensors
for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
const auto tensorName = m_engine->getIOTensorName(i);
m_IOTensorNames.emplace_back(tensorName);
const auto tensorType = m_engine->getTensorIOMode(tensorName);
const auto tensorShape = m_engine->getTensorShape(tensorName);
const auto tensorDataType = m_engine->getTensorDataType(tensorName);
if (tensorType == nvinfer1::TensorIOMode::kINPUT) {
if (m_verbose) std::cout << "\nInfo: Processing input tensor: " << tensorName << std::endl;
// Validate input type
if (tensorDataType != nvinfer1::DataType::kFLOAT) {
std::cout << "Error: Only float inputs are supported" << std::endl;
return false;
}
// Store input dimensions correctly (C, H, W - excluding batch)
m_inputDims.emplace_back(tensorShape.d[1], tensorShape.d[2], tensorShape.d[3]);
// Detect dynamic spatial dimensions (e.g., detection models with variable H/W)
if (tensorShape.d[2] == -1 || tensorShape.d[3] == -1) {
m_hasDynamicSpatialDims = true;
}
if (m_verbose) std::cout << " Input shape from engine: [" << tensorShape.d[0] << ", " << tensorShape.d[1]
<< ", " << tensorShape.d[2] << ", " << tensorShape.d[3] << "]" << std::endl;
// Calculate buffer size using actual max batch size from engine
// Dynamic dimensions (-1) are substituted with the configured max values
int32_t batchSize = (tensorShape.d[0] == -1) ? actualMaxBatch : tensorShape.d[0];
int32_t channels = tensorShape.d[1];
int32_t height = (tensorShape.d[2] == -1) ? m_options.maxInputHeight : tensorShape.d[2];
int32_t width = (tensorShape.d[3] == -1) ? m_options.maxInputWidth : tensorShape.d[3];
int64_t inputLength = static_cast<int64_t>(batchSize) * channels * height * width;
size_t requestedMemory = inputLength * sizeof(float);
if (m_verbose) std::cout << " Allocating for max batch size " << batchSize << ": "
<< requestedMemory / (1024 * 1024) << " MiB" << std::endl;
// Allocate GPU memory
cudaError_t err = cudaMalloc(&m_buffers[i], requestedMemory);
if (err != cudaSuccess) {
return false;
}
// Initialize to zero
cudaMemset(m_buffers[i], 0, requestedMemory);
totalAllocated += requestedMemory;
}
else if (tensorType == nvinfer1::TensorIOMode::kOUTPUT) {
if (m_verbose) std::cout << "\nInfo: Processing output tensor: " << tensorName << std::endl;
// Validate output type matches template parameter
if (tensorDataType == nvinfer1::DataType::kFLOAT && !std::is_same<float, T>::value) {
std::cout << "Error: Model output type is float, but template parameter is not float" << std::endl;
return false;
}
else if (tensorDataType == nvinfer1::DataType::kHALF && !std::is_same<__half, T>::value) {
std::cout << "Error: Model output type is half, but template parameter is not __half" << std::endl;
return false;
}
else if (tensorDataType == nvinfer1::DataType::kINT32 && !std::is_same<int32_t, T>::value) {
std::cout << "Error: Model output type is int32, but template parameter is not int32_t" << std::endl;
return false;
}
// Calculate output buffer size per batch element
int64_t outputLengthPerBatch = 1;
m_outputDims.push_back(tensorShape);
if (m_verbose) std::cout << " Output shape from engine: [" << tensorShape.d[0];
for (int j = 1; j < tensorShape.nbDims; ++j) {
if (m_verbose) std::cout << ", " << tensorShape.d[j];
int64_t dimSize = tensorShape.d[j];
if (dimSize <= 0) {
// Dynamic output dimension: use max input dims as upper bound
if (tensorShape.nbDims == 4) {
// NCHW: d[2]=H, d[3]=W
dimSize = (j == 2) ? m_options.maxInputHeight : m_options.maxInputWidth;
} else {
// Generic: use max input width as fallback for dynamic dims
dimSize = m_options.maxInputWidth;
}
if (dimSize <= 0) dimSize = 1; // Safety: avoid zero/negative
}
outputLengthPerBatch *= dimSize;
}
if (m_verbose) std::cout << "]" << std::endl;
// Store output length per batch element (excluding batch dimension)
m_outputLengths.push_back(outputLengthPerBatch);
// Allocate for actual max batch size from engine
size_t requestedMemory = outputLengthPerBatch * actualMaxBatch * sizeof(T);
if (m_verbose) std::cout << " Allocating for max batch size " << actualMaxBatch << ": "
<< requestedMemory / (1024 * 1024) << " MiB" << std::endl;
// Check if enough memory available
size_t free_mem, total_mem_check;
cudaMemGetInfo(&free_mem, &total_mem_check);
if (requestedMemory > free_mem) {
std::cout << "Error: Not enough GPU memory" << std::endl;
std::cout << " Requested: " << requestedMemory / (1024 * 1024) << " MiB" << std::endl;
std::cout << " Available: " << free_mem / (1024 * 1024) << " MiB" << std::endl;
return false;
}
// Allocate GPU memory
cudaError_t err = cudaMalloc(&m_buffers[i], requestedMemory);
if (err != cudaSuccess) {
return false;
}
// Initialize to zero
cudaMemset(m_buffers[i], 0, requestedMemory);
totalAllocated += requestedMemory;
}
else {
std::cout << "Error: Tensor is neither input nor output!" << std::endl;
return false;
}
}
2026-04-04 20:19:54 +11:00
{
size_t vramFree = 0, vramTotal = 0;
cudaMemGetInfo(&vramFree, &vramTotal);
ANS_DBG("TRT_Load", "Buffers allocated: %zuMB, VRAM: %zuMB used / %zuMB free / %zuMB total",
totalAllocated / (1024*1024),
(vramTotal - vramFree) / (1024*1024),
vramFree / (1024*1024),
vramTotal / (1024*1024));
}
2026-03-28 16:54:11 +11:00
if (m_verbose) std::cout << "\nInfo: Total GPU memory allocated: " << totalAllocated / (1024 * 1024) << " MiB" << std::endl;
// -- Pinned output buffers (CUDA graph prerequisite) -----------------------
// Invalidate any graphs captured by a previous loadNetwork() call on this instance.
for (auto& [bs, ge] : m_graphExecs) { if (ge) cudaGraphExecDestroy(ge); }
m_graphExecs.clear();
// Free any previously allocated pinned buffers.
for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
m_pinnedOutputBuffers.clear();
m_pinnedOutputBufElems.clear();
// Allocate one flat pinned buffer per output tensor, sized for
// actualMaxBatch x outputLength elements. Stable host addresses enable
// CUDA graph capture of D2H copies. If any allocation fails, disable
// graph acceleration gracefully and fall back to the original code path.
//
// Previously disabled for OpenCV 4.13+ because cv::cuda::split on the null
// stream threw cudaErrorStreamCaptureUnsupported (-217). Now safe because
// blobFromGpuMats runs on m_inferenceStream and finishes BEFORE graph capture.
m_pinnedOutputBuffers.resize(m_outputLengths.size(), nullptr);
m_pinnedOutputBufElems.resize(m_outputLengths.size(), 0);
bool pinnedOk = true;
for (size_t i = 0; i < m_outputLengths.size(); ++i) {
const size_t nElems = static_cast<size_t>(m_outputLengths[i])
* static_cast<size_t>(actualMaxBatch);
if (cudaMallocHost(reinterpret_cast<void**>(&m_pinnedOutputBuffers[i]),
nElems * sizeof(T)) != cudaSuccess) {
pinnedOk = false;
break;
}
m_pinnedOutputBufElems[i] = nElems;
}
if (!pinnedOk) {
std::cout << "Warning: cudaMallocHost failed -- CUDA graph acceleration disabled."
<< std::endl;
for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
m_pinnedOutputBuffers.clear();
m_pinnedOutputBufElems.clear();
} else {
if (m_verbose) std::cout << "Info: Pinned output buffers allocated -- CUDA graph acceleration enabled."
<< std::endl;
}
// Check final memory state
size_t free_mem_final, total_mem_final;
cudaMemGetInfo(&free_mem_final, &total_mem_final);
if (m_verbose) std::cout << "GPU Memory after allocation: Free " << free_mem_final / (1024 * 1024)
<< " MiB / Total " << total_mem_final / (1024 * 1024) << " MiB" << std::endl;
// Ensure all pending GPU operations (cudaMalloc, memcpy, etc.) complete
// before we begin inference on this engine.
cudaDeviceSynchronize();
// ============================================================================
// CONTEXT OPTIMIZATION
// ============================================================================
if (m_verbose) {
std::cout << "========================================" << std::endl;
std::cout << "Context Optimization" << std::endl;
std::cout << "========================================" << std::endl;
}
// Create temporary stream for context setup
cudaStream_t setupStream;
cudaStreamCreate(&setupStream);
// Check and set optimization profile
if (m_verbose) std::cout << "Info: Engine has " << numOptProfiles << " optimization profile(s)" << std::endl;
if (numOptProfiles > 0) {
int selectedProfile = 0;
if (m_verbose) std::cout << "Info: Using optimization profile " << selectedProfile
<< " (actual range: batch " << actualMinBatch << " to " << actualMaxBatch << ")" << std::endl;
// Set optimization profile FIRST
bool profileSet = m_context->setOptimizationProfileAsync(selectedProfile, setupStream);
if (!profileSet) {
std::cout << "Error: Failed to set optimization profile" << std::endl;
cudaStreamDestroy(setupStream);
return false;
}
// Wait for profile to be set
cudaStreamSynchronize(setupStream);
if (m_verbose) std::cout << "Info: Optimization profile " << selectedProfile << " set successfully" << std::endl;
}
// Set input shapes and bind buffers
for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
const auto tensorName = m_engine->getIOTensorName(i);
const auto tensorMode = m_engine->getTensorIOMode(tensorName);
// Set tensor address for both input and output
if (!m_context->setTensorAddress(tensorName, m_buffers[i])) {
std::cout << "Error: Failed to set tensor address for " << tensorName << std::endl;
cudaStreamDestroy(setupStream);
return false;
}
if (tensorMode == nvinfer1::TensorIOMode::kINPUT) {
auto dims = m_engine->getTensorShape(tensorName);
if (m_verbose) {
std::cout << "Info: Input tensor '" << tensorName << "' engine shape: [";
for (int j = 0; j < dims.nbDims; ++j) {
if (j > 0) std::cout << ", ";
std::cout << dims.d[j];
}
std::cout << "]" << std::endl;
}
// For dynamic batch engines, set shape to minimum for initialization
if (dims.d[0] == -1 || numOptProfiles > 0) {
nvinfer1::Dims inputDims = dims;
inputDims.d[0] = actualMinBatch; // Use actual min from engine
// Set height if dynamic
if (inputDims.d[2] == -1) {
inputDims.d[2] = m_options.optInputHeight;
}
// Set width if dynamic
if (inputDims.d[3] == -1) {
inputDims.d[3] = m_options.optInputWidth;
}
if (!m_context->setInputShape(tensorName, inputDims)) {
std::cout << "Error: Failed to set input shape for " << tensorName << std::endl;
cudaStreamDestroy(setupStream);
return false;
}
if (m_verbose) {
std::cout << "Info: Set initial input shape to [" << inputDims.d[0] << ", "
<< inputDims.d[1] << ", " << inputDims.d[2] << ", "
<< inputDims.d[3] << "] (for warmup)" << std::endl;
std::cout << " Actual batch size will be set at inference time" << std::endl;
}
}
}
}
// Verify all dimensions are specified
if (!m_context->allInputDimensionsSpecified()) {
std::cout << "Error: Not all input dimensions specified after setup" << std::endl;
// Debug: Show which dimensions are missing
for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
const auto tensorName = m_engine->getIOTensorName(i);
if (m_engine->getTensorIOMode(tensorName) == nvinfer1::TensorIOMode::kINPUT) {
auto dims = m_context->getTensorShape(tensorName);
std::cout << " " << tensorName << " shape: [";
for (int j = 0; j < dims.nbDims; ++j) {
if (j > 0) std::cout << ", ";
std::cout << dims.d[j];
}
std::cout << "]" << std::endl;
}
}
cudaStreamDestroy(setupStream);
return false;
}
if (m_verbose) {
std::cout << "Info: All input dimensions specified correctly" << std::endl;
std::cout << "Info: All tensor addresses bound successfully" << std::endl;
}
// Disable profiling for production
m_context->setEnqueueEmitsProfile(false);
if (m_verbose) std::cout << "Info: Enqueue profile emissions disabled (production mode)" << std::endl;
// Clean up setup stream
cudaStreamSynchronize(setupStream);
cudaStreamDestroy(setupStream);
// ============================================================================
// CREATE PERSISTENT INFERENCE AND MEMORY STREAMS
// ============================================================================
// Creating streams here (once, at load time) rather than lazily in
// runInference() removes the hot-path "if (!m_streamInitialized)" branch
// and ensures warmUp() already runs on the real inference stream.
if (!m_streamInitialized) {
int leastPriority, greatestPriority;
cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
cudaError_t streamErr = cudaStreamCreateWithPriority(
&m_inferenceStream, cudaStreamNonBlocking, greatestPriority);
if (streamErr != cudaSuccess) {
std::cout << "Error: Failed to create inference stream: "
<< cudaGetErrorString(streamErr) << std::endl;
return false;
}
streamErr = cudaStreamCreate(&m_memoryStream);
if (streamErr != cudaSuccess) {
std::cout << "Error: Failed to create memory stream: "
<< cudaGetErrorString(streamErr) << std::endl;
return false;
}
m_streamInitialized = true;
if (m_verbose) {
std::cout << "Info: Inference stream created at load time with highest priority" << std::endl;
std::cout << "Info: Memory stream created" << std::endl;
}
}
// ============================================================================
// PRE-WARMUP DIAGNOSTICS
// ============================================================================
if (m_verbose) {
std::cout << "\n========================================" << std::endl;
std::cout << "Pre-Warmup Diagnostics" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "Engine has " << m_engine->getNbIOTensors() << " I/O tensors" << std::endl;
std::cout << "Engine has " << m_engine->getNbOptimizationProfiles() << " optimization profiles" << std::endl;
for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
const auto tensorName = m_engine->getIOTensorName(i);
const auto tensorMode = m_engine->getTensorIOMode(tensorName);
const auto tensorShape = m_context->getTensorShape(tensorName);
std::cout << "\nTensor " << i << ": " << tensorName << std::endl;
std::cout << " Mode: " << (tensorMode == nvinfer1::TensorIOMode::kINPUT ? "INPUT" : "OUTPUT") << std::endl;
std::cout << " Shape: [";
for (int j = 0; j < tensorShape.nbDims; ++j) {
if (j > 0) std::cout << ", ";
std::cout << tensorShape.d[j];
}
std::cout << "]" << std::endl;
std::cout << " Buffer address: " << m_buffers[i] << std::endl;
}
std::cout << "\nContext state check:" << std::endl;
std::cout << " All dimensions specified: " << (m_context->allInputDimensionsSpecified() ? "YES" : "NO") << std::endl;
std::cout << "========================================" << std::endl;
}
if (!m_context->allInputDimensionsSpecified()) {
std::cout << "ERROR: Cannot proceed with warmup - dimensions not specified!" << std::endl;
return false;
}
// ============================================================================
// ENGINE LOADED SUCCESSFULLY
// ============================================================================
if (m_verbose) {
std::cout << "\n========================================" << std::endl;
std::cout << "Engine loaded successfully!" << std::endl;
std::cout << "========================================" << std::endl;
}
// ============================================================================
// WARMUP
// ============================================================================
if (m_verbose) std::cout << "\nInfo: Starting warm-up inference..." << std::endl;
warmUp(m_verbose ? 10 : 1);
if (m_verbose) std::cout << "Info: Warm-up complete" << std::endl;
} // end of trt_cache_create_context scope
return true;
}
template <typename T>
bool Engine<T>::build(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize) {
const auto engineName = serializeEngineOptions(m_options, onnxModelPath);
if (FileExist(engineName)) {
std::cout << "Engine file already exists: " << engineName << std::endl;
return true;
}
if (!FileExist(onnxModelPath)) {
std::cout << "Error: ONNX model file does not exist: " << onnxModelPath << std::endl;
return false;
}
std::cout << "========================================" << std::endl;
std::cout << "Building TensorRT Engine" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "TensorRT Version: " << NV_TENSORRT_MAJOR << "."
<< NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH << std::endl;
// TensorRT 10+ detection
#if NV_TENSORRT_MAJOR >= 10
std::cout << "\n⚠ TensorRT " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR
<< " detected - will apply dynamic batch optimization flags" << std::endl;
#endif
// Create our engine builder.
auto builder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(m_logger));
if (!builder) {
std::cout << "Error: Failed to create builder" << std::endl;
return false;
}
auto network = std::unique_ptr<nvinfer1::INetworkDefinition>(TRT_CREATE_NETWORK(builder));
if (!network) {
std::cout << "Error: Failed to create network" << std::endl;
return false;
}
// Create a parser for reading the onnx file.
auto parser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, m_logger));
if (!parser) {
std::cout << "Error: Failed to create parser" << std::endl;
return false;
}
// Read the onnx file into memory
std::ifstream file(onnxModelPath, std::ios::binary | std::ios::ate);
std::streamsize size = file.tellg();
file.seekg(0, std::ios::beg);
std::vector<char> buffer(size);
if (!file.read(buffer.data(), size)) {
std::cout << "Error: Unable to read ONNX file" << std::endl;
return false;
}
std::cout << "ONNX model size: " << size / (1024 * 1024) << " MiB" << std::endl;
// Parse the buffer we read into memory (crash-safe).
std::cout << "Parsing ONNX model..." << std::endl;
unsigned long sehCodeParse = 0;
auto parsed = parseOnnxModelSafe(parser.get(), buffer.data(),
buffer.size(), &sehCodeParse);
if (sehCodeParse != 0) {
return false;
}
if (!parsed) {
std::cout << "Error: Failed to parse ONNX model" << std::endl;
for (int32_t i = 0; i < parser->getNbErrors(); ++i) {
std::cout << " " << parser->getError(i)->desc() << std::endl;
}
return false;
}
std::cout << "ONNX model parsed successfully" << std::endl;
// ============================================================================
// ENHANCED ONNX MODEL ANALYSIS
// ============================================================================
std::cout << "\n========================================" << std::endl;
std::cout << "ONNX Model Analysis" << std::endl;
std::cout << "========================================" << std::endl;
const auto numInputs = network->getNbInputs();
if (numInputs < 1) {
std::cout << "Error: Model needs at least 1 input!" << std::endl;
return false;
}
std::cout << "Number of inputs: " << numInputs << std::endl;
// Analyze all inputs
for (int32_t i = 0; i < numInputs; ++i) {
const auto input = network->getInput(i);
const auto inputDims = input->getDimensions();
std::cout << "\nInput [" << i << "] '" << input->getName() << "':" << std::endl;
std::cout << " Dimensions: [";
for (int j = 0; j < inputDims.nbDims; ++j) {
if (j > 0) std::cout << ", ";
if (inputDims.d[j] == -1) {
std::cout << "DYNAMIC";
}
else {
std::cout << inputDims.d[j];
}
}
std::cout << "]" << std::endl;
// Check batch dimension
if (inputDims.d[0] == -1) {
std::cout << " ✓ Batch dimension: DYNAMIC" << std::endl;
}
else {
std::cout << " ✗ Batch dimension: FIXED at " << inputDims.d[0] << std::endl;
}
// Check height dimension (if applicable)
if (inputDims.nbDims >= 3 && inputDims.d[2] == -1) {
std::cout << " ✓ Height dimension: DYNAMIC" << std::endl;
}
else if (inputDims.nbDims >= 3) {
std::cout << " • Height dimension: FIXED at " << inputDims.d[2] << std::endl;
}
// Check width dimension (if applicable)
if (inputDims.nbDims >= 4 && inputDims.d[3] == -1) {
std::cout << " ✓ Width dimension: DYNAMIC" << std::endl;
}
else if (inputDims.nbDims >= 4) {
std::cout << " • Width dimension: FIXED at " << inputDims.d[3] << std::endl;
}
}
// Ensure that all the inputs have the same batch size
const auto input0Batch = network->getInput(0)->getDimensions().d[0];
for (int32_t i = 1; i < numInputs; ++i) {
if (network->getInput(i)->getDimensions().d[0] != input0Batch) {
std::cout << "\nError: Model has multiple inputs with differing batch sizes!" << std::endl;
return false;
}
}
// Check to see if the model supports dynamic batch size or not
bool doesSupportDynamicBatch = false;
if (input0Batch == -1) {
doesSupportDynamicBatch = true;
std::cout << "\n✓ Model supports DYNAMIC batch size" << std::endl;
std::cout << " Batch size range: min=1, opt=" << m_options.optBatchSize
<< ", max=" << m_options.maxBatchSize << std::endl;
}
else {
std::cout << "\n✗ Model only supports FIXED batch size of " << input0Batch << std::endl;
std::cout << " WARNING: This will limit batch processing performance!" << std::endl;
std::cout << " Consider re-exporting ONNX with dynamic batch axis." << std::endl;
// Adjust batch size options to match model's fixed batch size
if (m_options.optBatchSize != input0Batch || m_options.maxBatchSize != input0Batch) {
std::cout << " Adjusting batch size options to match model's fixed batch size" << std::endl;
m_options.optBatchSize = input0Batch;
m_options.maxBatchSize = input0Batch;
}
}
// Check for dynamic width and height dimensions
const auto inputHeight = network->getInput(0)->getDimensions().d[2];
const auto inputWidth = network->getInput(0)->getDimensions().d[3];
bool doesSupportDynamicHeight = false;
bool doesSupportDynamicWidth = false;
// Check height dimension
if (inputHeight == -1) {
doesSupportDynamicHeight = true;
std::cout << "\n✓ Model supports DYNAMIC height" << std::endl;
if (m_options.optInputHeight == -1) {
std::cout << " No user-configured height found, using default: 640" << std::endl;
m_options.minInputHeight = 640;
m_options.optInputHeight = 640;
m_options.maxInputHeight = 640;
}
else {
std::cout << " Using user-configured height: " << m_options.optInputHeight << std::endl;
}
}
else {
std::cout << "\n• Model has FIXED height: " << inputHeight << std::endl;
m_options.minInputHeight = m_options.optInputHeight = m_options.maxInputHeight = inputHeight;
}
// Check width dimension
if (inputWidth == -1) {
doesSupportDynamicWidth = true;
std::cout << "✓ Model supports DYNAMIC width" << std::endl;
if (m_options.optInputWidth == -1) {
std::cout << " No user-configured width found, using default: 640" << std::endl;
m_options.minInputWidth = 640;
m_options.optInputWidth = 640;
m_options.maxInputWidth = 640;
}
else {
std::cout << " Using user-configured width: " << m_options.optInputWidth << std::endl;
}
}
else {
std::cout << "• Model has FIXED width: " << inputWidth << std::endl;
m_options.minInputWidth = m_options.optInputWidth = m_options.maxInputWidth = inputWidth;
}
std::cout << "\nFinal input dimensions configured:" << std::endl;
std::cout << " Height: " << m_options.optInputHeight << std::endl;
std::cout << " Width: " << m_options.optInputWidth << std::endl;
std::cout << "========================================" << std::endl;
auto config = std::unique_ptr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
if (!config) {
std::cout << "Error: Failed to create builder config" << std::endl;
return false;
}
// ============================================================================
// PERFORMANCE OPTIMIZATIONS
// ============================================================================
std::cout << "\n========================================" << std::endl;
std::cout << "Configuring Performance Settings" << std::endl;
std::cout << "========================================" << std::endl;
// Get GPU properties for the target device (not always GPU 0)
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, m_options.deviceIndex);
std::cout << "Building engine for GPU " << m_options.deviceIndex << ": " << prop.name << std::endl;
std::cout << "Compute Capability: " << prop.major << "." << prop.minor << std::endl;
std::cout << "Total GPU Memory: " << prop.totalGlobalMem / (1024 * 1024) << " MiB" << std::endl;
size_t free_mem, total_mem;
cudaMemGetInfo(&free_mem, &total_mem);
const size_t totalMiB = total_mem / (1024ULL * 1024);
// -- GPU-tier adaptive configuration --------------------------------------
// All performance parameters scale with GPU VRAM to avoid OOM on small
// GPUs while maximising throughput on larger ones.
//
// VRAM | Workspace | Opt Level | Max Batch | Tactic DRAM
// ------------|-----------|-----------|-----------|-------------------
// <= 1 GiB | 256 MiB | 3 | 1 | up to 2 GiB cap
// <= 2 GiB | 512 MiB | 3 | 2 | up to 2 GiB cap
// <= 4 GiB | 1 GiB | 3 | 4 | up to 2 GiB cap
// <= 6 GiB | 2 GiB | 3 | 8 | up to 2 GiB cap
// <= 8 GiB | 2 GiB | 3 | 16 | up to 2 GiB cap
// <=12 GiB | 2 GiB | 3 | 16 | up to 2 GiB cap
// <=16 GiB | 8 GiB | 5 | 32 | up to 4 GiB cap
// <=24 GiB | 8 GiB | 5 | 32 | up to 4 GiB cap
// > 24 GiB | 16 GiB | 5 | 32 | up to 8 GiB cap
// -- 1. Workspace size ----------------------------------------------------
size_t max_workspace;
const char* tierLabel;
if (totalMiB > 24576) { // > 24 GiB
max_workspace = 16ULL * 1024 * 1024 * 1024;
tierLabel = "high-end (>24 GiB)";
} else if (totalMiB > 12288) { // > 12 GiB
max_workspace = 8ULL * 1024 * 1024 * 1024;
tierLabel = "desktop (>12 GiB)";
} else if (totalMiB > 4096) { // > 4 GiB
max_workspace = 2ULL * 1024 * 1024 * 1024;
tierLabel = "laptop (4-12 GiB)";
} else if (totalMiB > 2048) { // > 2 GiB
max_workspace = 1ULL * 1024 * 1024 * 1024;
tierLabel = "low-end (2-4 GiB)";
} else if (totalMiB > 1024) { // > 1 GiB
max_workspace = 512ULL * 1024 * 1024;
tierLabel = "minimal (1-2 GiB)";
} else { // <= 1 GiB
max_workspace = 256ULL * 1024 * 1024;
tierLabel = "ultra-low (<=1 GiB)";
}
size_t workspace_size = std::min(max_workspace, static_cast<size_t>(free_mem * 0.4));
config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspace_size);
std::cout << "Workspace size set to: " << workspace_size / (1024 * 1024)
<< " MiB (" << tierLabel << " tier)" << std::endl;
// -- 2. Max batch size cap ------------------------------------------------
// The model config sets the *desired* maxBatchSize; the GPU VRAM
// determines the *actual* cap. This affects the optimisation profile
// range, warmup, and runtime chunk splitting.
// Thresholds use ~97% of marketing size to account for OS/driver reserved
// memory (e.g. an "8 GB" GPU reports 8187 MiB).
if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
int gpuMaxBatch;
if (totalMiB >= 15800) gpuMaxBatch = 32; // ~16 GiB (e.g. 16384 -> reports ~15900+)
else if (totalMiB >= 11800) gpuMaxBatch = 16; // ~12 GiB (e.g. 12288 -> reports ~11800+)
else if (totalMiB >= 7900) gpuMaxBatch = 8; // ~ 8 GiB (e.g. 8192 -> reports ~8100+; batch=16 OCR ~987 MiB too large for 4 tasks)
else if (totalMiB >= 3900) gpuMaxBatch = 4; // ~ 4 GiB (e.g. 4096 -> reports ~3950+)
else if (totalMiB >= 1900) gpuMaxBatch = 2; // ~ 2 GiB (e.g. 2048 -> reports ~1950+)
else gpuMaxBatch = 1; // < 2 GiB
const int prevMax = m_options.maxBatchSize;
m_options.maxBatchSize = std::min(m_options.maxBatchSize, gpuMaxBatch);
m_options.optBatchSize = std::min(m_options.optBatchSize, m_options.maxBatchSize);
if (prevMax != m_options.maxBatchSize) {
std::cout << "Max batch size capped by GPU VRAM: " << prevMax
<< " -> " << m_options.maxBatchSize
<< " (GPU has " << totalMiB << " MiB)" << std::endl;
}
std::cout << "Batch config: opt=" << m_options.optBatchSize
<< ", max=" << m_options.maxBatchSize << std::endl;
}
// -- 3. Optimisation level ------------------------------------------------
// Level 5 (exhaustive kernel search) only on GPUs with ≥16 GiB where
// the tactic DRAM pool can hold the largest tactics. On smaller GPUs,
// level 3 gives ~95 % of the runtime performance with dramatically
// shorter build times.
// Level 3 = balanced (best tradeoff: fast build, near-optimal kernels)
// Level 5 = exhaustive (10x slower build for ~1-3% faster inference)
// Use level 3 for all GPUs — the marginal runtime gain from level 5
// is not worth the 10-30 minute build time on first run.
const int optLevel = 3;
config->setBuilderOptimizationLevel(optLevel);
std::cout << "Builder optimization level set to " << optLevel
<< " (balanced)" << std::endl;
// Enable TF32 for Ampere and newer GPUs
if (prop.major >= 8) {
config->setFlag(nvinfer1::BuilderFlag::kTF32);
std::cout << "TF32 enabled for Ampere/Ada/Blackwell architecture" << std::endl;
}
// Enable optimization flags
// kPREFER_PRECISION_CONSTRAINTS removed: deprecated in TRT 10.12, no-op in TRT 10.15.1.
config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
std::cout << "Optimization flags enabled" << std::endl;
// kDIRECT_IO removed: deprecated in TRT 10.7 as "Unneeded API".
// TRT 10.7+ enables this behaviour automatically; the flag is a no-op in TRT 10.15.1.
// Enable all available tactic sources
uint32_t tacticSources = 1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUBLAS) |
1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUBLAS_LT) |
1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUDNN);
if (prop.major >= 8) {
tacticSources |= 1U << static_cast<uint32_t>(nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS);
tacticSources |= 1U << static_cast<uint32_t>(nvinfer1::TacticSource::kJIT_CONVOLUTIONS);
std::cout << "Enhanced tactic sources enabled for Ampere+ architecture" << std::endl;
}
config->setTacticSources(tacticSources);
// kDETAILED profiling embeds per-layer metadata in the engine and adds measurable
// build/inference overhead. Use kNONE for production; switch to kDETAILED or
// kLAYER_NAMES_ONLY only when profiling with Nsight Systems / trt-exec --profilingVerbosity.
config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kNONE);
// Set timing iterations
config->setAvgTimingIterations(4);
std::cout << "Timing iterations set to 4 for stable kernel selection" << std::endl;
// Set hardware compatibility
config->setHardwareCompatibilityLevel(nvinfer1::HardwareCompatibilityLevel::kNONE);
// -- TensorRT 10+ tactic DRAM pool ----------------------------------------
// Separate scratch pool for kernel-selection during build. Without this,
// tactic evaluation competes with the workspace allocation and tactics
// requesting >1 GiB get skipped, causing hours of wasted fallback searches.
//
// Strategy: give the tactic pool as much memory as possible while reserving
// enough for workspace + builder overhead. The cap scales with GPU VRAM:
// <=12 GiB -> up to 2 GiB (most tactics fit within 1.5 GiB)
// <=24 GiB -> up to 4 GiB (room for larger model tactics)
// > 24 GiB -> up to 8 GiB (future-proof for very large models)
#if NV_TENSORRT_MAJOR >= 10
{
// Scale the tactic cap by GPU VRAM -- larger GPUs can afford more
size_t tacticCap;
if (totalMiB > 24576) tacticCap = 8ULL * 1024 * 1024 * 1024; // > 24 GiB
else if (totalMiB > 12288) tacticCap = 4ULL * 1024 * 1024 * 1024; // > 12 GiB
else tacticCap = 2ULL * 1024 * 1024 * 1024; // <= 12 GiB
// Reserve workspace + 512 MiB safety margin for builder internals
const size_t reserveForBuild = workspace_size + (512ULL * 1024 * 1024);
const size_t availableForTactic =
(free_mem > reserveForBuild) ? (free_mem - reserveForBuild) : 0ULL;
size_t tacticMemory = std::min(tacticCap, availableForTactic);
// kTACTIC_DRAM requires a power-of-2 size; floor to nearest power of 2
if (tacticMemory > 0) {
size_t p = 1ULL;
while (p * 2 <= tacticMemory) p *= 2;
tacticMemory = p;
}
config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kTACTIC_DRAM, tacticMemory);
std::cout << "kTACTIC_DRAM pool: " << tacticMemory / (1024 * 1024) << " MiB (TRT 10+)" << std::endl;
}
#endif
// -- kSTRONGLY_TYPED (TRT 8.5 - 9.x only) --------------------------------
// This flag existed in TRT 8.5 through 9.x to opt into strict type
// enforcement. NVIDIA removed the enum in TRT 10.0 because strongly-typed
// networks became the default behaviour -- setting it on TRT 10+ produces a
// compile error ("undeclared identifier"). For TRT 10+ simply log a note.
#if NV_TENSORRT_MAJOR < 10
if (m_options.precision != ANSCENTER::Precision::INT8) {
config->setFlag(nvinfer1::BuilderFlag::kSTRONGLY_TYPED);
std::cout << "kSTRONGLY_TYPED enabled (TRT 8.5-9.x, FP32/FP16 mode)" << std::endl;
}
#else
// TRT 10+: strongly-typed networks are the default; no flag required.
std::cout << "Info: Strongly-typed mode is default in TRT 10+ (kSTRONGLY_TYPED removed)" << std::endl;
#endif
// -- kFASTER_DYNAMIC_SHAPES ------------------------------------------------
// This flag reduces context-reshape overhead when batch size changes between
// calls (10-100x faster switching, ~5% larger engine). It was added in a
// TRT 10 minor release but the exact version varies by NVIDIA build; the
// enum is absent from the installed headers so it is disabled here.
// To re-enable: uncomment the block below once you confirm your TRT version
// exposes nvinfer1::BuilderFlag::kFASTER_DYNAMIC_SHAPES.
//
// if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
// config->setFlag(nvinfer1::BuilderFlag::kFASTER_DYNAMIC_SHAPES);
// std::cout << "kFASTER_DYNAMIC_SHAPES enabled" << std::endl;
// }
// -- kWEIGHT_STREAMING (TRT 10+) ------------------------------------------
// DISABLED: kWEIGHT_STREAMING requires INetworkDefinition::setStronglyTyped(true)
// to be called on the network before buildSerializedNetwork(), which is not done
// for ONNX-imported networks in this code path. BuilderFlag::kSTRONGLY_TYPED was
// removed from TRT 10+ (compile error), so there is no flag-level workaround.
// Re-enable only if the ONNX parser layer is updated to call setStronglyTyped(true).
// #if NV_TENSORRT_MAJOR >= 10
// config->setFlag(nvinfer1::BuilderFlag::kWEIGHT_STREAMING);
// std::cout << "kWEIGHT_STREAMING enabled (TRT 10+)" << std::endl;
// #endif
// ============================================================================
// TENSORRT 10+ DYNAMIC BATCH SUMMARY
// ============================================================================
#if NV_TENSORRT_MAJOR >= 10
std::cout << "\nTensorRT " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH
<< " | dynamic batch: " << (doesSupportDynamicBatch && m_options.maxBatchSize > 1 ? "YES" : "NO")
<< " | max batch: " << m_options.maxBatchSize
<< " | opt level: " << optLevel
<< " | GPU VRAM: " << totalMiB << " MiB" << std::endl;
#endif
// Load timing cache if available (use actual engine name -- batch may have been capped above)
const auto currentEngineName = serializeEngineOptions(m_options, onnxModelPath);
std::string timingCachePath = currentEngineName + ".timing.cache";
std::vector<char> timingCache;
std::ifstream timingCacheFile(timingCachePath, std::ios::binary);
if (timingCacheFile.good()) {
timingCacheFile.seekg(0, std::ios::end);
timingCache.resize(timingCacheFile.tellg());
timingCacheFile.seekg(0, std::ios::beg);
timingCacheFile.read(timingCache.data(), timingCache.size());
auto cache = config->createTimingCache(timingCache.data(), timingCache.size());
if (cache) {
config->setTimingCache(*cache, false);
std::cout << "Loaded timing cache from: " << timingCachePath << std::endl;
std::cout << " Cache size: " << timingCache.size() / 1024 << " KiB" << std::endl;
}
}
else {
std::cout << "No existing timing cache found (this is normal for first build)" << std::endl;
}
// ============================================================================
// OPTIMIZATION PROFILE CONFIGURATION
// ============================================================================
std::cout << "\n========================================" << std::endl;
std::cout << "Configuring Optimization Profiles" << std::endl;
std::cout << "========================================" << std::endl;
// Validate batch size options
if (doesSupportDynamicBatch) {
if (m_options.optBatchSize < 1) {
std::cout << "Warning: optBatchSize < 1, setting to 1" << std::endl;
m_options.optBatchSize = 1;
}
if (m_options.maxBatchSize < m_options.optBatchSize) {
std::cout << "Warning: maxBatchSize < optBatchSize, adjusting maxBatchSize" << std::endl;
m_options.maxBatchSize = m_options.optBatchSize;
}
std::cout << "Dynamic batch configuration validated:" << std::endl;
std::cout << " Min batch size: 1" << std::endl;
std::cout << " Opt batch size: " << m_options.optBatchSize << std::endl;
std::cout << " Max batch size: " << m_options.maxBatchSize << std::endl;
}
// Create optimization profile
nvinfer1::IOptimizationProfile* optProfile = builder->createOptimizationProfile();
if (!optProfile) {
std::cout << "Error: Failed to create optimization profile" << std::endl;
return false;
}
for (int32_t i = 0; i < numInputs; ++i) {
const auto input = network->getInput(i);
const auto inputName = input->getName();
const auto inputDims = input->getDimensions();
int32_t inputC = inputDims.d[1];
int32_t inputH = inputDims.d[2];
int32_t inputW = inputDims.d[3];
// Use configured values for height
int32_t minInputHeight = doesSupportDynamicHeight ? m_options.minInputHeight : inputH;
int32_t optInputHeight = doesSupportDynamicHeight ? m_options.optInputHeight : inputH;
int32_t maxInputHeight = doesSupportDynamicHeight ? m_options.maxInputHeight : inputH;
// Use configured values for width
int32_t minInputWidth = doesSupportDynamicWidth ? m_options.minInputWidth : inputW;
int32_t optInputWidth = doesSupportDynamicWidth ? m_options.optInputWidth : inputW;
int32_t maxInputWidth = doesSupportDynamicWidth ? m_options.maxInputWidth : inputW;
// Create dimension objects
int32_t minBatch = doesSupportDynamicBatch ? 1 : m_options.optBatchSize;
int32_t optBatch = doesSupportDynamicBatch ? m_options.optBatchSize : m_options.optBatchSize;
int32_t maxBatch = doesSupportDynamicBatch ? m_options.maxBatchSize : m_options.maxBatchSize;
nvinfer1::Dims4 minDims(minBatch, inputC, minInputHeight, minInputWidth);
nvinfer1::Dims4 optDims(optBatch, inputC, optInputHeight, optInputWidth);
nvinfer1::Dims4 maxDims(maxBatch, inputC, maxInputHeight, maxInputWidth);
std::cout << "\nSetting profile for input '" << inputName << "':" << std::endl;
std::cout << " MIN: [" << minDims.d[0] << "," << minDims.d[1] << ","
<< minDims.d[2] << "," << minDims.d[3] << "]" << std::endl;
std::cout << " OPT: [" << optDims.d[0] << "," << optDims.d[1] << ","
<< optDims.d[2] << "," << optDims.d[3] << "]" << std::endl;
std::cout << " MAX: [" << maxDims.d[0] << "," << maxDims.d[1] << ","
<< maxDims.d[2] << "," << maxDims.d[3] << "]" << std::endl;
// Set the dimensions with error checking
bool minSet = optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN, minDims);
bool optSet = optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT, optDims);
bool maxSet = optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX, maxDims);
if (!minSet || !optSet || !maxSet) {
std::cout << " ✗ ERROR: Failed to set profile dimensions!" << std::endl;
std::cout << " minSet: " << (minSet ? "OK" : "FAILED") << std::endl;
std::cout << " optSet: " << (optSet ? "OK" : "FAILED") << std::endl;
std::cout << " maxSet: " << (maxSet ? "OK" : "FAILED") << std::endl;
return false;
}
std::cout << " ✓ Profile dimensions set successfully" << std::endl;
}
// Validate the profile
std::cout << "\n========================================" << std::endl;
std::cout << "VALIDATING OPTIMIZATION PROFILE" << std::endl;
std::cout << "========================================" << std::endl;
bool profileValid = optProfile->isValid();
std::cout << "Profile validation result: " << (profileValid ? "✓ VALID" : "✗ INVALID") << std::endl;
if (!profileValid) {
std::cout << "ERROR: Profile is invalid! Cannot continue." << std::endl;
std::cout << "This usually means the min/opt/max dimensions are inconsistent." << std::endl;
return false;
}
// Verify what we actually set
for (int32_t i = 0; i < numInputs; ++i) {
const auto input = network->getInput(i);
const auto inputName = input->getName();
auto minDims = optProfile->getDimensions(inputName, nvinfer1::OptProfileSelector::kMIN);
auto optDims = optProfile->getDimensions(inputName, nvinfer1::OptProfileSelector::kOPT);
auto maxDims = optProfile->getDimensions(inputName, nvinfer1::OptProfileSelector::kMAX);
std::cout << "\nVerified profile for input '" << inputName << "':" << std::endl;
std::cout << " MIN: [" << minDims.d[0] << "," << minDims.d[1] << ","
<< minDims.d[2] << "," << minDims.d[3] << "]" << std::endl;
std::cout << " OPT: [" << optDims.d[0] << "," << optDims.d[1] << ","
<< optDims.d[2] << "," << optDims.d[3] << "]" << std::endl;
std::cout << " MAX: [" << maxDims.d[0] << "," << maxDims.d[1] << ","
<< maxDims.d[2] << "," << maxDims.d[3] << "]" << std::endl;
// Check batch dimension range
if (minDims.d[0] != maxDims.d[0]) {
std::cout << " ✓ Profile IS DYNAMIC (batch " << minDims.d[0]
<< " to " << maxDims.d[0] << ")" << std::endl;
}
else {
std::cout << " • Profile IS FIXED at batch " << minDims.d[0] << std::endl;
if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
std::cout << "\n🚨 CRITICAL ERROR: ONNX supports dynamic batch but profile is fixed!" << std::endl;
return false;
}
}
}
std::cout << "========================================" << std::endl;
// Add the validated profile
config->addOptimizationProfile(optProfile);
int32_t numProfiles = config->getNbOptimizationProfiles();
std::cout << "\n✓ Optimization profile added successfully" << std::endl;
std::cout << " Total profiles in config: " << numProfiles << std::endl;
if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
std::cout << " ✓ Profile covers DYNAMIC batch range: 1 to " << m_options.maxBatchSize << std::endl;
}
else {
std::cout << " • Profile has FIXED batch size: " << m_options.maxBatchSize << std::endl;
}
// ============================================================================
// PRECISION CONFIGURATION
// ============================================================================
std::cout << "\n========================================" << std::endl;
std::cout << "Configuring Precision" << std::endl;
std::cout << "========================================" << std::endl;
if (m_options.precision == ANSCENTER::Precision::FP16) {
if (!builder->platformHasFastFp16()) {
std::cout << "Error: GPU does not support FP16 precision" << std::endl;
return false;
}
config->setFlag(nvinfer1::BuilderFlag::kFP16);
std::cout << "FP16 precision enabled" << std::endl;
// Mixed precision safety: force numerically sensitive layers to FP32.
// Some models (e.g. PP-OCRv5 det) produce NaN when certain layers
// run in FP16 due to overflow in intermediate accumulators. Forcing
// these layers to FP32 has negligible performance impact while
// preventing NaN corruption.
//
// Targeted layer types:
// - kREDUCE : accumulation overflows FP16 max (65504)
// - kELEMENTWISE/Pow: large intermediate values
// - kNORMALIZATION : mean/variance reduction + 1/sqrt overflow
// - kSOFTMAX : exp() extremely sensitive to precision
// - kACTIVATION/Sigmoid: 1/(1+exp(-x)) overflows for large |x|
// - kUNARY/Exp,Log : exp overflows for x>~11, log underflows
//
// IMPORTANT: setPrecision() is only a HINT without kOBEY_PRECISION_CONSTRAINTS.
// We must set this flag so TRT strictly respects our per-layer FP32 overrides.
// (kPREFER_PRECISION_CONSTRAINTS is deprecated/no-op in TRT 10.12+;
// kOBEY means build FAILS if no FP32 kernel exists — better than silent NaN.)
int fp32Overrides = 0;
const int numLayers = network->getNbLayers();
// --- Diagnostic: enumerate all layer types in this network ---
std::map<std::string, int> layerTypeCounts;
auto layerTypeName = [](nvinfer1::LayerType t) -> std::string {
switch (t) {
case nvinfer1::LayerType::kCONVOLUTION: return "Convolution";
case nvinfer1::LayerType::kCAST: return "Cast";
case nvinfer1::LayerType::kACTIVATION: return "Activation";
case nvinfer1::LayerType::kPOOLING: return "Pooling";
case nvinfer1::LayerType::kLRN: return "LRN";
case nvinfer1::LayerType::kSCALE: return "Scale";
case nvinfer1::LayerType::kSOFTMAX: return "Softmax";
case nvinfer1::LayerType::kDECONVOLUTION: return "Deconvolution";
case nvinfer1::LayerType::kCONCATENATION: return "Concatenation";
case nvinfer1::LayerType::kELEMENTWISE: return "ElementWise";
case nvinfer1::LayerType::kPLUGIN: return "Plugin";
case nvinfer1::LayerType::kUNARY: return "Unary";
case nvinfer1::LayerType::kPADDING: return "Padding";
case nvinfer1::LayerType::kSHUFFLE: return "Shuffle";
case nvinfer1::LayerType::kREDUCE: return "Reduce";
case nvinfer1::LayerType::kTOPK: return "TopK";
case nvinfer1::LayerType::kGATHER: return "Gather";
case nvinfer1::LayerType::kMATRIX_MULTIPLY: return "MatrixMultiply";
case nvinfer1::LayerType::kCONSTANT: return "Constant";
case nvinfer1::LayerType::kIDENTITY: return "Identity";
case nvinfer1::LayerType::kSLICE: return "Slice";
case nvinfer1::LayerType::kSHAPE: return "Shape";
case nvinfer1::LayerType::kRESIZE: return "Resize";
case nvinfer1::LayerType::kSELECT: return "Select";
case nvinfer1::LayerType::kFILL: return "Fill";
case nvinfer1::LayerType::kQUANTIZE: return "Quantize";
case nvinfer1::LayerType::kDEQUANTIZE: return "Dequantize";
case nvinfer1::LayerType::kSCATTER: return "Scatter";
case nvinfer1::LayerType::kEINSUM: return "Einsum";
case nvinfer1::LayerType::kGRID_SAMPLE: return "GridSample";
case nvinfer1::LayerType::kNMS: return "NMS";
case nvinfer1::LayerType::kNORMALIZATION: return "Normalization";
case nvinfer1::LayerType::kSQUEEZE: return "Squeeze";
case nvinfer1::LayerType::kUNSQUEEZE: return "Unsqueeze";
default: return "Unknown(" + std::to_string(static_cast<int>(t)) + ")";
}
};
for (int i = 0; i < numLayers; ++i) {
auto* layer = network->getLayer(i);
const auto ltype = layer->getType();
bool needsFP32 = false;
switch (ltype) {
case nvinfer1::LayerType::kREDUCE:
needsFP32 = true;
break;
case nvinfer1::LayerType::kELEMENTWISE:
{
// Only force Pow to FP32; Add/Mul/etc. are fine in FP16
auto* ew = static_cast<nvinfer1::IElementWiseLayer*>(layer);
if (ew->getOperation() == nvinfer1::ElementWiseOperation::kPOW) {
needsFP32 = true;
}
break;
}
case nvinfer1::LayerType::kNORMALIZATION:
needsFP32 = true;
break;
case nvinfer1::LayerType::kSOFTMAX:
needsFP32 = true;
break;
case nvinfer1::LayerType::kACTIVATION:
{
// Sigmoid is 1/(1+exp(-x)) — exp overflows FP16 for large |x|
auto* act = static_cast<nvinfer1::IActivationLayer*>(layer);
if (act->getActivationType() == nvinfer1::ActivationType::kSIGMOID) {
needsFP32 = true;
}
break;
}
case nvinfer1::LayerType::kUNARY:
{
// Exp overflows FP16 for x > ~11; Log underflows for tiny values
auto* un = static_cast<nvinfer1::IUnaryLayer*>(layer);
const auto op = un->getOperation();
if (op == nvinfer1::UnaryOperation::kEXP ||
op == nvinfer1::UnaryOperation::kLOG) {
needsFP32 = true;
}
break;
}
default:
break;
}
// Track layer type for diagnostic summary
std::string name = layerTypeName(ltype);
if (needsFP32) name += " [FP32]";
layerTypeCounts[name]++;
if (needsFP32) {
layer->setPrecision(nvinfer1::DataType::kFLOAT);
for (int o = 0; o < layer->getNbOutputs(); ++o) {
layer->setOutputType(o, nvinfer1::DataType::kFLOAT);
}
++fp32Overrides;
}
}
// Print layer type summary
std::cout << " Network layer types (" << numLayers << " total):" << std::endl;
for (const auto& kv : layerTypeCounts) {
std::cout << " " << kv.first << ": " << kv.second << std::endl;
}
if (fp32Overrides > 0) {
// Enforce per-layer precision constraints — without this flag,
// setPrecision(kFLOAT) is merely a hint that TRT can ignore.
config->setFlag(nvinfer1::BuilderFlag::kOBEY_PRECISION_CONSTRAINTS);
std::cout << " Mixed precision: " << fp32Overrides
<< " / " << numLayers
<< " layers forced to FP32"
<< std::endl;
std::cout << " kOBEY_PRECISION_CONSTRAINTS enabled to enforce FP32 on marked layers"
<< std::endl;
}
}
else if (m_options.precision == ANSCENTER::Precision::INT8) {
if (numInputs > 1) {
std::cout << "Error: This implementation currently only supports INT8 for single input models" << std::endl;
return false;
}
if (!builder->platformHasFastInt8()) {
std::cout << "Error: GPU does not support INT8 precision" << std::endl;
return false;
}
if (m_options.calibrationDataDirectoryPath.empty()) {
std::cout << "Error: INT8 precision requires calibration data directory path" << std::endl;
return false;
}
config->setFlag(nvinfer1::BuilderFlag::kINT8);
std::cout << "INT8 precision enabled" << std::endl;
const auto input = network->getInput(0);
const auto inputName = input->getName();
const auto inputDims = input->getDimensions();
const auto calibrationFileName = currentEngineName + ".calibration";
m_calibrator = std::make_unique<Int8EntropyCalibrator2>(m_options.calibrationBatchSize, inputDims.d[3], inputDims.d[2],
m_options.calibrationDataDirectoryPath, calibrationFileName, inputName,
subVals, divVals, normalize);
config->setInt8Calibrator(m_calibrator.get());
}
else {
// FP32 mode - do NOT enable kFP16 flag; some models (e.g. PP-OCRv5 det)
// produce NaN when TRT silently promotes layers to FP16.
std::cout << "FP32 precision (strict, no FP16 fallback)" << std::endl;
}
// ============================================================================
// BUILD ENGINE
// ============================================================================
std::cout << "\n========================================" << std::endl;
std::cout << "Building Engine" << std::endl;
std::cout << "========================================" << std::endl;
cudaStream_t profileStream;
Util::checkCudaErrorCode(cudaStreamCreate(&profileStream));
config->setProfileStream(profileStream);
std::cout << "Building engine... This may take several minutes." << std::endl;
std::cout << "Progress will be shown as layers are optimized..." << std::endl;
if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
std::cout << "✓ Building with DYNAMIC batch support (1-" << m_options.maxBatchSize << ")" << std::endl;
}
else {
std::cout << "• Building with FIXED batch size " << m_options.maxBatchSize << std::endl;
}
// Build the engine (crash-safe)
auto startTime = std::chrono::high_resolution_clock::now();
unsigned long sehCodeBuild = 0;
std::unique_ptr<nvinfer1::IHostMemory> plan{
buildSerializedNetworkSafe(builder.get(), *network, *config, &sehCodeBuild)
};
auto endTime = std::chrono::high_resolution_clock::now();
if (sehCodeBuild != 0) {
Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
return false;
}
if (!plan) {
std::cout << "\n========================================" << std::endl;
std::cout << "Build Failed!" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "Error: Failed to build engine." << std::endl;
Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
return false;
}
auto buildTime = std::chrono::duration_cast<std::chrono::seconds>(endTime - startTime).count();
std::cout << "\n========================================" << std::endl;
std::cout << "Build Successful!" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "Build time: " << buildTime << " seconds (" << buildTime / 60 << " minutes)" << std::endl;
// Write the engine to disk.
// Re-compute the filename because build() may have capped maxBatchSize
// (e.g. b32 -> b8), so the saved file must match the actual config.
const auto actualEngineName = serializeEngineOptions(m_options, onnxModelPath);
const auto enginePath = std::filesystem::path(m_options.engineFileDir) / actualEngineName;
std::ofstream outfile(enginePath, std::ofstream::binary);
if (!outfile) {
std::cout << "Error: Failed to open file for writing: " << enginePath << std::endl;
Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
return false;
}
outfile.write(reinterpret_cast<const char*>(plan->data()), plan->size());
outfile.close();
std::cout << "Engine saved to: " << enginePath.string() << std::endl;
std::cout << "Engine size: " << plan->size() / (1024 * 1024) << " MiB" << std::endl;
if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
std::cout << "✓ Engine supports DYNAMIC batch sizes: 1 to " << m_options.maxBatchSize << std::endl;
}
else {
std::cout << "• Engine supports FIXED batch size: " << m_options.maxBatchSize << std::endl;
}
// Save timing cache
auto timingCacheFromConfig = config->getTimingCache();
if (timingCacheFromConfig) {
auto timingCacheData = timingCacheFromConfig->serialize();
if (timingCacheData) {
std::ofstream timingCacheOut(timingCachePath, std::ios::binary);
if (timingCacheOut) {
timingCacheOut.write(static_cast<const char*>(timingCacheData->data()), timingCacheData->size());
timingCacheOut.close();
std::cout << "Timing cache saved to: " << timingCachePath << std::endl;
std::cout << " Cache size: " << timingCacheData->size() / 1024 << " KiB" << std::endl;
}
}
}
Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
std::cout << "\n========================================" << std::endl;
std::cout << "Build Complete!" << std::endl;
std::cout << "========================================" << std::endl;
if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
std::cout << "\n✓ Engine supports batch inference (1-" << m_options.maxBatchSize << " images)" << std::endl;
}
return true;
}
// ============================================================================
// buildSafe()
//
// SEH wrapper around build(). Cannot use __try in a function with C++
// destructors, so the actual build() call is forwarded through a plain-C
// function pointer via callBoolFuncSafe().
// ============================================================================
struct BuildSafeCtx_Base {
void* enginePtr; // Engine<T>*
const char* onnxPath;
size_t onnxPathLen;
const float* subVals;
const float* divVals;
bool normalize;
bool result;
};
template <typename T>
static bool buildSafe_trampoline(void* ctx) {
auto* c = static_cast<BuildSafeCtx_Base*>(ctx);
auto* engine = static_cast<Engine<T>*>(c->enginePtr);
std::string path(c->onnxPath, c->onnxPathLen);
std::array<float, 3> sub = { c->subVals[0], c->subVals[1], c->subVals[2] };
std::array<float, 3> div = { c->divVals[0], c->divVals[1], c->divVals[2] };
return engine->build(path, sub, div, c->normalize);
}
template <typename T>
bool Engine<T>::buildSafe(std::string onnxModelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
unsigned long* outSehCode)
{
BuildSafeCtx_Base ctx;
ctx.enginePtr = this;
ctx.onnxPath = onnxModelPath.c_str();
ctx.onnxPathLen = onnxModelPath.size();
ctx.subVals = subVals.data();
ctx.divVals = divVals.data();
ctx.normalize = normalize;
ctx.result = false;
bool ok = callBoolFuncSafe(&buildSafe_trampoline<T>, &ctx, outSehCode);
return ok;
}
// ============================================================================
// buildWithRetry()
//
// Wraps build() with auto-retry for dynamic spatial dimension models.
// Pre-analyzes the ONNX model to detect dynamic H/W dims, then builds a
// fallback chain (max → 75% → 56% → ... → 640 → 320). Each candidate
// calls build(), which checks for a cached engine first (fast) then tries
// building if no cache exists. Fixed-spatial models skip retry.
// ============================================================================
template <typename T>
bool Engine<T>::buildWithRetry(std::string onnxModelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize)
{
// -- Quick pre-analysis: detect dynamic spatial dims in ONNX ---------------
bool hasDynamicSpatial = false;
int onnxFixedH = 0, onnxFixedW = 0; // 0 = dynamic (-1 in ONNX)
if (m_options.maxInputHeight > 0 && m_options.maxInputWidth > 0) {
auto tempBuilder = std::unique_ptr<nvinfer1::IBuilder>(
nvinfer1::createInferBuilder(m_logger));
auto tempNetwork = std::unique_ptr<nvinfer1::INetworkDefinition>(TRT_CREATE_NETWORK(tempBuilder));
auto tempParser = std::unique_ptr<nvonnxparser::IParser>(
nvonnxparser::createParser(*tempNetwork, m_logger));
std::ifstream onnxFile(onnxModelPath, std::ios::binary | std::ios::ate);
if (onnxFile.good()) {
std::streamsize onnxSize = onnxFile.tellg();
onnxFile.seekg(0, std::ios::beg);
std::vector<char> onnxBuffer(onnxSize);
if (onnxFile.read(onnxBuffer.data(), onnxSize)) {
unsigned long sehRetryParse = 0;
bool retryParsed = parseOnnxModelSafe(tempParser.get(),
onnxBuffer.data(), onnxBuffer.size(), &sehRetryParse);
if (sehRetryParse != 0) {
// hasDynamicSpatial stays false → single build() attempt
}
else if (retryParsed && tempNetwork->getNbInputs() > 0) {
auto dims = tempNetwork->getInput(0)->getDimensions();
if (dims.nbDims >= 4) {
if (dims.d[2] == -1 || dims.d[3] == -1)
hasDynamicSpatial = true;
onnxFixedH = (dims.d[2] != -1) ? dims.d[2] : 0;
onnxFixedW = (dims.d[3] != -1) ? dims.d[3] : 0;
}
}
}
}
}
// -- Fixed-spatial or no dynamic dims: single build attempt ----------------
if (!hasDynamicSpatial) {
unsigned long sehBuild = 0;
bool ok = buildSafe(onnxModelPath, subVals, divVals, normalize, &sehBuild);
if (sehBuild != 0) {
return false;
}
return ok;
}
// -- Dynamic spatial dims: build with fallback chain ----------------------
const bool dynamicH = (onnxFixedH == 0);
const bool dynamicW = (onnxFixedW == 0);
const int origMaxH = m_options.maxInputHeight;
const int origMaxW = m_options.maxInputWidth;
const int origOptH = m_options.optInputHeight;
const int origOptW = m_options.optInputWidth;
const int origMinH = m_options.minInputHeight;
const int origMinW = m_options.minInputWidth;
int dynMaxH = dynamicH ? origMaxH : 0;
int dynMaxW = dynamicW ? origMaxW : 0;
int maxDynDim = std::max(dynMaxH, dynMaxW);
// Build fallback chain: max → 75% → 56% → ... → 640 → 320
std::vector<int> candidates;
for (int s = maxDynDim; s >= 320; s = (s * 3) / 4) {
s = (s / 32) * 32;
if (candidates.empty() || candidates.back() != s)
candidates.push_back(s);
}
if (candidates.back() > 640) candidates.push_back(640);
if (candidates.back() > 320) candidates.push_back(320);
// Helper: configure m_options for a given candidate
auto setCandidateOptions = [&](int candidate) {
float scale = static_cast<float>(candidate) / maxDynDim;
m_options.maxInputHeight = dynamicH
? std::max(32, (static_cast<int>(origMaxH * scale) / 32) * 32)
: onnxFixedH;
m_options.maxInputWidth = dynamicW
? std::max(32, (static_cast<int>(origMaxW * scale) / 32) * 32)
: onnxFixedW;
m_options.minInputHeight = dynamicH
? std::min(origMinH, m_options.maxInputHeight) : onnxFixedH;
m_options.minInputWidth = dynamicW
? std::min(origMinW, m_options.maxInputWidth) : onnxFixedW;
m_options.optInputHeight = dynamicH
? std::min(origOptH, m_options.maxInputHeight) : onnxFixedH;
m_options.optInputWidth = dynamicW
? std::min(origOptW, m_options.maxInputWidth) : onnxFixedW;
};
// Try each candidate (largest first). build() checks cache before
// building, so previously cached smaller engines are found quickly.
for (size_t attempt = 0; attempt < candidates.size(); ++attempt) {
setCandidateOptions(candidates[attempt]);
{
unsigned long sehAttempt = 0;
bool attemptOk = buildSafe(onnxModelPath, subVals, divVals, normalize, &sehAttempt);
if (sehAttempt != 0) {
// CUDA context may be corrupted — no point retrying
return false;
}
if (attemptOk) {
return true;
}
}
}
// All candidates exhausted — restore original options for error reporting
m_options.maxInputHeight = origMaxH;
m_options.maxInputWidth = origMaxW;
m_options.optInputHeight = origOptH;
m_options.optInputWidth = origOptW;
m_options.minInputHeight = origMinH;
m_options.minInputWidth = origMinW;
return false;
}
// ============================================================================
// 6-param pool overloads
//
// These are non-virtual additions to Engine<T> that let callers opt into
// multi-GPU pool mode simply by supplying one extra argument:
//
// m_trtEngine->buildLoadNetwork(path, sub, div, norm); // single-GPU
// m_trtEngine->buildLoadNetwork(path, sub, div, norm, -1); // pool
//
// When maxSlotsPerGpu == 1 the call delegates to the existing 4-param
// single-GPU implementation -- zero behavioural difference.
// Any other value routes through loadSlots() which fills all GPUs.
// ============================================================================
template <typename T>
bool Engine<T>::buildLoadNetwork(
std::string onnxModelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
int maxSlotsPerGpu,
double memSafetyFactor)
{
// Force single-GPU when: maxSlotsPerGpu==0 (optimizer bypass),
// per-instance forceNoPool, global bypass (OptimizeModelStr),
// exported g_forceNoPool, OR single-GPU system with maxSlotsPerGpu==1.
//
// On a single-GPU system, the pool with 1 slot adds contention overhead
// (2s timeout + reject) without any multi-GPU benefit. The CUDA stream
// handles serialization naturally in single-GPU mode.
{
extern std::atomic<bool> g_forceNoPool;
int gpuCount = 0;
cudaGetDeviceCount(&gpuCount);
bool singleGpuNoElastic = (gpuCount <= 1 && maxSlotsPerGpu == 1);
bool noPool = (maxSlotsPerGpu == 0) || m_forceNoPool ||
g_forceNoPool.load(std::memory_order_relaxed) ||
TRTEngineCache::globalBypass().load(std::memory_order_relaxed) ||
singleGpuNoElastic;
if (noPool) {
std::cout << "Info: buildLoadNetwork -- single-GPU forced (maxSlots=" << maxSlotsPerGpu
<< ", forceNoPool=" << m_forceNoPool
<< ", g_forceNoPool=" << g_forceNoPool.load()
<< ", gpuCount=" << gpuCount << ")" << std::endl;
return buildLoadNetwork(onnxModelPath, subVals, divVals, normalize);
}
}
// Multi-GPU pool path. m_options carries the base configuration that was
// set either at construction (Engine(options)) or by initializePool().
std::cout << "Info: buildLoadNetwork -- activating multi-GPU pool"
<< " (maxSlotsPerGpu=" << maxSlotsPerGpu
<< ", memSafetyFactor=" << memSafetyFactor << ")" << std::endl;
return loadSlots(m_options, onnxModelPath,
subVals, divVals, normalize,
/*fromOnnx=*/true,
maxSlotsPerGpu, memSafetyFactor);
}
template <typename T>
bool Engine<T>::loadNetwork(
std::string trtModelPath,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
int maxSlotsPerGpu,
double memSafetyFactor)
{
{
extern std::atomic<bool> g_forceNoPool;
int gpuCount = 0;
cudaGetDeviceCount(&gpuCount);
bool singleGpuNoElastic = (gpuCount <= 1 && maxSlotsPerGpu == 1);
bool noPool = (maxSlotsPerGpu == 0) || m_forceNoPool ||
g_forceNoPool.load(std::memory_order_relaxed) ||
TRTEngineCache::globalBypass().load(std::memory_order_relaxed) ||
singleGpuNoElastic;
if (noPool) {
std::cout << "Info: loadNetwork -- single-GPU forced (maxSlots=" << maxSlotsPerGpu
<< ", g_forceNoPool=" << g_forceNoPool.load()
<< ", gpuCount=" << gpuCount << ")" << std::endl;
return loadNetwork(trtModelPath, subVals, divVals, normalize);
}
}
// Multi-GPU pool path.
std::cout << "Info: loadNetwork -- activating multi-GPU pool"
<< " (maxSlotsPerGpu=" << maxSlotsPerGpu
<< ", memSafetyFactor=" << memSafetyFactor << ")" << std::endl;
return loadSlots(m_options, trtModelPath,
subVals, divVals, normalize,
/*fromOnnx=*/false,
maxSlotsPerGpu, memSafetyFactor);
}