2637 lines
123 KiB
Plaintext
2637 lines
123 KiB
Plaintext
|
|
#pragma once
|
|||
|
|
#include <filesystem>
|
|||
|
|
#include <map>
|
|||
|
|
#include <sstream>
|
|||
|
|
#include "Utility.h"
|
|||
|
|
#include "TRTCompat.h"
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// Crash-safe wrappers for TensorRT operations that can crash the process.
|
|||
|
|
//
|
|||
|
|
// On Windows: uses SEH (__try/__except) to catch access violations, OOM, etc.
|
|||
|
|
// SEH cannot coexist with C++ objects that have destructors in the same
|
|||
|
|
// function scope, so these thin wrappers accept only raw pointers.
|
|||
|
|
//
|
|||
|
|
// On Linux: uses POSIX signals + sigsetjmp/siglongjmp to catch SIGSEGV,
|
|||
|
|
// SIGBUS, SIGABRT, SIGFPE. Thread-local jump buffers ensure thread safety.
|
|||
|
|
// Signal handlers are saved/restored around each dangerous call so that
|
|||
|
|
// the application's own handlers are not permanently replaced.
|
|||
|
|
//
|
|||
|
|
// outExceptionCode: 0 = OK.
|
|||
|
|
// Windows: the SEH exception code (e.g. 0xC0000005 = access violation).
|
|||
|
|
// Linux: the signal number (e.g. 11 = SIGSEGV).
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
#ifdef _WIN32
|
|||
|
|
# ifndef WIN32_LEAN_AND_MEAN
|
|||
|
|
# define WIN32_LEAN_AND_MEAN
|
|||
|
|
# endif
|
|||
|
|
# ifndef NOMINMAX
|
|||
|
|
# define NOMINMAX
|
|||
|
|
# endif
|
|||
|
|
# include <windows.h>
|
|||
|
|
#else
|
|||
|
|
# include <signal.h>
|
|||
|
|
# include <setjmp.h>
|
|||
|
|
|
|||
|
|
// Thread-local storage for the POSIX crash-recovery mechanism.
|
|||
|
|
// Each thread gets its own jump buffer and signal number so that
|
|||
|
|
// concurrent engine builds on different threads don't interfere.
|
|||
|
|
static thread_local sigjmp_buf s_crashJmpBuf;
|
|||
|
|
static thread_local volatile sig_atomic_t s_crashSignal = 0;
|
|||
|
|
|
|||
|
|
// Signal handler installed only around dangerous TensorRT calls.
|
|||
|
|
// It records which signal was received and jumps back to the
|
|||
|
|
// sigsetjmp() checkpoint. Only synchronous, thread-directed signals
|
|||
|
|
// (SIGSEGV, SIGBUS, SIGFPE) are guaranteed to land on the faulting
|
|||
|
|
// thread; SIGABRT is process-wide but typically raised from the same
|
|||
|
|
// thread that called abort().
|
|||
|
|
static void engineCrashSignalHandler(int sig) {
|
|||
|
|
s_crashSignal = sig;
|
|||
|
|
siglongjmp(s_crashJmpBuf, 1);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Helper: install crash signal handlers, saving the previous ones.
|
|||
|
|
struct CrashSignalGuard {
|
|||
|
|
struct sigaction oldSigsegv, oldSigbus, oldSigabrt, oldSigfpe;
|
|||
|
|
|
|||
|
|
void install() {
|
|||
|
|
struct sigaction sa;
|
|||
|
|
sa.sa_handler = engineCrashSignalHandler;
|
|||
|
|
sigemptyset(&sa.sa_mask);
|
|||
|
|
sa.sa_flags = 0; // no SA_RESTART — let interrupted calls fail
|
|||
|
|
sigaction(SIGSEGV, &sa, &oldSigsegv);
|
|||
|
|
sigaction(SIGBUS, &sa, &oldSigbus);
|
|||
|
|
sigaction(SIGABRT, &sa, &oldSigabrt);
|
|||
|
|
sigaction(SIGFPE, &sa, &oldSigfpe);
|
|||
|
|
s_crashSignal = 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void restore() {
|
|||
|
|
sigaction(SIGSEGV, &oldSigsegv, nullptr);
|
|||
|
|
sigaction(SIGBUS, &oldSigbus, nullptr);
|
|||
|
|
sigaction(SIGABRT, &oldSigabrt, nullptr);
|
|||
|
|
sigaction(SIGFPE, &oldSigfpe, nullptr);
|
|||
|
|
}
|
|||
|
|
};
|
|||
|
|
#endif // _WIN32
|
|||
|
|
|
|||
|
|
/// Crash-safe ONNX parser->parse() wrapper.
|
|||
|
|
/// @param outExceptionCode receives the exception/signal code on crash (0 = OK).
|
|||
|
|
static bool parseOnnxModelSafe(
|
|||
|
|
nvonnxparser::IParser* parser,
|
|||
|
|
const void* data,
|
|||
|
|
size_t dataSize,
|
|||
|
|
unsigned long* outExceptionCode)
|
|||
|
|
{
|
|||
|
|
#ifdef _WIN32
|
|||
|
|
*outExceptionCode = 0;
|
|||
|
|
__try {
|
|||
|
|
return parser->parse(data, dataSize);
|
|||
|
|
}
|
|||
|
|
__except (EXCEPTION_EXECUTE_HANDLER) {
|
|||
|
|
*outExceptionCode = GetExceptionCode();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
#else
|
|||
|
|
*outExceptionCode = 0;
|
|||
|
|
CrashSignalGuard guard;
|
|||
|
|
guard.install();
|
|||
|
|
bool result = false;
|
|||
|
|
if (sigsetjmp(s_crashJmpBuf, 1) == 0) {
|
|||
|
|
// Normal execution path
|
|||
|
|
result = parser->parse(data, dataSize);
|
|||
|
|
} else {
|
|||
|
|
// Returned here from signal handler — a crash was caught
|
|||
|
|
*outExceptionCode = static_cast<unsigned long>(s_crashSignal);
|
|||
|
|
result = false;
|
|||
|
|
}
|
|||
|
|
guard.restore();
|
|||
|
|
return result;
|
|||
|
|
#endif
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/// Crash-safe builder->buildSerializedNetwork() wrapper.
|
|||
|
|
/// Returns raw IHostMemory* (caller wraps in unique_ptr).
|
|||
|
|
static nvinfer1::IHostMemory* buildSerializedNetworkSafe(
|
|||
|
|
nvinfer1::IBuilder* builder,
|
|||
|
|
nvinfer1::INetworkDefinition& network,
|
|||
|
|
nvinfer1::IBuilderConfig& config,
|
|||
|
|
unsigned long* outExceptionCode)
|
|||
|
|
{
|
|||
|
|
#ifdef _WIN32
|
|||
|
|
*outExceptionCode = 0;
|
|||
|
|
__try {
|
|||
|
|
return builder->buildSerializedNetwork(network, config);
|
|||
|
|
}
|
|||
|
|
__except (EXCEPTION_EXECUTE_HANDLER) {
|
|||
|
|
*outExceptionCode = GetExceptionCode();
|
|||
|
|
return nullptr;
|
|||
|
|
}
|
|||
|
|
#else
|
|||
|
|
*outExceptionCode = 0;
|
|||
|
|
CrashSignalGuard guard;
|
|||
|
|
guard.install();
|
|||
|
|
nvinfer1::IHostMemory* plan = nullptr;
|
|||
|
|
if (sigsetjmp(s_crashJmpBuf, 1) == 0) {
|
|||
|
|
plan = builder->buildSerializedNetwork(network, config);
|
|||
|
|
} else {
|
|||
|
|
*outExceptionCode = static_cast<unsigned long>(s_crashSignal);
|
|||
|
|
plan = nullptr;
|
|||
|
|
}
|
|||
|
|
guard.restore();
|
|||
|
|
return plan;
|
|||
|
|
#endif
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/// Crash-safe runtime->deserializeCudaEngine() wrapper.
|
|||
|
|
/// Returns raw ICudaEngine* (caller wraps in unique_ptr).
|
|||
|
|
static nvinfer1::ICudaEngine* deserializeCudaEngineSafe(
|
|||
|
|
nvinfer1::IRuntime* runtime,
|
|||
|
|
const void* data,
|
|||
|
|
size_t dataSize,
|
|||
|
|
unsigned long* outExceptionCode)
|
|||
|
|
{
|
|||
|
|
#ifdef _WIN32
|
|||
|
|
*outExceptionCode = 0;
|
|||
|
|
__try {
|
|||
|
|
return runtime->deserializeCudaEngine(data, dataSize);
|
|||
|
|
}
|
|||
|
|
__except (EXCEPTION_EXECUTE_HANDLER) {
|
|||
|
|
*outExceptionCode = GetExceptionCode();
|
|||
|
|
return nullptr;
|
|||
|
|
}
|
|||
|
|
#else
|
|||
|
|
*outExceptionCode = 0;
|
|||
|
|
CrashSignalGuard guard;
|
|||
|
|
guard.install();
|
|||
|
|
nvinfer1::ICudaEngine* engine = nullptr;
|
|||
|
|
if (sigsetjmp(s_crashJmpBuf, 1) == 0) {
|
|||
|
|
engine = runtime->deserializeCudaEngine(data, dataSize);
|
|||
|
|
} else {
|
|||
|
|
*outExceptionCode = static_cast<unsigned long>(s_crashSignal);
|
|||
|
|
engine = nullptr;
|
|||
|
|
}
|
|||
|
|
guard.restore();
|
|||
|
|
return engine;
|
|||
|
|
#endif
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/// Crash-safe wrapper for an arbitrary bool-returning function pointer.
|
|||
|
|
/// Used to SEH-protect build() calls that may crash on bad ONNX models.
|
|||
|
|
typedef bool (*BoolFuncPtr)(void* ctx);
|
|||
|
|
static bool callBoolFuncSafe(BoolFuncPtr fn, void* ctx, unsigned long* outExceptionCode) {
|
|||
|
|
#ifdef _WIN32
|
|||
|
|
*outExceptionCode = 0;
|
|||
|
|
__try {
|
|||
|
|
return fn(ctx);
|
|||
|
|
}
|
|||
|
|
__except (EXCEPTION_EXECUTE_HANDLER) {
|
|||
|
|
*outExceptionCode = GetExceptionCode();
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
#else
|
|||
|
|
*outExceptionCode = 0;
|
|||
|
|
CrashSignalGuard guard;
|
|||
|
|
guard.install();
|
|||
|
|
bool result = false;
|
|||
|
|
if (sigsetjmp(s_crashJmpBuf, 1) == 0) {
|
|||
|
|
result = fn(ctx);
|
|||
|
|
} else {
|
|||
|
|
*outExceptionCode = static_cast<unsigned long>(s_crashSignal);
|
|||
|
|
result = false;
|
|||
|
|
}
|
|||
|
|
guard.restore();
|
|||
|
|
return result;
|
|||
|
|
#endif
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/// Format a crash code for logging (platform-aware).
|
|||
|
|
/// Windows: "SEH exception 0xC0000005" Linux: "signal 11 (SIGSEGV)"
|
|||
|
|
static std::string formatCrashCode(unsigned long code) {
|
|||
|
|
std::ostringstream oss;
|
|||
|
|
#ifdef _WIN32
|
|||
|
|
oss << "SEH exception 0x" << std::hex << code << std::dec;
|
|||
|
|
#else
|
|||
|
|
oss << "signal " << code;
|
|||
|
|
switch (code) {
|
|||
|
|
case SIGSEGV: oss << " (SIGSEGV)"; break;
|
|||
|
|
case SIGBUS: oss << " (SIGBUS)"; break;
|
|||
|
|
case SIGABRT: oss << " (SIGABRT)"; break;
|
|||
|
|
case SIGFPE: oss << " (SIGFPE)"; break;
|
|||
|
|
default: oss << " (unknown)"; break;
|
|||
|
|
}
|
|||
|
|
#endif
|
|||
|
|
return oss.str();
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
template <typename T>
|
|||
|
|
bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals,
|
|||
|
|
bool normalize)
|
|||
|
|
{
|
|||
|
|
// -- GPU-tier batch cap (early) -------------------------------------------
|
|||
|
|
// Apply the same VRAM-based batch cap that build() uses BEFORE computing
|
|||
|
|
// the engine filename. Without this, the cache lookup uses the uncapped
|
|||
|
|
// batch size (e.g. b32), misses the file that was saved with the capped
|
|||
|
|
// size (e.g. b16), and triggers a needless full rebuild every launch.
|
|||
|
|
// The cap inside build() still runs later as a safety net (it will be a
|
|||
|
|
// no-op because the batch is already capped here).
|
|||
|
|
{
|
|||
|
|
cudaDeviceProp prop;
|
|||
|
|
cudaGetDeviceProperties(&prop, m_options.deviceIndex);
|
|||
|
|
const size_t totalMiB = prop.totalGlobalMem / (1024ULL * 1024);
|
|||
|
|
|
|||
|
|
int gpuMaxBatch;
|
|||
|
|
if (totalMiB >= 15800) gpuMaxBatch = 32; // ~16 GiB+
|
|||
|
|
else if (totalMiB >= 11800) gpuMaxBatch = 16; // ~12 GiB
|
|||
|
|
else if (totalMiB >= 7900) gpuMaxBatch = 8; // ~ 8 GiB (batch=16 OCR ~987 MiB exec ctx, too large for 4 tasks)
|
|||
|
|
else if (totalMiB >= 3900) gpuMaxBatch = 4; // ~ 4 GiB
|
|||
|
|
else if (totalMiB >= 1900) gpuMaxBatch = 2; // ~ 2 GiB
|
|||
|
|
else gpuMaxBatch = 1; // < 2 GiB
|
|||
|
|
|
|||
|
|
if (m_options.maxBatchSize > gpuMaxBatch) {
|
|||
|
|
if (m_verbose) {
|
|||
|
|
std::cout << "Info: GPU-tier early batch cap: "
|
|||
|
|
<< m_options.maxBatchSize << " -> " << gpuMaxBatch
|
|||
|
|
<< " (GPU has " << totalMiB << " MiB)" << std::endl;
|
|||
|
|
}
|
|||
|
|
m_options.maxBatchSize = gpuMaxBatch;
|
|||
|
|
m_options.optBatchSize = std::min(m_options.optBatchSize,
|
|||
|
|
m_options.maxBatchSize);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// It is full path
|
|||
|
|
std::string engineName = serializeEngineOptions(m_options, onnxModelPath);
|
|||
|
|
std::string engineDir = m_options.engineFileDir;
|
|||
|
|
|
|||
|
|
if (FileExist(engineName)) {
|
|||
|
|
if (m_verbose) { std::cout << "Engine file found: " << engineName << std::endl; }
|
|||
|
|
logEngineEvent("[Engine] buildLoadNetwork: Loading cached engine: " + engineName);
|
|||
|
|
bool loadOk = loadNetwork(engineName, subVals, divVals, normalize);
|
|||
|
|
if (loadOk) {
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
// Engine file exists but loadNetwork failed.
|
|||
|
|
// Common causes:
|
|||
|
|
// - createExecutionContext returned null (VRAM exhausted)
|
|||
|
|
// - Incompatible TRT version or corrupt file
|
|||
|
|
// - Partially written by another thread
|
|||
|
|
if (m_skipOnnxRebuild) {
|
|||
|
|
// Elastic growth / non-critical path — don't delete and rebuild.
|
|||
|
|
// Just fail gracefully; the pool continues with existing slots.
|
|||
|
|
size_t freeMem = 0, totalMem = 0;
|
|||
|
|
cudaMemGetInfo(&freeMem, &totalMem);
|
|||
|
|
logEngineEvent("[Engine] buildLoadNetwork: Load failed (skip rebuild, "
|
|||
|
|
+ std::to_string(freeMem >> 20) + " MiB free): " + engineName, true);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
// Check if the failure was due to VRAM exhaustion vs. corrupt file.
|
|||
|
|
// If VRAM was the reason, PRESERVE the engine file — it's valid, just
|
|||
|
|
// can't fit right now. Deleting it forces a full ONNX→TRT rebuild
|
|||
|
|
// (minutes) when VRAM becomes available later, instead of a fast load.
|
|||
|
|
//
|
|||
|
|
// Uses the m_lastLoadFailedVRAM flag set by loadNetwork() instead of
|
|||
|
|
// re-querying cudaMemGetInfo. The old approach had a TOCTOU race:
|
|||
|
|
// VRAM could be freed between loadNetwork's check and this re-check,
|
|||
|
|
// causing a valid engine file to be falsely classified as INVALID
|
|||
|
|
// and deleted. Also check current VRAM as a safety net.
|
|||
|
|
{
|
|||
|
|
size_t freeCheck = 0, totalCheck = 0;
|
|||
|
|
cudaMemGetInfo(&freeCheck, &totalCheck);
|
|||
|
|
constexpr size_t kMinFreeBytes = 256ULL * 1024 * 1024;
|
|||
|
|
if (m_lastLoadFailedVRAM || freeCheck < kMinFreeBytes) {
|
|||
|
|
logEngineEvent("[Engine] buildLoadNetwork: Load failed due to LOW VRAM ("
|
|||
|
|
+ std::to_string(freeCheck / (1024 * 1024)) + " MiB free / "
|
|||
|
|
+ std::to_string(totalCheck / (1024 * 1024)) + " MiB total"
|
|||
|
|
+ ", vramFlag=" + std::to_string(m_lastLoadFailedVRAM)
|
|||
|
|
+ "). Preserving engine file (not corrupt): " + engineName, true);
|
|||
|
|
return false; // Don't delete the file, don't try ONNX rebuild
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
// Enough VRAM AND loadNetwork didn't flag VRAM as cause → file is
|
|||
|
|
// likely corrupt/incompatible. Delete and rebuild from ONNX.
|
|||
|
|
logEngineEvent("[Engine] buildLoadNetwork: Cached engine INVALID, deleting and rebuilding: " + engineName, true);
|
|||
|
|
try { std::filesystem::remove(engineName); } catch (...) {}
|
|||
|
|
// Fall through to ONNX build path below
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
{
|
|||
|
|
if (!FileExist(engineName)) {
|
|||
|
|
// Demand-driven growth: if no cached engine exists, bail out rather
|
|||
|
|
// than triggering a full ONNX→TRT build (30-60s, massive VRAM).
|
|||
|
|
if (m_skipOnnxBuild) {
|
|||
|
|
logEngineEvent("[Engine] buildLoadNetwork: Engine file NOT found, skipping ONNX build (demand growth): " + engineName);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
logEngineEvent("[Engine] buildLoadNetwork: Engine file NOT found, will build from ONNX: " + engineName);
|
|||
|
|
}
|
|||
|
|
if (!FileExist(onnxModelPath)) {
|
|||
|
|
// ONNX model does not exist, try to find alternative precision engine
|
|||
|
|
logEngineEvent("[Engine] buildLoadNetwork: ONNX model also not found: " + onnxModelPath, true);
|
|||
|
|
std::cout << "Searching for alternative precision engine..." << std::endl;
|
|||
|
|
|
|||
|
|
size_t lastDot = engineName.find_last_of('.');
|
|||
|
|
std::string alternativeEngineName;
|
|||
|
|
ANSCENTER::Precision originalPrecision = m_options.precision;
|
|||
|
|
|
|||
|
|
if (m_options.precision == ANSCENTER::Precision::FP16) {
|
|||
|
|
alternativeEngineName = engineName.substr(0, lastDot + 1) + "fp32";
|
|||
|
|
m_options.precision = ANSCENTER::Precision::FP32;
|
|||
|
|
std::cout << " Looking for FP32 engine: " << alternativeEngineName << std::endl;
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
alternativeEngineName = engineName.substr(0, lastDot + 1) + "fp16";
|
|||
|
|
m_options.precision = ANSCENTER::Precision::FP16;
|
|||
|
|
std::cout << " Looking for FP16 engine: " << alternativeEngineName << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (FileExist(alternativeEngineName)) {
|
|||
|
|
std::cout << "Found alternative precision engine: " << alternativeEngineName << std::endl;
|
|||
|
|
return loadNetwork(alternativeEngineName, subVals, divVals, normalize);
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
// Restore original precision
|
|||
|
|
m_options.precision = originalPrecision;
|
|||
|
|
std::cout << "Error: Neither ONNX model nor engine files exist for: " << onnxModelPath << std::endl;
|
|||
|
|
std::cout << " Searched for: " << engineName << std::endl;
|
|||
|
|
std::cout << " Searched for: " << alternativeEngineName << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
else {
|
|||
|
|
// Before building, check if an alternative precision engine already exists
|
|||
|
|
// (e.g., FP16 requested but a FP32 engine was built by a previous fallback)
|
|||
|
|
if (m_options.precision == ANSCENTER::Precision::FP16) {
|
|||
|
|
ANSCENTER::Options fp32Opts = m_options;
|
|||
|
|
fp32Opts.precision = ANSCENTER::Precision::FP32;
|
|||
|
|
std::string fp32EngineName = serializeEngineOptions(fp32Opts, onnxModelPath);
|
|||
|
|
if (FileExist(fp32EngineName)) {
|
|||
|
|
std::cout << "FP16 engine not found, but FP32 engine exists: " << fp32EngineName << std::endl;
|
|||
|
|
std::cout << "Loading existing FP32 engine..." << std::endl;
|
|||
|
|
m_options.precision = ANSCENTER::Precision::FP32;
|
|||
|
|
return loadNetwork(fp32EngineName, subVals, divVals, normalize);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ONNX model exists, generate engine
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "Engine not found, generating from ONNX model" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "ONNX model: " << onnxModelPath << std::endl;
|
|||
|
|
std::cout << "Target engine: " << engineName << std::endl;
|
|||
|
|
|
|||
|
|
if (!FolderExist(engineDir)) {
|
|||
|
|
std::cout << "Creating engine directory: " << engineDir << std::endl;
|
|||
|
|
std::filesystem::create_directories(engineDir);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
// CRITICAL FIX: Read ONNX to determine if it supports dynamic batch
|
|||
|
|
int32_t onnxBatchSize = -1;
|
|||
|
|
bool hasDynamicSpatialDims_onnx = false;
|
|||
|
|
int onnxFixedH = 0, onnxFixedW = 0; // 0 = dynamic (-1 in ONNX)
|
|||
|
|
std::cout << "\nAnalyzing ONNX model structure..." << std::endl;
|
|||
|
|
|
|||
|
|
auto tempBuilder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(m_logger));
|
|||
|
|
auto tempNetwork = std::unique_ptr<nvinfer1::INetworkDefinition>(TRT_CREATE_NETWORK(tempBuilder));
|
|||
|
|
auto tempParser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*tempNetwork, m_logger));
|
|||
|
|
|
|||
|
|
std::ifstream onnxFile(onnxModelPath, std::ios::binary | std::ios::ate);
|
|||
|
|
std::streamsize onnxSize = onnxFile.tellg();
|
|||
|
|
onnxFile.seekg(0, std::ios::beg);
|
|||
|
|
std::vector<char> onnxBuffer(onnxSize);
|
|||
|
|
if (!onnxFile.read(onnxBuffer.data(), onnxSize)) {
|
|||
|
|
std::cout << "Error: Failed to read ONNX file" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
unsigned long sehPreAnalysis = 0;
|
|||
|
|
bool preParsed = parseOnnxModelSafe(tempParser.get(),
|
|||
|
|
onnxBuffer.data(), onnxBuffer.size(), &sehPreAnalysis);
|
|||
|
|
if (sehPreAnalysis != 0) {
|
|||
|
|
std::cout << "[Engine] WARNING: ONNX pre-analysis parse crashed ("
|
|||
|
|
<< formatCrashCode(sehPreAnalysis)
|
|||
|
|
<< "). Skipping pre-analysis, proceeding with build..." << std::endl;
|
|||
|
|
}
|
|||
|
|
else if (preParsed) {
|
|||
|
|
auto numInputs = tempNetwork->getNbInputs();
|
|||
|
|
std::cout << "ONNX Model Analysis:" << std::endl;
|
|||
|
|
std::cout << " Number of inputs: " << numInputs << std::endl;
|
|||
|
|
|
|||
|
|
for (int32_t i = 0; i < numInputs; ++i) {
|
|||
|
|
auto input = tempNetwork->getInput(i);
|
|||
|
|
auto inputDims = input->getDimensions();
|
|||
|
|
std::cout << " Input " << i << " (" << input->getName() << "): [";
|
|||
|
|
for (int j = 0; j < inputDims.nbDims; ++j) {
|
|||
|
|
if (j > 0) std::cout << ", ";
|
|||
|
|
|
|||
|
|
// FIXED: Properly display dynamic dimensions
|
|||
|
|
if (inputDims.d[j] == -1) {
|
|||
|
|
std::cout << "dynamic";
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << inputDims.d[j];
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
std::cout << "]" << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check first input's batch dimension
|
|||
|
|
auto firstInput = tempNetwork->getInput(0);
|
|||
|
|
auto firstInputDims = firstInput->getDimensions();
|
|||
|
|
onnxBatchSize = firstInputDims.d[0];
|
|||
|
|
|
|||
|
|
// Detect dynamic spatial dimensions (for auto-retry mechanism)
|
|||
|
|
if (firstInputDims.nbDims >= 4) {
|
|||
|
|
if (firstInputDims.d[2] == -1 || firstInputDims.d[3] == -1) {
|
|||
|
|
hasDynamicSpatialDims_onnx = true;
|
|||
|
|
}
|
|||
|
|
onnxFixedH = (firstInputDims.d[2] != -1) ? firstInputDims.d[2] : 0;
|
|||
|
|
onnxFixedW = (firstInputDims.d[3] != -1) ? firstInputDims.d[3] : 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
std::cout << "\nBatch dimension analysis:" << std::endl;
|
|||
|
|
std::cout << " ONNX model batch dimension: ";
|
|||
|
|
if (onnxBatchSize == -1) {
|
|||
|
|
std::cout << "dynamic (-1)" << std::endl;
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << onnxBatchSize << std::endl;
|
|||
|
|
}
|
|||
|
|
std::cout << " Current maxBatchSize setting: " << m_options.maxBatchSize << std::endl;
|
|||
|
|
std::cout << " Current optBatchSize setting: " << m_options.optBatchSize << std::endl;
|
|||
|
|
|
|||
|
|
// FIXED: Correct logic for dynamic vs fixed batch
|
|||
|
|
if (onnxBatchSize == -1) {
|
|||
|
|
// Dynamic batch size model - keep user settings
|
|||
|
|
std::cout << "\n✓ ONNX model supports DYNAMIC batch size" << std::endl;
|
|||
|
|
std::cout << " Engine will support batch sizes 1 to " << m_options.maxBatchSize << std::endl;
|
|||
|
|
std::cout << " Optimal batch size: " << m_options.optBatchSize << std::endl;
|
|||
|
|
std::cout << " Keeping user-defined batch size configuration" << std::endl;
|
|||
|
|
}
|
|||
|
|
else if (onnxBatchSize > 0) {
|
|||
|
|
// Fixed batch size model - must match ONNX
|
|||
|
|
std::cout << "\n⚠ WARNING: ONNX model has FIXED batch size of " << onnxBatchSize << std::endl;
|
|||
|
|
std::cout << " Your model was exported with dynamic=False" << std::endl;
|
|||
|
|
std::cout << " Engine will only support batch size " << onnxBatchSize << std::endl;
|
|||
|
|
std::cout << " To use dynamic batching, re-export ONNX with dynamic=True" << std::endl;
|
|||
|
|
std::cout << "\n Adjusting engine options to match ONNX model..." << std::endl;
|
|||
|
|
|
|||
|
|
m_options.optBatchSize = onnxBatchSize;
|
|||
|
|
m_options.maxBatchSize = onnxBatchSize;
|
|||
|
|
|
|||
|
|
std::cout << " Updated optBatchSize: " << m_options.optBatchSize << std::endl;
|
|||
|
|
std::cout << " Updated maxBatchSize: " << m_options.maxBatchSize << std::endl;
|
|||
|
|
engineName = serializeEngineOptions(m_options, onnxModelPath);
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
// Unexpected value
|
|||
|
|
std::cout << "\n⚠ WARNING: Unexpected batch dimension value: " << onnxBatchSize << std::endl;
|
|||
|
|
std::cout << " This may indicate an issue with the ONNX file" << std::endl;
|
|||
|
|
std::cout << " Proceeding with user-defined settings" << std::endl;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << "Warning: Failed to parse ONNX for pre-analysis. Proceeding with build..." << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "Starting Engine Build Process" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "This may take 10-20 minutes depending on model complexity..." << std::endl;
|
|||
|
|
std::cout << "Configuration:" << std::endl;
|
|||
|
|
std::cout << " Precision: " << (m_options.precision == ANSCENTER::Precision::FP16 ? "FP16" :
|
|||
|
|
m_options.precision == ANSCENTER::Precision::INT8 ? "INT8" : "FP32") << std::endl;
|
|||
|
|
std::cout << " Optimization Level: 5 (Maximum)" << std::endl;
|
|||
|
|
std::cout << " Batch Size Range: 1 to " << m_options.maxBatchSize << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
|
|||
|
|
// Build with auto-retry for dynamic spatial dimension models.
|
|||
|
|
// buildWithRetry() handles the ONNX pre-analysis internally and
|
|||
|
|
// reduces max spatial dims on OOM, falling back to smaller
|
|||
|
|
// profiles until build succeeds or all candidates are exhausted.
|
|||
|
|
// Fixed-spatial models get a single build() attempt.
|
|||
|
|
bool buildSuccess = buildWithRetry(onnxModelPath, subVals,
|
|||
|
|
divVals, normalize);
|
|||
|
|
|
|||
|
|
// -- FP16 -> FP32 automatic fallback ---------------------------------
|
|||
|
|
// Some GPU architectures fail FP16 builds due to:
|
|||
|
|
// - platformHasFastFp16() returning false (older GPUs)
|
|||
|
|
// - kOBEY_PRECISION_CONSTRAINTS failing for mixed-precision layers
|
|||
|
|
// - Insufficient VRAM for FP16 tactic optimization
|
|||
|
|
// When FP16 build fails, automatically retry with FP32 precision.
|
|||
|
|
if (!buildSuccess && m_options.precision == ANSCENTER::Precision::FP16) {
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "FP16 Build Failed - Retrying with FP32" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "FP16 engine build failed on this GPU." << std::endl;
|
|||
|
|
std::cout << "Automatically falling back to FP32 precision..." << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
|
|||
|
|
m_options.precision = ANSCENTER::Precision::FP32;
|
|||
|
|
|
|||
|
|
// Re-compute engine name for FP32 to avoid caching conflicts
|
|||
|
|
engineName = serializeEngineOptions(m_options, onnxModelPath);
|
|||
|
|
|
|||
|
|
buildSuccess = buildWithRetry(onnxModelPath, subVals,
|
|||
|
|
divVals, normalize);
|
|||
|
|
|
|||
|
|
if (buildSuccess) {
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "FP32 Fallback Build Successful!" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "Note: Engine is running in FP32 mode on this GPU." << std::endl;
|
|||
|
|
std::cout << "Performance may be lower than FP16 but accuracy is preserved." << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (!buildSuccess) {
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "Engine Build Failed!" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "Error: Failed to build engine from ONNX model" << std::endl;
|
|||
|
|
std::cout << "Possible causes:" << std::endl;
|
|||
|
|
std::cout << " 1. Insufficient GPU memory" << std::endl;
|
|||
|
|
std::cout << " 2. Unsupported ONNX operations" << std::endl;
|
|||
|
|
std::cout << " 3. Invalid batch size configuration" << std::endl;
|
|||
|
|
std::cout << " 4. Corrupted ONNX file" << std::endl;
|
|||
|
|
if (hasDynamicSpatialDims_onnx) {
|
|||
|
|
std::cout << " 5. All spatial dimension fallbacks exhausted" << std::endl;
|
|||
|
|
}
|
|||
|
|
std::cout << " Note: Both FP16 and FP32 builds were attempted." << std::endl;
|
|||
|
|
std::cout << "\nTroubleshooting:" << std::endl;
|
|||
|
|
std::cout << " - Check GPU memory availability" << std::endl;
|
|||
|
|
std::cout << " - Try reducing maxBatchSize" << std::endl;
|
|||
|
|
std::cout << " - Verify ONNX model integrity" << std::endl;
|
|||
|
|
std::cout << " - Check TensorRT logs above for details" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// build() may have capped maxBatchSize based on GPU VRAM, which
|
|||
|
|
// changes the serialized engine filename (e.g. b32 -> b8). Re-compute
|
|||
|
|
// so we load the file that build() actually saved.
|
|||
|
|
std::string actualEngineName = serializeEngineOptions(m_options, onnxModelPath);
|
|||
|
|
|
|||
|
|
// After building, the engine should be saved, so load it
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "Engine Build Complete - Loading Engine" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
|
|||
|
|
if (FileExist(actualEngineName)) {
|
|||
|
|
std::cout << "Engine file created successfully: " << actualEngineName << std::endl;
|
|||
|
|
std::cout << "Loading engine into memory..." << std::endl;
|
|||
|
|
|
|||
|
|
bool loadSuccess = loadNetwork(actualEngineName, subVals, divVals, normalize);
|
|||
|
|
|
|||
|
|
if (loadSuccess) {
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "✓ Engine Ready for Inference!" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "Configuration Summary:" << std::endl;
|
|||
|
|
std::cout << " Engine File: " << actualEngineName << std::endl;
|
|||
|
|
std::cout << " Batch Size Support: 1 to " << m_options.maxBatchSize << std::endl;
|
|||
|
|
std::cout << " Precision: " << (m_options.precision == ANSCENTER::Precision::FP16 ? "FP16" :
|
|||
|
|
m_options.precision == ANSCENTER::Precision::INT8 ? "INT8" : "FP32") << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return loadSuccess;
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "Engine Build Error!" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "Error: Engine file not found after build: " << actualEngineName << std::endl;
|
|||
|
|
std::cout << "Expected location: " << std::filesystem::absolute(actualEngineName) << std::endl;
|
|||
|
|
std::cout << "\nPossible causes:" << std::endl;
|
|||
|
|
std::cout << " 1. Build succeeded but save failed (disk full?)" << std::endl;
|
|||
|
|
std::cout << " 2. Incorrect engine directory permissions" << std::endl;
|
|||
|
|
std::cout << " 3. Engine filename mismatch" << std::endl;
|
|||
|
|
std::cout << "\nPlease check:" << std::endl;
|
|||
|
|
std::cout << " - Available disk space in: " << engineDir << std::endl;
|
|||
|
|
std::cout << " - Write permissions for engine directory" << std::endl;
|
|||
|
|
std::cout << " - TensorRT build logs above for warnings" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
template <typename T>
|
|||
|
|
bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize)
|
|||
|
|
{
|
|||
|
|
m_lastLoadFailedVRAM = false; // reset on each load attempt
|
|||
|
|
m_subVals = subVals;
|
|||
|
|
m_divVals = divVals;
|
|||
|
|
m_normalize = normalize;
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// TRT ENGINE CACHE CHECK — skip file I/O + deserialization if already cached
|
|||
|
|
// (Bypassed when m_skipEngineCache is true, e.g., during model optimization)
|
|||
|
|
// ============================================================================
|
|||
|
|
if (!m_skipEngineCache) {
|
|||
|
|
auto cacheHit = TRTEngineCache::instance().tryGet(trtModelPath, m_options.deviceIndex);
|
|||
|
|
if (cacheHit.engine) {
|
|||
|
|
// Cache hit — reuse shared ICudaEngine (no deserialization, no file I/O)
|
|||
|
|
m_context.reset();
|
|||
|
|
m_engine.reset();
|
|||
|
|
m_runtime.reset();
|
|||
|
|
|
|||
|
|
m_engine = cacheHit.engine;
|
|||
|
|
m_runtime = cacheHit.runtime;
|
|||
|
|
m_usingCachedEngine = true;
|
|||
|
|
m_cachedEnginePath = trtModelPath;
|
|||
|
|
m_cachedGpuIndex = m_options.deviceIndex;
|
|||
|
|
|
|||
|
|
// Still need to set GPU device for context + buffer allocation
|
|||
|
|
cudaSetDevice(m_options.deviceIndex);
|
|||
|
|
|
|||
|
|
// Jump past file read + deserialization to context creation (below)
|
|||
|
|
goto trt_cache_create_context;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// READ ENGINE FILE (cache miss path)
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
if (!Util::doesFileExist(trtModelPath)) {
|
|||
|
|
logEngineEvent("[Engine] loadNetwork FAIL: Engine file not found: " + trtModelPath, true);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (m_verbose) { std::cout << "Loading TensorRT engine file at path: " << trtModelPath << std::endl; }
|
|||
|
|
|
|||
|
|
{
|
|||
|
|
std::ifstream file(trtModelPath, std::ios::binary | std::ios::ate);
|
|||
|
|
if (!file.is_open()) {
|
|||
|
|
logEngineEvent("[Engine] loadNetwork FAIL: Cannot open engine file: " + trtModelPath, true);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
std::streamsize size = file.tellg();
|
|||
|
|
if (size <= 0) {
|
|||
|
|
logEngineEvent("[Engine] loadNetwork FAIL: Engine file is empty (0 bytes): " + trtModelPath, true);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
file.seekg(0, std::ios::beg);
|
|||
|
|
|
|||
|
|
std::vector<char> buffer(size);
|
|||
|
|
if (!file.read(buffer.data(), size)) {
|
|||
|
|
logEngineEvent("[Engine] loadNetwork FAIL: Read error on engine file: " + trtModelPath, true);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (m_verbose) { std::cout << "Engine file size: " << size / (1024 * 1024) << " MiB" << std::endl; }
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// CREATE RUNTIME
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
// TRT requires: destroy context before engine, engine before runtime.
|
|||
|
|
// If loadNetwork() is called more than once on the same instance, the
|
|||
|
|
// previous objects must be torn down in the correct order before we
|
|||
|
|
// create new ones.
|
|||
|
|
m_context.reset();
|
|||
|
|
m_engine.reset();
|
|||
|
|
m_runtime.reset();
|
|||
|
|
|
|||
|
|
m_runtime = std::shared_ptr<nvinfer1::IRuntime>{ nvinfer1::createInferRuntime(m_logger) };
|
|||
|
|
if (!m_runtime) {
|
|||
|
|
logEngineEvent("[Engine] loadNetwork FAIL: createInferRuntime returned null for " + trtModelPath, true);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// GPU SELECTION AND CONFIGURATION
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
int numGPUs = 0;
|
|||
|
|
cudaGetDeviceCount(&numGPUs);
|
|||
|
|
if (m_verbose) std::cout << "Info: Number of GPU devices: " << numGPUs << std::endl;
|
|||
|
|
|
|||
|
|
if (numGPUs == 0) {
|
|||
|
|
std::cout << "Error: No CUDA-capable GPUs detected" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (m_options.deviceIndex < 0 || m_options.deviceIndex >= numGPUs) {
|
|||
|
|
std::cout << "Error: Invalid GPU index " << m_options.deviceIndex
|
|||
|
|
<< ". Available GPUs: " << numGPUs << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (m_verbose) std::cout << "Info: Using GPU device index: " << m_options.deviceIndex << std::endl;
|
|||
|
|
|
|||
|
|
// Use yield mode to avoid busy-wait spinning that falsely reports 100% GPU utilization.
|
|||
|
|
// Must be called before cudaSetDevice creates the CUDA context.
|
|||
|
|
cudaSetDeviceFlags(cudaDeviceScheduleYield);
|
|||
|
|
|
|||
|
|
cudaError_t ret = cudaSetDevice(m_options.deviceIndex);
|
|||
|
|
if (ret != cudaSuccess) {
|
|||
|
|
std::cout << "Error: Unable to set GPU device index to " << m_options.deviceIndex << std::endl;
|
|||
|
|
std::cout << "CUDA Error: " << cudaGetErrorString(ret) << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Get GPU properties
|
|||
|
|
cudaDeviceProp prop;
|
|||
|
|
cudaGetDeviceProperties(&prop, m_options.deviceIndex);
|
|||
|
|
|
|||
|
|
// Set GPU device limits.
|
|||
|
|
// Blackwell GPUs (GB200/B200 = SM 10.x, RTX 5090/5080 = SM 12.x) have
|
|||
|
|
// deeper kernel-launch pipelines and benefit from a larger pending-launch
|
|||
|
|
// queue. Using 8192 on Blackwell avoids throttling with heavily pipelined
|
|||
|
|
// workloads; 2048 is sufficient for all earlier architectures.
|
|||
|
|
{
|
|||
|
|
const size_t pendingLaunches = (prop.major >= 10) ? 8192 : 2048;
|
|||
|
|
cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, pendingLaunches);
|
|||
|
|
if (m_verbose) std::cout << "Info: cudaLimitDevRuntimePendingLaunchCount = " << pendingLaunches
|
|||
|
|
<< " (SM " << prop.major << "." << prop.minor << ")" << std::endl;
|
|||
|
|
}
|
|||
|
|
cudaDeviceSetLimit(cudaLimitStackSize, 8192);
|
|||
|
|
cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 2);
|
|||
|
|
|
|||
|
|
// Lock GPU clocks if requested (prevents power throttling on laptop GPUs)
|
|||
|
|
if (m_options.gpuClockLockMHz != 0 && !m_clocksLocked) {
|
|||
|
|
lockGpuClocks(m_options.deviceIndex, m_options.gpuClockLockMHz);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// -- VRAM safety check before engine deserialization -----------------------
|
|||
|
|
// Reject early if the GPU doesn't have enough free VRAM to load the engine.
|
|||
|
|
// This prevents slow degradation (unified memory fallback) or crashes
|
|||
|
|
// (cudaMalloc failure during inference) when too many tasks are loaded.
|
|||
|
|
{
|
|||
|
|
size_t freeVRAM = 0, totalVRAM = 0;
|
|||
|
|
cudaError_t memErr = cudaMemGetInfo(&freeVRAM, &totalVRAM);
|
|||
|
|
constexpr size_t kMinFreeBytes = 256ULL * 1024 * 1024; // 256 MiB minimum
|
|||
|
|
if (memErr != cudaSuccess) {
|
|||
|
|
// cudaMemGetInfo failed — CUDA context may not be initialized on this thread.
|
|||
|
|
// Log but don't reject: let TRT try to deserialize (it may succeed).
|
|||
|
|
logEngineEvent("[Engine] loadNetwork WARNING: cudaMemGetInfo failed ("
|
|||
|
|
+ std::string(cudaGetErrorString(memErr)) + ") on GPU["
|
|||
|
|
+ std::to_string(m_options.deviceIndex) + "] — skipping VRAM check for "
|
|||
|
|
+ trtModelPath, true);
|
|||
|
|
} else if (freeVRAM < kMinFreeBytes) {
|
|||
|
|
logEngineEvent("[Engine] loadNetwork FAIL: GPU[" + std::to_string(m_options.deviceIndex)
|
|||
|
|
+ "] only " + std::to_string(freeVRAM / (1024 * 1024))
|
|||
|
|
+ " MiB free / " + std::to_string(totalVRAM / (1024 * 1024))
|
|||
|
|
+ " MiB total (need " + std::to_string(kMinFreeBytes / (1024 * 1024))
|
|||
|
|
+ " MiB) for " + trtModelPath, true);
|
|||
|
|
m_lastLoadFailedVRAM = true; // signal to buildLoadNetwork: engine file is NOT corrupt
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
if (m_verbose) {
|
|||
|
|
std::cout << "Info: GPU " << m_options.deviceIndex << " VRAM: "
|
|||
|
|
<< (freeVRAM / (1024 * 1024)) << " MiB free / "
|
|||
|
|
<< (totalVRAM / (1024 * 1024)) << " MiB total" << std::endl;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// DESERIALIZE ENGINE
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
if (m_verbose) std::cout << "Info: Deserializing TensorRT engine..." << std::endl;
|
|||
|
|
unsigned long sehCodeDeserialize = 0;
|
|||
|
|
m_engine = std::shared_ptr<nvinfer1::ICudaEngine>(
|
|||
|
|
deserializeCudaEngineSafe(m_runtime.get(), buffer.data(),
|
|||
|
|
buffer.size(), &sehCodeDeserialize));
|
|||
|
|
if (sehCodeDeserialize != 0) {
|
|||
|
|
logEngineEvent("[Engine] loadNetwork FAIL: deserializeCudaEngine CRASHED (SEH "
|
|||
|
|
+ formatCrashCode(sehCodeDeserialize) + ") for " + trtModelPath, true);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
if (!m_engine) {
|
|||
|
|
logEngineEvent("[Engine] loadNetwork FAIL: deserializeCudaEngine returned null for "
|
|||
|
|
+ trtModelPath + " (file size=" + std::to_string(size / (1024*1024)) + " MiB)", true);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (m_verbose) std::cout << "Info: Engine deserialized successfully" << std::endl;
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// CRITICAL: VERIFY ENGINE BATCH CAPABILITIES IMMEDIATELY
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
int numOptProfiles = m_engine->getNbOptimizationProfiles();
|
|||
|
|
if (m_verbose) {
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "ENGINE BATCH CAPABILITY VERIFICATION" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "Number of optimization profiles: " << numOptProfiles << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
bool engineSupportsDynamicBatch = false;
|
|||
|
|
int actualMinBatch = 1;
|
|||
|
|
int actualMaxBatch = 1;
|
|||
|
|
|
|||
|
|
if (numOptProfiles > 0) {
|
|||
|
|
// Find the first input tensor to check batch support
|
|||
|
|
for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
|
|||
|
|
const char* tensorName = m_engine->getIOTensorName(i);
|
|||
|
|
if (m_engine->getTensorIOMode(tensorName) == nvinfer1::TensorIOMode::kINPUT) {
|
|||
|
|
|
|||
|
|
auto minDims = m_engine->getProfileShape(tensorName, 0,
|
|||
|
|
nvinfer1::OptProfileSelector::kMIN);
|
|||
|
|
auto optDims = m_engine->getProfileShape(tensorName, 0,
|
|||
|
|
nvinfer1::OptProfileSelector::kOPT);
|
|||
|
|
auto maxDims = m_engine->getProfileShape(tensorName, 0,
|
|||
|
|
nvinfer1::OptProfileSelector::kMAX);
|
|||
|
|
|
|||
|
|
actualMinBatch = minDims.d[0];
|
|||
|
|
actualMaxBatch = maxDims.d[0];
|
|||
|
|
|
|||
|
|
// Store actual profile max spatial dims for runtime queries
|
|||
|
|
if (maxDims.nbDims >= 4) {
|
|||
|
|
m_profileMaxHeight = maxDims.d[2];
|
|||
|
|
m_profileMaxWidth = maxDims.d[3];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (actualMinBatch != actualMaxBatch) {
|
|||
|
|
engineSupportsDynamicBatch = true;
|
|||
|
|
}
|
|||
|
|
if (m_verbose) {
|
|||
|
|
std::cout << "\nInput tensor '" << tensorName << "' profile 0:" << std::endl;
|
|||
|
|
std::cout << " Min: [" << minDims.d[0];
|
|||
|
|
for (int d = 1; d < minDims.nbDims; ++d) std::cout << "," << minDims.d[d];
|
|||
|
|
std::cout << "]" << std::endl;
|
|||
|
|
std::cout << " Opt: [" << optDims.d[0];
|
|||
|
|
for (int d = 1; d < optDims.nbDims; ++d) std::cout << "," << optDims.d[d];
|
|||
|
|
std::cout << "]" << std::endl;
|
|||
|
|
std::cout << " Max: [" << maxDims.d[0];
|
|||
|
|
for (int d = 1; d < maxDims.nbDims; ++d) std::cout << "," << maxDims.d[d];
|
|||
|
|
std::cout << "]" << std::endl;
|
|||
|
|
if (actualMinBatch != actualMaxBatch)
|
|||
|
|
std::cout << "\n✓ Engine supports DYNAMIC batching: "
|
|||
|
|
<< actualMinBatch << " to " << actualMaxBatch << std::endl;
|
|||
|
|
else
|
|||
|
|
std::cout << "\n⚠️ Engine has FIXED batch size: " << actualMinBatch << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
break; // Only need to check first input
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
if (m_verbose) std::cout << "⚠️ No optimization profiles found" << std::endl;
|
|||
|
|
// Check if batch dimension is dynamic via -1
|
|||
|
|
auto firstTensorName = m_engine->getIOTensorName(0);
|
|||
|
|
auto shape = m_engine->getTensorShape(firstTensorName);
|
|||
|
|
if (shape.d[0] == -1) {
|
|||
|
|
engineSupportsDynamicBatch = true;
|
|||
|
|
actualMaxBatch = m_options.maxBatchSize;
|
|||
|
|
if (m_verbose) std::cout << "Engine uses implicit dynamic batch (batch dim = -1)" << std::endl;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// CRITICAL CHECK: Verify engine can support requested batch sizes
|
|||
|
|
if (!engineSupportsDynamicBatch && m_options.maxBatchSize > actualMaxBatch) {
|
|||
|
|
std::cout << "\n🚨🚨🚨 CRITICAL ERROR 🚨🚨🚨" << std::endl;
|
|||
|
|
std::cout << "Requested max batch size: " << m_options.maxBatchSize << std::endl;
|
|||
|
|
std::cout << "Engine max batch size: " << actualMaxBatch << std::endl;
|
|||
|
|
std::cout << "\nThis engine CANNOT support batch sizes larger than "
|
|||
|
|
<< actualMaxBatch << "!" << std::endl;
|
|||
|
|
std::cout << "\nYou have two options:" << std::endl;
|
|||
|
|
std::cout << "1. Rebuild the engine with dynamic batch support:" << std::endl;
|
|||
|
|
std::cout << " trtexec --onnx=model.onnx \\" << std::endl;
|
|||
|
|
std::cout << " --minShapes=images:1x3x640x640 \\" << std::endl;
|
|||
|
|
std::cout << " --optShapes=images:4x3x640x640 \\" << std::endl;
|
|||
|
|
std::cout << " --maxShapes=images:32x3x640x640 \\" << std::endl;
|
|||
|
|
std::cout << " --saveEngine=model_dynamic.engine --fp16" << std::endl;
|
|||
|
|
std::cout << "\n2. Reduce maxBatchSize in your config to " << actualMaxBatch << std::endl;
|
|||
|
|
std::cout << "========================================\n" << std::endl;
|
|||
|
|
|
|||
|
|
// Optionally fail here:
|
|||
|
|
// return false;
|
|||
|
|
|
|||
|
|
// Or adjust maxBatchSize to match engine capability
|
|||
|
|
if (m_verbose) std::cout << "⚠️ Auto-adjusting maxBatchSize from " << m_options.maxBatchSize
|
|||
|
|
<< " to " << actualMaxBatch << std::endl;
|
|||
|
|
m_options.maxBatchSize = actualMaxBatch;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (m_verbose) std::cout << "========================================\n" << std::endl;
|
|||
|
|
|
|||
|
|
// Store in cache for future tasks loading the same model
|
|||
|
|
if (!m_skipEngineCache) {
|
|||
|
|
m_engine = TRTEngineCache::instance().putIfAbsent(
|
|||
|
|
trtModelPath, m_options.deviceIndex, m_runtime, m_engine);
|
|||
|
|
m_usingCachedEngine = true;
|
|||
|
|
m_cachedEnginePath = trtModelPath;
|
|||
|
|
m_cachedGpuIndex = m_options.deviceIndex;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
} // end of cache-miss scope (closes the brace opened after cache check)
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// CREATE EXECUTION CONTEXT (both cache-hit and cache-miss paths converge here)
|
|||
|
|
// ============================================================================
|
|||
|
|
trt_cache_create_context:
|
|||
|
|
// These variables may not exist if we came from cache-hit path (goto skipped them).
|
|||
|
|
// Re-derive from the (now valid) m_engine so both paths work.
|
|||
|
|
{
|
|||
|
|
int numOptProfiles = m_engine->getNbOptimizationProfiles();
|
|||
|
|
bool engineSupportsDynamicBatch = false;
|
|||
|
|
int actualMinBatch = 1;
|
|||
|
|
int actualMaxBatch = 1;
|
|||
|
|
if (numOptProfiles > 0) {
|
|||
|
|
for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
|
|||
|
|
const char* tn = m_engine->getIOTensorName(i);
|
|||
|
|
if (m_engine->getTensorIOMode(tn) == nvinfer1::TensorIOMode::kINPUT) {
|
|||
|
|
auto minDims = m_engine->getProfileShape(tn, 0, nvinfer1::OptProfileSelector::kMIN);
|
|||
|
|
auto maxDims = m_engine->getProfileShape(tn, 0, nvinfer1::OptProfileSelector::kMAX);
|
|||
|
|
actualMinBatch = minDims.d[0];
|
|||
|
|
actualMaxBatch = maxDims.d[0];
|
|||
|
|
engineSupportsDynamicBatch = (actualMinBatch != actualMaxBatch);
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (actualMaxBatch > 0 && m_options.maxBatchSize > actualMaxBatch) {
|
|||
|
|
m_options.maxBatchSize = actualMaxBatch;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
|
|||
|
|
if (!m_context) {
|
|||
|
|
logEngineEvent("[Engine] loadNetwork FAIL: createExecutionContext returned null for "
|
|||
|
|
+ trtModelPath, true);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (m_verbose) std::cout << "Info: Execution context created successfully" << std::endl;
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// BUFFER ALLOCATION
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
if (m_verbose) {
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "Initializing Buffers" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
clearGpuBuffers();
|
|||
|
|
m_buffers.resize(m_engine->getNbIOTensors());
|
|||
|
|
m_outputLengths.clear();
|
|||
|
|
m_inputDims.clear();
|
|||
|
|
m_outputDims.clear();
|
|||
|
|
m_IOTensorNames.clear();
|
|||
|
|
m_hasDynamicSpatialDims = false;
|
|||
|
|
|
|||
|
|
// Check available GPU memory
|
|||
|
|
size_t free_mem_initial, total_mem;
|
|||
|
|
cudaMemGetInfo(&free_mem_initial, &total_mem);
|
|||
|
|
if (m_verbose) {
|
|||
|
|
std::cout << "GPU Memory before allocation: Free " << free_mem_initial / (1024 * 1024)
|
|||
|
|
<< " MiB / Total " << total_mem / (1024 * 1024) << " MiB" << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
size_t totalAllocated = 0;
|
|||
|
|
|
|||
|
|
if (m_verbose) {
|
|||
|
|
std::cout << "Engine batch configuration:" << std::endl;
|
|||
|
|
std::cout << " Dynamic batch: " << (engineSupportsDynamicBatch ? "YES" : "NO") << std::endl;
|
|||
|
|
std::cout << " Actual batch range: " << actualMinBatch << " to " << actualMaxBatch << std::endl;
|
|||
|
|
std::cout << " Configured max batch size: " << m_options.maxBatchSize << std::endl;
|
|||
|
|
std::cout << " Optimization profiles: " << numOptProfiles << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Allocate buffers for all I/O tensors
|
|||
|
|
for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
|
|||
|
|
const auto tensorName = m_engine->getIOTensorName(i);
|
|||
|
|
m_IOTensorNames.emplace_back(tensorName);
|
|||
|
|
const auto tensorType = m_engine->getTensorIOMode(tensorName);
|
|||
|
|
const auto tensorShape = m_engine->getTensorShape(tensorName);
|
|||
|
|
const auto tensorDataType = m_engine->getTensorDataType(tensorName);
|
|||
|
|
|
|||
|
|
if (tensorType == nvinfer1::TensorIOMode::kINPUT) {
|
|||
|
|
if (m_verbose) std::cout << "\nInfo: Processing input tensor: " << tensorName << std::endl;
|
|||
|
|
|
|||
|
|
// Validate input type
|
|||
|
|
if (tensorDataType != nvinfer1::DataType::kFLOAT) {
|
|||
|
|
std::cout << "Error: Only float inputs are supported" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Store input dimensions correctly (C, H, W - excluding batch)
|
|||
|
|
m_inputDims.emplace_back(tensorShape.d[1], tensorShape.d[2], tensorShape.d[3]);
|
|||
|
|
|
|||
|
|
// Detect dynamic spatial dimensions (e.g., detection models with variable H/W)
|
|||
|
|
if (tensorShape.d[2] == -1 || tensorShape.d[3] == -1) {
|
|||
|
|
m_hasDynamicSpatialDims = true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (m_verbose) std::cout << " Input shape from engine: [" << tensorShape.d[0] << ", " << tensorShape.d[1]
|
|||
|
|
<< ", " << tensorShape.d[2] << ", " << tensorShape.d[3] << "]" << std::endl;
|
|||
|
|
|
|||
|
|
// Calculate buffer size using actual max batch size from engine
|
|||
|
|
// Dynamic dimensions (-1) are substituted with the configured max values
|
|||
|
|
int32_t batchSize = (tensorShape.d[0] == -1) ? actualMaxBatch : tensorShape.d[0];
|
|||
|
|
int32_t channels = tensorShape.d[1];
|
|||
|
|
int32_t height = (tensorShape.d[2] == -1) ? m_options.maxInputHeight : tensorShape.d[2];
|
|||
|
|
int32_t width = (tensorShape.d[3] == -1) ? m_options.maxInputWidth : tensorShape.d[3];
|
|||
|
|
|
|||
|
|
int64_t inputLength = static_cast<int64_t>(batchSize) * channels * height * width;
|
|||
|
|
|
|||
|
|
size_t requestedMemory = inputLength * sizeof(float);
|
|||
|
|
if (m_verbose) std::cout << " Allocating for max batch size " << batchSize << ": "
|
|||
|
|
<< requestedMemory / (1024 * 1024) << " MiB" << std::endl;
|
|||
|
|
|
|||
|
|
// Allocate GPU memory
|
|||
|
|
cudaError_t err = cudaMalloc(&m_buffers[i], requestedMemory);
|
|||
|
|
if (err != cudaSuccess) {
|
|||
|
|
logEngineEvent("[Engine] loadNetwork FAIL: cudaMalloc input buffer ("
|
|||
|
|
+ std::to_string(requestedMemory / (1024*1024)) + " MiB): "
|
|||
|
|
+ cudaGetErrorString(err) + " for " + trtModelPath, true);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Initialize to zero
|
|||
|
|
cudaMemset(m_buffers[i], 0, requestedMemory);
|
|||
|
|
|
|||
|
|
totalAllocated += requestedMemory;
|
|||
|
|
|
|||
|
|
}
|
|||
|
|
else if (tensorType == nvinfer1::TensorIOMode::kOUTPUT) {
|
|||
|
|
if (m_verbose) std::cout << "\nInfo: Processing output tensor: " << tensorName << std::endl;
|
|||
|
|
|
|||
|
|
// Validate output type matches template parameter
|
|||
|
|
if (tensorDataType == nvinfer1::DataType::kFLOAT && !std::is_same<float, T>::value) {
|
|||
|
|
std::cout << "Error: Model output type is float, but template parameter is not float" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
else if (tensorDataType == nvinfer1::DataType::kHALF && !std::is_same<__half, T>::value) {
|
|||
|
|
std::cout << "Error: Model output type is half, but template parameter is not __half" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
else if (tensorDataType == nvinfer1::DataType::kINT32 && !std::is_same<int32_t, T>::value) {
|
|||
|
|
std::cout << "Error: Model output type is int32, but template parameter is not int32_t" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Calculate output buffer size per batch element
|
|||
|
|
int64_t outputLengthPerBatch = 1;
|
|||
|
|
m_outputDims.push_back(tensorShape);
|
|||
|
|
|
|||
|
|
if (m_verbose) std::cout << " Output shape from engine: [" << tensorShape.d[0];
|
|||
|
|
for (int j = 1; j < tensorShape.nbDims; ++j) {
|
|||
|
|
if (m_verbose) std::cout << ", " << tensorShape.d[j];
|
|||
|
|
int64_t dimSize = tensorShape.d[j];
|
|||
|
|
if (dimSize <= 0) {
|
|||
|
|
// Dynamic output dimension: use max input dims as upper bound
|
|||
|
|
if (tensorShape.nbDims == 4) {
|
|||
|
|
// NCHW: d[2]=H, d[3]=W
|
|||
|
|
dimSize = (j == 2) ? m_options.maxInputHeight : m_options.maxInputWidth;
|
|||
|
|
} else {
|
|||
|
|
// Generic: use max input width as fallback for dynamic dims
|
|||
|
|
dimSize = m_options.maxInputWidth;
|
|||
|
|
}
|
|||
|
|
if (dimSize <= 0) dimSize = 1; // Safety: avoid zero/negative
|
|||
|
|
}
|
|||
|
|
outputLengthPerBatch *= dimSize;
|
|||
|
|
}
|
|||
|
|
if (m_verbose) std::cout << "]" << std::endl;
|
|||
|
|
|
|||
|
|
// Store output length per batch element (excluding batch dimension)
|
|||
|
|
m_outputLengths.push_back(outputLengthPerBatch);
|
|||
|
|
|
|||
|
|
// Allocate for actual max batch size from engine
|
|||
|
|
size_t requestedMemory = outputLengthPerBatch * actualMaxBatch * sizeof(T);
|
|||
|
|
if (m_verbose) std::cout << " Allocating for max batch size " << actualMaxBatch << ": "
|
|||
|
|
<< requestedMemory / (1024 * 1024) << " MiB" << std::endl;
|
|||
|
|
|
|||
|
|
// Check if enough memory available
|
|||
|
|
size_t free_mem, total_mem_check;
|
|||
|
|
cudaMemGetInfo(&free_mem, &total_mem_check);
|
|||
|
|
if (requestedMemory > free_mem) {
|
|||
|
|
std::cout << "Error: Not enough GPU memory" << std::endl;
|
|||
|
|
std::cout << " Requested: " << requestedMemory / (1024 * 1024) << " MiB" << std::endl;
|
|||
|
|
std::cout << " Available: " << free_mem / (1024 * 1024) << " MiB" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Allocate GPU memory
|
|||
|
|
cudaError_t err = cudaMalloc(&m_buffers[i], requestedMemory);
|
|||
|
|
if (err != cudaSuccess) {
|
|||
|
|
logEngineEvent("[Engine] loadNetwork FAIL: cudaMalloc output buffer ("
|
|||
|
|
+ std::to_string(requestedMemory / (1024*1024)) + " MiB): "
|
|||
|
|
+ cudaGetErrorString(err) + " for " + trtModelPath, true);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Initialize to zero
|
|||
|
|
cudaMemset(m_buffers[i], 0, requestedMemory);
|
|||
|
|
|
|||
|
|
totalAllocated += requestedMemory;
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << "Error: Tensor is neither input nor output!" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (m_verbose) std::cout << "\nInfo: Total GPU memory allocated: " << totalAllocated / (1024 * 1024) << " MiB" << std::endl;
|
|||
|
|
|
|||
|
|
// -- Pinned output buffers (CUDA graph prerequisite) -----------------------
|
|||
|
|
// Invalidate any graphs captured by a previous loadNetwork() call on this instance.
|
|||
|
|
for (auto& [bs, ge] : m_graphExecs) { if (ge) cudaGraphExecDestroy(ge); }
|
|||
|
|
m_graphExecs.clear();
|
|||
|
|
// Free any previously allocated pinned buffers.
|
|||
|
|
for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
|
|||
|
|
m_pinnedOutputBuffers.clear();
|
|||
|
|
m_pinnedOutputBufElems.clear();
|
|||
|
|
|
|||
|
|
// Allocate one flat pinned buffer per output tensor, sized for
|
|||
|
|
// actualMaxBatch x outputLength elements. Stable host addresses enable
|
|||
|
|
// CUDA graph capture of D2H copies. If any allocation fails, disable
|
|||
|
|
// graph acceleration gracefully and fall back to the original code path.
|
|||
|
|
//
|
|||
|
|
// Previously disabled for OpenCV 4.13+ because cv::cuda::split on the null
|
|||
|
|
// stream threw cudaErrorStreamCaptureUnsupported (-217). Now safe because
|
|||
|
|
// blobFromGpuMats runs on m_inferenceStream and finishes BEFORE graph capture.
|
|||
|
|
m_pinnedOutputBuffers.resize(m_outputLengths.size(), nullptr);
|
|||
|
|
m_pinnedOutputBufElems.resize(m_outputLengths.size(), 0);
|
|||
|
|
bool pinnedOk = true;
|
|||
|
|
for (size_t i = 0; i < m_outputLengths.size(); ++i) {
|
|||
|
|
const size_t nElems = static_cast<size_t>(m_outputLengths[i])
|
|||
|
|
* static_cast<size_t>(actualMaxBatch);
|
|||
|
|
if (cudaMallocHost(reinterpret_cast<void**>(&m_pinnedOutputBuffers[i]),
|
|||
|
|
nElems * sizeof(T)) != cudaSuccess) {
|
|||
|
|
pinnedOk = false;
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
m_pinnedOutputBufElems[i] = nElems;
|
|||
|
|
}
|
|||
|
|
if (!pinnedOk) {
|
|||
|
|
std::cout << "Warning: cudaMallocHost failed -- CUDA graph acceleration disabled."
|
|||
|
|
<< std::endl;
|
|||
|
|
for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
|
|||
|
|
m_pinnedOutputBuffers.clear();
|
|||
|
|
m_pinnedOutputBufElems.clear();
|
|||
|
|
} else {
|
|||
|
|
if (m_verbose) std::cout << "Info: Pinned output buffers allocated -- CUDA graph acceleration enabled."
|
|||
|
|
<< std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check final memory state
|
|||
|
|
size_t free_mem_final, total_mem_final;
|
|||
|
|
cudaMemGetInfo(&free_mem_final, &total_mem_final);
|
|||
|
|
if (m_verbose) std::cout << "GPU Memory after allocation: Free " << free_mem_final / (1024 * 1024)
|
|||
|
|
<< " MiB / Total " << total_mem_final / (1024 * 1024) << " MiB" << std::endl;
|
|||
|
|
|
|||
|
|
// Ensure all pending GPU operations (cudaMalloc, memcpy, etc.) complete
|
|||
|
|
// before we begin inference on this engine.
|
|||
|
|
cudaDeviceSynchronize();
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// CONTEXT OPTIMIZATION
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
if (m_verbose) {
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "Context Optimization" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Create temporary stream for context setup
|
|||
|
|
cudaStream_t setupStream;
|
|||
|
|
cudaStreamCreate(&setupStream);
|
|||
|
|
|
|||
|
|
// Check and set optimization profile
|
|||
|
|
if (m_verbose) std::cout << "Info: Engine has " << numOptProfiles << " optimization profile(s)" << std::endl;
|
|||
|
|
|
|||
|
|
if (numOptProfiles > 0) {
|
|||
|
|
int selectedProfile = 0;
|
|||
|
|
if (m_verbose) std::cout << "Info: Using optimization profile " << selectedProfile
|
|||
|
|
<< " (actual range: batch " << actualMinBatch << " to " << actualMaxBatch << ")" << std::endl;
|
|||
|
|
|
|||
|
|
// Set optimization profile FIRST
|
|||
|
|
bool profileSet = m_context->setOptimizationProfileAsync(selectedProfile, setupStream);
|
|||
|
|
if (!profileSet) {
|
|||
|
|
std::cout << "Error: Failed to set optimization profile" << std::endl;
|
|||
|
|
cudaStreamDestroy(setupStream);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Wait for profile to be set
|
|||
|
|
cudaStreamSynchronize(setupStream);
|
|||
|
|
if (m_verbose) std::cout << "Info: Optimization profile " << selectedProfile << " set successfully" << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Set input shapes and bind buffers
|
|||
|
|
for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
|
|||
|
|
const auto tensorName = m_engine->getIOTensorName(i);
|
|||
|
|
const auto tensorMode = m_engine->getTensorIOMode(tensorName);
|
|||
|
|
|
|||
|
|
// Set tensor address for both input and output
|
|||
|
|
if (!m_context->setTensorAddress(tensorName, m_buffers[i])) {
|
|||
|
|
std::cout << "Error: Failed to set tensor address for " << tensorName << std::endl;
|
|||
|
|
cudaStreamDestroy(setupStream);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (tensorMode == nvinfer1::TensorIOMode::kINPUT) {
|
|||
|
|
auto dims = m_engine->getTensorShape(tensorName);
|
|||
|
|
|
|||
|
|
if (m_verbose) {
|
|||
|
|
std::cout << "Info: Input tensor '" << tensorName << "' engine shape: [";
|
|||
|
|
for (int j = 0; j < dims.nbDims; ++j) {
|
|||
|
|
if (j > 0) std::cout << ", ";
|
|||
|
|
std::cout << dims.d[j];
|
|||
|
|
}
|
|||
|
|
std::cout << "]" << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// For dynamic batch engines, set shape to minimum for initialization
|
|||
|
|
if (dims.d[0] == -1 || numOptProfiles > 0) {
|
|||
|
|
nvinfer1::Dims inputDims = dims;
|
|||
|
|
inputDims.d[0] = actualMinBatch; // Use actual min from engine
|
|||
|
|
|
|||
|
|
// Set height if dynamic
|
|||
|
|
if (inputDims.d[2] == -1) {
|
|||
|
|
inputDims.d[2] = m_options.optInputHeight;
|
|||
|
|
}
|
|||
|
|
// Set width if dynamic
|
|||
|
|
if (inputDims.d[3] == -1) {
|
|||
|
|
inputDims.d[3] = m_options.optInputWidth;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (!m_context->setInputShape(tensorName, inputDims)) {
|
|||
|
|
std::cout << "Error: Failed to set input shape for " << tensorName << std::endl;
|
|||
|
|
cudaStreamDestroy(setupStream);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (m_verbose) {
|
|||
|
|
std::cout << "Info: Set initial input shape to [" << inputDims.d[0] << ", "
|
|||
|
|
<< inputDims.d[1] << ", " << inputDims.d[2] << ", "
|
|||
|
|
<< inputDims.d[3] << "] (for warmup)" << std::endl;
|
|||
|
|
std::cout << " Actual batch size will be set at inference time" << std::endl;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Verify all dimensions are specified
|
|||
|
|
if (!m_context->allInputDimensionsSpecified()) {
|
|||
|
|
std::cout << "Error: Not all input dimensions specified after setup" << std::endl;
|
|||
|
|
|
|||
|
|
// Debug: Show which dimensions are missing
|
|||
|
|
for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
|
|||
|
|
const auto tensorName = m_engine->getIOTensorName(i);
|
|||
|
|
if (m_engine->getTensorIOMode(tensorName) == nvinfer1::TensorIOMode::kINPUT) {
|
|||
|
|
auto dims = m_context->getTensorShape(tensorName);
|
|||
|
|
std::cout << " " << tensorName << " shape: [";
|
|||
|
|
for (int j = 0; j < dims.nbDims; ++j) {
|
|||
|
|
if (j > 0) std::cout << ", ";
|
|||
|
|
std::cout << dims.d[j];
|
|||
|
|
}
|
|||
|
|
std::cout << "]" << std::endl;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
cudaStreamDestroy(setupStream);
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (m_verbose) {
|
|||
|
|
std::cout << "Info: All input dimensions specified correctly" << std::endl;
|
|||
|
|
std::cout << "Info: All tensor addresses bound successfully" << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Disable profiling for production
|
|||
|
|
m_context->setEnqueueEmitsProfile(false);
|
|||
|
|
if (m_verbose) std::cout << "Info: Enqueue profile emissions disabled (production mode)" << std::endl;
|
|||
|
|
|
|||
|
|
// Clean up setup stream
|
|||
|
|
cudaStreamSynchronize(setupStream);
|
|||
|
|
cudaStreamDestroy(setupStream);
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// CREATE PERSISTENT INFERENCE AND MEMORY STREAMS
|
|||
|
|
// ============================================================================
|
|||
|
|
// Creating streams here (once, at load time) rather than lazily in
|
|||
|
|
// runInference() removes the hot-path "if (!m_streamInitialized)" branch
|
|||
|
|
// and ensures warmUp() already runs on the real inference stream.
|
|||
|
|
if (!m_streamInitialized) {
|
|||
|
|
int leastPriority, greatestPriority;
|
|||
|
|
cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
|
|||
|
|
|
|||
|
|
cudaError_t streamErr = cudaStreamCreateWithPriority(
|
|||
|
|
&m_inferenceStream, cudaStreamNonBlocking, greatestPriority);
|
|||
|
|
if (streamErr != cudaSuccess) {
|
|||
|
|
std::cout << "Error: Failed to create inference stream: "
|
|||
|
|
<< cudaGetErrorString(streamErr) << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
streamErr = cudaStreamCreate(&m_memoryStream);
|
|||
|
|
if (streamErr != cudaSuccess) {
|
|||
|
|
std::cout << "Error: Failed to create memory stream: "
|
|||
|
|
<< cudaGetErrorString(streamErr) << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
m_streamInitialized = true;
|
|||
|
|
if (m_verbose) {
|
|||
|
|
std::cout << "Info: Inference stream created at load time with highest priority" << std::endl;
|
|||
|
|
std::cout << "Info: Memory stream created" << std::endl;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// PRE-WARMUP DIAGNOSTICS
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
if (m_verbose) {
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "Pre-Warmup Diagnostics" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "Engine has " << m_engine->getNbIOTensors() << " I/O tensors" << std::endl;
|
|||
|
|
std::cout << "Engine has " << m_engine->getNbOptimizationProfiles() << " optimization profiles" << std::endl;
|
|||
|
|
for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
|
|||
|
|
const auto tensorName = m_engine->getIOTensorName(i);
|
|||
|
|
const auto tensorMode = m_engine->getTensorIOMode(tensorName);
|
|||
|
|
const auto tensorShape = m_context->getTensorShape(tensorName);
|
|||
|
|
std::cout << "\nTensor " << i << ": " << tensorName << std::endl;
|
|||
|
|
std::cout << " Mode: " << (tensorMode == nvinfer1::TensorIOMode::kINPUT ? "INPUT" : "OUTPUT") << std::endl;
|
|||
|
|
std::cout << " Shape: [";
|
|||
|
|
for (int j = 0; j < tensorShape.nbDims; ++j) {
|
|||
|
|
if (j > 0) std::cout << ", ";
|
|||
|
|
std::cout << tensorShape.d[j];
|
|||
|
|
}
|
|||
|
|
std::cout << "]" << std::endl;
|
|||
|
|
std::cout << " Buffer address: " << m_buffers[i] << std::endl;
|
|||
|
|
}
|
|||
|
|
std::cout << "\nContext state check:" << std::endl;
|
|||
|
|
std::cout << " All dimensions specified: " << (m_context->allInputDimensionsSpecified() ? "YES" : "NO") << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
}
|
|||
|
|
if (!m_context->allInputDimensionsSpecified()) {
|
|||
|
|
std::cout << "ERROR: Cannot proceed with warmup - dimensions not specified!" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// ENGINE LOADED SUCCESSFULLY
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
if (m_verbose) {
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "Engine loaded successfully!" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// WARMUP
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
if (m_verbose) std::cout << "\nInfo: Starting warm-up inference..." << std::endl;
|
|||
|
|
warmUp(m_verbose ? 10 : 1);
|
|||
|
|
if (m_verbose) std::cout << "Info: Warm-up complete" << std::endl;
|
|||
|
|
|
|||
|
|
} // end of trt_cache_create_context scope
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
template <typename T>
|
|||
|
|
bool Engine<T>::build(std::string onnxModelPath, const std::array<float, 3>& subVals, const std::array<float, 3>& divVals, bool normalize) {
|
|||
|
|
const auto engineName = serializeEngineOptions(m_options, onnxModelPath);
|
|||
|
|
if (FileExist(engineName)) {
|
|||
|
|
std::cout << "Engine file already exists: " << engineName << std::endl;
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
if (!FileExist(onnxModelPath)) {
|
|||
|
|
std::cout << "Error: ONNX model file does not exist: " << onnxModelPath << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "Building TensorRT Engine" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "TensorRT Version: " << NV_TENSORRT_MAJOR << "."
|
|||
|
|
<< NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH << std::endl;
|
|||
|
|
|
|||
|
|
// TensorRT 10+ detection
|
|||
|
|
#if NV_TENSORRT_MAJOR >= 10
|
|||
|
|
std::cout << "\n⚠️ TensorRT " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR
|
|||
|
|
<< " detected - will apply dynamic batch optimization flags" << std::endl;
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
// Create our engine builder.
|
|||
|
|
auto builder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(m_logger));
|
|||
|
|
if (!builder) {
|
|||
|
|
std::cout << "Error: Failed to create builder" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
auto network = std::unique_ptr<nvinfer1::INetworkDefinition>(TRT_CREATE_NETWORK(builder));
|
|||
|
|
if (!network) {
|
|||
|
|
std::cout << "Error: Failed to create network" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Create a parser for reading the onnx file.
|
|||
|
|
auto parser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, m_logger));
|
|||
|
|
if (!parser) {
|
|||
|
|
std::cout << "Error: Failed to create parser" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Read the onnx file into memory
|
|||
|
|
std::ifstream file(onnxModelPath, std::ios::binary | std::ios::ate);
|
|||
|
|
std::streamsize size = file.tellg();
|
|||
|
|
file.seekg(0, std::ios::beg);
|
|||
|
|
|
|||
|
|
std::vector<char> buffer(size);
|
|||
|
|
if (!file.read(buffer.data(), size)) {
|
|||
|
|
std::cout << "Error: Unable to read ONNX file" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
std::cout << "ONNX model size: " << size / (1024 * 1024) << " MiB" << std::endl;
|
|||
|
|
|
|||
|
|
// Parse the buffer we read into memory (crash-safe).
|
|||
|
|
std::cout << "Parsing ONNX model..." << std::endl;
|
|||
|
|
unsigned long sehCodeParse = 0;
|
|||
|
|
auto parsed = parseOnnxModelSafe(parser.get(), buffer.data(),
|
|||
|
|
buffer.size(), &sehCodeParse);
|
|||
|
|
if (sehCodeParse != 0) {
|
|||
|
|
std::cout << "[Engine] FATAL: ONNX parser crashed ("
|
|||
|
|
<< formatCrashCode(sehCodeParse) << ")" << std::endl;
|
|||
|
|
std::cout << "[Engine] This may indicate a corrupt ONNX file or driver issue." << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
if (!parsed) {
|
|||
|
|
std::cout << "Error: Failed to parse ONNX model" << std::endl;
|
|||
|
|
for (int32_t i = 0; i < parser->getNbErrors(); ++i) {
|
|||
|
|
std::cout << " " << parser->getError(i)->desc() << std::endl;
|
|||
|
|
}
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
std::cout << "ONNX model parsed successfully" << std::endl;
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// ENHANCED ONNX MODEL ANALYSIS
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "ONNX Model Analysis" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
|
|||
|
|
const auto numInputs = network->getNbInputs();
|
|||
|
|
if (numInputs < 1) {
|
|||
|
|
std::cout << "Error: Model needs at least 1 input!" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
std::cout << "Number of inputs: " << numInputs << std::endl;
|
|||
|
|
|
|||
|
|
// Analyze all inputs
|
|||
|
|
for (int32_t i = 0; i < numInputs; ++i) {
|
|||
|
|
const auto input = network->getInput(i);
|
|||
|
|
const auto inputDims = input->getDimensions();
|
|||
|
|
|
|||
|
|
std::cout << "\nInput [" << i << "] '" << input->getName() << "':" << std::endl;
|
|||
|
|
std::cout << " Dimensions: [";
|
|||
|
|
for (int j = 0; j < inputDims.nbDims; ++j) {
|
|||
|
|
if (j > 0) std::cout << ", ";
|
|||
|
|
if (inputDims.d[j] == -1) {
|
|||
|
|
std::cout << "DYNAMIC";
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << inputDims.d[j];
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
std::cout << "]" << std::endl;
|
|||
|
|
|
|||
|
|
// Check batch dimension
|
|||
|
|
if (inputDims.d[0] == -1) {
|
|||
|
|
std::cout << " ✓ Batch dimension: DYNAMIC" << std::endl;
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << " ✗ Batch dimension: FIXED at " << inputDims.d[0] << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check height dimension (if applicable)
|
|||
|
|
if (inputDims.nbDims >= 3 && inputDims.d[2] == -1) {
|
|||
|
|
std::cout << " ✓ Height dimension: DYNAMIC" << std::endl;
|
|||
|
|
}
|
|||
|
|
else if (inputDims.nbDims >= 3) {
|
|||
|
|
std::cout << " • Height dimension: FIXED at " << inputDims.d[2] << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check width dimension (if applicable)
|
|||
|
|
if (inputDims.nbDims >= 4 && inputDims.d[3] == -1) {
|
|||
|
|
std::cout << " ✓ Width dimension: DYNAMIC" << std::endl;
|
|||
|
|
}
|
|||
|
|
else if (inputDims.nbDims >= 4) {
|
|||
|
|
std::cout << " • Width dimension: FIXED at " << inputDims.d[3] << std::endl;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Ensure that all the inputs have the same batch size
|
|||
|
|
const auto input0Batch = network->getInput(0)->getDimensions().d[0];
|
|||
|
|
for (int32_t i = 1; i < numInputs; ++i) {
|
|||
|
|
if (network->getInput(i)->getDimensions().d[0] != input0Batch) {
|
|||
|
|
std::cout << "\nError: Model has multiple inputs with differing batch sizes!" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check to see if the model supports dynamic batch size or not
|
|||
|
|
bool doesSupportDynamicBatch = false;
|
|||
|
|
if (input0Batch == -1) {
|
|||
|
|
doesSupportDynamicBatch = true;
|
|||
|
|
std::cout << "\n✓ Model supports DYNAMIC batch size" << std::endl;
|
|||
|
|
std::cout << " Batch size range: min=1, opt=" << m_options.optBatchSize
|
|||
|
|
<< ", max=" << m_options.maxBatchSize << std::endl;
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << "\n✗ Model only supports FIXED batch size of " << input0Batch << std::endl;
|
|||
|
|
std::cout << " WARNING: This will limit batch processing performance!" << std::endl;
|
|||
|
|
std::cout << " Consider re-exporting ONNX with dynamic batch axis." << std::endl;
|
|||
|
|
|
|||
|
|
// Adjust batch size options to match model's fixed batch size
|
|||
|
|
if (m_options.optBatchSize != input0Batch || m_options.maxBatchSize != input0Batch) {
|
|||
|
|
std::cout << " Adjusting batch size options to match model's fixed batch size" << std::endl;
|
|||
|
|
m_options.optBatchSize = input0Batch;
|
|||
|
|
m_options.maxBatchSize = input0Batch;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check for dynamic width and height dimensions
|
|||
|
|
const auto inputHeight = network->getInput(0)->getDimensions().d[2];
|
|||
|
|
const auto inputWidth = network->getInput(0)->getDimensions().d[3];
|
|||
|
|
|
|||
|
|
bool doesSupportDynamicHeight = false;
|
|||
|
|
bool doesSupportDynamicWidth = false;
|
|||
|
|
|
|||
|
|
// Check height dimension
|
|||
|
|
if (inputHeight == -1) {
|
|||
|
|
doesSupportDynamicHeight = true;
|
|||
|
|
std::cout << "\n✓ Model supports DYNAMIC height" << std::endl;
|
|||
|
|
|
|||
|
|
if (m_options.optInputHeight == -1) {
|
|||
|
|
std::cout << " No user-configured height found, using default: 640" << std::endl;
|
|||
|
|
m_options.minInputHeight = 640;
|
|||
|
|
m_options.optInputHeight = 640;
|
|||
|
|
m_options.maxInputHeight = 640;
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << " Using user-configured height: " << m_options.optInputHeight << std::endl;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << "\n• Model has FIXED height: " << inputHeight << std::endl;
|
|||
|
|
m_options.minInputHeight = m_options.optInputHeight = m_options.maxInputHeight = inputHeight;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check width dimension
|
|||
|
|
if (inputWidth == -1) {
|
|||
|
|
doesSupportDynamicWidth = true;
|
|||
|
|
std::cout << "✓ Model supports DYNAMIC width" << std::endl;
|
|||
|
|
|
|||
|
|
if (m_options.optInputWidth == -1) {
|
|||
|
|
std::cout << " No user-configured width found, using default: 640" << std::endl;
|
|||
|
|
m_options.minInputWidth = 640;
|
|||
|
|
m_options.optInputWidth = 640;
|
|||
|
|
m_options.maxInputWidth = 640;
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << " Using user-configured width: " << m_options.optInputWidth << std::endl;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << "• Model has FIXED width: " << inputWidth << std::endl;
|
|||
|
|
m_options.minInputWidth = m_options.optInputWidth = m_options.maxInputWidth = inputWidth;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
std::cout << "\nFinal input dimensions configured:" << std::endl;
|
|||
|
|
std::cout << " Height: " << m_options.optInputHeight << std::endl;
|
|||
|
|
std::cout << " Width: " << m_options.optInputWidth << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
|
|||
|
|
auto config = std::unique_ptr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
|
|||
|
|
if (!config) {
|
|||
|
|
std::cout << "Error: Failed to create builder config" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// PERFORMANCE OPTIMIZATIONS
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "Configuring Performance Settings" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
|
|||
|
|
// Get GPU properties for the target device (not always GPU 0)
|
|||
|
|
cudaDeviceProp prop;
|
|||
|
|
cudaGetDeviceProperties(&prop, m_options.deviceIndex);
|
|||
|
|
std::cout << "Building engine for GPU " << m_options.deviceIndex << ": " << prop.name << std::endl;
|
|||
|
|
std::cout << "Compute Capability: " << prop.major << "." << prop.minor << std::endl;
|
|||
|
|
std::cout << "Total GPU Memory: " << prop.totalGlobalMem / (1024 * 1024) << " MiB" << std::endl;
|
|||
|
|
|
|||
|
|
size_t free_mem, total_mem;
|
|||
|
|
cudaMemGetInfo(&free_mem, &total_mem);
|
|||
|
|
const size_t totalMiB = total_mem / (1024ULL * 1024);
|
|||
|
|
|
|||
|
|
// -- GPU-tier adaptive configuration --------------------------------------
|
|||
|
|
// All performance parameters scale with GPU VRAM to avoid OOM on small
|
|||
|
|
// GPUs while maximising throughput on larger ones.
|
|||
|
|
//
|
|||
|
|
// VRAM | Workspace | Opt Level | Max Batch | Tactic DRAM
|
|||
|
|
// ------------|-----------|-----------|-----------|-------------------
|
|||
|
|
// <= 1 GiB | 256 MiB | 3 | 1 | up to 2 GiB cap
|
|||
|
|
// <= 2 GiB | 512 MiB | 3 | 2 | up to 2 GiB cap
|
|||
|
|
// <= 4 GiB | 1 GiB | 3 | 4 | up to 2 GiB cap
|
|||
|
|
// <= 6 GiB | 2 GiB | 3 | 8 | up to 2 GiB cap
|
|||
|
|
// <= 8 GiB | 2 GiB | 3 | 16 | up to 2 GiB cap
|
|||
|
|
// <=12 GiB | 2 GiB | 3 | 16 | up to 2 GiB cap
|
|||
|
|
// <=16 GiB | 8 GiB | 5 | 32 | up to 4 GiB cap
|
|||
|
|
// <=24 GiB | 8 GiB | 5 | 32 | up to 4 GiB cap
|
|||
|
|
// > 24 GiB | 16 GiB | 5 | 32 | up to 8 GiB cap
|
|||
|
|
|
|||
|
|
// -- 1. Workspace size ----------------------------------------------------
|
|||
|
|
size_t max_workspace;
|
|||
|
|
const char* tierLabel;
|
|||
|
|
if (totalMiB > 24576) { // > 24 GiB
|
|||
|
|
max_workspace = 16ULL * 1024 * 1024 * 1024;
|
|||
|
|
tierLabel = "high-end (>24 GiB)";
|
|||
|
|
} else if (totalMiB > 12288) { // > 12 GiB
|
|||
|
|
max_workspace = 8ULL * 1024 * 1024 * 1024;
|
|||
|
|
tierLabel = "desktop (>12 GiB)";
|
|||
|
|
} else if (totalMiB > 4096) { // > 4 GiB
|
|||
|
|
max_workspace = 2ULL * 1024 * 1024 * 1024;
|
|||
|
|
tierLabel = "laptop (4-12 GiB)";
|
|||
|
|
} else if (totalMiB > 2048) { // > 2 GiB
|
|||
|
|
max_workspace = 1ULL * 1024 * 1024 * 1024;
|
|||
|
|
tierLabel = "low-end (2-4 GiB)";
|
|||
|
|
} else if (totalMiB > 1024) { // > 1 GiB
|
|||
|
|
max_workspace = 512ULL * 1024 * 1024;
|
|||
|
|
tierLabel = "minimal (1-2 GiB)";
|
|||
|
|
} else { // <= 1 GiB
|
|||
|
|
max_workspace = 256ULL * 1024 * 1024;
|
|||
|
|
tierLabel = "ultra-low (<=1 GiB)";
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
size_t workspace_size = std::min(max_workspace, static_cast<size_t>(free_mem * 0.4));
|
|||
|
|
config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, workspace_size);
|
|||
|
|
std::cout << "Workspace size set to: " << workspace_size / (1024 * 1024)
|
|||
|
|
<< " MiB (" << tierLabel << " tier)" << std::endl;
|
|||
|
|
|
|||
|
|
// -- 2. Max batch size cap ------------------------------------------------
|
|||
|
|
// The model config sets the *desired* maxBatchSize; the GPU VRAM
|
|||
|
|
// determines the *actual* cap. This affects the optimisation profile
|
|||
|
|
// range, warmup, and runtime chunk splitting.
|
|||
|
|
// Thresholds use ~97% of marketing size to account for OS/driver reserved
|
|||
|
|
// memory (e.g. an "8 GB" GPU reports 8187 MiB).
|
|||
|
|
if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
|
|||
|
|
int gpuMaxBatch;
|
|||
|
|
if (totalMiB >= 15800) gpuMaxBatch = 32; // ~16 GiB (e.g. 16384 -> reports ~15900+)
|
|||
|
|
else if (totalMiB >= 11800) gpuMaxBatch = 16; // ~12 GiB (e.g. 12288 -> reports ~11800+)
|
|||
|
|
else if (totalMiB >= 7900) gpuMaxBatch = 8; // ~ 8 GiB (e.g. 8192 -> reports ~8100+; batch=16 OCR ~987 MiB too large for 4 tasks)
|
|||
|
|
else if (totalMiB >= 3900) gpuMaxBatch = 4; // ~ 4 GiB (e.g. 4096 -> reports ~3950+)
|
|||
|
|
else if (totalMiB >= 1900) gpuMaxBatch = 2; // ~ 2 GiB (e.g. 2048 -> reports ~1950+)
|
|||
|
|
else gpuMaxBatch = 1; // < 2 GiB
|
|||
|
|
|
|||
|
|
const int prevMax = m_options.maxBatchSize;
|
|||
|
|
m_options.maxBatchSize = std::min(m_options.maxBatchSize, gpuMaxBatch);
|
|||
|
|
m_options.optBatchSize = std::min(m_options.optBatchSize, m_options.maxBatchSize);
|
|||
|
|
|
|||
|
|
if (prevMax != m_options.maxBatchSize) {
|
|||
|
|
std::cout << "Max batch size capped by GPU VRAM: " << prevMax
|
|||
|
|
<< " -> " << m_options.maxBatchSize
|
|||
|
|
<< " (GPU has " << totalMiB << " MiB)" << std::endl;
|
|||
|
|
}
|
|||
|
|
std::cout << "Batch config: opt=" << m_options.optBatchSize
|
|||
|
|
<< ", max=" << m_options.maxBatchSize << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// -- 3. Optimisation level ------------------------------------------------
|
|||
|
|
// Level 5 (exhaustive kernel search) only on GPUs with ≥16 GiB where
|
|||
|
|
// the tactic DRAM pool can hold the largest tactics. On smaller GPUs,
|
|||
|
|
// level 3 gives ~95 % of the runtime performance with dramatically
|
|||
|
|
// shorter build times.
|
|||
|
|
// Level 3 = balanced (best tradeoff: fast build, near-optimal kernels)
|
|||
|
|
// Level 5 = exhaustive (10x slower build for ~1-3% faster inference)
|
|||
|
|
// Use level 3 for all GPUs — the marginal runtime gain from level 5
|
|||
|
|
// is not worth the 10-30 minute build time on first run.
|
|||
|
|
const int optLevel = 3;
|
|||
|
|
config->setBuilderOptimizationLevel(optLevel);
|
|||
|
|
std::cout << "Builder optimization level set to " << optLevel
|
|||
|
|
<< " (balanced)" << std::endl;
|
|||
|
|
|
|||
|
|
// Enable TF32 for Ampere and newer GPUs
|
|||
|
|
if (prop.major >= 8) {
|
|||
|
|
config->setFlag(nvinfer1::BuilderFlag::kTF32);
|
|||
|
|
std::cout << "TF32 enabled for Ampere/Ada/Blackwell architecture" << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Enable optimization flags
|
|||
|
|
// kPREFER_PRECISION_CONSTRAINTS removed: deprecated in TRT 10.12, no-op in TRT 10.15.1.
|
|||
|
|
config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
|
|||
|
|
std::cout << "Optimization flags enabled" << std::endl;
|
|||
|
|
|
|||
|
|
// kDIRECT_IO removed: deprecated in TRT 10.7 as "Unneeded API".
|
|||
|
|
// TRT 10.7+ enables this behaviour automatically; the flag is a no-op in TRT 10.15.1.
|
|||
|
|
|
|||
|
|
// Enable all available tactic sources
|
|||
|
|
uint32_t tacticSources = 1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUBLAS) |
|
|||
|
|
1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUBLAS_LT) |
|
|||
|
|
1U << static_cast<uint32_t>(nvinfer1::TacticSource::kCUDNN);
|
|||
|
|
|
|||
|
|
if (prop.major >= 8) {
|
|||
|
|
tacticSources |= 1U << static_cast<uint32_t>(nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS);
|
|||
|
|
tacticSources |= 1U << static_cast<uint32_t>(nvinfer1::TacticSource::kJIT_CONVOLUTIONS);
|
|||
|
|
std::cout << "Enhanced tactic sources enabled for Ampere+ architecture" << std::endl;
|
|||
|
|
}
|
|||
|
|
config->setTacticSources(tacticSources);
|
|||
|
|
|
|||
|
|
// kDETAILED profiling embeds per-layer metadata in the engine and adds measurable
|
|||
|
|
// build/inference overhead. Use kNONE for production; switch to kDETAILED or
|
|||
|
|
// kLAYER_NAMES_ONLY only when profiling with Nsight Systems / trt-exec --profilingVerbosity.
|
|||
|
|
config->setProfilingVerbosity(nvinfer1::ProfilingVerbosity::kNONE);
|
|||
|
|
|
|||
|
|
// Set timing iterations
|
|||
|
|
config->setAvgTimingIterations(4);
|
|||
|
|
std::cout << "Timing iterations set to 4 for stable kernel selection" << std::endl;
|
|||
|
|
|
|||
|
|
// Set hardware compatibility
|
|||
|
|
config->setHardwareCompatibilityLevel(nvinfer1::HardwareCompatibilityLevel::kNONE);
|
|||
|
|
|
|||
|
|
// -- TensorRT 10+ tactic DRAM pool ----------------------------------------
|
|||
|
|
// Separate scratch pool for kernel-selection during build. Without this,
|
|||
|
|
// tactic evaluation competes with the workspace allocation and tactics
|
|||
|
|
// requesting >1 GiB get skipped, causing hours of wasted fallback searches.
|
|||
|
|
//
|
|||
|
|
// Strategy: give the tactic pool as much memory as possible while reserving
|
|||
|
|
// enough for workspace + builder overhead. The cap scales with GPU VRAM:
|
|||
|
|
// <=12 GiB -> up to 2 GiB (most tactics fit within 1.5 GiB)
|
|||
|
|
// <=24 GiB -> up to 4 GiB (room for larger model tactics)
|
|||
|
|
// > 24 GiB -> up to 8 GiB (future-proof for very large models)
|
|||
|
|
#if NV_TENSORRT_MAJOR >= 10
|
|||
|
|
{
|
|||
|
|
// Scale the tactic cap by GPU VRAM -- larger GPUs can afford more
|
|||
|
|
size_t tacticCap;
|
|||
|
|
if (totalMiB > 24576) tacticCap = 8ULL * 1024 * 1024 * 1024; // > 24 GiB
|
|||
|
|
else if (totalMiB > 12288) tacticCap = 4ULL * 1024 * 1024 * 1024; // > 12 GiB
|
|||
|
|
else tacticCap = 2ULL * 1024 * 1024 * 1024; // <= 12 GiB
|
|||
|
|
|
|||
|
|
// Reserve workspace + 512 MiB safety margin for builder internals
|
|||
|
|
const size_t reserveForBuild = workspace_size + (512ULL * 1024 * 1024);
|
|||
|
|
const size_t availableForTactic =
|
|||
|
|
(free_mem > reserveForBuild) ? (free_mem - reserveForBuild) : 0ULL;
|
|||
|
|
|
|||
|
|
size_t tacticMemory = std::min(tacticCap, availableForTactic);
|
|||
|
|
|
|||
|
|
// kTACTIC_DRAM requires a power-of-2 size; floor to nearest power of 2
|
|||
|
|
if (tacticMemory > 0) {
|
|||
|
|
size_t p = 1ULL;
|
|||
|
|
while (p * 2 <= tacticMemory) p *= 2;
|
|||
|
|
tacticMemory = p;
|
|||
|
|
}
|
|||
|
|
config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kTACTIC_DRAM, tacticMemory);
|
|||
|
|
std::cout << "kTACTIC_DRAM pool: " << tacticMemory / (1024 * 1024) << " MiB (TRT 10+)" << std::endl;
|
|||
|
|
}
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
// -- kSTRONGLY_TYPED (TRT 8.5 - 9.x only) --------------------------------
|
|||
|
|
// This flag existed in TRT 8.5 through 9.x to opt into strict type
|
|||
|
|
// enforcement. NVIDIA removed the enum in TRT 10.0 because strongly-typed
|
|||
|
|
// networks became the default behaviour -- setting it on TRT 10+ produces a
|
|||
|
|
// compile error ("undeclared identifier"). For TRT 10+ simply log a note.
|
|||
|
|
#if NV_TENSORRT_MAJOR < 10
|
|||
|
|
if (m_options.precision != ANSCENTER::Precision::INT8) {
|
|||
|
|
config->setFlag(nvinfer1::BuilderFlag::kSTRONGLY_TYPED);
|
|||
|
|
std::cout << "kSTRONGLY_TYPED enabled (TRT 8.5-9.x, FP32/FP16 mode)" << std::endl;
|
|||
|
|
}
|
|||
|
|
#else
|
|||
|
|
// TRT 10+: strongly-typed networks are the default; no flag required.
|
|||
|
|
std::cout << "Info: Strongly-typed mode is default in TRT 10+ (kSTRONGLY_TYPED removed)" << std::endl;
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
// -- kFASTER_DYNAMIC_SHAPES ------------------------------------------------
|
|||
|
|
// This flag reduces context-reshape overhead when batch size changes between
|
|||
|
|
// calls (10-100x faster switching, ~5% larger engine). It was added in a
|
|||
|
|
// TRT 10 minor release but the exact version varies by NVIDIA build; the
|
|||
|
|
// enum is absent from the installed headers so it is disabled here.
|
|||
|
|
// To re-enable: uncomment the block below once you confirm your TRT version
|
|||
|
|
// exposes nvinfer1::BuilderFlag::kFASTER_DYNAMIC_SHAPES.
|
|||
|
|
//
|
|||
|
|
// if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
|
|||
|
|
// config->setFlag(nvinfer1::BuilderFlag::kFASTER_DYNAMIC_SHAPES);
|
|||
|
|
// std::cout << "kFASTER_DYNAMIC_SHAPES enabled" << std::endl;
|
|||
|
|
// }
|
|||
|
|
|
|||
|
|
// -- kWEIGHT_STREAMING (TRT 10+) ------------------------------------------
|
|||
|
|
// DISABLED: kWEIGHT_STREAMING requires INetworkDefinition::setStronglyTyped(true)
|
|||
|
|
// to be called on the network before buildSerializedNetwork(), which is not done
|
|||
|
|
// for ONNX-imported networks in this code path. BuilderFlag::kSTRONGLY_TYPED was
|
|||
|
|
// removed from TRT 10+ (compile error), so there is no flag-level workaround.
|
|||
|
|
// Re-enable only if the ONNX parser layer is updated to call setStronglyTyped(true).
|
|||
|
|
// #if NV_TENSORRT_MAJOR >= 10
|
|||
|
|
// config->setFlag(nvinfer1::BuilderFlag::kWEIGHT_STREAMING);
|
|||
|
|
// std::cout << "kWEIGHT_STREAMING enabled (TRT 10+)" << std::endl;
|
|||
|
|
// #endif
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// TENSORRT 10+ DYNAMIC BATCH SUMMARY
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
#if NV_TENSORRT_MAJOR >= 10
|
|||
|
|
std::cout << "\nTensorRT " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH
|
|||
|
|
<< " | dynamic batch: " << (doesSupportDynamicBatch && m_options.maxBatchSize > 1 ? "YES" : "NO")
|
|||
|
|
<< " | max batch: " << m_options.maxBatchSize
|
|||
|
|
<< " | opt level: " << optLevel
|
|||
|
|
<< " | GPU VRAM: " << totalMiB << " MiB" << std::endl;
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
// Load timing cache if available (use actual engine name -- batch may have been capped above)
|
|||
|
|
const auto currentEngineName = serializeEngineOptions(m_options, onnxModelPath);
|
|||
|
|
std::string timingCachePath = currentEngineName + ".timing.cache";
|
|||
|
|
std::vector<char> timingCache;
|
|||
|
|
std::ifstream timingCacheFile(timingCachePath, std::ios::binary);
|
|||
|
|
if (timingCacheFile.good()) {
|
|||
|
|
timingCacheFile.seekg(0, std::ios::end);
|
|||
|
|
timingCache.resize(timingCacheFile.tellg());
|
|||
|
|
timingCacheFile.seekg(0, std::ios::beg);
|
|||
|
|
timingCacheFile.read(timingCache.data(), timingCache.size());
|
|||
|
|
|
|||
|
|
auto cache = config->createTimingCache(timingCache.data(), timingCache.size());
|
|||
|
|
if (cache) {
|
|||
|
|
config->setTimingCache(*cache, false);
|
|||
|
|
std::cout << "Loaded timing cache from: " << timingCachePath << std::endl;
|
|||
|
|
std::cout << " Cache size: " << timingCache.size() / 1024 << " KiB" << std::endl;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << "No existing timing cache found (this is normal for first build)" << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// OPTIMIZATION PROFILE CONFIGURATION
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "Configuring Optimization Profiles" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
|
|||
|
|
// Validate batch size options
|
|||
|
|
if (doesSupportDynamicBatch) {
|
|||
|
|
if (m_options.optBatchSize < 1) {
|
|||
|
|
std::cout << "Warning: optBatchSize < 1, setting to 1" << std::endl;
|
|||
|
|
m_options.optBatchSize = 1;
|
|||
|
|
}
|
|||
|
|
if (m_options.maxBatchSize < m_options.optBatchSize) {
|
|||
|
|
std::cout << "Warning: maxBatchSize < optBatchSize, adjusting maxBatchSize" << std::endl;
|
|||
|
|
m_options.maxBatchSize = m_options.optBatchSize;
|
|||
|
|
}
|
|||
|
|
std::cout << "Dynamic batch configuration validated:" << std::endl;
|
|||
|
|
std::cout << " Min batch size: 1" << std::endl;
|
|||
|
|
std::cout << " Opt batch size: " << m_options.optBatchSize << std::endl;
|
|||
|
|
std::cout << " Max batch size: " << m_options.maxBatchSize << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Create optimization profile
|
|||
|
|
nvinfer1::IOptimizationProfile* optProfile = builder->createOptimizationProfile();
|
|||
|
|
if (!optProfile) {
|
|||
|
|
std::cout << "Error: Failed to create optimization profile" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for (int32_t i = 0; i < numInputs; ++i) {
|
|||
|
|
const auto input = network->getInput(i);
|
|||
|
|
const auto inputName = input->getName();
|
|||
|
|
const auto inputDims = input->getDimensions();
|
|||
|
|
int32_t inputC = inputDims.d[1];
|
|||
|
|
int32_t inputH = inputDims.d[2];
|
|||
|
|
int32_t inputW = inputDims.d[3];
|
|||
|
|
|
|||
|
|
// Use configured values for height
|
|||
|
|
int32_t minInputHeight = doesSupportDynamicHeight ? m_options.minInputHeight : inputH;
|
|||
|
|
int32_t optInputHeight = doesSupportDynamicHeight ? m_options.optInputHeight : inputH;
|
|||
|
|
int32_t maxInputHeight = doesSupportDynamicHeight ? m_options.maxInputHeight : inputH;
|
|||
|
|
|
|||
|
|
// Use configured values for width
|
|||
|
|
int32_t minInputWidth = doesSupportDynamicWidth ? m_options.minInputWidth : inputW;
|
|||
|
|
int32_t optInputWidth = doesSupportDynamicWidth ? m_options.optInputWidth : inputW;
|
|||
|
|
int32_t maxInputWidth = doesSupportDynamicWidth ? m_options.maxInputWidth : inputW;
|
|||
|
|
|
|||
|
|
// Create dimension objects
|
|||
|
|
int32_t minBatch = doesSupportDynamicBatch ? 1 : m_options.optBatchSize;
|
|||
|
|
int32_t optBatch = doesSupportDynamicBatch ? m_options.optBatchSize : m_options.optBatchSize;
|
|||
|
|
int32_t maxBatch = doesSupportDynamicBatch ? m_options.maxBatchSize : m_options.maxBatchSize;
|
|||
|
|
|
|||
|
|
nvinfer1::Dims4 minDims(minBatch, inputC, minInputHeight, minInputWidth);
|
|||
|
|
nvinfer1::Dims4 optDims(optBatch, inputC, optInputHeight, optInputWidth);
|
|||
|
|
nvinfer1::Dims4 maxDims(maxBatch, inputC, maxInputHeight, maxInputWidth);
|
|||
|
|
|
|||
|
|
std::cout << "\nSetting profile for input '" << inputName << "':" << std::endl;
|
|||
|
|
std::cout << " MIN: [" << minDims.d[0] << "," << minDims.d[1] << ","
|
|||
|
|
<< minDims.d[2] << "," << minDims.d[3] << "]" << std::endl;
|
|||
|
|
std::cout << " OPT: [" << optDims.d[0] << "," << optDims.d[1] << ","
|
|||
|
|
<< optDims.d[2] << "," << optDims.d[3] << "]" << std::endl;
|
|||
|
|
std::cout << " MAX: [" << maxDims.d[0] << "," << maxDims.d[1] << ","
|
|||
|
|
<< maxDims.d[2] << "," << maxDims.d[3] << "]" << std::endl;
|
|||
|
|
|
|||
|
|
// Set the dimensions with error checking
|
|||
|
|
bool minSet = optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN, minDims);
|
|||
|
|
bool optSet = optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT, optDims);
|
|||
|
|
bool maxSet = optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX, maxDims);
|
|||
|
|
|
|||
|
|
if (!minSet || !optSet || !maxSet) {
|
|||
|
|
std::cout << " ✗ ERROR: Failed to set profile dimensions!" << std::endl;
|
|||
|
|
std::cout << " minSet: " << (minSet ? "OK" : "FAILED") << std::endl;
|
|||
|
|
std::cout << " optSet: " << (optSet ? "OK" : "FAILED") << std::endl;
|
|||
|
|
std::cout << " maxSet: " << (maxSet ? "OK" : "FAILED") << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
std::cout << " ✓ Profile dimensions set successfully" << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Validate the profile
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "VALIDATING OPTIMIZATION PROFILE" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
|
|||
|
|
bool profileValid = optProfile->isValid();
|
|||
|
|
std::cout << "Profile validation result: " << (profileValid ? "✓ VALID" : "✗ INVALID") << std::endl;
|
|||
|
|
|
|||
|
|
if (!profileValid) {
|
|||
|
|
std::cout << "ERROR: Profile is invalid! Cannot continue." << std::endl;
|
|||
|
|
std::cout << "This usually means the min/opt/max dimensions are inconsistent." << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Verify what we actually set
|
|||
|
|
for (int32_t i = 0; i < numInputs; ++i) {
|
|||
|
|
const auto input = network->getInput(i);
|
|||
|
|
const auto inputName = input->getName();
|
|||
|
|
|
|||
|
|
auto minDims = optProfile->getDimensions(inputName, nvinfer1::OptProfileSelector::kMIN);
|
|||
|
|
auto optDims = optProfile->getDimensions(inputName, nvinfer1::OptProfileSelector::kOPT);
|
|||
|
|
auto maxDims = optProfile->getDimensions(inputName, nvinfer1::OptProfileSelector::kMAX);
|
|||
|
|
|
|||
|
|
std::cout << "\nVerified profile for input '" << inputName << "':" << std::endl;
|
|||
|
|
std::cout << " MIN: [" << minDims.d[0] << "," << minDims.d[1] << ","
|
|||
|
|
<< minDims.d[2] << "," << minDims.d[3] << "]" << std::endl;
|
|||
|
|
std::cout << " OPT: [" << optDims.d[0] << "," << optDims.d[1] << ","
|
|||
|
|
<< optDims.d[2] << "," << optDims.d[3] << "]" << std::endl;
|
|||
|
|
std::cout << " MAX: [" << maxDims.d[0] << "," << maxDims.d[1] << ","
|
|||
|
|
<< maxDims.d[2] << "," << maxDims.d[3] << "]" << std::endl;
|
|||
|
|
|
|||
|
|
// Check batch dimension range
|
|||
|
|
if (minDims.d[0] != maxDims.d[0]) {
|
|||
|
|
std::cout << " ✓ Profile IS DYNAMIC (batch " << minDims.d[0]
|
|||
|
|
<< " to " << maxDims.d[0] << ")" << std::endl;
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << " • Profile IS FIXED at batch " << minDims.d[0] << std::endl;
|
|||
|
|
|
|||
|
|
if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
|
|||
|
|
std::cout << "\n🚨 CRITICAL ERROR: ONNX supports dynamic batch but profile is fixed!" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
|
|||
|
|
// Add the validated profile
|
|||
|
|
config->addOptimizationProfile(optProfile);
|
|||
|
|
|
|||
|
|
int32_t numProfiles = config->getNbOptimizationProfiles();
|
|||
|
|
std::cout << "\n✓ Optimization profile added successfully" << std::endl;
|
|||
|
|
std::cout << " Total profiles in config: " << numProfiles << std::endl;
|
|||
|
|
|
|||
|
|
if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
|
|||
|
|
std::cout << " ✓ Profile covers DYNAMIC batch range: 1 to " << m_options.maxBatchSize << std::endl;
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << " • Profile has FIXED batch size: " << m_options.maxBatchSize << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// PRECISION CONFIGURATION
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "Configuring Precision" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
|
|||
|
|
if (m_options.precision == ANSCENTER::Precision::FP16) {
|
|||
|
|
if (!builder->platformHasFastFp16()) {
|
|||
|
|
std::cout << "Error: GPU does not support FP16 precision" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
config->setFlag(nvinfer1::BuilderFlag::kFP16);
|
|||
|
|
std::cout << "FP16 precision enabled" << std::endl;
|
|||
|
|
|
|||
|
|
// Mixed precision safety: force numerically sensitive layers to FP32.
|
|||
|
|
// Some models (e.g. PP-OCRv5 det) produce NaN when certain layers
|
|||
|
|
// run in FP16 due to overflow in intermediate accumulators. Forcing
|
|||
|
|
// these layers to FP32 has negligible performance impact while
|
|||
|
|
// preventing NaN corruption.
|
|||
|
|
//
|
|||
|
|
// Targeted layer types:
|
|||
|
|
// - kREDUCE : accumulation overflows FP16 max (65504)
|
|||
|
|
// - kELEMENTWISE/Pow: large intermediate values
|
|||
|
|
// - kNORMALIZATION : mean/variance reduction + 1/sqrt overflow
|
|||
|
|
// - kSOFTMAX : exp() extremely sensitive to precision
|
|||
|
|
// - kACTIVATION/Sigmoid: 1/(1+exp(-x)) overflows for large |x|
|
|||
|
|
// - kUNARY/Exp,Log : exp overflows for x>~11, log underflows
|
|||
|
|
//
|
|||
|
|
// IMPORTANT: setPrecision() is only a HINT without kOBEY_PRECISION_CONSTRAINTS.
|
|||
|
|
// We must set this flag so TRT strictly respects our per-layer FP32 overrides.
|
|||
|
|
// (kPREFER_PRECISION_CONSTRAINTS is deprecated/no-op in TRT 10.12+;
|
|||
|
|
// kOBEY means build FAILS if no FP32 kernel exists — better than silent NaN.)
|
|||
|
|
int fp32Overrides = 0;
|
|||
|
|
const int numLayers = network->getNbLayers();
|
|||
|
|
|
|||
|
|
// --- Diagnostic: enumerate all layer types in this network ---
|
|||
|
|
std::map<std::string, int> layerTypeCounts;
|
|||
|
|
auto layerTypeName = [](nvinfer1::LayerType t) -> std::string {
|
|||
|
|
switch (t) {
|
|||
|
|
case nvinfer1::LayerType::kCONVOLUTION: return "Convolution";
|
|||
|
|
case nvinfer1::LayerType::kCAST: return "Cast";
|
|||
|
|
case nvinfer1::LayerType::kACTIVATION: return "Activation";
|
|||
|
|
case nvinfer1::LayerType::kPOOLING: return "Pooling";
|
|||
|
|
case nvinfer1::LayerType::kLRN: return "LRN";
|
|||
|
|
case nvinfer1::LayerType::kSCALE: return "Scale";
|
|||
|
|
case nvinfer1::LayerType::kSOFTMAX: return "Softmax";
|
|||
|
|
case nvinfer1::LayerType::kDECONVOLUTION: return "Deconvolution";
|
|||
|
|
case nvinfer1::LayerType::kCONCATENATION: return "Concatenation";
|
|||
|
|
case nvinfer1::LayerType::kELEMENTWISE: return "ElementWise";
|
|||
|
|
case nvinfer1::LayerType::kPLUGIN: return "Plugin";
|
|||
|
|
case nvinfer1::LayerType::kUNARY: return "Unary";
|
|||
|
|
case nvinfer1::LayerType::kPADDING: return "Padding";
|
|||
|
|
case nvinfer1::LayerType::kSHUFFLE: return "Shuffle";
|
|||
|
|
case nvinfer1::LayerType::kREDUCE: return "Reduce";
|
|||
|
|
case nvinfer1::LayerType::kTOPK: return "TopK";
|
|||
|
|
case nvinfer1::LayerType::kGATHER: return "Gather";
|
|||
|
|
case nvinfer1::LayerType::kMATRIX_MULTIPLY: return "MatrixMultiply";
|
|||
|
|
case nvinfer1::LayerType::kCONSTANT: return "Constant";
|
|||
|
|
case nvinfer1::LayerType::kIDENTITY: return "Identity";
|
|||
|
|
case nvinfer1::LayerType::kSLICE: return "Slice";
|
|||
|
|
case nvinfer1::LayerType::kSHAPE: return "Shape";
|
|||
|
|
case nvinfer1::LayerType::kRESIZE: return "Resize";
|
|||
|
|
case nvinfer1::LayerType::kSELECT: return "Select";
|
|||
|
|
case nvinfer1::LayerType::kFILL: return "Fill";
|
|||
|
|
case nvinfer1::LayerType::kQUANTIZE: return "Quantize";
|
|||
|
|
case nvinfer1::LayerType::kDEQUANTIZE: return "Dequantize";
|
|||
|
|
case nvinfer1::LayerType::kSCATTER: return "Scatter";
|
|||
|
|
case nvinfer1::LayerType::kEINSUM: return "Einsum";
|
|||
|
|
case nvinfer1::LayerType::kGRID_SAMPLE: return "GridSample";
|
|||
|
|
case nvinfer1::LayerType::kNMS: return "NMS";
|
|||
|
|
case nvinfer1::LayerType::kNORMALIZATION: return "Normalization";
|
|||
|
|
case nvinfer1::LayerType::kSQUEEZE: return "Squeeze";
|
|||
|
|
case nvinfer1::LayerType::kUNSQUEEZE: return "Unsqueeze";
|
|||
|
|
default: return "Unknown(" + std::to_string(static_cast<int>(t)) + ")";
|
|||
|
|
}
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
for (int i = 0; i < numLayers; ++i) {
|
|||
|
|
auto* layer = network->getLayer(i);
|
|||
|
|
const auto ltype = layer->getType();
|
|||
|
|
|
|||
|
|
bool needsFP32 = false;
|
|||
|
|
|
|||
|
|
switch (ltype) {
|
|||
|
|
case nvinfer1::LayerType::kREDUCE:
|
|||
|
|
needsFP32 = true;
|
|||
|
|
break;
|
|||
|
|
case nvinfer1::LayerType::kELEMENTWISE:
|
|||
|
|
{
|
|||
|
|
// Only force Pow to FP32; Add/Mul/etc. are fine in FP16
|
|||
|
|
auto* ew = static_cast<nvinfer1::IElementWiseLayer*>(layer);
|
|||
|
|
if (ew->getOperation() == nvinfer1::ElementWiseOperation::kPOW) {
|
|||
|
|
needsFP32 = true;
|
|||
|
|
}
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
case nvinfer1::LayerType::kNORMALIZATION:
|
|||
|
|
needsFP32 = true;
|
|||
|
|
break;
|
|||
|
|
case nvinfer1::LayerType::kSOFTMAX:
|
|||
|
|
needsFP32 = true;
|
|||
|
|
break;
|
|||
|
|
case nvinfer1::LayerType::kACTIVATION:
|
|||
|
|
{
|
|||
|
|
// Sigmoid is 1/(1+exp(-x)) — exp overflows FP16 for large |x|
|
|||
|
|
auto* act = static_cast<nvinfer1::IActivationLayer*>(layer);
|
|||
|
|
if (act->getActivationType() == nvinfer1::ActivationType::kSIGMOID) {
|
|||
|
|
needsFP32 = true;
|
|||
|
|
}
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
case nvinfer1::LayerType::kUNARY:
|
|||
|
|
{
|
|||
|
|
// Exp overflows FP16 for x > ~11; Log underflows for tiny values
|
|||
|
|
auto* un = static_cast<nvinfer1::IUnaryLayer*>(layer);
|
|||
|
|
const auto op = un->getOperation();
|
|||
|
|
if (op == nvinfer1::UnaryOperation::kEXP ||
|
|||
|
|
op == nvinfer1::UnaryOperation::kLOG) {
|
|||
|
|
needsFP32 = true;
|
|||
|
|
}
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
default:
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Track layer type for diagnostic summary
|
|||
|
|
std::string name = layerTypeName(ltype);
|
|||
|
|
if (needsFP32) name += " [FP32]";
|
|||
|
|
layerTypeCounts[name]++;
|
|||
|
|
|
|||
|
|
if (needsFP32) {
|
|||
|
|
layer->setPrecision(nvinfer1::DataType::kFLOAT);
|
|||
|
|
for (int o = 0; o < layer->getNbOutputs(); ++o) {
|
|||
|
|
layer->setOutputType(o, nvinfer1::DataType::kFLOAT);
|
|||
|
|
}
|
|||
|
|
++fp32Overrides;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Print layer type summary
|
|||
|
|
std::cout << " Network layer types (" << numLayers << " total):" << std::endl;
|
|||
|
|
for (const auto& kv : layerTypeCounts) {
|
|||
|
|
std::cout << " " << kv.first << ": " << kv.second << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (fp32Overrides > 0) {
|
|||
|
|
// Enforce per-layer precision constraints — without this flag,
|
|||
|
|
// setPrecision(kFLOAT) is merely a hint that TRT can ignore.
|
|||
|
|
config->setFlag(nvinfer1::BuilderFlag::kOBEY_PRECISION_CONSTRAINTS);
|
|||
|
|
std::cout << " Mixed precision: " << fp32Overrides
|
|||
|
|
<< " / " << numLayers
|
|||
|
|
<< " layers forced to FP32"
|
|||
|
|
<< std::endl;
|
|||
|
|
std::cout << " kOBEY_PRECISION_CONSTRAINTS enabled to enforce FP32 on marked layers"
|
|||
|
|
<< std::endl;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
else if (m_options.precision == ANSCENTER::Precision::INT8) {
|
|||
|
|
if (numInputs > 1) {
|
|||
|
|
std::cout << "Error: This implementation currently only supports INT8 for single input models" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (!builder->platformHasFastInt8()) {
|
|||
|
|
std::cout << "Error: GPU does not support INT8 precision" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (m_options.calibrationDataDirectoryPath.empty()) {
|
|||
|
|
std::cout << "Error: INT8 precision requires calibration data directory path" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
config->setFlag(nvinfer1::BuilderFlag::kINT8);
|
|||
|
|
std::cout << "INT8 precision enabled" << std::endl;
|
|||
|
|
|
|||
|
|
const auto input = network->getInput(0);
|
|||
|
|
const auto inputName = input->getName();
|
|||
|
|
const auto inputDims = input->getDimensions();
|
|||
|
|
const auto calibrationFileName = currentEngineName + ".calibration";
|
|||
|
|
|
|||
|
|
m_calibrator = std::make_unique<Int8EntropyCalibrator2>(m_options.calibrationBatchSize, inputDims.d[3], inputDims.d[2],
|
|||
|
|
m_options.calibrationDataDirectoryPath, calibrationFileName, inputName,
|
|||
|
|
subVals, divVals, normalize);
|
|||
|
|
config->setInt8Calibrator(m_calibrator.get());
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
// FP32 mode - do NOT enable kFP16 flag; some models (e.g. PP-OCRv5 det)
|
|||
|
|
// produce NaN when TRT silently promotes layers to FP16.
|
|||
|
|
std::cout << "FP32 precision (strict, no FP16 fallback)" << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// BUILD ENGINE
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "Building Engine" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
|
|||
|
|
cudaStream_t profileStream;
|
|||
|
|
Util::checkCudaErrorCode(cudaStreamCreate(&profileStream));
|
|||
|
|
config->setProfileStream(profileStream);
|
|||
|
|
|
|||
|
|
std::cout << "Building engine... This may take several minutes." << std::endl;
|
|||
|
|
std::cout << "Progress will be shown as layers are optimized..." << std::endl;
|
|||
|
|
|
|||
|
|
if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
|
|||
|
|
std::cout << "✓ Building with DYNAMIC batch support (1-" << m_options.maxBatchSize << ")" << std::endl;
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << "• Building with FIXED batch size " << m_options.maxBatchSize << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Build the engine (crash-safe)
|
|||
|
|
auto startTime = std::chrono::high_resolution_clock::now();
|
|||
|
|
unsigned long sehCodeBuild = 0;
|
|||
|
|
std::unique_ptr<nvinfer1::IHostMemory> plan{
|
|||
|
|
buildSerializedNetworkSafe(builder.get(), *network, *config, &sehCodeBuild)
|
|||
|
|
};
|
|||
|
|
auto endTime = std::chrono::high_resolution_clock::now();
|
|||
|
|
|
|||
|
|
if (sehCodeBuild != 0) {
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "Build CRASHED!" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "[Engine] FATAL: buildSerializedNetwork crashed ("
|
|||
|
|
<< formatCrashCode(sehCodeBuild) << ")" << std::endl;
|
|||
|
|
std::cout << "[Engine] This typically indicates insufficient GPU memory or a driver crash." << std::endl;
|
|||
|
|
Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (!plan) {
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "Build Failed!" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "Error: Failed to build engine." << std::endl;
|
|||
|
|
Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
auto buildTime = std::chrono::duration_cast<std::chrono::seconds>(endTime - startTime).count();
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "Build Successful!" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
std::cout << "Build time: " << buildTime << " seconds (" << buildTime / 60 << " minutes)" << std::endl;
|
|||
|
|
|
|||
|
|
// Write the engine to disk.
|
|||
|
|
// Re-compute the filename because build() may have capped maxBatchSize
|
|||
|
|
// (e.g. b32 -> b8), so the saved file must match the actual config.
|
|||
|
|
const auto actualEngineName = serializeEngineOptions(m_options, onnxModelPath);
|
|||
|
|
const auto enginePath = std::filesystem::path(m_options.engineFileDir) / actualEngineName;
|
|||
|
|
std::ofstream outfile(enginePath, std::ofstream::binary);
|
|||
|
|
if (!outfile) {
|
|||
|
|
std::cout << "Error: Failed to open file for writing: " << enginePath << std::endl;
|
|||
|
|
Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
outfile.write(reinterpret_cast<const char*>(plan->data()), plan->size());
|
|||
|
|
outfile.close();
|
|||
|
|
|
|||
|
|
std::cout << "Engine saved to: " << enginePath.string() << std::endl;
|
|||
|
|
std::cout << "Engine size: " << plan->size() / (1024 * 1024) << " MiB" << std::endl;
|
|||
|
|
|
|||
|
|
if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
|
|||
|
|
std::cout << "✓ Engine supports DYNAMIC batch sizes: 1 to " << m_options.maxBatchSize << std::endl;
|
|||
|
|
}
|
|||
|
|
else {
|
|||
|
|
std::cout << "• Engine supports FIXED batch size: " << m_options.maxBatchSize << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Save timing cache
|
|||
|
|
auto timingCacheFromConfig = config->getTimingCache();
|
|||
|
|
if (timingCacheFromConfig) {
|
|||
|
|
auto timingCacheData = timingCacheFromConfig->serialize();
|
|||
|
|
if (timingCacheData) {
|
|||
|
|
std::ofstream timingCacheOut(timingCachePath, std::ios::binary);
|
|||
|
|
if (timingCacheOut) {
|
|||
|
|
timingCacheOut.write(static_cast<const char*>(timingCacheData->data()), timingCacheData->size());
|
|||
|
|
timingCacheOut.close();
|
|||
|
|
std::cout << "Timing cache saved to: " << timingCachePath << std::endl;
|
|||
|
|
std::cout << " Cache size: " << timingCacheData->size() / 1024 << " KiB" << std::endl;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
|
|||
|
|
|
|||
|
|
std::cout << "\n========================================" << std::endl;
|
|||
|
|
std::cout << "Build Complete!" << std::endl;
|
|||
|
|
std::cout << "========================================" << std::endl;
|
|||
|
|
|
|||
|
|
if (doesSupportDynamicBatch && m_options.maxBatchSize > 1) {
|
|||
|
|
std::cout << "\n✓ Engine supports batch inference (1-" << m_options.maxBatchSize << " images)" << std::endl;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// buildSafe()
|
|||
|
|
//
|
|||
|
|
// SEH wrapper around build(). Cannot use __try in a function with C++
|
|||
|
|
// destructors, so the actual build() call is forwarded through a plain-C
|
|||
|
|
// function pointer via callBoolFuncSafe().
|
|||
|
|
// ============================================================================
|
|||
|
|
struct BuildSafeCtx_Base {
|
|||
|
|
void* enginePtr; // Engine<T>*
|
|||
|
|
const char* onnxPath;
|
|||
|
|
size_t onnxPathLen;
|
|||
|
|
const float* subVals;
|
|||
|
|
const float* divVals;
|
|||
|
|
bool normalize;
|
|||
|
|
bool result;
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
template <typename T>
|
|||
|
|
static bool buildSafe_trampoline(void* ctx) {
|
|||
|
|
auto* c = static_cast<BuildSafeCtx_Base*>(ctx);
|
|||
|
|
auto* engine = static_cast<Engine<T>*>(c->enginePtr);
|
|||
|
|
std::string path(c->onnxPath, c->onnxPathLen);
|
|||
|
|
std::array<float, 3> sub = { c->subVals[0], c->subVals[1], c->subVals[2] };
|
|||
|
|
std::array<float, 3> div = { c->divVals[0], c->divVals[1], c->divVals[2] };
|
|||
|
|
return engine->build(path, sub, div, c->normalize);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
template <typename T>
|
|||
|
|
bool Engine<T>::buildSafe(std::string onnxModelPath,
|
|||
|
|
const std::array<float, 3>& subVals,
|
|||
|
|
const std::array<float, 3>& divVals,
|
|||
|
|
bool normalize,
|
|||
|
|
unsigned long* outSehCode)
|
|||
|
|
{
|
|||
|
|
BuildSafeCtx_Base ctx;
|
|||
|
|
ctx.enginePtr = this;
|
|||
|
|
ctx.onnxPath = onnxModelPath.c_str();
|
|||
|
|
ctx.onnxPathLen = onnxModelPath.size();
|
|||
|
|
ctx.subVals = subVals.data();
|
|||
|
|
ctx.divVals = divVals.data();
|
|||
|
|
ctx.normalize = normalize;
|
|||
|
|
ctx.result = false;
|
|||
|
|
|
|||
|
|
bool ok = callBoolFuncSafe(&buildSafe_trampoline<T>, &ctx, outSehCode);
|
|||
|
|
return ok;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// buildWithRetry()
|
|||
|
|
//
|
|||
|
|
// Wraps build() with auto-retry for dynamic spatial dimension models.
|
|||
|
|
// Pre-analyzes the ONNX model to detect dynamic H/W dims, then builds a
|
|||
|
|
// fallback chain (max → 75% → 56% → ... → 640 → 320). Each candidate
|
|||
|
|
// calls build(), which checks for a cached engine first (fast) then tries
|
|||
|
|
// building if no cache exists. Fixed-spatial models skip retry.
|
|||
|
|
// ============================================================================
|
|||
|
|
template <typename T>
|
|||
|
|
bool Engine<T>::buildWithRetry(std::string onnxModelPath,
|
|||
|
|
const std::array<float, 3>& subVals,
|
|||
|
|
const std::array<float, 3>& divVals,
|
|||
|
|
bool normalize)
|
|||
|
|
{
|
|||
|
|
// -- Quick pre-analysis: detect dynamic spatial dims in ONNX ---------------
|
|||
|
|
bool hasDynamicSpatial = false;
|
|||
|
|
int onnxFixedH = 0, onnxFixedW = 0; // 0 = dynamic (-1 in ONNX)
|
|||
|
|
|
|||
|
|
if (m_options.maxInputHeight > 0 && m_options.maxInputWidth > 0) {
|
|||
|
|
auto tempBuilder = std::unique_ptr<nvinfer1::IBuilder>(
|
|||
|
|
nvinfer1::createInferBuilder(m_logger));
|
|||
|
|
auto tempNetwork = std::unique_ptr<nvinfer1::INetworkDefinition>(TRT_CREATE_NETWORK(tempBuilder));
|
|||
|
|
auto tempParser = std::unique_ptr<nvonnxparser::IParser>(
|
|||
|
|
nvonnxparser::createParser(*tempNetwork, m_logger));
|
|||
|
|
|
|||
|
|
std::ifstream onnxFile(onnxModelPath, std::ios::binary | std::ios::ate);
|
|||
|
|
if (onnxFile.good()) {
|
|||
|
|
std::streamsize onnxSize = onnxFile.tellg();
|
|||
|
|
onnxFile.seekg(0, std::ios::beg);
|
|||
|
|
std::vector<char> onnxBuffer(onnxSize);
|
|||
|
|
if (onnxFile.read(onnxBuffer.data(), onnxSize)) {
|
|||
|
|
unsigned long sehRetryParse = 0;
|
|||
|
|
bool retryParsed = parseOnnxModelSafe(tempParser.get(),
|
|||
|
|
onnxBuffer.data(), onnxBuffer.size(), &sehRetryParse);
|
|||
|
|
if (sehRetryParse != 0) {
|
|||
|
|
std::cout << "[Engine] WARNING: ONNX pre-analysis parse crashed in "
|
|||
|
|
<< "buildWithRetry (" << formatCrashCode(sehRetryParse)
|
|||
|
|
<< "). Skipping spatial analysis." << std::endl;
|
|||
|
|
// hasDynamicSpatial stays false → single build() attempt
|
|||
|
|
}
|
|||
|
|
else if (retryParsed && tempNetwork->getNbInputs() > 0) {
|
|||
|
|
auto dims = tempNetwork->getInput(0)->getDimensions();
|
|||
|
|
if (dims.nbDims >= 4) {
|
|||
|
|
if (dims.d[2] == -1 || dims.d[3] == -1)
|
|||
|
|
hasDynamicSpatial = true;
|
|||
|
|
onnxFixedH = (dims.d[2] != -1) ? dims.d[2] : 0;
|
|||
|
|
onnxFixedW = (dims.d[3] != -1) ? dims.d[3] : 0;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// -- Fixed-spatial or no dynamic dims: single build attempt ----------------
|
|||
|
|
if (!hasDynamicSpatial) {
|
|||
|
|
unsigned long sehBuild = 0;
|
|||
|
|
bool ok = buildSafe(onnxModelPath, subVals, divVals, normalize, &sehBuild);
|
|||
|
|
if (sehBuild != 0) {
|
|||
|
|
std::cout << "[Engine] FATAL: build() crashed in buildWithRetry ("
|
|||
|
|
<< formatCrashCode(sehBuild) << ")" << std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
return ok;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// -- Dynamic spatial dims: build with fallback chain ----------------------
|
|||
|
|
const bool dynamicH = (onnxFixedH == 0);
|
|||
|
|
const bool dynamicW = (onnxFixedW == 0);
|
|||
|
|
|
|||
|
|
const int origMaxH = m_options.maxInputHeight;
|
|||
|
|
const int origMaxW = m_options.maxInputWidth;
|
|||
|
|
const int origOptH = m_options.optInputHeight;
|
|||
|
|
const int origOptW = m_options.optInputWidth;
|
|||
|
|
const int origMinH = m_options.minInputHeight;
|
|||
|
|
const int origMinW = m_options.minInputWidth;
|
|||
|
|
|
|||
|
|
int dynMaxH = dynamicH ? origMaxH : 0;
|
|||
|
|
int dynMaxW = dynamicW ? origMaxW : 0;
|
|||
|
|
int maxDynDim = std::max(dynMaxH, dynMaxW);
|
|||
|
|
|
|||
|
|
// Build fallback chain: max → 75% → 56% → ... → 640 → 320
|
|||
|
|
std::vector<int> candidates;
|
|||
|
|
for (int s = maxDynDim; s >= 320; s = (s * 3) / 4) {
|
|||
|
|
s = (s / 32) * 32;
|
|||
|
|
if (candidates.empty() || candidates.back() != s)
|
|||
|
|
candidates.push_back(s);
|
|||
|
|
}
|
|||
|
|
if (candidates.back() > 640) candidates.push_back(640);
|
|||
|
|
if (candidates.back() > 320) candidates.push_back(320);
|
|||
|
|
|
|||
|
|
// Helper: configure m_options for a given candidate
|
|||
|
|
auto setCandidateOptions = [&](int candidate) {
|
|||
|
|
float scale = static_cast<float>(candidate) / maxDynDim;
|
|||
|
|
m_options.maxInputHeight = dynamicH
|
|||
|
|
? std::max(32, (static_cast<int>(origMaxH * scale) / 32) * 32)
|
|||
|
|
: onnxFixedH;
|
|||
|
|
m_options.maxInputWidth = dynamicW
|
|||
|
|
? std::max(32, (static_cast<int>(origMaxW * scale) / 32) * 32)
|
|||
|
|
: onnxFixedW;
|
|||
|
|
m_options.minInputHeight = dynamicH
|
|||
|
|
? std::min(origMinH, m_options.maxInputHeight) : onnxFixedH;
|
|||
|
|
m_options.minInputWidth = dynamicW
|
|||
|
|
? std::min(origMinW, m_options.maxInputWidth) : onnxFixedW;
|
|||
|
|
m_options.optInputHeight = dynamicH
|
|||
|
|
? std::min(origOptH, m_options.maxInputHeight) : onnxFixedH;
|
|||
|
|
m_options.optInputWidth = dynamicW
|
|||
|
|
? std::min(origOptW, m_options.maxInputWidth) : onnxFixedW;
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
// Try each candidate (largest first). build() checks cache before
|
|||
|
|
// building, so previously cached smaller engines are found quickly.
|
|||
|
|
for (size_t attempt = 0; attempt < candidates.size(); ++attempt) {
|
|||
|
|
setCandidateOptions(candidates[attempt]);
|
|||
|
|
|
|||
|
|
std::cout << "[Engine] buildWithRetry attempt " << (attempt + 1)
|
|||
|
|
<< "/" << candidates.size() << " (max "
|
|||
|
|
<< m_options.maxInputHeight << "x"
|
|||
|
|
<< m_options.maxInputWidth << ")" << std::endl;
|
|||
|
|
|
|||
|
|
{
|
|||
|
|
unsigned long sehAttempt = 0;
|
|||
|
|
bool attemptOk = buildSafe(onnxModelPath, subVals, divVals, normalize, &sehAttempt);
|
|||
|
|
if (sehAttempt != 0) {
|
|||
|
|
std::cout << "[Engine] Build crashed ("
|
|||
|
|
<< formatCrashCode(sehAttempt) << ") at max "
|
|||
|
|
<< m_options.maxInputHeight << "x"
|
|||
|
|
<< m_options.maxInputWidth << std::endl;
|
|||
|
|
// CUDA context may be corrupted — no point retrying
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
if (attemptOk) {
|
|||
|
|
if (attempt > 0) {
|
|||
|
|
std::cout << "[Engine] Built with reduced max "
|
|||
|
|
<< m_options.maxInputHeight << "x"
|
|||
|
|
<< m_options.maxInputWidth
|
|||
|
|
<< " (requested " << origMaxH << "x" << origMaxW
|
|||
|
|
<< " exceeded GPU capacity)" << std::endl;
|
|||
|
|
}
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (attempt + 1 < candidates.size()) {
|
|||
|
|
std::cout << "[Engine] Build failed at max "
|
|||
|
|
<< m_options.maxInputHeight << "x"
|
|||
|
|
<< m_options.maxInputWidth
|
|||
|
|
<< ", trying smaller..." << std::endl;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// All candidates exhausted — restore original options for error reporting
|
|||
|
|
m_options.maxInputHeight = origMaxH;
|
|||
|
|
m_options.maxInputWidth = origMaxW;
|
|||
|
|
m_options.optInputHeight = origOptH;
|
|||
|
|
m_options.optInputWidth = origOptW;
|
|||
|
|
m_options.minInputHeight = origMinH;
|
|||
|
|
m_options.minInputWidth = origMinW;
|
|||
|
|
|
|||
|
|
std::cout << "[Engine] buildWithRetry: all spatial dimension fallbacks "
|
|||
|
|
<< "exhausted (tried " << candidates.size() << " candidates from "
|
|||
|
|
<< candidates.front() << " down to " << candidates.back() << ")"
|
|||
|
|
<< std::endl;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ============================================================================
|
|||
|
|
// 6-param pool overloads
|
|||
|
|
//
|
|||
|
|
// These are non-virtual additions to Engine<T> that let callers opt into
|
|||
|
|
// multi-GPU pool mode simply by supplying one extra argument:
|
|||
|
|
//
|
|||
|
|
// m_trtEngine->buildLoadNetwork(path, sub, div, norm); // single-GPU
|
|||
|
|
// m_trtEngine->buildLoadNetwork(path, sub, div, norm, -1); // pool
|
|||
|
|
//
|
|||
|
|
// When maxSlotsPerGpu == 1 the call delegates to the existing 4-param
|
|||
|
|
// single-GPU implementation -- zero behavioural difference.
|
|||
|
|
// Any other value routes through loadSlots() which fills all GPUs.
|
|||
|
|
// ============================================================================
|
|||
|
|
|
|||
|
|
template <typename T>
|
|||
|
|
bool Engine<T>::buildLoadNetwork(
|
|||
|
|
std::string onnxModelPath,
|
|||
|
|
const std::array<float, 3>& subVals,
|
|||
|
|
const std::array<float, 3>& divVals,
|
|||
|
|
bool normalize,
|
|||
|
|
int maxSlotsPerGpu,
|
|||
|
|
double memSafetyFactor)
|
|||
|
|
{
|
|||
|
|
// Force single-GPU when: maxSlotsPerGpu==0 (optimizer bypass),
|
|||
|
|
// per-instance forceNoPool, global bypass (OptimizeModelStr),
|
|||
|
|
// exported g_forceNoPool, OR single-GPU system with maxSlotsPerGpu==1.
|
|||
|
|
//
|
|||
|
|
// On a single-GPU system, the pool with 1 slot adds contention overhead
|
|||
|
|
// (2s timeout + reject) without any multi-GPU benefit. The CUDA stream
|
|||
|
|
// handles serialization naturally in single-GPU mode.
|
|||
|
|
{
|
|||
|
|
extern std::atomic<bool> g_forceNoPool;
|
|||
|
|
int gpuCount = 0;
|
|||
|
|
cudaGetDeviceCount(&gpuCount);
|
|||
|
|
bool singleGpuNoElastic = (gpuCount <= 1 && maxSlotsPerGpu == 1);
|
|||
|
|
bool noPool = (maxSlotsPerGpu == 0) || m_forceNoPool ||
|
|||
|
|
g_forceNoPool.load(std::memory_order_relaxed) ||
|
|||
|
|
TRTEngineCache::globalBypass().load(std::memory_order_relaxed) ||
|
|||
|
|
singleGpuNoElastic;
|
|||
|
|
if (noPool) {
|
|||
|
|
std::cout << "Info: buildLoadNetwork -- single-GPU forced (maxSlots=" << maxSlotsPerGpu
|
|||
|
|
<< ", forceNoPool=" << m_forceNoPool
|
|||
|
|
<< ", g_forceNoPool=" << g_forceNoPool.load()
|
|||
|
|
<< ", gpuCount=" << gpuCount << ")" << std::endl;
|
|||
|
|
return buildLoadNetwork(onnxModelPath, subVals, divVals, normalize);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Multi-GPU pool path. m_options carries the base configuration that was
|
|||
|
|
// set either at construction (Engine(options)) or by initializePool().
|
|||
|
|
std::cout << "Info: buildLoadNetwork -- activating multi-GPU pool"
|
|||
|
|
<< " (maxSlotsPerGpu=" << maxSlotsPerGpu
|
|||
|
|
<< ", memSafetyFactor=" << memSafetyFactor << ")" << std::endl;
|
|||
|
|
|
|||
|
|
return loadSlots(m_options, onnxModelPath,
|
|||
|
|
subVals, divVals, normalize,
|
|||
|
|
/*fromOnnx=*/true,
|
|||
|
|
maxSlotsPerGpu, memSafetyFactor);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
template <typename T>
|
|||
|
|
bool Engine<T>::loadNetwork(
|
|||
|
|
std::string trtModelPath,
|
|||
|
|
const std::array<float, 3>& subVals,
|
|||
|
|
const std::array<float, 3>& divVals,
|
|||
|
|
bool normalize,
|
|||
|
|
int maxSlotsPerGpu,
|
|||
|
|
double memSafetyFactor)
|
|||
|
|
{
|
|||
|
|
{
|
|||
|
|
extern std::atomic<bool> g_forceNoPool;
|
|||
|
|
int gpuCount = 0;
|
|||
|
|
cudaGetDeviceCount(&gpuCount);
|
|||
|
|
bool singleGpuNoElastic = (gpuCount <= 1 && maxSlotsPerGpu == 1);
|
|||
|
|
bool noPool = (maxSlotsPerGpu == 0) || m_forceNoPool ||
|
|||
|
|
g_forceNoPool.load(std::memory_order_relaxed) ||
|
|||
|
|
TRTEngineCache::globalBypass().load(std::memory_order_relaxed) ||
|
|||
|
|
singleGpuNoElastic;
|
|||
|
|
if (noPool) {
|
|||
|
|
std::cout << "Info: loadNetwork -- single-GPU forced (maxSlots=" << maxSlotsPerGpu
|
|||
|
|
<< ", g_forceNoPool=" << g_forceNoPool.load()
|
|||
|
|
<< ", gpuCount=" << gpuCount << ")" << std::endl;
|
|||
|
|
return loadNetwork(trtModelPath, subVals, divVals, normalize);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Multi-GPU pool path.
|
|||
|
|
std::cout << "Info: loadNetwork -- activating multi-GPU pool"
|
|||
|
|
<< " (maxSlotsPerGpu=" << maxSlotsPerGpu
|
|||
|
|
<< ", memSafetyFactor=" << memSafetyFactor << ")" << std::endl;
|
|||
|
|
|
|||
|
|
return loadSlots(m_options, trtModelPath,
|
|||
|
|
subVals, divVals, normalize,
|
|||
|
|
/*fromOnnx=*/false,
|
|||
|
|
maxSlotsPerGpu, memSafetyFactor);
|
|||
|
|
}
|