Files
ANSCORE/engines/TensorRTAPI/include/engine/EngineRunInference.inl

720 lines
33 KiB
C++

#pragma once
#include <cstring>
#include <filesystem>
#include "TRTCompat.h"
// Per-device mutex for CUDA graph capture.
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
// at the CUDA context level. When two Engine instances on the same GPU
// capture graphs concurrently, these cross-stream dependencies violate
// graph capture rules ("operation not permitted when stream is capturing").
// This mutex serialises graph captures across all Engine<T> instances on
// the same device — subsequent cudaGraphLaunch calls are still concurrent.
static std::mutex& graphCaptureMutex() {
static std::mutex m;
return m;
}
template <typename T>
void Engine<T>::warmUp(int iterations) {
if (m_verbose) {
std::cout << "\n========================================" << std::endl;
std::cout << "Engine Warmup" << std::endl;
std::cout << "========================================" << std::endl;
}
// Determine batch sizes to warm up
std::vector<int> batchSizes;
if (m_options.maxBatchSize > 1) {
if (m_verbose) {
std::cout << "Dynamic batch engine detected (max batch: " << m_options.maxBatchSize << ")" << std::endl;
std::cout << "Warming up common batch sizes to pre-compile kernels..." << std::endl;
}
// Warm up ALL batch sizes from 1 to maxBatchSize.
// Each unseen batch size incurs a 100-300ms kernel compilation penalty
// on first use. Warming all sizes eliminates that latency at inference
// time and ensures every CUDA graph is pre-captured.
for (int batch = 1; batch <= m_options.maxBatchSize; ++batch) {
batchSizes.push_back(batch);
}
}
else {
if (m_verbose) std::cout << "Fixed batch engine detected (batch size: " << m_options.maxBatchSize << ")" << std::endl;
batchSizes.push_back(m_options.maxBatchSize);
}
if (m_verbose) {
std::cout << "Batch sizes to warm up: ";
for (size_t i = 0; i < batchSizes.size(); ++i) {
std::cout << batchSizes[i];
if (i < batchSizes.size() - 1) std::cout << ", ";
}
std::cout << std::endl;
}
// Warm up each batch size.
// The first call triggers kernel compilation; the second captures the CUDA
// graph. Additional iterations only measure steady-state latency for the
// optBatchSize (printed as a diagnostic).
for (int batchSize : batchSizes) {
const int iters = (batchSize == m_options.optBatchSize) ? iterations : 2;
if (m_verbose) std::cout << "\nWarming up batch=" << batchSize << " (x" << iters << " iterations)..." << std::endl;
// Create dummy inputs for this batch size
std::vector<std::vector<cv::cuda::GpuMat>> dummyInputs;
for (size_t i = 0; i < m_inputDims.size(); ++i) {
const auto& dims = m_inputDims[i];
std::vector<cv::cuda::GpuMat> batch;
// FIXED: Create proper dummy images on GPU
// For dynamic spatial dims, use opt dimensions for warmup
int warmH = (dims.d[1] > 0) ? dims.d[1] : m_options.optInputHeight;
int warmW = (dims.d[2] > 0) ? dims.d[2] : m_options.optInputWidth;
for (int b = 0; b < batchSize; ++b) {
// Create on CPU first
cv::Mat cpuImg(warmH, warmW, CV_32FC(dims.d[0]), cv::Scalar(0.5f, 0.5f, 0.5f));
// Upload to GPU
cv::cuda::GpuMat gpuImg;
gpuImg.upload(cpuImg);
batch.push_back(gpuImg);
}
dummyInputs.push_back(batch);
}
std::vector<std::vector<std::vector<T>>> dummyOutputs;
// Time the first iteration (kernel compilation happens here)
auto start = std::chrono::high_resolution_clock::now();
bool firstSuccess = runInference(dummyInputs, dummyOutputs);
auto end = std::chrono::high_resolution_clock::now();
auto firstTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
if (!firstSuccess) {
if (m_verbose) std::cout << " ✗ ERROR: First iteration failed for batch=" << batchSize << std::endl;
continue;
}
if (m_verbose) {
std::cout << " First iteration: " << firstTime << " ms";
if (firstTime > 100) {
std::cout << " (kernel compilation detected)";
}
std::cout << std::endl;
}
// Run remaining iterations to measure stable performance
if (iters > 1) {
auto iterStart = std::chrono::high_resolution_clock::now();
for (int i = 1; i < iters; ++i) {
bool success = runInference(dummyInputs, dummyOutputs);
if (!success) {
if (m_verbose) std::cout << " ✗ ERROR: Iteration " << i << " failed" << std::endl;
break;
}
}
auto iterEnd = std::chrono::high_resolution_clock::now();
auto totalTime = std::chrono::duration_cast<std::chrono::milliseconds>(iterEnd - iterStart).count();
float avgTime = totalTime / static_cast<float>(iters - 1);
if (m_verbose) {
std::cout << " Subsequent iterations (avg): " << std::fixed << std::setprecision(1)
<< avgTime << " ms" << std::endl;
if (firstTime > 100 && avgTime < firstTime * 0.5f) {
float speedup = firstTime / avgTime;
std::cout << " ✓ Speedup after warmup: " << std::fixed << std::setprecision(1)
<< speedup << "x faster" << std::endl;
}
}
}
if (m_verbose) std::cout << " ✓ Batch=" << batchSize << " warmed up successfully" << std::endl;
}
if (m_verbose) {
std::cout << "\n========================================" << std::endl;
std::cout << "Warmup Complete!" << std::endl;
std::cout << "========================================" << std::endl;
std::cout << "Kernels pre-compiled for all batch sizes." << std::endl;
std::cout << "========================================\n" << std::endl;
}
}
template <typename T>
bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,std::vector<std::vector<std::vector<T>>>& featureVectors) {
// ============================================================================
// MULTI-GPU POOL DISPATCH
// ============================================================================
// If this Engine was initialised with initializePool() / initializePoolFromEngine()
// the m_slots vector is non-empty. In that case, delegate to the pool
// dispatcher which acquires the first idle slot and runs inference there.
// This branch is NEVER taken for single-GPU use (buildLoadNetwork / loadNetwork).
if (!m_slots.empty()) {
return runInferenceFromPool(inputs, featureVectors);
}
// ============================================================================
// SINGLE-ENGINE SERIALISATION
// ============================================================================
// The single Engine instance has shared mutable state (m_buffers, m_lastBatchSize,
// m_inferenceStream, TRT execution context). If two LabVIEW threads call
// runInference concurrently with different batch sizes, one will overwrite
// the input shapes and buffers while the other is mid-inference, causing a
// fatal "illegal memory access" that permanently corrupts the CUDA context.
//
// Pool-mode slots have their own busy-flag dispatch so they do NOT need this.
std::lock_guard<std::mutex> inferenceLock(m_inferenceMutex);
// ============================================================================
// THREAD-SAFE GPU CONTEXT
// ============================================================================
// Ensure the calling thread's CUDA device matches this engine's GPU.
// This is essential for multi-GPU round-robin: LabVIEW reuses threads
// across tasks, so a thread that last ran inference on GPU 1 might now
// be running a task on GPU 0. Without this, cv::cuda::GpuMat allocations
// and kernel launches would target the wrong GPU, causing result corruption.
// Skip cudaSetDevice if already on the correct device — under WDDM
// with multiple GPUs each call costs 1-5ms of scheduler overhead.
{
int currentDev = -1;
cudaGetDevice(&currentDev);
if (currentDev != m_options.deviceIndex) {
cudaSetDevice(m_options.deviceIndex);
}
}
// ============================================================================
// DEBUG: First call diagnostics (per-instance, not process-wide)
// ============================================================================
if (m_verbose && m_firstInferenceCall) {
std::cout << "\n=== First runInference Call ===" << std::endl;
std::cout << "Number of input tensors: " << inputs.size() << std::endl;
for (size_t i = 0; i < inputs.size(); ++i) {
std::cout << "Input " << i << " batch size: " << inputs[i].size() << std::endl;
if (!inputs[i].empty()) {
const auto& img = inputs[i][0];
std::cout << " Image shape: " << img.cols << "x" << img.rows
<< "x" << img.channels() << " (type: " << img.type() << ")" << std::endl;
}
}
// Print optimization profile information
std::cout << "\n=== Engine Profile Information ===" << std::endl;
std::cout << "Number of optimization profiles: "
<< m_engine->getNbOptimizationProfiles() << std::endl;
if (m_engine->getNbOptimizationProfiles() > 0) {
for (int profile = 0; profile < m_engine->getNbOptimizationProfiles(); ++profile) {
std::cout << "\n--- Profile " << profile << " ---" << std::endl;
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
const char* tensorName = m_IOTensorNames[i].c_str();
// Check if this is an input tensor
auto ioMode = m_engine->getTensorIOMode(tensorName);
if (ioMode != nvinfer1::TensorIOMode::kINPUT) {
continue;
}
auto minDims = m_engine->getProfileShape(tensorName, profile,
nvinfer1::OptProfileSelector::kMIN);
auto optDims = m_engine->getProfileShape(tensorName, profile,
nvinfer1::OptProfileSelector::kOPT);
auto maxDims = m_engine->getProfileShape(tensorName, profile,
nvinfer1::OptProfileSelector::kMAX);
std::cout << "Tensor '" << tensorName << "' (INPUT):" << std::endl;
std::cout << " Min: [" << minDims.d[0];
for (int j = 1; j < minDims.nbDims; ++j) std::cout << "," << minDims.d[j];
std::cout << "]" << std::endl;
std::cout << " Opt: [" << optDims.d[0];
for (int j = 1; j < optDims.nbDims; ++j) std::cout << "," << optDims.d[j];
std::cout << "]" << std::endl;
std::cout << " Max: [" << maxDims.d[0];
for (int j = 1; j < maxDims.nbDims; ++j) std::cout << "," << maxDims.d[j];
std::cout << "]" << std::endl;
}
}
}
if (!m_context->allInputDimensionsSpecified()) {
std::cout << "ERROR: Input dimensions not specified in context!" << std::endl;
return false;
}
std::cout << "\nContext state: All dimensions specified ✓" << std::endl;
m_firstInferenceCall = false;
}
// ============================================================================
// INPUT VALIDATION
// ============================================================================
if (inputs.empty() || inputs[0].empty()) {
std::cout << "Error: Empty input" << std::endl;
return false;
}
const auto numInputs = m_inputDims.size();
if (inputs.size() != numInputs) {
std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
<< ", Got: " << inputs.size() << std::endl;
return false;
}
const auto batchSize = static_cast<int32_t>(inputs[0].size());
if (batchSize > m_options.maxBatchSize) {
std::cout << "Error: Batch size " << batchSize << " exceeds maximum "
<< m_options.maxBatchSize << std::endl;
return false;
}
if (batchSize < 1) {
std::cout << "Error: Batch size must be at least 1" << std::endl;
return false;
}
// Validate batch size consistency across all inputs
for (size_t i = 1; i < inputs.size(); ++i) {
if (inputs[i].size() != static_cast<size_t>(batchSize)) {
std::cout << "Error: Inconsistent batch sizes across inputs. Input 0: "
<< batchSize << ", Input " << i << ": " << inputs[i].size() << std::endl;
return false;
}
}
// ============================================================================
// STREAM GUARD
// ============================================================================
// m_inferenceStream is now created eagerly in loadNetwork() so it is always
// valid here. Guard against the (unlikely) edge case where runInference is
// called before loadNetwork succeeds.
if (!m_streamInitialized || !m_inferenceStream) {
std::string errMsg = "Error: Inference stream not initialised. "
"Call loadNetwork() / buildLoadNetwork() before runInference().";
std::cout << errMsg << std::endl;
logEngineEvent("[Engine] runInference: " + errMsg, true);
return false;
}
// ============================================================================
// SET INPUT SHAPES (batch size changed OR dynamic spatial dims need updating)
// ============================================================================
// Fast path: compute desired dims first, then compare against cached dims.
// This avoids all TRT API calls when the shape hasn't actually changed —
// critical for the recognizer which is called ~50-100x per image with
// dynamic width but often the same or similar widths.
// ============================================================================
{
// Lazily initialise the dims cache on first call
if (m_lastSetInputDims.empty()) {
m_lastSetInputDims.resize(numInputs);
for (size_t i = 0; i < numInputs; ++i) {
m_lastSetInputDims[i].nbDims = 0; // force mismatch on first call
}
}
// Build desired dims for every input tensor (cheap — no TRT API calls)
bool anyDimChanged = (m_lastBatchSize != batchSize);
std::vector<nvinfer1::Dims> desiredDims(numInputs);
for (size_t i = 0; i < numInputs; ++i) {
nvinfer1::Dims& nd = desiredDims[i];
nd.nbDims = 4;
nd.d[0] = batchSize;
nd.d[1] = m_inputDims[i].d[0]; // channels
if (m_hasDynamicSpatialDims && !inputs[i].empty()) {
const auto& firstImg = inputs[i][0];
nd.d[2] = (m_inputDims[i].d[1] == -1) ? firstImg.rows : m_inputDims[i].d[1];
nd.d[3] = (m_inputDims[i].d[2] == -1) ? firstImg.cols : m_inputDims[i].d[2];
} else {
nd.d[2] = m_inputDims[i].d[1];
nd.d[3] = m_inputDims[i].d[2];
}
// Compare with cached
if (!anyDimChanged) {
const auto& cached = m_lastSetInputDims[i];
if (cached.nbDims != nd.nbDims ||
cached.d[0] != nd.d[0] || cached.d[1] != nd.d[1] ||
cached.d[2] != nd.d[2] || cached.d[3] != nd.d[3]) {
anyDimChanged = true;
}
}
}
if (anyDimChanged) {
// === First-time diagnostics (verbose, once) ===
const bool firstTime = !m_batchShapeChangeLogged;
if (m_verbose && firstTime) {
std::cout << "\nInfo: Batch size change: " << m_lastBatchSize
<< " -> " << batchSize << std::endl;
}
// Set optimization profile (only when truly needed)
if (m_engine->getNbOptimizationProfiles() > 0) {
int currentProfile = m_context->getOptimizationProfile();
if (currentProfile != 0 || m_lastBatchSize < 0) {
if (m_verbose && firstTime) {
std::cout << " Setting optimization profile to 0..." << std::endl;
}
if (!m_context->setOptimizationProfileAsync(0, m_inferenceStream)) {
std::cout << "Error: Failed to set optimization profile 0" << std::endl;
return false;
}
cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
if (syncErr != cudaSuccess) {
std::cout << "Error: Failed to sync after profile change: "
<< cudaGetErrorString(syncErr) << std::endl;
return false;
}
if (m_verbose && firstTime) {
std::cout << " Optimization profile set successfully" << std::endl;
}
}
}
// Update shapes for input tensors that actually changed
for (size_t i = 0; i < numInputs; ++i) {
const char* tensorName = m_IOTensorNames[i].c_str();
// Skip non-input tensors
auto ioMode = m_engine->getTensorIOMode(tensorName);
if (ioMode != nvinfer1::TensorIOMode::kINPUT) continue;
const nvinfer1::Dims& newDims = desiredDims[i];
const nvinfer1::Dims& cached = m_lastSetInputDims[i];
// Skip this tensor if its dims haven't changed
if (cached.nbDims == newDims.nbDims &&
cached.d[0] == newDims.d[0] && cached.d[1] == newDims.d[1] &&
cached.d[2] == newDims.d[2] && cached.d[3] == newDims.d[3]) {
continue;
}
// First-time verbose diagnostics
if (m_verbose && firstTime) {
std::cout << "\n Processing tensor " << i << ": '" << tensorName << "'" << std::endl;
// Validate batch size range (first time only)
if (m_engine->getNbOptimizationProfiles() > 0) {
int profileIndex = m_context->getOptimizationProfile();
nvinfer1::Dims minDims = m_engine->getProfileShape(
tensorName, profileIndex, nvinfer1::OptProfileSelector::kMIN);
nvinfer1::Dims maxDims = m_engine->getProfileShape(
tensorName, profileIndex, nvinfer1::OptProfileSelector::kMAX);
std::cout << " Profile batch range: [" << minDims.d[0]
<< " to " << maxDims.d[0] << "]" << std::endl;
if (batchSize < minDims.d[0] || batchSize > maxDims.d[0]) {
std::cout << "Error: Batch size " << batchSize
<< " outside profile range" << std::endl;
return false;
}
}
auto currentShape = m_context->getTensorShape(tensorName);
std::cout << " Current context shape: [";
for (int j = 0; j < currentShape.nbDims; ++j) {
if (j > 0) std::cout << ", ";
std::cout << currentShape.d[j];
}
std::cout << "]" << std::endl;
std::cout << " Setting new shape: [" << newDims.d[0] << ", "
<< newDims.d[1] << ", " << newDims.d[2] << ", "
<< newDims.d[3] << "]" << std::endl;
}
if (!m_context->setInputShape(tensorName, newDims)) {
std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
return false;
}
// Verify shape (first time only — trust the API on hot path)
if (firstTime) {
auto verifyShape = m_context->getTensorShape(tensorName);
if (verifyShape.d[0] != batchSize) {
std::cout << "Error: Shape change didn't take effect. Expected batch "
<< batchSize << ", got " << verifyShape.d[0] << std::endl;
return false;
}
if (m_verbose) {
std::cout << " Shape updated successfully" << std::endl;
}
}
m_lastSetInputDims[i] = newDims;
}
// Verify all input dimensions specified (first time only)
if (firstTime) {
if (!m_context->allInputDimensionsSpecified()) {
std::cout << "Error: Not all input dimensions specified after shape change" << std::endl;
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
std::cout << " " << m_IOTensorNames[i] << ": [";
for (int j = 0; j < shape.nbDims; ++j) {
if (j > 0) std::cout << ", ";
std::cout << shape.d[j];
}
std::cout << "]" << std::endl;
}
return false;
}
}
m_lastBatchSize = batchSize;
m_batchShapeChangeLogged = true;
if (m_verbose && firstTime) {
std::cout << "\nInfo: Input shapes updated successfully for batch size "
<< batchSize << "\n" << std::endl;
}
}
}
// ============================================================================
// PREPROCESS AND COPY INPUTS TO GPU BUFFERS
// ============================================================================
// Pass 1: Validate all input dimensions before any GPU work.
// Dynamic dims (-1) are skipped in validation (they accept any size).
for (size_t i = 0; i < numInputs; ++i) {
const auto& batchInput = inputs[i];
const auto& dims = m_inputDims[i];
if (!batchInput.empty()) {
const auto& firstImg = batchInput[0];
bool mismatch = false;
if (dims.d[0] > 0 && firstImg.channels() != dims.d[0]) mismatch = true;
if (dims.d[1] > 0 && firstImg.rows != dims.d[1]) mismatch = true;
if (dims.d[2] > 0 && firstImg.cols != dims.d[2]) mismatch = true;
if (mismatch) {
std::cout << "Error: Input " << i << " dimension mismatch!" << std::endl;
std::cout << " Expected: " << dims.d[2] << "x" << dims.d[1]
<< "x" << dims.d[0] << " (WxHxC, -1=dynamic)" << std::endl;
std::cout << " Got: " << firstImg.cols << "x" << firstImg.rows
<< "x" << firstImg.channels() << " (WxHxC)" << std::endl;
return false;
}
}
}
// Pass 2: Preprocess + D2D copies — all on m_inferenceStream (no null stream).
//
// All OpenCV CUDA ops (convertTo, subtract, divide, split) in blobFromGpuMats
// now run on m_inferenceStream via the cv::cuda::Stream wrapper. This means:
// • No null-stream interaction — eliminates global sync barriers on WDDM
// • No event bridge needed — same-stream ordering guarantees correctness
// • CUDA graphs are safe — cv::cuda::split runs BEFORE graph capture
//
// GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
// cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
std::vector<cv::cuda::GpuMat> preprocessedBuffers;
preprocessedBuffers.reserve(numInputs);
for (size_t i = 0; i < numInputs; ++i) {
const auto& batchInput = inputs[i];
// Preprocess on m_inferenceStream (not the null stream).
preprocessedBuffers.push_back(
blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize, false, cvInferStream));
// D2D copy: same stream as preprocessing, so ordering is guaranteed.
const auto& blobMat = preprocessedBuffers.back();
const size_t copySize = static_cast<size_t>(blobMat.rows) * static_cast<size_t>(blobMat.cols) * blobMat.elemSize();
cudaError_t copyErr = cudaMemcpyAsync(
m_buffers[i],
preprocessedBuffers.back().ptr<void>(),
copySize,
cudaMemcpyDeviceToDevice,
m_inferenceStream);
if (copyErr != cudaSuccess) {
std::cout << "Error: Failed to copy input " << i
<< " to inference buffer: " << cudaGetErrorString(copyErr) << std::endl;
return false;
}
}
// ============================================================================
// PRE-ALLOCATE OUTPUT STRUCTURE
// ============================================================================
const size_t numOutputs = m_outputLengths.size();
featureVectors.clear();
featureVectors.resize(batchSize);
for (int batch = 0; batch < batchSize; ++batch) {
featureVectors[batch].resize(numOutputs);
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx)
featureVectors[batch][outputIdx].resize(m_outputLengths[outputIdx]);
}
// ============================================================================
// RUN INFERENCE + COPY OUTPUTS (CUDA Graph path or direct path)
// ============================================================================
// CUDA Graph path
// ---------------
// On the first call for a given batchSize we capture enqueueV3 + D2H copies
// into a reusable graph. Subsequent calls use cudaGraphLaunch, replacing
// many individual kernel-submission API calls with a single launch.
//
// Prerequisites satisfied here:
// • Preprocessing + D2D copies are queued on m_inferenceStream (same-stream
// ordering guarantees they complete before captured kernels execute)
// • m_pinnedOutputBuffers has stable addresses (allocated in loadNetwork)
// • m_buffers (GPU outputs) have stable addresses (allocated in loadNetwork)
//
// Falls back to the direct path if pinned buffers are unavailable or if
// graph capture/instantiation fails for any reason.
// CUDA graphs capture fixed kernel sequences; incompatible with dynamic spatial dims
// (input H/W change per inference call → different TRT kernel plans each time).
// Disabled for pool slots — concurrent graph captures on the same GPU corrupt the
// CUDA context ("operation not permitted when stream is capturing").
const bool canGraph = !m_disableGraphs && !m_pinnedOutputBuffers.empty() && !m_hasDynamicSpatialDims;
bool graphUsed = false;
if (canGraph) {
auto& graphExec = m_graphExecs[batchSize]; // inserts nullptr on first access
if (!graphExec) {
// First call for this batchSize -- capture a new graph.
// Serialise captures across all Engine instances on this device to
// prevent TRT's shared workspace from creating cross-stream
// dependencies that violate CUDA graph capture rules.
std::lock_guard<std::mutex> captureLock(graphCaptureMutex());
// Clear any sticky CUDA error from a prior failed capture so that
// this attempt starts clean.
cudaGetLastError();
cudaGraph_t graph = nullptr;
bool captureOk = false;
if (cudaStreamBeginCapture(m_inferenceStream,
cudaStreamCaptureModeRelaxed) == cudaSuccess) {
// Record TRT kernels into the graph (not executed yet).
TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
// Record D2H copies to stable pinned addresses.
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
cudaMemcpyAsync(
m_pinnedOutputBuffers[outputIdx],
static_cast<char*>(m_buffers[numInputs + outputIdx]),
static_cast<size_t>(batchSize) * m_outputLengths[outputIdx] * sizeof(T),
cudaMemcpyDeviceToHost,
m_inferenceStream);
}
captureOk = (cudaStreamEndCapture(m_inferenceStream, &graph) == cudaSuccess
&& graph != nullptr);
}
if (captureOk) {
cudaGraphExec_t exec = nullptr;
if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
graphExec = exec;
cudaGraphDestroy(graph);
}
if (!graphExec) {
std::cout << "Warning: CUDA graph capture failed for batchSize="
<< batchSize << " -- falling back to direct inference path." << std::endl;
// Disable graph acceleration for this Engine instance.
for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
m_pinnedOutputBuffers.clear();
m_graphExecs.erase(batchSize);
}
}
if (graphExec) {
// Launch the pre-captured graph (single API call replaces many).
cudaGraphLaunch(graphExec, m_inferenceStream);
cudaStreamSynchronize(m_inferenceStream);
// CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
for (int batch = 0; batch < batchSize; ++batch) {
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
std::memcpy(
featureVectors[batch][outputIdx].data(),
m_pinnedOutputBuffers[outputIdx]
+ static_cast<size_t>(batch) * m_outputLengths[outputIdx],
m_outputLengths[outputIdx] * sizeof(T));
}
}
graphUsed = true;
}
}
// Direct path (no graph)
// ----------------------
// Used when pinned buffers are unavailable or graph capture failed.
if (!graphUsed) {
bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
if (!success) {
std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
+ std::to_string(batchSize)
+ ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
debugInfo += ", tensor'" + m_IOTensorNames[i] + "'=[";
for (int j = 0; j < shape.nbDims; ++j) {
if (j > 0) debugInfo += ",";
debugInfo += std::to_string(shape.d[j]);
}
debugInfo += "]";
}
std::cout << debugInfo << std::endl;
logEngineEvent(debugInfo, true);
return false;
}
for (int batch = 0; batch < batchSize; ++batch) {
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
const size_t outputBinding = numInputs + outputIdx;
const size_t offset =
static_cast<size_t>(batch) * m_outputLengths[outputIdx] * sizeof(T);
cudaError_t copyErr = cudaMemcpyAsync(
featureVectors[batch][outputIdx].data(),
static_cast<char*>(m_buffers[outputBinding]) + offset,
m_outputLengths[outputIdx] * sizeof(T),
cudaMemcpyDeviceToHost,
m_inferenceStream);
if (copyErr != cudaSuccess) {
std::string errMsg = "[Engine] runInference FAIL: cudaMemcpyAsync output "
+ std::to_string(outputIdx) + " batch " + std::to_string(batch)
+ ": " + cudaGetErrorString(copyErr);
std::cout << errMsg << std::endl;
logEngineEvent(errMsg, true);
return false;
}
}
}
cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
if (syncErr != cudaSuccess) {
std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
+ std::string(cudaGetErrorString(syncErr));
std::cout << errMsg << std::endl;
logEngineEvent(errMsg, true);
return false;
}
}
return true;
}