Initial setup for CLion
This commit is contained in:
719
engines/TensorRTAPI/include/engine/EngineRunInference.inl
Normal file
719
engines/TensorRTAPI/include/engine/EngineRunInference.inl
Normal file
@@ -0,0 +1,719 @@
|
||||
#pragma once
|
||||
#include <cstring>
|
||||
#include <filesystem>
|
||||
#include "TRTCompat.h"
|
||||
|
||||
// Per-device mutex for CUDA graph capture.
|
||||
// TRT's enqueueV3 uses shared internal resources (workspace, memory pools)
|
||||
// at the CUDA context level. When two Engine instances on the same GPU
|
||||
// capture graphs concurrently, these cross-stream dependencies violate
|
||||
// graph capture rules ("operation not permitted when stream is capturing").
|
||||
// This mutex serialises graph captures across all Engine<T> instances on
|
||||
// the same device — subsequent cudaGraphLaunch calls are still concurrent.
|
||||
static std::mutex& graphCaptureMutex() {
|
||||
static std::mutex m;
|
||||
return m;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void Engine<T>::warmUp(int iterations) {
|
||||
if (m_verbose) {
|
||||
std::cout << "\n========================================" << std::endl;
|
||||
std::cout << "Engine Warmup" << std::endl;
|
||||
std::cout << "========================================" << std::endl;
|
||||
}
|
||||
|
||||
// Determine batch sizes to warm up
|
||||
std::vector<int> batchSizes;
|
||||
|
||||
if (m_options.maxBatchSize > 1) {
|
||||
if (m_verbose) {
|
||||
std::cout << "Dynamic batch engine detected (max batch: " << m_options.maxBatchSize << ")" << std::endl;
|
||||
std::cout << "Warming up common batch sizes to pre-compile kernels..." << std::endl;
|
||||
}
|
||||
|
||||
// Warm up ALL batch sizes from 1 to maxBatchSize.
|
||||
// Each unseen batch size incurs a 100-300ms kernel compilation penalty
|
||||
// on first use. Warming all sizes eliminates that latency at inference
|
||||
// time and ensures every CUDA graph is pre-captured.
|
||||
for (int batch = 1; batch <= m_options.maxBatchSize; ++batch) {
|
||||
batchSizes.push_back(batch);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (m_verbose) std::cout << "Fixed batch engine detected (batch size: " << m_options.maxBatchSize << ")" << std::endl;
|
||||
batchSizes.push_back(m_options.maxBatchSize);
|
||||
}
|
||||
|
||||
if (m_verbose) {
|
||||
std::cout << "Batch sizes to warm up: ";
|
||||
for (size_t i = 0; i < batchSizes.size(); ++i) {
|
||||
std::cout << batchSizes[i];
|
||||
if (i < batchSizes.size() - 1) std::cout << ", ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
// Warm up each batch size.
|
||||
// The first call triggers kernel compilation; the second captures the CUDA
|
||||
// graph. Additional iterations only measure steady-state latency for the
|
||||
// optBatchSize (printed as a diagnostic).
|
||||
for (int batchSize : batchSizes) {
|
||||
const int iters = (batchSize == m_options.optBatchSize) ? iterations : 2;
|
||||
if (m_verbose) std::cout << "\nWarming up batch=" << batchSize << " (x" << iters << " iterations)..." << std::endl;
|
||||
|
||||
// Create dummy inputs for this batch size
|
||||
std::vector<std::vector<cv::cuda::GpuMat>> dummyInputs;
|
||||
|
||||
for (size_t i = 0; i < m_inputDims.size(); ++i) {
|
||||
const auto& dims = m_inputDims[i];
|
||||
std::vector<cv::cuda::GpuMat> batch;
|
||||
|
||||
// FIXED: Create proper dummy images on GPU
|
||||
// For dynamic spatial dims, use opt dimensions for warmup
|
||||
int warmH = (dims.d[1] > 0) ? dims.d[1] : m_options.optInputHeight;
|
||||
int warmW = (dims.d[2] > 0) ? dims.d[2] : m_options.optInputWidth;
|
||||
for (int b = 0; b < batchSize; ++b) {
|
||||
// Create on CPU first
|
||||
cv::Mat cpuImg(warmH, warmW, CV_32FC(dims.d[0]), cv::Scalar(0.5f, 0.5f, 0.5f));
|
||||
|
||||
// Upload to GPU
|
||||
cv::cuda::GpuMat gpuImg;
|
||||
gpuImg.upload(cpuImg);
|
||||
|
||||
batch.push_back(gpuImg);
|
||||
}
|
||||
|
||||
dummyInputs.push_back(batch);
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::vector<T>>> dummyOutputs;
|
||||
|
||||
// Time the first iteration (kernel compilation happens here)
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
bool firstSuccess = runInference(dummyInputs, dummyOutputs);
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto firstTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
|
||||
|
||||
if (!firstSuccess) {
|
||||
if (m_verbose) std::cout << " ✗ ERROR: First iteration failed for batch=" << batchSize << std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (m_verbose) {
|
||||
std::cout << " First iteration: " << firstTime << " ms";
|
||||
if (firstTime > 100) {
|
||||
std::cout << " (kernel compilation detected)";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
// Run remaining iterations to measure stable performance
|
||||
if (iters > 1) {
|
||||
auto iterStart = std::chrono::high_resolution_clock::now();
|
||||
|
||||
for (int i = 1; i < iters; ++i) {
|
||||
bool success = runInference(dummyInputs, dummyOutputs);
|
||||
if (!success) {
|
||||
if (m_verbose) std::cout << " ✗ ERROR: Iteration " << i << " failed" << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
auto iterEnd = std::chrono::high_resolution_clock::now();
|
||||
auto totalTime = std::chrono::duration_cast<std::chrono::milliseconds>(iterEnd - iterStart).count();
|
||||
float avgTime = totalTime / static_cast<float>(iters - 1);
|
||||
|
||||
if (m_verbose) {
|
||||
std::cout << " Subsequent iterations (avg): " << std::fixed << std::setprecision(1)
|
||||
<< avgTime << " ms" << std::endl;
|
||||
|
||||
if (firstTime > 100 && avgTime < firstTime * 0.5f) {
|
||||
float speedup = firstTime / avgTime;
|
||||
std::cout << " ✓ Speedup after warmup: " << std::fixed << std::setprecision(1)
|
||||
<< speedup << "x faster" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (m_verbose) std::cout << " ✓ Batch=" << batchSize << " warmed up successfully" << std::endl;
|
||||
}
|
||||
|
||||
if (m_verbose) {
|
||||
std::cout << "\n========================================" << std::endl;
|
||||
std::cout << "Warmup Complete!" << std::endl;
|
||||
std::cout << "========================================" << std::endl;
|
||||
std::cout << "Kernels pre-compiled for all batch sizes." << std::endl;
|
||||
std::cout << "========================================\n" << std::endl;
|
||||
}
|
||||
}
|
||||
template <typename T>
|
||||
bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& inputs,std::vector<std::vector<std::vector<T>>>& featureVectors) {
|
||||
|
||||
// ============================================================================
|
||||
// MULTI-GPU POOL DISPATCH
|
||||
// ============================================================================
|
||||
// If this Engine was initialised with initializePool() / initializePoolFromEngine()
|
||||
// the m_slots vector is non-empty. In that case, delegate to the pool
|
||||
// dispatcher which acquires the first idle slot and runs inference there.
|
||||
// This branch is NEVER taken for single-GPU use (buildLoadNetwork / loadNetwork).
|
||||
if (!m_slots.empty()) {
|
||||
return runInferenceFromPool(inputs, featureVectors);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// SINGLE-ENGINE SERIALISATION
|
||||
// ============================================================================
|
||||
// The single Engine instance has shared mutable state (m_buffers, m_lastBatchSize,
|
||||
// m_inferenceStream, TRT execution context). If two LabVIEW threads call
|
||||
// runInference concurrently with different batch sizes, one will overwrite
|
||||
// the input shapes and buffers while the other is mid-inference, causing a
|
||||
// fatal "illegal memory access" that permanently corrupts the CUDA context.
|
||||
//
|
||||
// Pool-mode slots have their own busy-flag dispatch so they do NOT need this.
|
||||
std::lock_guard<std::mutex> inferenceLock(m_inferenceMutex);
|
||||
|
||||
// ============================================================================
|
||||
// THREAD-SAFE GPU CONTEXT
|
||||
// ============================================================================
|
||||
// Ensure the calling thread's CUDA device matches this engine's GPU.
|
||||
// This is essential for multi-GPU round-robin: LabVIEW reuses threads
|
||||
// across tasks, so a thread that last ran inference on GPU 1 might now
|
||||
// be running a task on GPU 0. Without this, cv::cuda::GpuMat allocations
|
||||
// and kernel launches would target the wrong GPU, causing result corruption.
|
||||
// Skip cudaSetDevice if already on the correct device — under WDDM
|
||||
// with multiple GPUs each call costs 1-5ms of scheduler overhead.
|
||||
{
|
||||
int currentDev = -1;
|
||||
cudaGetDevice(¤tDev);
|
||||
if (currentDev != m_options.deviceIndex) {
|
||||
cudaSetDevice(m_options.deviceIndex);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// DEBUG: First call diagnostics (per-instance, not process-wide)
|
||||
// ============================================================================
|
||||
|
||||
if (m_verbose && m_firstInferenceCall) {
|
||||
std::cout << "\n=== First runInference Call ===" << std::endl;
|
||||
std::cout << "Number of input tensors: " << inputs.size() << std::endl;
|
||||
for (size_t i = 0; i < inputs.size(); ++i) {
|
||||
std::cout << "Input " << i << " batch size: " << inputs[i].size() << std::endl;
|
||||
if (!inputs[i].empty()) {
|
||||
const auto& img = inputs[i][0];
|
||||
std::cout << " Image shape: " << img.cols << "x" << img.rows
|
||||
<< "x" << img.channels() << " (type: " << img.type() << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Print optimization profile information
|
||||
std::cout << "\n=== Engine Profile Information ===" << std::endl;
|
||||
std::cout << "Number of optimization profiles: "
|
||||
<< m_engine->getNbOptimizationProfiles() << std::endl;
|
||||
|
||||
if (m_engine->getNbOptimizationProfiles() > 0) {
|
||||
for (int profile = 0; profile < m_engine->getNbOptimizationProfiles(); ++profile) {
|
||||
std::cout << "\n--- Profile " << profile << " ---" << std::endl;
|
||||
|
||||
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
|
||||
const char* tensorName = m_IOTensorNames[i].c_str();
|
||||
|
||||
// Check if this is an input tensor
|
||||
auto ioMode = m_engine->getTensorIOMode(tensorName);
|
||||
if (ioMode != nvinfer1::TensorIOMode::kINPUT) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto minDims = m_engine->getProfileShape(tensorName, profile,
|
||||
nvinfer1::OptProfileSelector::kMIN);
|
||||
auto optDims = m_engine->getProfileShape(tensorName, profile,
|
||||
nvinfer1::OptProfileSelector::kOPT);
|
||||
auto maxDims = m_engine->getProfileShape(tensorName, profile,
|
||||
nvinfer1::OptProfileSelector::kMAX);
|
||||
|
||||
std::cout << "Tensor '" << tensorName << "' (INPUT):" << std::endl;
|
||||
std::cout << " Min: [" << minDims.d[0];
|
||||
for (int j = 1; j < minDims.nbDims; ++j) std::cout << "," << minDims.d[j];
|
||||
std::cout << "]" << std::endl;
|
||||
|
||||
std::cout << " Opt: [" << optDims.d[0];
|
||||
for (int j = 1; j < optDims.nbDims; ++j) std::cout << "," << optDims.d[j];
|
||||
std::cout << "]" << std::endl;
|
||||
|
||||
std::cout << " Max: [" << maxDims.d[0];
|
||||
for (int j = 1; j < maxDims.nbDims; ++j) std::cout << "," << maxDims.d[j];
|
||||
std::cout << "]" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!m_context->allInputDimensionsSpecified()) {
|
||||
std::cout << "ERROR: Input dimensions not specified in context!" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::cout << "\nContext state: All dimensions specified ✓" << std::endl;
|
||||
m_firstInferenceCall = false;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// INPUT VALIDATION
|
||||
// ============================================================================
|
||||
|
||||
if (inputs.empty() || inputs[0].empty()) {
|
||||
std::cout << "Error: Empty input" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto numInputs = m_inputDims.size();
|
||||
if (inputs.size() != numInputs) {
|
||||
std::cout << "Error: Wrong number of inputs. Expected: " << numInputs
|
||||
<< ", Got: " << inputs.size() << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto batchSize = static_cast<int32_t>(inputs[0].size());
|
||||
|
||||
if (batchSize > m_options.maxBatchSize) {
|
||||
std::cout << "Error: Batch size " << batchSize << " exceeds maximum "
|
||||
<< m_options.maxBatchSize << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (batchSize < 1) {
|
||||
std::cout << "Error: Batch size must be at least 1" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Validate batch size consistency across all inputs
|
||||
for (size_t i = 1; i < inputs.size(); ++i) {
|
||||
if (inputs[i].size() != static_cast<size_t>(batchSize)) {
|
||||
std::cout << "Error: Inconsistent batch sizes across inputs. Input 0: "
|
||||
<< batchSize << ", Input " << i << ": " << inputs[i].size() << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// STREAM GUARD
|
||||
// ============================================================================
|
||||
// m_inferenceStream is now created eagerly in loadNetwork() so it is always
|
||||
// valid here. Guard against the (unlikely) edge case where runInference is
|
||||
// called before loadNetwork succeeds.
|
||||
if (!m_streamInitialized || !m_inferenceStream) {
|
||||
std::string errMsg = "Error: Inference stream not initialised. "
|
||||
"Call loadNetwork() / buildLoadNetwork() before runInference().";
|
||||
std::cout << errMsg << std::endl;
|
||||
logEngineEvent("[Engine] runInference: " + errMsg, true);
|
||||
return false;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// SET INPUT SHAPES (batch size changed OR dynamic spatial dims need updating)
|
||||
// ============================================================================
|
||||
// Fast path: compute desired dims first, then compare against cached dims.
|
||||
// This avoids all TRT API calls when the shape hasn't actually changed —
|
||||
// critical for the recognizer which is called ~50-100x per image with
|
||||
// dynamic width but often the same or similar widths.
|
||||
// ============================================================================
|
||||
|
||||
{
|
||||
// Lazily initialise the dims cache on first call
|
||||
if (m_lastSetInputDims.empty()) {
|
||||
m_lastSetInputDims.resize(numInputs);
|
||||
for (size_t i = 0; i < numInputs; ++i) {
|
||||
m_lastSetInputDims[i].nbDims = 0; // force mismatch on first call
|
||||
}
|
||||
}
|
||||
|
||||
// Build desired dims for every input tensor (cheap — no TRT API calls)
|
||||
bool anyDimChanged = (m_lastBatchSize != batchSize);
|
||||
std::vector<nvinfer1::Dims> desiredDims(numInputs);
|
||||
for (size_t i = 0; i < numInputs; ++i) {
|
||||
nvinfer1::Dims& nd = desiredDims[i];
|
||||
nd.nbDims = 4;
|
||||
nd.d[0] = batchSize;
|
||||
nd.d[1] = m_inputDims[i].d[0]; // channels
|
||||
if (m_hasDynamicSpatialDims && !inputs[i].empty()) {
|
||||
const auto& firstImg = inputs[i][0];
|
||||
nd.d[2] = (m_inputDims[i].d[1] == -1) ? firstImg.rows : m_inputDims[i].d[1];
|
||||
nd.d[3] = (m_inputDims[i].d[2] == -1) ? firstImg.cols : m_inputDims[i].d[2];
|
||||
} else {
|
||||
nd.d[2] = m_inputDims[i].d[1];
|
||||
nd.d[3] = m_inputDims[i].d[2];
|
||||
}
|
||||
// Compare with cached
|
||||
if (!anyDimChanged) {
|
||||
const auto& cached = m_lastSetInputDims[i];
|
||||
if (cached.nbDims != nd.nbDims ||
|
||||
cached.d[0] != nd.d[0] || cached.d[1] != nd.d[1] ||
|
||||
cached.d[2] != nd.d[2] || cached.d[3] != nd.d[3]) {
|
||||
anyDimChanged = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (anyDimChanged) {
|
||||
// === First-time diagnostics (verbose, once) ===
|
||||
const bool firstTime = !m_batchShapeChangeLogged;
|
||||
|
||||
if (m_verbose && firstTime) {
|
||||
std::cout << "\nInfo: Batch size change: " << m_lastBatchSize
|
||||
<< " -> " << batchSize << std::endl;
|
||||
}
|
||||
|
||||
// Set optimization profile (only when truly needed)
|
||||
if (m_engine->getNbOptimizationProfiles() > 0) {
|
||||
int currentProfile = m_context->getOptimizationProfile();
|
||||
if (currentProfile != 0 || m_lastBatchSize < 0) {
|
||||
if (m_verbose && firstTime) {
|
||||
std::cout << " Setting optimization profile to 0..." << std::endl;
|
||||
}
|
||||
if (!m_context->setOptimizationProfileAsync(0, m_inferenceStream)) {
|
||||
std::cout << "Error: Failed to set optimization profile 0" << std::endl;
|
||||
return false;
|
||||
}
|
||||
cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
|
||||
if (syncErr != cudaSuccess) {
|
||||
std::cout << "Error: Failed to sync after profile change: "
|
||||
<< cudaGetErrorString(syncErr) << std::endl;
|
||||
return false;
|
||||
}
|
||||
if (m_verbose && firstTime) {
|
||||
std::cout << " Optimization profile set successfully" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update shapes for input tensors that actually changed
|
||||
for (size_t i = 0; i < numInputs; ++i) {
|
||||
const char* tensorName = m_IOTensorNames[i].c_str();
|
||||
|
||||
// Skip non-input tensors
|
||||
auto ioMode = m_engine->getTensorIOMode(tensorName);
|
||||
if (ioMode != nvinfer1::TensorIOMode::kINPUT) continue;
|
||||
|
||||
const nvinfer1::Dims& newDims = desiredDims[i];
|
||||
const nvinfer1::Dims& cached = m_lastSetInputDims[i];
|
||||
|
||||
// Skip this tensor if its dims haven't changed
|
||||
if (cached.nbDims == newDims.nbDims &&
|
||||
cached.d[0] == newDims.d[0] && cached.d[1] == newDims.d[1] &&
|
||||
cached.d[2] == newDims.d[2] && cached.d[3] == newDims.d[3]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// First-time verbose diagnostics
|
||||
if (m_verbose && firstTime) {
|
||||
std::cout << "\n Processing tensor " << i << ": '" << tensorName << "'" << std::endl;
|
||||
// Validate batch size range (first time only)
|
||||
if (m_engine->getNbOptimizationProfiles() > 0) {
|
||||
int profileIndex = m_context->getOptimizationProfile();
|
||||
nvinfer1::Dims minDims = m_engine->getProfileShape(
|
||||
tensorName, profileIndex, nvinfer1::OptProfileSelector::kMIN);
|
||||
nvinfer1::Dims maxDims = m_engine->getProfileShape(
|
||||
tensorName, profileIndex, nvinfer1::OptProfileSelector::kMAX);
|
||||
std::cout << " Profile batch range: [" << minDims.d[0]
|
||||
<< " to " << maxDims.d[0] << "]" << std::endl;
|
||||
if (batchSize < minDims.d[0] || batchSize > maxDims.d[0]) {
|
||||
std::cout << "Error: Batch size " << batchSize
|
||||
<< " outside profile range" << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
auto currentShape = m_context->getTensorShape(tensorName);
|
||||
std::cout << " Current context shape: [";
|
||||
for (int j = 0; j < currentShape.nbDims; ++j) {
|
||||
if (j > 0) std::cout << ", ";
|
||||
std::cout << currentShape.d[j];
|
||||
}
|
||||
std::cout << "]" << std::endl;
|
||||
std::cout << " Setting new shape: [" << newDims.d[0] << ", "
|
||||
<< newDims.d[1] << ", " << newDims.d[2] << ", "
|
||||
<< newDims.d[3] << "]" << std::endl;
|
||||
}
|
||||
|
||||
if (!m_context->setInputShape(tensorName, newDims)) {
|
||||
std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Verify shape (first time only — trust the API on hot path)
|
||||
if (firstTime) {
|
||||
auto verifyShape = m_context->getTensorShape(tensorName);
|
||||
if (verifyShape.d[0] != batchSize) {
|
||||
std::cout << "Error: Shape change didn't take effect. Expected batch "
|
||||
<< batchSize << ", got " << verifyShape.d[0] << std::endl;
|
||||
return false;
|
||||
}
|
||||
if (m_verbose) {
|
||||
std::cout << " Shape updated successfully" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
m_lastSetInputDims[i] = newDims;
|
||||
}
|
||||
|
||||
// Verify all input dimensions specified (first time only)
|
||||
if (firstTime) {
|
||||
if (!m_context->allInputDimensionsSpecified()) {
|
||||
std::cout << "Error: Not all input dimensions specified after shape change" << std::endl;
|
||||
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
|
||||
auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
|
||||
std::cout << " " << m_IOTensorNames[i] << ": [";
|
||||
for (int j = 0; j < shape.nbDims; ++j) {
|
||||
if (j > 0) std::cout << ", ";
|
||||
std::cout << shape.d[j];
|
||||
}
|
||||
std::cout << "]" << std::endl;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
m_lastBatchSize = batchSize;
|
||||
m_batchShapeChangeLogged = true;
|
||||
if (m_verbose && firstTime) {
|
||||
std::cout << "\nInfo: Input shapes updated successfully for batch size "
|
||||
<< batchSize << " ✓\n" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// PREPROCESS AND COPY INPUTS TO GPU BUFFERS
|
||||
// ============================================================================
|
||||
|
||||
// Pass 1: Validate all input dimensions before any GPU work.
|
||||
// Dynamic dims (-1) are skipped in validation (they accept any size).
|
||||
for (size_t i = 0; i < numInputs; ++i) {
|
||||
const auto& batchInput = inputs[i];
|
||||
const auto& dims = m_inputDims[i];
|
||||
if (!batchInput.empty()) {
|
||||
const auto& firstImg = batchInput[0];
|
||||
bool mismatch = false;
|
||||
if (dims.d[0] > 0 && firstImg.channels() != dims.d[0]) mismatch = true;
|
||||
if (dims.d[1] > 0 && firstImg.rows != dims.d[1]) mismatch = true;
|
||||
if (dims.d[2] > 0 && firstImg.cols != dims.d[2]) mismatch = true;
|
||||
if (mismatch) {
|
||||
std::cout << "Error: Input " << i << " dimension mismatch!" << std::endl;
|
||||
std::cout << " Expected: " << dims.d[2] << "x" << dims.d[1]
|
||||
<< "x" << dims.d[0] << " (WxHxC, -1=dynamic)" << std::endl;
|
||||
std::cout << " Got: " << firstImg.cols << "x" << firstImg.rows
|
||||
<< "x" << firstImg.channels() << " (WxHxC)" << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pass 2: Preprocess + D2D copies — all on m_inferenceStream (no null stream).
|
||||
//
|
||||
// All OpenCV CUDA ops (convertTo, subtract, divide, split) in blobFromGpuMats
|
||||
// now run on m_inferenceStream via the cv::cuda::Stream wrapper. This means:
|
||||
// • No null-stream interaction — eliminates global sync barriers on WDDM
|
||||
// • No event bridge needed — same-stream ordering guarantees correctness
|
||||
// • CUDA graphs are safe — cv::cuda::split runs BEFORE graph capture
|
||||
//
|
||||
// GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final
|
||||
// cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline.
|
||||
cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream);
|
||||
std::vector<cv::cuda::GpuMat> preprocessedBuffers;
|
||||
preprocessedBuffers.reserve(numInputs);
|
||||
|
||||
for (size_t i = 0; i < numInputs; ++i) {
|
||||
const auto& batchInput = inputs[i];
|
||||
|
||||
// Preprocess on m_inferenceStream (not the null stream).
|
||||
preprocessedBuffers.push_back(
|
||||
blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize, false, cvInferStream));
|
||||
|
||||
// D2D copy: same stream as preprocessing, so ordering is guaranteed.
|
||||
const auto& blobMat = preprocessedBuffers.back();
|
||||
const size_t copySize = static_cast<size_t>(blobMat.rows) * static_cast<size_t>(blobMat.cols) * blobMat.elemSize();
|
||||
cudaError_t copyErr = cudaMemcpyAsync(
|
||||
m_buffers[i],
|
||||
preprocessedBuffers.back().ptr<void>(),
|
||||
copySize,
|
||||
cudaMemcpyDeviceToDevice,
|
||||
m_inferenceStream);
|
||||
|
||||
if (copyErr != cudaSuccess) {
|
||||
std::cout << "Error: Failed to copy input " << i
|
||||
<< " to inference buffer: " << cudaGetErrorString(copyErr) << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// PRE-ALLOCATE OUTPUT STRUCTURE
|
||||
// ============================================================================
|
||||
|
||||
const size_t numOutputs = m_outputLengths.size();
|
||||
|
||||
featureVectors.clear();
|
||||
featureVectors.resize(batchSize);
|
||||
for (int batch = 0; batch < batchSize; ++batch) {
|
||||
featureVectors[batch].resize(numOutputs);
|
||||
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx)
|
||||
featureVectors[batch][outputIdx].resize(m_outputLengths[outputIdx]);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// RUN INFERENCE + COPY OUTPUTS (CUDA Graph path or direct path)
|
||||
// ============================================================================
|
||||
|
||||
// CUDA Graph path
|
||||
// ---------------
|
||||
// On the first call for a given batchSize we capture enqueueV3 + D2H copies
|
||||
// into a reusable graph. Subsequent calls use cudaGraphLaunch, replacing
|
||||
// many individual kernel-submission API calls with a single launch.
|
||||
//
|
||||
// Prerequisites satisfied here:
|
||||
// • Preprocessing + D2D copies are queued on m_inferenceStream (same-stream
|
||||
// ordering guarantees they complete before captured kernels execute)
|
||||
// • m_pinnedOutputBuffers has stable addresses (allocated in loadNetwork)
|
||||
// • m_buffers (GPU outputs) have stable addresses (allocated in loadNetwork)
|
||||
//
|
||||
// Falls back to the direct path if pinned buffers are unavailable or if
|
||||
// graph capture/instantiation fails for any reason.
|
||||
|
||||
// CUDA graphs capture fixed kernel sequences; incompatible with dynamic spatial dims
|
||||
// (input H/W change per inference call → different TRT kernel plans each time).
|
||||
// Disabled for pool slots — concurrent graph captures on the same GPU corrupt the
|
||||
// CUDA context ("operation not permitted when stream is capturing").
|
||||
const bool canGraph = !m_disableGraphs && !m_pinnedOutputBuffers.empty() && !m_hasDynamicSpatialDims;
|
||||
bool graphUsed = false;
|
||||
|
||||
if (canGraph) {
|
||||
auto& graphExec = m_graphExecs[batchSize]; // inserts nullptr on first access
|
||||
if (!graphExec) {
|
||||
// First call for this batchSize -- capture a new graph.
|
||||
// Serialise captures across all Engine instances on this device to
|
||||
// prevent TRT's shared workspace from creating cross-stream
|
||||
// dependencies that violate CUDA graph capture rules.
|
||||
std::lock_guard<std::mutex> captureLock(graphCaptureMutex());
|
||||
|
||||
// Clear any sticky CUDA error from a prior failed capture so that
|
||||
// this attempt starts clean.
|
||||
cudaGetLastError();
|
||||
|
||||
cudaGraph_t graph = nullptr;
|
||||
bool captureOk = false;
|
||||
|
||||
if (cudaStreamBeginCapture(m_inferenceStream,
|
||||
cudaStreamCaptureModeRelaxed) == cudaSuccess) {
|
||||
// Record TRT kernels into the graph (not executed yet).
|
||||
TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
|
||||
|
||||
// Record D2H copies to stable pinned addresses.
|
||||
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
|
||||
cudaMemcpyAsync(
|
||||
m_pinnedOutputBuffers[outputIdx],
|
||||
static_cast<char*>(m_buffers[numInputs + outputIdx]),
|
||||
static_cast<size_t>(batchSize) * m_outputLengths[outputIdx] * sizeof(T),
|
||||
cudaMemcpyDeviceToHost,
|
||||
m_inferenceStream);
|
||||
}
|
||||
|
||||
captureOk = (cudaStreamEndCapture(m_inferenceStream, &graph) == cudaSuccess
|
||||
&& graph != nullptr);
|
||||
}
|
||||
|
||||
if (captureOk) {
|
||||
cudaGraphExec_t exec = nullptr;
|
||||
if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess)
|
||||
graphExec = exec;
|
||||
cudaGraphDestroy(graph);
|
||||
}
|
||||
|
||||
if (!graphExec) {
|
||||
std::cout << "Warning: CUDA graph capture failed for batchSize="
|
||||
<< batchSize << " -- falling back to direct inference path." << std::endl;
|
||||
// Disable graph acceleration for this Engine instance.
|
||||
for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); }
|
||||
m_pinnedOutputBuffers.clear();
|
||||
m_graphExecs.erase(batchSize);
|
||||
}
|
||||
}
|
||||
|
||||
if (graphExec) {
|
||||
// Launch the pre-captured graph (single API call replaces many).
|
||||
cudaGraphLaunch(graphExec, m_inferenceStream);
|
||||
cudaStreamSynchronize(m_inferenceStream);
|
||||
|
||||
// CPU memcpy: pinned buffers -> featureVectors (interleaved by batch).
|
||||
for (int batch = 0; batch < batchSize; ++batch) {
|
||||
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
|
||||
std::memcpy(
|
||||
featureVectors[batch][outputIdx].data(),
|
||||
m_pinnedOutputBuffers[outputIdx]
|
||||
+ static_cast<size_t>(batch) * m_outputLengths[outputIdx],
|
||||
m_outputLengths[outputIdx] * sizeof(T));
|
||||
}
|
||||
}
|
||||
graphUsed = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Direct path (no graph)
|
||||
// ----------------------
|
||||
// Used when pinned buffers are unavailable or graph capture failed.
|
||||
if (!graphUsed) {
|
||||
bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers);
|
||||
if (!success) {
|
||||
std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
|
||||
+ std::to_string(batchSize)
|
||||
+ ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
|
||||
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
|
||||
auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
|
||||
debugInfo += ", tensor'" + m_IOTensorNames[i] + "'=[";
|
||||
for (int j = 0; j < shape.nbDims; ++j) {
|
||||
if (j > 0) debugInfo += ",";
|
||||
debugInfo += std::to_string(shape.d[j]);
|
||||
}
|
||||
debugInfo += "]";
|
||||
}
|
||||
std::cout << debugInfo << std::endl;
|
||||
logEngineEvent(debugInfo, true);
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int batch = 0; batch < batchSize; ++batch) {
|
||||
for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) {
|
||||
const size_t outputBinding = numInputs + outputIdx;
|
||||
const size_t offset =
|
||||
static_cast<size_t>(batch) * m_outputLengths[outputIdx] * sizeof(T);
|
||||
|
||||
cudaError_t copyErr = cudaMemcpyAsync(
|
||||
featureVectors[batch][outputIdx].data(),
|
||||
static_cast<char*>(m_buffers[outputBinding]) + offset,
|
||||
m_outputLengths[outputIdx] * sizeof(T),
|
||||
cudaMemcpyDeviceToHost,
|
||||
m_inferenceStream);
|
||||
|
||||
if (copyErr != cudaSuccess) {
|
||||
std::string errMsg = "[Engine] runInference FAIL: cudaMemcpyAsync output "
|
||||
+ std::to_string(outputIdx) + " batch " + std::to_string(batch)
|
||||
+ ": " + cudaGetErrorString(copyErr);
|
||||
std::cout << errMsg << std::endl;
|
||||
logEngineEvent(errMsg, true);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cudaError_t syncErr = cudaStreamSynchronize(m_inferenceStream);
|
||||
if (syncErr != cudaSuccess) {
|
||||
std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
|
||||
+ std::string(cudaGetErrorString(syncErr));
|
||||
std::cout << errMsg << std::endl;
|
||||
logEngineEvent(errMsg, true);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
Reference in New Issue
Block a user