#pragma once #include #include #include #include #include "TRTCompat.h" #include "ANSLicense.h" // ANS_DBG macro for DebugView logging // Per-device mutex for CUDA graph capture. // TRT's enqueueV3 uses shared internal resources (workspace, memory pools) // at the CUDA context level. When two Engine instances on the same GPU // capture graphs concurrently, these cross-stream dependencies violate // graph capture rules ("operation not permitted when stream is capturing"). // This mutex serialises graph captures across all Engine instances on // the same device — subsequent cudaGraphLaunch calls are still concurrent. static std::mutex& graphCaptureMutex() { static std::mutex m; return m; } // ============================================================================ // GPU INFERENCE THROTTLE // ============================================================================ // Global counting semaphore that limits how many Engine instances can execute // CUDA inference simultaneously. Without this, N separate Engine instances // (one per camera) all submit GPU work at once, causing: // 1. SM 100% saturation → each inference takes 5-10x longer // 2. GPU thermal throttling at 85°C → further slowdown // 3. cudaStreamSynchronize blocking indefinitely → system freeze // // Auto-computed from GPU VRAM: // ≤ 4 GB → 2 concurrent 8 GB → 4 concurrent // 6 GB → 3 concurrent 12+ GB → 6 concurrent // Multi-GPU: sum across all GPUs // // Excess threads wait on CPU (nearly zero cost) while the bounded set // runs efficiently on the GPU without thermal throttling. static std::counting_semaphore<64>& gpuInferenceSemaphore() { static int maxConcurrent = []() { int totalSlots = 0; int gpuCount = 0; cudaGetDeviceCount(&gpuCount); if (gpuCount <= 0) return 4; // fallback for (int i = 0; i < gpuCount; ++i) { size_t freeMem = 0, totalMem = 0; cudaSetDevice(i); cudaMemGetInfo(&freeMem, &totalMem); int gbTotal = static_cast(totalMem / (1024ULL * 1024ULL * 1024ULL)); // Scale concurrency with VRAM: ~1 slot per 2 GB, min 2, max 6 per GPU int slotsThisGpu = std::clamp(gbTotal / 2, 2, 6); totalSlots += slotsThisGpu; } totalSlots = std::clamp(totalSlots, 2, 64); std::cout << "Info [GPU Throttle]: max concurrent inferences = " << totalSlots << " (across " << gpuCount << " GPU(s))" << std::endl; return totalSlots; }(); static std::counting_semaphore<64> sem(maxConcurrent); return sem; } // RAII guard for the GPU inference semaphore struct GpuInferenceGuard { GpuInferenceGuard() { gpuInferenceSemaphore().acquire(); } ~GpuInferenceGuard() { gpuInferenceSemaphore().release(); } GpuInferenceGuard(const GpuInferenceGuard&) = delete; GpuInferenceGuard& operator=(const GpuInferenceGuard&) = delete; }; // ============================================================================ // WDDM-SAFE STREAM SYNCHRONIZATION // ============================================================================ // Under Windows WDDM, cudaStreamSynchronize calls cuStreamQuery in a tight // loop with SwitchToThread, holding nvcuda64's internal SRW lock the entire // time. When the GPU is busy with inference, this spin blocks ALL other CUDA // operations — including HW video decode (nvcuvid), cuMemAlloc, cuArrayDestroy. // If a camera Reconnect or decode buffer allocation needs an exclusive SRW lock // while inference is spinning, the entire system deadlocks. // // This function replaces cudaStreamSynchronize with a polling loop that // explicitly releases the SRW lock between queries by sleeping briefly. // This allows other CUDA operations to interleave with the sync wait. static inline cudaError_t cudaStreamSynchronize_Safe(cudaStream_t stream) { // Fast path: check if already done (no sleep overhead for quick kernels) cudaError_t err = cudaStreamQuery(stream); if (err != cudaErrorNotReady) return err; auto syncStart = std::chrono::steady_clock::now(); // Short Sleep(0) fast path (~10 iterations) catches sub-ms kernel completions. for (int i = 0; i < 10; ++i) { Sleep(0); err = cudaStreamQuery(stream); if (err != cudaErrorNotReady) return err; } // 1ms sleeps — adds negligible latency at 30 FPS but prevents SRW lock starvation. int sleepCount = 0; while (true) { Sleep(1); sleepCount++; err = cudaStreamQuery(stream); if (err != cudaErrorNotReady) { // Log if sync took too long (>500ms indicates GPU stall) auto elapsed = std::chrono::duration( std::chrono::steady_clock::now() - syncStart).count(); if (elapsed > 500.0) { ANS_DBG("TRT_Engine", "SLOW SYNC: %.1fms (%d sleeps) stream=%p err=%d", elapsed, sleepCount, (void*)stream, (int)err); } return err; } } } template void Engine::warmUp(int iterations) { if (m_verbose) { std::cout << "\n========================================" << std::endl; std::cout << "Engine Warmup" << std::endl; std::cout << "========================================" << std::endl; } // Determine batch sizes to warm up std::vector batchSizes; if (m_options.maxBatchSize > 1) { if (m_verbose) { std::cout << "Dynamic batch engine detected (max batch: " << m_options.maxBatchSize << ")" << std::endl; std::cout << "Warming up common batch sizes to pre-compile kernels..." << std::endl; } // Warm up ALL batch sizes from 1 to maxBatchSize. // Each unseen batch size incurs a 100-300ms kernel compilation penalty // on first use. Warming all sizes eliminates that latency at inference // time and ensures every CUDA graph is pre-captured. for (int batch = 1; batch <= m_options.maxBatchSize; ++batch) { batchSizes.push_back(batch); } } else { if (m_verbose) std::cout << "Fixed batch engine detected (batch size: " << m_options.maxBatchSize << ")" << std::endl; batchSizes.push_back(m_options.maxBatchSize); } if (m_verbose) { std::cout << "Batch sizes to warm up: "; for (size_t i = 0; i < batchSizes.size(); ++i) { std::cout << batchSizes[i]; if (i < batchSizes.size() - 1) std::cout << ", "; } std::cout << std::endl; } // Warm up each batch size. // The first call triggers kernel compilation; the second captures the CUDA // graph. Additional iterations only measure steady-state latency for the // optBatchSize (printed as a diagnostic). for (int batchSize : batchSizes) { const int iters = (batchSize == m_options.optBatchSize) ? iterations : 2; if (m_verbose) std::cout << "\nWarming up batch=" << batchSize << " (x" << iters << " iterations)..." << std::endl; // Create dummy inputs for this batch size std::vector> dummyInputs; for (size_t i = 0; i < m_inputDims.size(); ++i) { const auto& dims = m_inputDims[i]; std::vector batch; // FIXED: Create proper dummy images on GPU // For dynamic spatial dims, use opt dimensions for warmup int warmH = (dims.d[1] > 0) ? dims.d[1] : m_options.optInputHeight; int warmW = (dims.d[2] > 0) ? dims.d[2] : m_options.optInputWidth; for (int b = 0; b < batchSize; ++b) { // Create on CPU first cv::Mat cpuImg(warmH, warmW, CV_32FC(dims.d[0]), cv::Scalar(0.5f, 0.5f, 0.5f)); // Upload to GPU cv::cuda::GpuMat gpuImg; gpuImg.upload(cpuImg); batch.push_back(gpuImg); } dummyInputs.push_back(batch); } std::vector>> dummyOutputs; // Time the first iteration (kernel compilation happens here) auto start = std::chrono::high_resolution_clock::now(); bool firstSuccess = runInference(dummyInputs, dummyOutputs); auto end = std::chrono::high_resolution_clock::now(); auto firstTime = std::chrono::duration_cast(end - start).count(); if (!firstSuccess) { if (m_verbose) std::cout << " ✗ ERROR: First iteration failed for batch=" << batchSize << std::endl; continue; } if (m_verbose) { std::cout << " First iteration: " << firstTime << " ms"; if (firstTime > 100) { std::cout << " (kernel compilation detected)"; } std::cout << std::endl; } // Run remaining iterations to measure stable performance if (iters > 1) { auto iterStart = std::chrono::high_resolution_clock::now(); for (int i = 1; i < iters; ++i) { bool success = runInference(dummyInputs, dummyOutputs); if (!success) { if (m_verbose) std::cout << " ✗ ERROR: Iteration " << i << " failed" << std::endl; break; } } auto iterEnd = std::chrono::high_resolution_clock::now(); auto totalTime = std::chrono::duration_cast(iterEnd - iterStart).count(); float avgTime = totalTime / static_cast(iters - 1); if (m_verbose) { std::cout << " Subsequent iterations (avg): " << std::fixed << std::setprecision(1) << avgTime << " ms" << std::endl; if (firstTime > 100 && avgTime < firstTime * 0.5f) { float speedup = firstTime / avgTime; std::cout << " ✓ Speedup after warmup: " << std::fixed << std::setprecision(1) << speedup << "x faster" << std::endl; } } } if (m_verbose) std::cout << " ✓ Batch=" << batchSize << " warmed up successfully" << std::endl; } if (m_verbose) { std::cout << "\n========================================" << std::endl; std::cout << "Warmup Complete!" << std::endl; std::cout << "========================================" << std::endl; std::cout << "Kernels pre-compiled for all batch sizes." << std::endl; std::cout << "========================================\n" << std::endl; } } template bool Engine::runInference(const std::vector>& inputs,std::vector>>& featureVectors) { // ============================================================================ // MULTI-GPU POOL DISPATCH // ============================================================================ // If this Engine was initialised with initializePool() / initializePoolFromEngine() // the m_slots vector is non-empty. In that case, delegate to the pool // dispatcher which acquires the first idle slot and runs inference there. // This branch is NEVER taken for single-GPU use (buildLoadNetwork / loadNetwork). if (!m_slots.empty()) { return runInferenceFromPool(inputs, featureVectors); } // ============================================================================ // GPU INFERENCE THROTTLE // ============================================================================ // Limit how many Engine instances can run CUDA inference simultaneously. // Without this, 12 cameras each with their own Engine all submit GPU work // at once → SM 100% → thermal throttle → cudaStreamSynchronize hangs. // The semaphore lets excess threads wait on CPU (nearly zero cost) while // a bounded number use the GPU efficiently. GpuInferenceGuard gpuThrottle; // ============================================================================ // SINGLE-ENGINE SERIALISATION // ============================================================================ // The single Engine instance has shared mutable state (m_buffers, m_lastBatchSize, // m_inferenceStream, TRT execution context). If two LabVIEW threads call // runInference concurrently with different batch sizes, one will overwrite // the input shapes and buffers while the other is mid-inference, causing a // fatal "illegal memory access" that permanently corrupts the CUDA context. // // Pool-mode slots have their own busy-flag dispatch so they do NOT need this. std::lock_guard inferenceLock(m_inferenceMutex); // ============================================================================ // THREAD-SAFE GPU CONTEXT // ============================================================================ // Ensure the calling thread's CUDA device matches this engine's GPU. // This is essential for multi-GPU round-robin: LabVIEW reuses threads // across tasks, so a thread that last ran inference on GPU 1 might now // be running a task on GPU 0. Without this, cv::cuda::GpuMat allocations // and kernel launches would target the wrong GPU, causing result corruption. // Skip cudaSetDevice if already on the correct device — under WDDM // with multiple GPUs each call costs 1-5ms of scheduler overhead. { int currentDev = -1; cudaGetDevice(¤tDev); if (currentDev != m_options.deviceIndex) { cudaSetDevice(m_options.deviceIndex); } } // ============================================================================ // DEBUG: First call diagnostics (per-instance, not process-wide) // ============================================================================ if (m_verbose && m_firstInferenceCall) { std::cout << "\n=== First runInference Call ===" << std::endl; std::cout << "Number of input tensors: " << inputs.size() << std::endl; for (size_t i = 0; i < inputs.size(); ++i) { std::cout << "Input " << i << " batch size: " << inputs[i].size() << std::endl; if (!inputs[i].empty()) { const auto& img = inputs[i][0]; std::cout << " Image shape: " << img.cols << "x" << img.rows << "x" << img.channels() << " (type: " << img.type() << ")" << std::endl; } } // Print optimization profile information std::cout << "\n=== Engine Profile Information ===" << std::endl; std::cout << "Number of optimization profiles: " << m_engine->getNbOptimizationProfiles() << std::endl; if (m_engine->getNbOptimizationProfiles() > 0) { for (int profile = 0; profile < m_engine->getNbOptimizationProfiles(); ++profile) { std::cout << "\n--- Profile " << profile << " ---" << std::endl; for (size_t i = 0; i < m_IOTensorNames.size(); ++i) { const char* tensorName = m_IOTensorNames[i].c_str(); // Check if this is an input tensor auto ioMode = m_engine->getTensorIOMode(tensorName); if (ioMode != nvinfer1::TensorIOMode::kINPUT) { continue; } auto minDims = m_engine->getProfileShape(tensorName, profile, nvinfer1::OptProfileSelector::kMIN); auto optDims = m_engine->getProfileShape(tensorName, profile, nvinfer1::OptProfileSelector::kOPT); auto maxDims = m_engine->getProfileShape(tensorName, profile, nvinfer1::OptProfileSelector::kMAX); std::cout << "Tensor '" << tensorName << "' (INPUT):" << std::endl; std::cout << " Min: [" << minDims.d[0]; for (int j = 1; j < minDims.nbDims; ++j) std::cout << "," << minDims.d[j]; std::cout << "]" << std::endl; std::cout << " Opt: [" << optDims.d[0]; for (int j = 1; j < optDims.nbDims; ++j) std::cout << "," << optDims.d[j]; std::cout << "]" << std::endl; std::cout << " Max: [" << maxDims.d[0]; for (int j = 1; j < maxDims.nbDims; ++j) std::cout << "," << maxDims.d[j]; std::cout << "]" << std::endl; } } } if (!m_context->allInputDimensionsSpecified()) { std::cout << "ERROR: Input dimensions not specified in context!" << std::endl; return false; } std::cout << "\nContext state: All dimensions specified ✓" << std::endl; m_firstInferenceCall = false; } // ============================================================================ // INPUT VALIDATION // ============================================================================ if (inputs.empty() || inputs[0].empty()) { std::cout << "Error: Empty input" << std::endl; return false; } // ============================================================================ // SM=100% DETECTOR — tracks inference timing trends to catch the exact // moment GPU becomes saturated. Logs every 50 inferences with rolling // average, and immediately when degradation is detected. // ============================================================================ // Global (process-wide) counters shared across all engine instances/threads static std::atomic s_globalInfCount{0}; static std::atomic s_globalActiveInf{0}; // currently in-flight inferences static std::atomic s_globalLastAvgMs{0.0}; // last known avg inference time const int64_t myInfNum = s_globalInfCount.fetch_add(1) + 1; s_globalActiveInf.fetch_add(1); // Per-thread tracking { static thread_local int64_t s_infCount = 0; static thread_local std::chrono::steady_clock::time_point s_lastLog; static thread_local double s_rollingAvgMs = 0.0; static thread_local double s_baselineMs = 0.0; // avg during first 100 inferences static thread_local double s_maxMs = 0.0; static thread_local bool s_degradationLogged = false; s_infCount++; if (s_infCount == 1) { s_lastLog = std::chrono::steady_clock::now(); ANS_DBG("TRT_SM100", "FIRST inference — engine alive, globalInf=%lld", myInfNum); } // Log every 50 inferences (more frequent than 500 to catch transitions) if (s_infCount % 50 == 0) { auto now = std::chrono::steady_clock::now(); double elapsed = std::chrono::duration(now - s_lastLog).count(); double fps = (elapsed > 0) ? (50.0 / elapsed) : 0; s_lastLog = now; size_t vramFree = 0, vramTotal = 0; cudaMemGetInfo(&vramFree, &vramTotal); size_t vramUsedMB = (vramTotal - vramFree) / (1024 * 1024); size_t vramFreeMB = vramFree / (1024 * 1024); ANS_DBG("TRT_SM100", "#%lld [global=%lld active=%d] %.1f inf/sec avgMs=%.1f maxMs=%.1f batch=%d graphs=%zu VRAM=%zuMB/%zuMB", s_infCount, myInfNum, s_globalActiveInf.load(), fps, s_rollingAvgMs, s_maxMs, (int)inputs[0].size(), m_graphExecs.size(), vramUsedMB, vramFreeMB); // Capture baseline from first 100 inferences if (s_infCount == 100) { s_baselineMs = s_rollingAvgMs; ANS_DBG("TRT_SM100", "BASELINE established: %.1fms/inference", s_baselineMs); } // Detect degradation: avg >3x baseline AND baseline is set if (s_baselineMs > 0 && s_rollingAvgMs > s_baselineMs * 3.0 && !s_degradationLogged) { s_degradationLogged = true; ANS_DBG("TRT_SM100", "*** DEGRADATION DETECTED *** avg=%.1fms baseline=%.1fms (%.1fx) VRAM=%zuMB/%zuMB active=%d", s_rollingAvgMs, s_baselineMs, s_rollingAvgMs / s_baselineMs, vramUsedMB, vramFreeMB, s_globalActiveInf.load()); } // Reset max for next window s_maxMs = 0.0; } } const auto numInputs = m_inputDims.size(); if (inputs.size() != numInputs) { std::cout << "Error: Wrong number of inputs. Expected: " << numInputs << ", Got: " << inputs.size() << std::endl; return false; } const auto batchSize = static_cast(inputs[0].size()); if (batchSize > m_options.maxBatchSize) { std::cout << "Error: Batch size " << batchSize << " exceeds maximum " << m_options.maxBatchSize << std::endl; return false; } if (batchSize < 1) { std::cout << "Error: Batch size must be at least 1" << std::endl; return false; } // Validate batch size consistency across all inputs for (size_t i = 1; i < inputs.size(); ++i) { if (inputs[i].size() != static_cast(batchSize)) { std::cout << "Error: Inconsistent batch sizes across inputs. Input 0: " << batchSize << ", Input " << i << ": " << inputs[i].size() << std::endl; return false; } } // ============================================================================ // STREAM GUARD // ============================================================================ // m_inferenceStream is now created eagerly in loadNetwork() so it is always // valid here. Guard against the (unlikely) edge case where runInference is // called before loadNetwork succeeds. if (!m_streamInitialized || !m_inferenceStream) { std::string errMsg = "Error: Inference stream not initialised. " "Call loadNetwork() / buildLoadNetwork() before runInference()."; std::cout << errMsg << std::endl; logEngineEvent("[Engine] runInference: " + errMsg, true); return false; } // ============================================================================ // SET INPUT SHAPES (batch size changed OR dynamic spatial dims need updating) // ============================================================================ // Fast path: compute desired dims first, then compare against cached dims. // This avoids all TRT API calls when the shape hasn't actually changed — // critical for the recognizer which is called ~50-100x per image with // dynamic width but often the same or similar widths. // ============================================================================ { // Lazily initialise the dims cache on first call if (m_lastSetInputDims.empty()) { m_lastSetInputDims.resize(numInputs); for (size_t i = 0; i < numInputs; ++i) { m_lastSetInputDims[i].nbDims = 0; // force mismatch on first call } } // Build desired dims for every input tensor (cheap — no TRT API calls) bool anyDimChanged = (m_lastBatchSize != batchSize); std::vector desiredDims(numInputs); for (size_t i = 0; i < numInputs; ++i) { nvinfer1::Dims& nd = desiredDims[i]; nd.nbDims = 4; nd.d[0] = batchSize; nd.d[1] = m_inputDims[i].d[0]; // channels if (m_hasDynamicSpatialDims && !inputs[i].empty()) { const auto& firstImg = inputs[i][0]; nd.d[2] = (m_inputDims[i].d[1] == -1) ? firstImg.rows : m_inputDims[i].d[1]; nd.d[3] = (m_inputDims[i].d[2] == -1) ? firstImg.cols : m_inputDims[i].d[2]; } else { nd.d[2] = m_inputDims[i].d[1]; nd.d[3] = m_inputDims[i].d[2]; } // Compare with cached if (!anyDimChanged) { const auto& cached = m_lastSetInputDims[i]; if (cached.nbDims != nd.nbDims || cached.d[0] != nd.d[0] || cached.d[1] != nd.d[1] || cached.d[2] != nd.d[2] || cached.d[3] != nd.d[3]) { anyDimChanged = true; } } } if (anyDimChanged) { ANS_DBG("TRT_Engine", "Shape change detected: batch %d -> %d (graphsCached=%zu)", m_lastBatchSize, batchSize, m_graphExecs.size()); // === First-time diagnostics (verbose, once) === const bool firstTime = !m_batchShapeChangeLogged; if (m_verbose && firstTime) { std::cout << "\nInfo: Batch size change: " << m_lastBatchSize << " -> " << batchSize << std::endl; } // Set optimization profile (only when truly needed) if (m_engine->getNbOptimizationProfiles() > 0) { int currentProfile = m_context->getOptimizationProfile(); if (currentProfile != 0 || m_lastBatchSize < 0) { if (m_verbose && firstTime) { std::cout << " Setting optimization profile to 0..." << std::endl; } if (!m_context->setOptimizationProfileAsync(0, m_inferenceStream)) { std::cout << "Error: Failed to set optimization profile 0" << std::endl; return false; } cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream); if (syncErr != cudaSuccess) { std::cout << "Error: Failed to sync after profile change: " << cudaGetErrorString(syncErr) << std::endl; return false; } if (m_verbose && firstTime) { std::cout << " Optimization profile set successfully" << std::endl; } } } // Update shapes for input tensors that actually changed for (size_t i = 0; i < numInputs; ++i) { const char* tensorName = m_IOTensorNames[i].c_str(); // Skip non-input tensors auto ioMode = m_engine->getTensorIOMode(tensorName); if (ioMode != nvinfer1::TensorIOMode::kINPUT) continue; const nvinfer1::Dims& newDims = desiredDims[i]; const nvinfer1::Dims& cached = m_lastSetInputDims[i]; // Skip this tensor if its dims haven't changed if (cached.nbDims == newDims.nbDims && cached.d[0] == newDims.d[0] && cached.d[1] == newDims.d[1] && cached.d[2] == newDims.d[2] && cached.d[3] == newDims.d[3]) { continue; } // First-time verbose diagnostics if (m_verbose && firstTime) { std::cout << "\n Processing tensor " << i << ": '" << tensorName << "'" << std::endl; // Validate batch size range (first time only) if (m_engine->getNbOptimizationProfiles() > 0) { int profileIndex = m_context->getOptimizationProfile(); nvinfer1::Dims minDims = m_engine->getProfileShape( tensorName, profileIndex, nvinfer1::OptProfileSelector::kMIN); nvinfer1::Dims maxDims = m_engine->getProfileShape( tensorName, profileIndex, nvinfer1::OptProfileSelector::kMAX); std::cout << " Profile batch range: [" << minDims.d[0] << " to " << maxDims.d[0] << "]" << std::endl; if (batchSize < minDims.d[0] || batchSize > maxDims.d[0]) { std::cout << "Error: Batch size " << batchSize << " outside profile range" << std::endl; return false; } } auto currentShape = m_context->getTensorShape(tensorName); std::cout << " Current context shape: ["; for (int j = 0; j < currentShape.nbDims; ++j) { if (j > 0) std::cout << ", "; std::cout << currentShape.d[j]; } std::cout << "]" << std::endl; std::cout << " Setting new shape: [" << newDims.d[0] << ", " << newDims.d[1] << ", " << newDims.d[2] << ", " << newDims.d[3] << "]" << std::endl; } ANS_DBG("TRT_Engine", "setInputShape('%s') [%d,%d,%d,%d]", tensorName, newDims.d[0], newDims.d[1], newDims.d[2], newDims.d[3]); if (!m_context->setInputShape(tensorName, newDims)) { ANS_DBG("TRT_Engine", "ERROR: setInputShape FAILED for '%s'", tensorName); std::cout << "Error: Failed to set input shape for '" << tensorName << "'" << std::endl; return false; } // Verify shape (first time only — trust the API on hot path) if (firstTime) { auto verifyShape = m_context->getTensorShape(tensorName); if (verifyShape.d[0] != batchSize) { std::cout << "Error: Shape change didn't take effect. Expected batch " << batchSize << ", got " << verifyShape.d[0] << std::endl; return false; } if (m_verbose) { std::cout << " Shape updated successfully" << std::endl; } } m_lastSetInputDims[i] = newDims; } // Verify all input dimensions specified (first time only) if (firstTime) { if (!m_context->allInputDimensionsSpecified()) { std::cout << "Error: Not all input dimensions specified after shape change" << std::endl; for (size_t i = 0; i < m_IOTensorNames.size(); ++i) { auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str()); std::cout << " " << m_IOTensorNames[i] << ": ["; for (int j = 0; j < shape.nbDims; ++j) { if (j > 0) std::cout << ", "; std::cout << shape.d[j]; } std::cout << "]" << std::endl; } return false; } } m_lastBatchSize = batchSize; m_batchShapeChangeLogged = true; // CRITICAL: Invalidate all cached CUDA graphs after shape change. // Graphs were captured with the OLD context state (old tensor shapes). // Launching them after setInputShape() produces undefined GPU behavior // (invalid kernel sequences, SM lockup at 100%, inference hang). if (!m_graphExecs.empty()) { size_t destroyed = m_graphExecs.size(); for (auto& [bs, ge] : m_graphExecs) { if (ge) cudaGraphExecDestroy(ge); } m_graphExecs.clear(); ANS_DBG("TRT_Engine", "INVALIDATED %zu cached CUDA graphs after shape change (batch=%d)", destroyed, batchSize); if (m_verbose || firstTime) { std::cout << "Info: Invalidated " << destroyed << " cached CUDA graphs after shape change" << std::endl; } } if (m_verbose && firstTime) { std::cout << "\nInfo: Input shapes updated successfully for batch size " << batchSize << " ✓\n" << std::endl; } } } // ============================================================================ // PREPROCESS AND COPY INPUTS TO GPU BUFFERS // ============================================================================ // Pass 1: Validate all input dimensions before any GPU work. // Dynamic dims (-1) are skipped in validation (they accept any size). for (size_t i = 0; i < numInputs; ++i) { const auto& batchInput = inputs[i]; const auto& dims = m_inputDims[i]; if (!batchInput.empty()) { const auto& firstImg = batchInput[0]; bool mismatch = false; if (dims.d[0] > 0 && firstImg.channels() != dims.d[0]) mismatch = true; if (dims.d[1] > 0 && firstImg.rows != dims.d[1]) mismatch = true; if (dims.d[2] > 0 && firstImg.cols != dims.d[2]) mismatch = true; if (mismatch) { std::cout << "Error: Input " << i << " dimension mismatch!" << std::endl; std::cout << " Expected: " << dims.d[2] << "x" << dims.d[1] << "x" << dims.d[0] << " (WxHxC, -1=dynamic)" << std::endl; std::cout << " Got: " << firstImg.cols << "x" << firstImg.rows << "x" << firstImg.channels() << " (WxHxC)" << std::endl; return false; } } } // Pass 2: Preprocess + D2D copies — all on m_inferenceStream (no null stream). // // All OpenCV CUDA ops (convertTo, subtract, divide, split) in blobFromGpuMats // now run on m_inferenceStream via the cv::cuda::Stream wrapper. This means: // • No null-stream interaction — eliminates global sync barriers on WDDM // • No event bridge needed — same-stream ordering guarantees correctness // • CUDA graphs are safe — cv::cuda::split runs BEFORE graph capture // // GpuMat-lifetime: preprocessedBuffers keeps GpuMats alive past the final // cudaStreamSynchronize, so cudaFree() doesn't stall the pipeline. auto _prepStart = std::chrono::steady_clock::now(); cv::cuda::Stream cvInferStream = cv::cuda::StreamAccessor::wrapStream(m_inferenceStream); std::vector preprocessedBuffers; preprocessedBuffers.reserve(numInputs); for (size_t i = 0; i < numInputs; ++i) { const auto& batchInput = inputs[i]; // Preprocess on m_inferenceStream (not the null stream). preprocessedBuffers.push_back( blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize, false, cvInferStream)); // D2D copy: same stream as preprocessing, so ordering is guaranteed. const auto& blobMat = preprocessedBuffers.back(); const size_t copySize = static_cast(blobMat.rows) * static_cast(blobMat.cols) * blobMat.elemSize(); cudaError_t copyErr = cudaMemcpyAsync( m_buffers[i], preprocessedBuffers.back().ptr(), copySize, cudaMemcpyDeviceToDevice, m_inferenceStream); if (copyErr != cudaSuccess) { std::cout << "Error: Failed to copy input " << i << " to inference buffer: " << cudaGetErrorString(copyErr) << std::endl; return false; } } { double _prepMs = std::chrono::duration( std::chrono::steady_clock::now() - _prepStart).count(); if (_prepMs > 100.0) { ANS_DBG("TRT_SM100", "SLOW PREPROCESS: %.1fms batch=%d (blobFromGpuMats+D2D)", _prepMs, batchSize); } } // ============================================================================ // PRE-ALLOCATE OUTPUT STRUCTURE // ============================================================================ const size_t numOutputs = m_outputLengths.size(); featureVectors.clear(); featureVectors.resize(batchSize); for (int batch = 0; batch < batchSize; ++batch) { featureVectors[batch].resize(numOutputs); for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) featureVectors[batch][outputIdx].resize(m_outputLengths[outputIdx]); } // ============================================================================ // RUN INFERENCE + COPY OUTPUTS (CUDA Graph path or direct path) // ============================================================================ // CUDA Graph path // --------------- // On the first call for a given batchSize we capture enqueueV3 + D2H copies // into a reusable graph. Subsequent calls use cudaGraphLaunch, replacing // many individual kernel-submission API calls with a single launch. // // Prerequisites satisfied here: // • Preprocessing + D2D copies are queued on m_inferenceStream (same-stream // ordering guarantees they complete before captured kernels execute) // • m_pinnedOutputBuffers has stable addresses (allocated in loadNetwork) // • m_buffers (GPU outputs) have stable addresses (allocated in loadNetwork) // // Falls back to the direct path if pinned buffers are unavailable or if // graph capture/instantiation fails for any reason. // CUDA graphs capture fixed kernel sequences; incompatible with dynamic spatial dims // (input H/W change per inference call → different TRT kernel plans each time). // Disabled for pool slots — concurrent graph captures on the same GPU corrupt the // CUDA context ("operation not permitted when stream is capturing"). const bool canGraph = !m_disableGraphs && !m_pinnedOutputBuffers.empty() && !m_hasDynamicSpatialDims; bool graphUsed = false; if (canGraph) { auto& graphExec = m_graphExecs[batchSize]; // inserts nullptr on first access if (!graphExec) { ANS_DBG("TRT_Engine", "CUDA graph CAPTURE starting for batch=%d (cached=%zu)", batchSize, m_graphExecs.size()); // First call for this batchSize -- capture a new graph. // Serialise captures across all Engine instances on this device to // prevent TRT's shared workspace from creating cross-stream // dependencies that violate CUDA graph capture rules. std::lock_guard captureLock(graphCaptureMutex()); // Clear any sticky CUDA error from a prior failed capture so that // this attempt starts clean. cudaGetLastError(); cudaGraph_t graph = nullptr; bool captureOk = false; if (cudaStreamBeginCapture(m_inferenceStream, cudaStreamCaptureModeRelaxed) == cudaSuccess) { // Record TRT kernels into the graph (not executed yet). TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers); // Record D2H copies to stable pinned addresses. for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) { cudaMemcpyAsync( m_pinnedOutputBuffers[outputIdx], static_cast(m_buffers[numInputs + outputIdx]), static_cast(batchSize) * m_outputLengths[outputIdx] * sizeof(T), cudaMemcpyDeviceToHost, m_inferenceStream); } captureOk = (cudaStreamEndCapture(m_inferenceStream, &graph) == cudaSuccess && graph != nullptr); } if (captureOk) { cudaGraphExec_t exec = nullptr; if (cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0) == cudaSuccess) graphExec = exec; cudaGraphDestroy(graph); ANS_DBG("TRT_Engine", "CUDA graph CAPTURED OK for batch=%d exec=%p", batchSize, (void*)graphExec); } if (!graphExec) { ANS_DBG("TRT_Engine", "CUDA graph capture FAILED for batch=%d — falling back to direct path", batchSize); std::cout << "Warning: CUDA graph capture failed for batchSize=" << batchSize << " -- falling back to direct inference path." << std::endl; // Disable graph acceleration for this Engine instance. for (T* p : m_pinnedOutputBuffers) { if (p) cudaFreeHost(p); } m_pinnedOutputBuffers.clear(); m_graphExecs.erase(batchSize); } } if (graphExec) { ANS_DBG("TRT_Engine", "CUDA graph LAUNCH batch=%d exec=%p", batchSize, (void*)graphExec); // Launch the pre-captured graph (single API call replaces many). auto _graphStart = std::chrono::steady_clock::now(); cudaGraphLaunch(graphExec, m_inferenceStream); cudaStreamSynchronize_Safe(m_inferenceStream); auto _graphEnd = std::chrono::steady_clock::now(); double _graphMs = std::chrono::duration(_graphEnd - _graphStart).count(); if (_graphMs > 500.0) { ANS_DBG("TRT_SM100", "SLOW GRAPH: %.1fms batch=%d active=%d", _graphMs, batchSize, s_globalActiveInf.load()); } // CPU memcpy: pinned buffers -> featureVectors (interleaved by batch). for (int batch = 0; batch < batchSize; ++batch) { for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) { std::memcpy( featureVectors[batch][outputIdx].data(), m_pinnedOutputBuffers[outputIdx] + static_cast(batch) * m_outputLengths[outputIdx], m_outputLengths[outputIdx] * sizeof(T)); } } graphUsed = true; } } // Direct path (no graph) // ---------------------- // Used when pinned buffers are unavailable or graph capture failed. if (!graphUsed) { ANS_DBG("TRT_Engine", "Direct path (no graph) batch=%d", batchSize); auto enqueueStart = std::chrono::steady_clock::now(); bool success = TRT_ENQUEUE(m_context.get(), m_inferenceStream, m_buffers); auto enqueueEnd = std::chrono::steady_clock::now(); double enqueueMs = std::chrono::duration(enqueueEnd - enqueueStart).count(); if (enqueueMs > 500.0) { ANS_DBG("TRT_Engine", "SLOW ENQUEUE: %.1fms batch=%d (enqueueV3 blocked!)", enqueueMs, batchSize); } if (!success) { ANS_DBG("TRT_Engine", "ERROR: enqueueV3 FAILED batch=%d", batchSize); std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch=" + std::to_string(batchSize) + ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO"); for (size_t i = 0; i < m_IOTensorNames.size(); ++i) { auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str()); debugInfo += ", tensor'" + m_IOTensorNames[i] + "'=["; for (int j = 0; j < shape.nbDims; ++j) { if (j > 0) debugInfo += ","; debugInfo += std::to_string(shape.d[j]); } debugInfo += "]"; } std::cout << debugInfo << std::endl; logEngineEvent(debugInfo, true); return false; } for (int batch = 0; batch < batchSize; ++batch) { for (size_t outputIdx = 0; outputIdx < numOutputs; ++outputIdx) { const size_t outputBinding = numInputs + outputIdx; const size_t offset = static_cast(batch) * m_outputLengths[outputIdx] * sizeof(T); cudaError_t copyErr = cudaMemcpyAsync( featureVectors[batch][outputIdx].data(), static_cast(m_buffers[outputBinding]) + offset, m_outputLengths[outputIdx] * sizeof(T), cudaMemcpyDeviceToHost, m_inferenceStream); if (copyErr != cudaSuccess) { std::string errMsg = "[Engine] runInference FAIL: cudaMemcpyAsync output " + std::to_string(outputIdx) + " batch " + std::to_string(batch) + ": " + cudaGetErrorString(copyErr); std::cout << errMsg << std::endl; logEngineEvent(errMsg, true); return false; } } } auto syncStart = std::chrono::steady_clock::now(); cudaError_t syncErr = cudaStreamSynchronize_Safe(m_inferenceStream); auto syncEnd = std::chrono::steady_clock::now(); double syncMs = std::chrono::duration(syncEnd - syncStart).count(); if (syncMs > 500.0) { ANS_DBG("TRT_Engine", "SLOW INFERENCE SYNC: %.1fms batch=%d (direct path)", syncMs, batchSize); } if (syncErr != cudaSuccess) { ANS_DBG("TRT_Engine", "ERROR: cudaStreamSync FAILED err=%d (%s)", (int)syncErr, cudaGetErrorString(syncErr)); std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: " + std::string(cudaGetErrorString(syncErr)); std::cout << errMsg << std::endl; logEngineEvent(errMsg, true); return false; } } // ============================================================================ // SM=100% DETECTOR — end-of-inference timing // ============================================================================ { static thread_local double s_ema = 0; static thread_local std::chrono::steady_clock::time_point s_prevEnd; static thread_local bool s_firstDone = false; auto _now = std::chrono::steady_clock::now(); if (s_firstDone) { double sinceLastMs = std::chrono::duration(_now - s_prevEnd).count(); // If time between consecutive inferences jumps dramatically, // something blocked the thread (SM=100% or mutex contention) if (s_ema > 0 && sinceLastMs > s_ema * 3.0 && sinceLastMs > 500.0) { size_t vf = 0, vt = 0; cudaMemGetInfo(&vf, &vt); ANS_DBG("TRT_SM100", "GAP DETECTED: %.1fms between inferences (avg=%.1fms, %.1fx) active=%d VRAM=%zuMB free", sinceLastMs, s_ema, sinceLastMs / s_ema, s_globalActiveInf.load(), vf / (1024*1024)); } s_ema = (s_ema == 0) ? sinceLastMs : (0.9 * s_ema + 0.1 * sinceLastMs); } s_prevEnd = _now; s_firstDone = true; s_globalActiveInf.fetch_sub(1); } return true; }