diff --git a/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl b/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl index f594c63..b25d5e8 100644 --- a/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl +++ b/engines/TensorRTAPI/include/engine/EngineBuildLoadNetwork.inl @@ -267,7 +267,6 @@ bool Engine::buildLoadNetwork(std::string onnxModelPath, const std::array::buildLoadNetwork(std::string onnxModelPath, const std::array> 20) + " MiB free): " + engineName, true); return false; } // Check if the failure was due to VRAM exhaustion vs. corrupt file. @@ -301,17 +296,11 @@ bool Engine::buildLoadNetwork(std::string onnxModelPath, const std::array::buildLoadNetwork(std::string onnxModelPath, const std::array::buildLoadNetwork(std::string onnxModelPath, const std::arraygetNbInputs(); @@ -718,7 +702,6 @@ bool Engine::loadNetwork(std::string trtModelPath, const std::array // ============================================================================ if (!Util::doesFileExist(trtModelPath)) { - logEngineEvent("[Engine] loadNetwork FAIL: Engine file not found: " + trtModelPath, true); return false; } @@ -727,13 +710,11 @@ bool Engine::loadNetwork(std::string trtModelPath, const std::array { std::ifstream file(trtModelPath, std::ios::binary | std::ios::ate); if (!file.is_open()) { - logEngineEvent("[Engine] loadNetwork FAIL: Cannot open engine file: " + trtModelPath, true); return false; } std::streamsize size = file.tellg(); if (size <= 0) { - logEngineEvent("[Engine] loadNetwork FAIL: Engine file is empty (0 bytes): " + trtModelPath, true); return false; } @@ -741,7 +722,6 @@ bool Engine::loadNetwork(std::string trtModelPath, const std::array std::vector buffer(size); if (!file.read(buffer.data(), size)) { - logEngineEvent("[Engine] loadNetwork FAIL: Read error on engine file: " + trtModelPath, true); return false; } @@ -761,7 +741,6 @@ bool Engine::loadNetwork(std::string trtModelPath, const std::array m_runtime = std::shared_ptr{ nvinfer1::createInferRuntime(m_logger) }; if (!m_runtime) { - logEngineEvent("[Engine] loadNetwork FAIL: createInferRuntime returned null for " + trtModelPath, true); return false; } @@ -830,17 +809,8 @@ bool Engine::loadNetwork(std::string trtModelPath, const std::array constexpr size_t kMinFreeBytes = 256ULL * 1024 * 1024; // 256 MiB minimum if (memErr != cudaSuccess) { // cudaMemGetInfo failed — CUDA context may not be initialized on this thread. - // Log but don't reject: let TRT try to deserialize (it may succeed). - logEngineEvent("[Engine] loadNetwork WARNING: cudaMemGetInfo failed (" - + std::string(cudaGetErrorString(memErr)) + ") on GPU[" - + std::to_string(m_options.deviceIndex) + "] — skipping VRAM check for " - + trtModelPath, true); + // Don't reject: let TRT try to deserialize (it may succeed). } else if (freeVRAM < kMinFreeBytes) { - logEngineEvent("[Engine] loadNetwork FAIL: GPU[" + std::to_string(m_options.deviceIndex) - + "] only " + std::to_string(freeVRAM / (1024 * 1024)) - + " MiB free / " + std::to_string(totalVRAM / (1024 * 1024)) - + " MiB total (need " + std::to_string(kMinFreeBytes / (1024 * 1024)) - + " MiB) for " + trtModelPath, true); m_lastLoadFailedVRAM = true; // signal to buildLoadNetwork: engine file is NOT corrupt return false; } @@ -861,13 +831,9 @@ bool Engine::loadNetwork(std::string trtModelPath, const std::array deserializeCudaEngineSafe(m_runtime.get(), buffer.data(), buffer.size(), &sehCodeDeserialize)); if (sehCodeDeserialize != 0) { - logEngineEvent("[Engine] loadNetwork FAIL: deserializeCudaEngine CRASHED (SEH " - + formatCrashCode(sehCodeDeserialize) + ") for " + trtModelPath, true); return false; } if (!m_engine) { - logEngineEvent("[Engine] loadNetwork FAIL: deserializeCudaEngine returned null for " - + trtModelPath + " (file size=" + std::to_string(size / (1024*1024)) + " MiB)", true); return false; } @@ -1018,8 +984,6 @@ trt_cache_create_context: m_context = std::unique_ptr(m_engine->createExecutionContext()); if (!m_context) { ANS_DBG("TRT_Load", "ERROR: createExecutionContext returned null"); - logEngineEvent("[Engine] loadNetwork FAIL: createExecutionContext returned null for " - + trtModelPath, true); return false; } @@ -1106,9 +1070,6 @@ trt_cache_create_context: // Allocate GPU memory cudaError_t err = cudaMalloc(&m_buffers[i], requestedMemory); if (err != cudaSuccess) { - logEngineEvent("[Engine] loadNetwork FAIL: cudaMalloc input buffer (" - + std::to_string(requestedMemory / (1024*1024)) + " MiB): " - + cudaGetErrorString(err) + " for " + trtModelPath, true); return false; } @@ -1179,9 +1140,6 @@ trt_cache_create_context: // Allocate GPU memory cudaError_t err = cudaMalloc(&m_buffers[i], requestedMemory); if (err != cudaSuccess) { - logEngineEvent("[Engine] loadNetwork FAIL: cudaMalloc output buffer (" - + std::to_string(requestedMemory / (1024*1024)) + " MiB): " - + cudaGetErrorString(err) + " for " + trtModelPath, true); return false; } @@ -1534,9 +1492,6 @@ bool Engine::build(std::string onnxModelPath, const std::array& sub auto parsed = parseOnnxModelSafe(parser.get(), buffer.data(), buffer.size(), &sehCodeParse); if (sehCodeParse != 0) { - std::cout << "[Engine] FATAL: ONNX parser crashed (" - << formatCrashCode(sehCodeParse) << ")" << std::endl; - std::cout << "[Engine] This may indicate a corrupt ONNX file or driver issue." << std::endl; return false; } if (!parsed) { @@ -2317,12 +2272,6 @@ bool Engine::build(std::string onnxModelPath, const std::array& sub auto endTime = std::chrono::high_resolution_clock::now(); if (sehCodeBuild != 0) { - std::cout << "\n========================================" << std::endl; - std::cout << "Build CRASHED!" << std::endl; - std::cout << "========================================" << std::endl; - std::cout << "[Engine] FATAL: buildSerializedNetwork crashed (" - << formatCrashCode(sehCodeBuild) << ")" << std::endl; - std::cout << "[Engine] This typically indicates insufficient GPU memory or a driver crash." << std::endl; Util::checkCudaErrorCode(cudaStreamDestroy(profileStream)); return false; } @@ -2478,9 +2427,6 @@ bool Engine::buildWithRetry(std::string onnxModelPath, bool retryParsed = parseOnnxModelSafe(tempParser.get(), onnxBuffer.data(), onnxBuffer.size(), &sehRetryParse); if (sehRetryParse != 0) { - std::cout << "[Engine] WARNING: ONNX pre-analysis parse crashed in " - << "buildWithRetry (" << formatCrashCode(sehRetryParse) - << "). Skipping spatial analysis." << std::endl; // hasDynamicSpatial stays false → single build() attempt } else if (retryParsed && tempNetwork->getNbInputs() > 0) { @@ -2501,8 +2447,6 @@ bool Engine::buildWithRetry(std::string onnxModelPath, unsigned long sehBuild = 0; bool ok = buildSafe(onnxModelPath, subVals, divVals, normalize, &sehBuild); if (sehBuild != 0) { - std::cout << "[Engine] FATAL: build() crashed in buildWithRetry (" - << formatCrashCode(sehBuild) << ")" << std::endl; return false; } return ok; @@ -2557,40 +2501,17 @@ bool Engine::buildWithRetry(std::string onnxModelPath, for (size_t attempt = 0; attempt < candidates.size(); ++attempt) { setCandidateOptions(candidates[attempt]); - std::cout << "[Engine] buildWithRetry attempt " << (attempt + 1) - << "/" << candidates.size() << " (max " - << m_options.maxInputHeight << "x" - << m_options.maxInputWidth << ")" << std::endl; - { unsigned long sehAttempt = 0; bool attemptOk = buildSafe(onnxModelPath, subVals, divVals, normalize, &sehAttempt); if (sehAttempt != 0) { - std::cout << "[Engine] Build crashed (" - << formatCrashCode(sehAttempt) << ") at max " - << m_options.maxInputHeight << "x" - << m_options.maxInputWidth << std::endl; // CUDA context may be corrupted — no point retrying return false; } if (attemptOk) { - if (attempt > 0) { - std::cout << "[Engine] Built with reduced max " - << m_options.maxInputHeight << "x" - << m_options.maxInputWidth - << " (requested " << origMaxH << "x" << origMaxW - << " exceeded GPU capacity)" << std::endl; - } return true; } } - - if (attempt + 1 < candidates.size()) { - std::cout << "[Engine] Build failed at max " - << m_options.maxInputHeight << "x" - << m_options.maxInputWidth - << ", trying smaller..." << std::endl; - } } // All candidates exhausted — restore original options for error reporting @@ -2601,10 +2522,6 @@ bool Engine::buildWithRetry(std::string onnxModelPath, m_options.minInputHeight = origMinH; m_options.minInputWidth = origMinW; - std::cout << "[Engine] buildWithRetry: all spatial dimension fallbacks " - << "exhausted (tried " << candidates.size() << " candidates from " - << candidates.front() << " down to " << candidates.back() << ")" - << std::endl; return false; } diff --git a/engines/TensorRTAPI/include/engine/EngineMultiGpu.inl b/engines/TensorRTAPI/include/engine/EngineMultiGpu.inl index 99b607d..37915d2 100644 --- a/engines/TensorRTAPI/include/engine/EngineMultiGpu.inl +++ b/engines/TensorRTAPI/include/engine/EngineMultiGpu.inl @@ -223,9 +223,6 @@ bool Engine::loadSlots( : probeEngine->loadNetwork (modelPath, subVals, divVals, normalize); if (!probeOk) { - logEngineEvent("[Engine] loadSlots FAIL: Probe engine failed on GPU[" - + std::to_string(probeGpuIdx) + "] for " + modelPath - + " (freeVRAM before=" + std::to_string(freeBefore / 1048576) + " MiB)", true); return false; } @@ -686,13 +683,6 @@ bool Engine::runInferenceFromPool( } if (!slot) { ANS_DBG("TRT_Pool", "ERROR: No slot available — all %zu slots busy, timeout", m_slots.size()); - std::string errMsg = "[Engine] runInferenceFromPool FAIL: Capacity reached -- all " - + std::to_string(m_activeCount.load()) + "/" + std::to_string(m_totalCapacity) - + " slot(s) busy" - + (m_elasticMode ? " and all GPUs full" : "") - + ". Request rejected (2s timeout)."; - std::cout << errMsg << std::endl; - logEngineEvent(errMsg, true); return false; } diff --git a/engines/TensorRTAPI/include/engine/EnginePoolManager.h b/engines/TensorRTAPI/include/engine/EnginePoolManager.h index 4f8b9b8..c26e1fc 100644 --- a/engines/TensorRTAPI/include/engine/EnginePoolManager.h +++ b/engines/TensorRTAPI/include/engine/EnginePoolManager.h @@ -99,8 +99,6 @@ public: // Note: maxSlotsPerGpu==1 is now the normal "1 slot per GPU" multi-GPU // round-robin mode, so it goes through the pool path below. if (maxSlotsPerGpu == 0) { - logEvent("[EnginePoolManager] BYPASS (maxSlots=0): " + key.modelPath - + " — creating non-shared engine"); auto engine = std::make_shared>(options); bool ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize); return ok ? engine : nullptr; @@ -114,8 +112,6 @@ public: it->second.evictTime = TimePoint{}; // cancel pending eviction int refs = it->second.refcount; auto engine = it->second.engine; - logEvent("[EnginePoolManager] HIT: " + key.modelPath - + " refs=" + std::to_string(refs)); // Demand-driven growth: only in elastic mode (maxSlotsPerGpu <= 0 // or > 1). With maxSlotsPerGpu==1 (round-robin default), the pool @@ -134,19 +130,9 @@ public: constexpr size_t kMinVramForGrowth = 6ULL * 1024 * 1024 * 1024; // 6 GB if (totalVram >= kMinVramForGrowth) { lock.unlock(); // release PoolManager lock before growing - std::thread([engine, alive, refs, modelPath = key.modelPath]() { - int created = engine->growPool(1); - if (created > 0) { - logEngineEvent("[EnginePoolManager] DEMAND GROWTH: " + modelPath - + " grew from " + std::to_string(alive) - + " to " + std::to_string(engine->getTotalCapacity()) - + " slots (refs=" + std::to_string(refs) + ")"); - } + std::thread([engine]() { + engine->growPool(1); }).detach(); - } else { - logEvent("[EnginePoolManager] SKIP GROWTH: " + key.modelPath - + " (GPU VRAM " + std::to_string(totalVram >> 20) - + " MiB < 6 GB threshold, refs=" + std::to_string(refs) + ")"); } } } @@ -155,31 +141,12 @@ public: } // Cache miss — create new Engine pool - logEvent("[EnginePoolManager] MISS: Creating pool for " + key.modelPath + "..."); - - // Log VRAM before attempting to create probe - { - size_t freeMem = 0, totalMem = 0; - cudaSetDevice(options.deviceIndex); - cudaMemGetInfo(&freeMem, &totalMem); - logEvent("[EnginePoolManager] GPU[" + std::to_string(options.deviceIndex) - + "] VRAM: " + std::to_string(freeMem >> 20) + " MiB free / " - + std::to_string(totalMem >> 20) + " MiB total (before probe)"); - } - auto engine = std::make_shared>(options); bool ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize, maxSlotsPerGpu); if (!ok) { // Step 1: Force-evict all pools with refcount=0 to reclaim VRAM int evicted = forceEvictPending(); if (evicted > 0) { - size_t freeMem2 = 0, totalMem2 = 0; - cudaSetDevice(options.deviceIndex); - cudaMemGetInfo(&freeMem2, &totalMem2); - logEvent("[EnginePoolManager] RETRY EVICT: Force-evicted " + std::to_string(evicted) - + " pending pool(s), now " + std::to_string(freeMem2 >> 20) - + " MiB free. Retrying " + key.modelPath + "..."); - engine = std::make_shared>(options); ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize, maxSlotsPerGpu); } @@ -189,13 +156,6 @@ public: // consumes ~300-500 MB vs ~50-100 MB for a simple loadNetwork. // Lightweight mode: tasks queue for a single shared slot — slower but works. if (!ok) { - size_t freeMem3 = 0, totalMem3 = 0; - cudaSetDevice(options.deviceIndex); - cudaMemGetInfo(&freeMem3, &totalMem3); - logEvent("[EnginePoolManager] RETRY LIGHTWEIGHT: Elastic probe failed, " - + std::to_string(freeMem3 >> 20) + " MiB free. " - "Retrying with single-slot mode for " + key.modelPath + "..."); - engine = std::make_shared>(options); ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize); } @@ -208,13 +168,6 @@ public: // Evidence: FireSmoke/detector.onnx failed at 3740 MiB free, then // succeeded 4 seconds later at 3154 MiB free (less VRAM!). if (!ok) { - size_t freeMem4 = 0, totalMem4 = 0; - cudaSetDevice(options.deviceIndex); - cudaMemGetInfo(&freeMem4, &totalMem4); - logEvent("[EnginePoolManager] RETRY DELAYED: All attempts failed with " - + std::to_string(freeMem4 >> 20) + " MiB free. " - "Waiting 3s before final retry for " + key.modelPath + "..."); - // Release mutex during sleep so other tasks can proceed // (they may complete pool creation that resolves our issue) lock.unlock(); @@ -226,29 +179,15 @@ public: if (it2 != m_pools.end()) { it2->second.refcount++; it2->second.evictTime = TimePoint{}; - logEvent("[EnginePoolManager] HIT (after delay): " + key.modelPath - + " refs=" + std::to_string(it2->second.refcount)); return it2->second.engine; } // Final retry — try lightweight again after delay - cudaSetDevice(options.deviceIndex); - cudaMemGetInfo(&freeMem4, &totalMem4); - logEvent("[EnginePoolManager] RETRY FINAL: " + std::to_string(freeMem4 >> 20) - + " MiB free. Last attempt for " + key.modelPath + "..."); - engine = std::make_shared>(options); ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize); } if (!ok) { - size_t freeMem = 0, totalMem = 0; - cudaMemGetInfo(&freeMem, &totalMem); - logEvent("[EnginePoolManager] FAILED: Could not load engine for " - + key.modelPath + " | GPU[" + std::to_string(options.deviceIndex) - + "] VRAM: " + std::to_string(freeMem >> 20) + " MiB free / " - + std::to_string(totalMem >> 20) + " MiB total" - + " (after 4 attempts: elastic, evict, lightweight, delayed)", true); return nullptr; } } @@ -261,7 +200,6 @@ public: // Start the lazy-eviction sweeper if not already running startSweeperIfNeeded(); - logEvent("[EnginePoolManager] CREATED: " + key.modelPath + " refs=1"); return engine; } @@ -280,14 +218,10 @@ public: if (it->second.refcount <= 0) return; it->second.refcount--; - logEvent("[EnginePoolManager] RELEASE: " + key.modelPath - + " refs=" + std::to_string(it->second.refcount)); if (it->second.refcount <= 0) { // Mark for lazy eviction — don't destroy yet it->second.evictTime = Clock::now() + std::chrono::seconds(kEvictGraceSec); - logEvent("[EnginePoolManager] PENDING EVICT: " + key.modelPath - + " (will evict in " + std::to_string(kEvictGraceSec) + "s if not re-acquired)"); } } @@ -295,7 +229,6 @@ public: void clearAll() { { std::lock_guard lock(m_mutex); - logEvent("[EnginePoolManager] CLEAR ALL (" + std::to_string(m_pools.size()) + " pools)"); m_pools.clear(); } stopSweeper(); @@ -361,17 +294,6 @@ private: using Clock = std::chrono::steady_clock; using TimePoint = std::chrono::time_point; - // Log to stdout/stderr only — no Windows Event Viewer. - // Event Viewer logging is handled by logEngineEvent() in engine.h for - // critical engine-level errors. EnginePoolManager messages are - // informational (HIT/MISS/EVICT) and don't need Event Viewer entries. - static void logEvent(const std::string& msg, bool isError = false) { - if (isError) - std::cerr << msg << std::endl; - else - std::cout << msg << std::endl; - } - struct PoolEntry { std::shared_ptr> engine; int refcount = 0; @@ -408,7 +330,6 @@ private: int evicted = 0; for (auto it = m_pools.begin(); it != m_pools.end(); ) { if (it->second.refcount <= 0) { - logEvent("[EnginePoolManager] FORCE EVICT (VRAM recovery): " + it->first.modelPath); it = m_pools.erase(it); evicted++; } else { @@ -428,7 +349,6 @@ private: && entry.evictTime != TimePoint{} && now >= entry.evictTime) { - logEvent("[EnginePoolManager] EVICT (expired): " + it->first.modelPath); it = m_pools.erase(it); } else { ++it; diff --git a/engines/TensorRTAPI/include/engine/EngineRunInference.inl b/engines/TensorRTAPI/include/engine/EngineRunInference.inl index c1fe6e0..882895c 100644 --- a/engines/TensorRTAPI/include/engine/EngineRunInference.inl +++ b/engines/TensorRTAPI/include/engine/EngineRunInference.inl @@ -486,10 +486,6 @@ bool Engine::runInference(const std::vector>& i // valid here. Guard against the (unlikely) edge case where runInference is // called before loadNetwork succeeds. if (!m_streamInitialized || !m_inferenceStream) { - std::string errMsg = "Error: Inference stream not initialised. " - "Call loadNetwork() / buildLoadNetwork() before runInference()."; - std::cout << errMsg << std::endl; - logEngineEvent("[Engine] runInference: " + errMsg, true); return false; } @@ -902,20 +898,6 @@ bool Engine::runInference(const std::vector>& i } if (!success) { ANS_DBG("TRT_Engine", "ERROR: enqueueV3 FAILED batch=%d", batchSize); - std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch=" - + std::to_string(batchSize) - + ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO"); - for (size_t i = 0; i < m_IOTensorNames.size(); ++i) { - auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str()); - debugInfo += ", tensor'" + m_IOTensorNames[i] + "'=["; - for (int j = 0; j < shape.nbDims; ++j) { - if (j > 0) debugInfo += ","; - debugInfo += std::to_string(shape.d[j]); - } - debugInfo += "]"; - } - std::cout << debugInfo << std::endl; - logEngineEvent(debugInfo, true); return false; } @@ -933,11 +915,6 @@ bool Engine::runInference(const std::vector>& i m_inferenceStream); if (copyErr != cudaSuccess) { - std::string errMsg = "[Engine] runInference FAIL: cudaMemcpyAsync output " - + std::to_string(outputIdx) + " batch " + std::to_string(batch) - + ": " + cudaGetErrorString(copyErr); - std::cout << errMsg << std::endl; - logEngineEvent(errMsg, true); return false; } } @@ -953,10 +930,6 @@ bool Engine::runInference(const std::vector>& i if (syncErr != cudaSuccess) { ANS_DBG("TRT_Engine", "ERROR: cudaStreamSync FAILED err=%d (%s)", (int)syncErr, cudaGetErrorString(syncErr)); - std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: " - + std::string(cudaGetErrorString(syncErr)); - std::cout << errMsg << std::endl; - logEngineEvent(errMsg, true); return false; } }