Remove [Engine] and [EnginePoolManager] debug log messages
Cleaned up verbose engine telemetry emitted to stdout/stderr and the Windows Event Viewer. Removes logEngineEvent/logEvent calls (and their diagnostic-only locals) across the TensorRT engine load, build, run, multi-GPU, and pool-manager paths, plus the now-unused logEvent helper in EnginePoolManager. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -267,7 +267,6 @@ bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<flo
|
|||||||
|
|
||||||
if (FileExist(engineName)) {
|
if (FileExist(engineName)) {
|
||||||
if (m_verbose) { std::cout << "Engine file found: " << engineName << std::endl; }
|
if (m_verbose) { std::cout << "Engine file found: " << engineName << std::endl; }
|
||||||
logEngineEvent("[Engine] buildLoadNetwork: Loading cached engine: " + engineName);
|
|
||||||
bool loadOk = loadNetwork(engineName, subVals, divVals, normalize);
|
bool loadOk = loadNetwork(engineName, subVals, divVals, normalize);
|
||||||
if (loadOk) {
|
if (loadOk) {
|
||||||
return true;
|
return true;
|
||||||
@@ -280,10 +279,6 @@ bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<flo
|
|||||||
if (m_skipOnnxRebuild) {
|
if (m_skipOnnxRebuild) {
|
||||||
// Elastic growth / non-critical path — don't delete and rebuild.
|
// Elastic growth / non-critical path — don't delete and rebuild.
|
||||||
// Just fail gracefully; the pool continues with existing slots.
|
// Just fail gracefully; the pool continues with existing slots.
|
||||||
size_t freeMem = 0, totalMem = 0;
|
|
||||||
cudaMemGetInfo(&freeMem, &totalMem);
|
|
||||||
logEngineEvent("[Engine] buildLoadNetwork: Load failed (skip rebuild, "
|
|
||||||
+ std::to_string(freeMem >> 20) + " MiB free): " + engineName, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Check if the failure was due to VRAM exhaustion vs. corrupt file.
|
// Check if the failure was due to VRAM exhaustion vs. corrupt file.
|
||||||
@@ -301,17 +296,11 @@ bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<flo
|
|||||||
cudaMemGetInfo(&freeCheck, &totalCheck);
|
cudaMemGetInfo(&freeCheck, &totalCheck);
|
||||||
constexpr size_t kMinFreeBytes = 256ULL * 1024 * 1024;
|
constexpr size_t kMinFreeBytes = 256ULL * 1024 * 1024;
|
||||||
if (m_lastLoadFailedVRAM || freeCheck < kMinFreeBytes) {
|
if (m_lastLoadFailedVRAM || freeCheck < kMinFreeBytes) {
|
||||||
logEngineEvent("[Engine] buildLoadNetwork: Load failed due to LOW VRAM ("
|
|
||||||
+ std::to_string(freeCheck / (1024 * 1024)) + " MiB free / "
|
|
||||||
+ std::to_string(totalCheck / (1024 * 1024)) + " MiB total"
|
|
||||||
+ ", vramFlag=" + std::to_string(m_lastLoadFailedVRAM)
|
|
||||||
+ "). Preserving engine file (not corrupt): " + engineName, true);
|
|
||||||
return false; // Don't delete the file, don't try ONNX rebuild
|
return false; // Don't delete the file, don't try ONNX rebuild
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Enough VRAM AND loadNetwork didn't flag VRAM as cause → file is
|
// Enough VRAM AND loadNetwork didn't flag VRAM as cause → file is
|
||||||
// likely corrupt/incompatible. Delete and rebuild from ONNX.
|
// likely corrupt/incompatible. Delete and rebuild from ONNX.
|
||||||
logEngineEvent("[Engine] buildLoadNetwork: Cached engine INVALID, deleting and rebuilding: " + engineName, true);
|
|
||||||
try { std::filesystem::remove(engineName); } catch (...) {}
|
try { std::filesystem::remove(engineName); } catch (...) {}
|
||||||
// Fall through to ONNX build path below
|
// Fall through to ONNX build path below
|
||||||
}
|
}
|
||||||
@@ -321,14 +310,11 @@ bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<flo
|
|||||||
// Demand-driven growth: if no cached engine exists, bail out rather
|
// Demand-driven growth: if no cached engine exists, bail out rather
|
||||||
// than triggering a full ONNX→TRT build (30-60s, massive VRAM).
|
// than triggering a full ONNX→TRT build (30-60s, massive VRAM).
|
||||||
if (m_skipOnnxBuild) {
|
if (m_skipOnnxBuild) {
|
||||||
logEngineEvent("[Engine] buildLoadNetwork: Engine file NOT found, skipping ONNX build (demand growth): " + engineName);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
logEngineEvent("[Engine] buildLoadNetwork: Engine file NOT found, will build from ONNX: " + engineName);
|
|
||||||
}
|
}
|
||||||
if (!FileExist(onnxModelPath)) {
|
if (!FileExist(onnxModelPath)) {
|
||||||
// ONNX model does not exist, try to find alternative precision engine
|
// ONNX model does not exist, try to find alternative precision engine
|
||||||
logEngineEvent("[Engine] buildLoadNetwork: ONNX model also not found: " + onnxModelPath, true);
|
|
||||||
std::cout << "Searching for alternative precision engine..." << std::endl;
|
std::cout << "Searching for alternative precision engine..." << std::endl;
|
||||||
|
|
||||||
size_t lastDot = engineName.find_last_of('.');
|
size_t lastDot = engineName.find_last_of('.');
|
||||||
@@ -411,9 +397,7 @@ bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<flo
|
|||||||
bool preParsed = parseOnnxModelSafe(tempParser.get(),
|
bool preParsed = parseOnnxModelSafe(tempParser.get(),
|
||||||
onnxBuffer.data(), onnxBuffer.size(), &sehPreAnalysis);
|
onnxBuffer.data(), onnxBuffer.size(), &sehPreAnalysis);
|
||||||
if (sehPreAnalysis != 0) {
|
if (sehPreAnalysis != 0) {
|
||||||
std::cout << "[Engine] WARNING: ONNX pre-analysis parse crashed ("
|
// Skipping pre-analysis, proceeding with build...
|
||||||
<< formatCrashCode(sehPreAnalysis)
|
|
||||||
<< "). Skipping pre-analysis, proceeding with build..." << std::endl;
|
|
||||||
}
|
}
|
||||||
else if (preParsed) {
|
else if (preParsed) {
|
||||||
auto numInputs = tempNetwork->getNbInputs();
|
auto numInputs = tempNetwork->getNbInputs();
|
||||||
@@ -718,7 +702,6 @@ bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>
|
|||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
if (!Util::doesFileExist(trtModelPath)) {
|
if (!Util::doesFileExist(trtModelPath)) {
|
||||||
logEngineEvent("[Engine] loadNetwork FAIL: Engine file not found: " + trtModelPath, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -727,13 +710,11 @@ bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>
|
|||||||
{
|
{
|
||||||
std::ifstream file(trtModelPath, std::ios::binary | std::ios::ate);
|
std::ifstream file(trtModelPath, std::ios::binary | std::ios::ate);
|
||||||
if (!file.is_open()) {
|
if (!file.is_open()) {
|
||||||
logEngineEvent("[Engine] loadNetwork FAIL: Cannot open engine file: " + trtModelPath, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::streamsize size = file.tellg();
|
std::streamsize size = file.tellg();
|
||||||
if (size <= 0) {
|
if (size <= 0) {
|
||||||
logEngineEvent("[Engine] loadNetwork FAIL: Engine file is empty (0 bytes): " + trtModelPath, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -741,7 +722,6 @@ bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>
|
|||||||
|
|
||||||
std::vector<char> buffer(size);
|
std::vector<char> buffer(size);
|
||||||
if (!file.read(buffer.data(), size)) {
|
if (!file.read(buffer.data(), size)) {
|
||||||
logEngineEvent("[Engine] loadNetwork FAIL: Read error on engine file: " + trtModelPath, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -761,7 +741,6 @@ bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>
|
|||||||
|
|
||||||
m_runtime = std::shared_ptr<nvinfer1::IRuntime>{ nvinfer1::createInferRuntime(m_logger) };
|
m_runtime = std::shared_ptr<nvinfer1::IRuntime>{ nvinfer1::createInferRuntime(m_logger) };
|
||||||
if (!m_runtime) {
|
if (!m_runtime) {
|
||||||
logEngineEvent("[Engine] loadNetwork FAIL: createInferRuntime returned null for " + trtModelPath, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -830,17 +809,8 @@ bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>
|
|||||||
constexpr size_t kMinFreeBytes = 256ULL * 1024 * 1024; // 256 MiB minimum
|
constexpr size_t kMinFreeBytes = 256ULL * 1024 * 1024; // 256 MiB minimum
|
||||||
if (memErr != cudaSuccess) {
|
if (memErr != cudaSuccess) {
|
||||||
// cudaMemGetInfo failed — CUDA context may not be initialized on this thread.
|
// cudaMemGetInfo failed — CUDA context may not be initialized on this thread.
|
||||||
// Log but don't reject: let TRT try to deserialize (it may succeed).
|
// Don't reject: let TRT try to deserialize (it may succeed).
|
||||||
logEngineEvent("[Engine] loadNetwork WARNING: cudaMemGetInfo failed ("
|
|
||||||
+ std::string(cudaGetErrorString(memErr)) + ") on GPU["
|
|
||||||
+ std::to_string(m_options.deviceIndex) + "] — skipping VRAM check for "
|
|
||||||
+ trtModelPath, true);
|
|
||||||
} else if (freeVRAM < kMinFreeBytes) {
|
} else if (freeVRAM < kMinFreeBytes) {
|
||||||
logEngineEvent("[Engine] loadNetwork FAIL: GPU[" + std::to_string(m_options.deviceIndex)
|
|
||||||
+ "] only " + std::to_string(freeVRAM / (1024 * 1024))
|
|
||||||
+ " MiB free / " + std::to_string(totalVRAM / (1024 * 1024))
|
|
||||||
+ " MiB total (need " + std::to_string(kMinFreeBytes / (1024 * 1024))
|
|
||||||
+ " MiB) for " + trtModelPath, true);
|
|
||||||
m_lastLoadFailedVRAM = true; // signal to buildLoadNetwork: engine file is NOT corrupt
|
m_lastLoadFailedVRAM = true; // signal to buildLoadNetwork: engine file is NOT corrupt
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -861,13 +831,9 @@ bool Engine<T>::loadNetwork(std::string trtModelPath, const std::array<float, 3>
|
|||||||
deserializeCudaEngineSafe(m_runtime.get(), buffer.data(),
|
deserializeCudaEngineSafe(m_runtime.get(), buffer.data(),
|
||||||
buffer.size(), &sehCodeDeserialize));
|
buffer.size(), &sehCodeDeserialize));
|
||||||
if (sehCodeDeserialize != 0) {
|
if (sehCodeDeserialize != 0) {
|
||||||
logEngineEvent("[Engine] loadNetwork FAIL: deserializeCudaEngine CRASHED (SEH "
|
|
||||||
+ formatCrashCode(sehCodeDeserialize) + ") for " + trtModelPath, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (!m_engine) {
|
if (!m_engine) {
|
||||||
logEngineEvent("[Engine] loadNetwork FAIL: deserializeCudaEngine returned null for "
|
|
||||||
+ trtModelPath + " (file size=" + std::to_string(size / (1024*1024)) + " MiB)", true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1018,8 +984,6 @@ trt_cache_create_context:
|
|||||||
m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
|
m_context = std::unique_ptr<nvinfer1::IExecutionContext>(m_engine->createExecutionContext());
|
||||||
if (!m_context) {
|
if (!m_context) {
|
||||||
ANS_DBG("TRT_Load", "ERROR: createExecutionContext returned null");
|
ANS_DBG("TRT_Load", "ERROR: createExecutionContext returned null");
|
||||||
logEngineEvent("[Engine] loadNetwork FAIL: createExecutionContext returned null for "
|
|
||||||
+ trtModelPath, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1106,9 +1070,6 @@ trt_cache_create_context:
|
|||||||
// Allocate GPU memory
|
// Allocate GPU memory
|
||||||
cudaError_t err = cudaMalloc(&m_buffers[i], requestedMemory);
|
cudaError_t err = cudaMalloc(&m_buffers[i], requestedMemory);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
logEngineEvent("[Engine] loadNetwork FAIL: cudaMalloc input buffer ("
|
|
||||||
+ std::to_string(requestedMemory / (1024*1024)) + " MiB): "
|
|
||||||
+ cudaGetErrorString(err) + " for " + trtModelPath, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1179,9 +1140,6 @@ trt_cache_create_context:
|
|||||||
// Allocate GPU memory
|
// Allocate GPU memory
|
||||||
cudaError_t err = cudaMalloc(&m_buffers[i], requestedMemory);
|
cudaError_t err = cudaMalloc(&m_buffers[i], requestedMemory);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
logEngineEvent("[Engine] loadNetwork FAIL: cudaMalloc output buffer ("
|
|
||||||
+ std::to_string(requestedMemory / (1024*1024)) + " MiB): "
|
|
||||||
+ cudaGetErrorString(err) + " for " + trtModelPath, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1534,9 +1492,6 @@ bool Engine<T>::build(std::string onnxModelPath, const std::array<float, 3>& sub
|
|||||||
auto parsed = parseOnnxModelSafe(parser.get(), buffer.data(),
|
auto parsed = parseOnnxModelSafe(parser.get(), buffer.data(),
|
||||||
buffer.size(), &sehCodeParse);
|
buffer.size(), &sehCodeParse);
|
||||||
if (sehCodeParse != 0) {
|
if (sehCodeParse != 0) {
|
||||||
std::cout << "[Engine] FATAL: ONNX parser crashed ("
|
|
||||||
<< formatCrashCode(sehCodeParse) << ")" << std::endl;
|
|
||||||
std::cout << "[Engine] This may indicate a corrupt ONNX file or driver issue." << std::endl;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (!parsed) {
|
if (!parsed) {
|
||||||
@@ -2317,12 +2272,6 @@ bool Engine<T>::build(std::string onnxModelPath, const std::array<float, 3>& sub
|
|||||||
auto endTime = std::chrono::high_resolution_clock::now();
|
auto endTime = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
if (sehCodeBuild != 0) {
|
if (sehCodeBuild != 0) {
|
||||||
std::cout << "\n========================================" << std::endl;
|
|
||||||
std::cout << "Build CRASHED!" << std::endl;
|
|
||||||
std::cout << "========================================" << std::endl;
|
|
||||||
std::cout << "[Engine] FATAL: buildSerializedNetwork crashed ("
|
|
||||||
<< formatCrashCode(sehCodeBuild) << ")" << std::endl;
|
|
||||||
std::cout << "[Engine] This typically indicates insufficient GPU memory or a driver crash." << std::endl;
|
|
||||||
Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
|
Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -2478,9 +2427,6 @@ bool Engine<T>::buildWithRetry(std::string onnxModelPath,
|
|||||||
bool retryParsed = parseOnnxModelSafe(tempParser.get(),
|
bool retryParsed = parseOnnxModelSafe(tempParser.get(),
|
||||||
onnxBuffer.data(), onnxBuffer.size(), &sehRetryParse);
|
onnxBuffer.data(), onnxBuffer.size(), &sehRetryParse);
|
||||||
if (sehRetryParse != 0) {
|
if (sehRetryParse != 0) {
|
||||||
std::cout << "[Engine] WARNING: ONNX pre-analysis parse crashed in "
|
|
||||||
<< "buildWithRetry (" << formatCrashCode(sehRetryParse)
|
|
||||||
<< "). Skipping spatial analysis." << std::endl;
|
|
||||||
// hasDynamicSpatial stays false → single build() attempt
|
// hasDynamicSpatial stays false → single build() attempt
|
||||||
}
|
}
|
||||||
else if (retryParsed && tempNetwork->getNbInputs() > 0) {
|
else if (retryParsed && tempNetwork->getNbInputs() > 0) {
|
||||||
@@ -2501,8 +2447,6 @@ bool Engine<T>::buildWithRetry(std::string onnxModelPath,
|
|||||||
unsigned long sehBuild = 0;
|
unsigned long sehBuild = 0;
|
||||||
bool ok = buildSafe(onnxModelPath, subVals, divVals, normalize, &sehBuild);
|
bool ok = buildSafe(onnxModelPath, subVals, divVals, normalize, &sehBuild);
|
||||||
if (sehBuild != 0) {
|
if (sehBuild != 0) {
|
||||||
std::cout << "[Engine] FATAL: build() crashed in buildWithRetry ("
|
|
||||||
<< formatCrashCode(sehBuild) << ")" << std::endl;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return ok;
|
return ok;
|
||||||
@@ -2557,40 +2501,17 @@ bool Engine<T>::buildWithRetry(std::string onnxModelPath,
|
|||||||
for (size_t attempt = 0; attempt < candidates.size(); ++attempt) {
|
for (size_t attempt = 0; attempt < candidates.size(); ++attempt) {
|
||||||
setCandidateOptions(candidates[attempt]);
|
setCandidateOptions(candidates[attempt]);
|
||||||
|
|
||||||
std::cout << "[Engine] buildWithRetry attempt " << (attempt + 1)
|
|
||||||
<< "/" << candidates.size() << " (max "
|
|
||||||
<< m_options.maxInputHeight << "x"
|
|
||||||
<< m_options.maxInputWidth << ")" << std::endl;
|
|
||||||
|
|
||||||
{
|
{
|
||||||
unsigned long sehAttempt = 0;
|
unsigned long sehAttempt = 0;
|
||||||
bool attemptOk = buildSafe(onnxModelPath, subVals, divVals, normalize, &sehAttempt);
|
bool attemptOk = buildSafe(onnxModelPath, subVals, divVals, normalize, &sehAttempt);
|
||||||
if (sehAttempt != 0) {
|
if (sehAttempt != 0) {
|
||||||
std::cout << "[Engine] Build crashed ("
|
|
||||||
<< formatCrashCode(sehAttempt) << ") at max "
|
|
||||||
<< m_options.maxInputHeight << "x"
|
|
||||||
<< m_options.maxInputWidth << std::endl;
|
|
||||||
// CUDA context may be corrupted — no point retrying
|
// CUDA context may be corrupted — no point retrying
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (attemptOk) {
|
if (attemptOk) {
|
||||||
if (attempt > 0) {
|
|
||||||
std::cout << "[Engine] Built with reduced max "
|
|
||||||
<< m_options.maxInputHeight << "x"
|
|
||||||
<< m_options.maxInputWidth
|
|
||||||
<< " (requested " << origMaxH << "x" << origMaxW
|
|
||||||
<< " exceeded GPU capacity)" << std::endl;
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (attempt + 1 < candidates.size()) {
|
|
||||||
std::cout << "[Engine] Build failed at max "
|
|
||||||
<< m_options.maxInputHeight << "x"
|
|
||||||
<< m_options.maxInputWidth
|
|
||||||
<< ", trying smaller..." << std::endl;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// All candidates exhausted — restore original options for error reporting
|
// All candidates exhausted — restore original options for error reporting
|
||||||
@@ -2601,10 +2522,6 @@ bool Engine<T>::buildWithRetry(std::string onnxModelPath,
|
|||||||
m_options.minInputHeight = origMinH;
|
m_options.minInputHeight = origMinH;
|
||||||
m_options.minInputWidth = origMinW;
|
m_options.minInputWidth = origMinW;
|
||||||
|
|
||||||
std::cout << "[Engine] buildWithRetry: all spatial dimension fallbacks "
|
|
||||||
<< "exhausted (tried " << candidates.size() << " candidates from "
|
|
||||||
<< candidates.front() << " down to " << candidates.back() << ")"
|
|
||||||
<< std::endl;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -223,9 +223,6 @@ bool Engine<T>::loadSlots(
|
|||||||
: probeEngine->loadNetwork (modelPath, subVals, divVals, normalize);
|
: probeEngine->loadNetwork (modelPath, subVals, divVals, normalize);
|
||||||
|
|
||||||
if (!probeOk) {
|
if (!probeOk) {
|
||||||
logEngineEvent("[Engine] loadSlots FAIL: Probe engine failed on GPU["
|
|
||||||
+ std::to_string(probeGpuIdx) + "] for " + modelPath
|
|
||||||
+ " (freeVRAM before=" + std::to_string(freeBefore / 1048576) + " MiB)", true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -686,13 +683,6 @@ bool Engine<T>::runInferenceFromPool(
|
|||||||
}
|
}
|
||||||
if (!slot) {
|
if (!slot) {
|
||||||
ANS_DBG("TRT_Pool", "ERROR: No slot available — all %zu slots busy, timeout", m_slots.size());
|
ANS_DBG("TRT_Pool", "ERROR: No slot available — all %zu slots busy, timeout", m_slots.size());
|
||||||
std::string errMsg = "[Engine] runInferenceFromPool FAIL: Capacity reached -- all "
|
|
||||||
+ std::to_string(m_activeCount.load()) + "/" + std::to_string(m_totalCapacity)
|
|
||||||
+ " slot(s) busy"
|
|
||||||
+ (m_elasticMode ? " and all GPUs full" : "")
|
|
||||||
+ ". Request rejected (2s timeout).";
|
|
||||||
std::cout << errMsg << std::endl;
|
|
||||||
logEngineEvent(errMsg, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -99,8 +99,6 @@ public:
|
|||||||
// Note: maxSlotsPerGpu==1 is now the normal "1 slot per GPU" multi-GPU
|
// Note: maxSlotsPerGpu==1 is now the normal "1 slot per GPU" multi-GPU
|
||||||
// round-robin mode, so it goes through the pool path below.
|
// round-robin mode, so it goes through the pool path below.
|
||||||
if (maxSlotsPerGpu == 0) {
|
if (maxSlotsPerGpu == 0) {
|
||||||
logEvent("[EnginePoolManager] BYPASS (maxSlots=0): " + key.modelPath
|
|
||||||
+ " — creating non-shared engine");
|
|
||||||
auto engine = std::make_shared<Engine<T>>(options);
|
auto engine = std::make_shared<Engine<T>>(options);
|
||||||
bool ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
|
bool ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
|
||||||
return ok ? engine : nullptr;
|
return ok ? engine : nullptr;
|
||||||
@@ -114,8 +112,6 @@ public:
|
|||||||
it->second.evictTime = TimePoint{}; // cancel pending eviction
|
it->second.evictTime = TimePoint{}; // cancel pending eviction
|
||||||
int refs = it->second.refcount;
|
int refs = it->second.refcount;
|
||||||
auto engine = it->second.engine;
|
auto engine = it->second.engine;
|
||||||
logEvent("[EnginePoolManager] HIT: " + key.modelPath
|
|
||||||
+ " refs=" + std::to_string(refs));
|
|
||||||
|
|
||||||
// Demand-driven growth: only in elastic mode (maxSlotsPerGpu <= 0
|
// Demand-driven growth: only in elastic mode (maxSlotsPerGpu <= 0
|
||||||
// or > 1). With maxSlotsPerGpu==1 (round-robin default), the pool
|
// or > 1). With maxSlotsPerGpu==1 (round-robin default), the pool
|
||||||
@@ -134,19 +130,9 @@ public:
|
|||||||
constexpr size_t kMinVramForGrowth = 6ULL * 1024 * 1024 * 1024; // 6 GB
|
constexpr size_t kMinVramForGrowth = 6ULL * 1024 * 1024 * 1024; // 6 GB
|
||||||
if (totalVram >= kMinVramForGrowth) {
|
if (totalVram >= kMinVramForGrowth) {
|
||||||
lock.unlock(); // release PoolManager lock before growing
|
lock.unlock(); // release PoolManager lock before growing
|
||||||
std::thread([engine, alive, refs, modelPath = key.modelPath]() {
|
std::thread([engine]() {
|
||||||
int created = engine->growPool(1);
|
engine->growPool(1);
|
||||||
if (created > 0) {
|
|
||||||
logEngineEvent("[EnginePoolManager] DEMAND GROWTH: " + modelPath
|
|
||||||
+ " grew from " + std::to_string(alive)
|
|
||||||
+ " to " + std::to_string(engine->getTotalCapacity())
|
|
||||||
+ " slots (refs=" + std::to_string(refs) + ")");
|
|
||||||
}
|
|
||||||
}).detach();
|
}).detach();
|
||||||
} else {
|
|
||||||
logEvent("[EnginePoolManager] SKIP GROWTH: " + key.modelPath
|
|
||||||
+ " (GPU VRAM " + std::to_string(totalVram >> 20)
|
|
||||||
+ " MiB < 6 GB threshold, refs=" + std::to_string(refs) + ")");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -155,31 +141,12 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Cache miss — create new Engine pool
|
// Cache miss — create new Engine pool
|
||||||
logEvent("[EnginePoolManager] MISS: Creating pool for " + key.modelPath + "...");
|
|
||||||
|
|
||||||
// Log VRAM before attempting to create probe
|
|
||||||
{
|
|
||||||
size_t freeMem = 0, totalMem = 0;
|
|
||||||
cudaSetDevice(options.deviceIndex);
|
|
||||||
cudaMemGetInfo(&freeMem, &totalMem);
|
|
||||||
logEvent("[EnginePoolManager] GPU[" + std::to_string(options.deviceIndex)
|
|
||||||
+ "] VRAM: " + std::to_string(freeMem >> 20) + " MiB free / "
|
|
||||||
+ std::to_string(totalMem >> 20) + " MiB total (before probe)");
|
|
||||||
}
|
|
||||||
|
|
||||||
auto engine = std::make_shared<Engine<T>>(options);
|
auto engine = std::make_shared<Engine<T>>(options);
|
||||||
bool ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize, maxSlotsPerGpu);
|
bool ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize, maxSlotsPerGpu);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
// Step 1: Force-evict all pools with refcount=0 to reclaim VRAM
|
// Step 1: Force-evict all pools with refcount=0 to reclaim VRAM
|
||||||
int evicted = forceEvictPending();
|
int evicted = forceEvictPending();
|
||||||
if (evicted > 0) {
|
if (evicted > 0) {
|
||||||
size_t freeMem2 = 0, totalMem2 = 0;
|
|
||||||
cudaSetDevice(options.deviceIndex);
|
|
||||||
cudaMemGetInfo(&freeMem2, &totalMem2);
|
|
||||||
logEvent("[EnginePoolManager] RETRY EVICT: Force-evicted " + std::to_string(evicted)
|
|
||||||
+ " pending pool(s), now " + std::to_string(freeMem2 >> 20)
|
|
||||||
+ " MiB free. Retrying " + key.modelPath + "...");
|
|
||||||
|
|
||||||
engine = std::make_shared<Engine<T>>(options);
|
engine = std::make_shared<Engine<T>>(options);
|
||||||
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize, maxSlotsPerGpu);
|
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize, maxSlotsPerGpu);
|
||||||
}
|
}
|
||||||
@@ -189,13 +156,6 @@ public:
|
|||||||
// consumes ~300-500 MB vs ~50-100 MB for a simple loadNetwork.
|
// consumes ~300-500 MB vs ~50-100 MB for a simple loadNetwork.
|
||||||
// Lightweight mode: tasks queue for a single shared slot — slower but works.
|
// Lightweight mode: tasks queue for a single shared slot — slower but works.
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
size_t freeMem3 = 0, totalMem3 = 0;
|
|
||||||
cudaSetDevice(options.deviceIndex);
|
|
||||||
cudaMemGetInfo(&freeMem3, &totalMem3);
|
|
||||||
logEvent("[EnginePoolManager] RETRY LIGHTWEIGHT: Elastic probe failed, "
|
|
||||||
+ std::to_string(freeMem3 >> 20) + " MiB free. "
|
|
||||||
"Retrying with single-slot mode for " + key.modelPath + "...");
|
|
||||||
|
|
||||||
engine = std::make_shared<Engine<T>>(options);
|
engine = std::make_shared<Engine<T>>(options);
|
||||||
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
|
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
|
||||||
}
|
}
|
||||||
@@ -208,13 +168,6 @@ public:
|
|||||||
// Evidence: FireSmoke/detector.onnx failed at 3740 MiB free, then
|
// Evidence: FireSmoke/detector.onnx failed at 3740 MiB free, then
|
||||||
// succeeded 4 seconds later at 3154 MiB free (less VRAM!).
|
// succeeded 4 seconds later at 3154 MiB free (less VRAM!).
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
size_t freeMem4 = 0, totalMem4 = 0;
|
|
||||||
cudaSetDevice(options.deviceIndex);
|
|
||||||
cudaMemGetInfo(&freeMem4, &totalMem4);
|
|
||||||
logEvent("[EnginePoolManager] RETRY DELAYED: All attempts failed with "
|
|
||||||
+ std::to_string(freeMem4 >> 20) + " MiB free. "
|
|
||||||
"Waiting 3s before final retry for " + key.modelPath + "...");
|
|
||||||
|
|
||||||
// Release mutex during sleep so other tasks can proceed
|
// Release mutex during sleep so other tasks can proceed
|
||||||
// (they may complete pool creation that resolves our issue)
|
// (they may complete pool creation that resolves our issue)
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
@@ -226,29 +179,15 @@ public:
|
|||||||
if (it2 != m_pools.end()) {
|
if (it2 != m_pools.end()) {
|
||||||
it2->second.refcount++;
|
it2->second.refcount++;
|
||||||
it2->second.evictTime = TimePoint{};
|
it2->second.evictTime = TimePoint{};
|
||||||
logEvent("[EnginePoolManager] HIT (after delay): " + key.modelPath
|
|
||||||
+ " refs=" + std::to_string(it2->second.refcount));
|
|
||||||
return it2->second.engine;
|
return it2->second.engine;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Final retry — try lightweight again after delay
|
// Final retry — try lightweight again after delay
|
||||||
cudaSetDevice(options.deviceIndex);
|
|
||||||
cudaMemGetInfo(&freeMem4, &totalMem4);
|
|
||||||
logEvent("[EnginePoolManager] RETRY FINAL: " + std::to_string(freeMem4 >> 20)
|
|
||||||
+ " MiB free. Last attempt for " + key.modelPath + "...");
|
|
||||||
|
|
||||||
engine = std::make_shared<Engine<T>>(options);
|
engine = std::make_shared<Engine<T>>(options);
|
||||||
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
|
ok = engine->buildLoadNetwork(modelPath, subVals, divVals, normalize);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
size_t freeMem = 0, totalMem = 0;
|
|
||||||
cudaMemGetInfo(&freeMem, &totalMem);
|
|
||||||
logEvent("[EnginePoolManager] FAILED: Could not load engine for "
|
|
||||||
+ key.modelPath + " | GPU[" + std::to_string(options.deviceIndex)
|
|
||||||
+ "] VRAM: " + std::to_string(freeMem >> 20) + " MiB free / "
|
|
||||||
+ std::to_string(totalMem >> 20) + " MiB total"
|
|
||||||
+ " (after 4 attempts: elastic, evict, lightweight, delayed)", true);
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -261,7 +200,6 @@ public:
|
|||||||
// Start the lazy-eviction sweeper if not already running
|
// Start the lazy-eviction sweeper if not already running
|
||||||
startSweeperIfNeeded();
|
startSweeperIfNeeded();
|
||||||
|
|
||||||
logEvent("[EnginePoolManager] CREATED: " + key.modelPath + " refs=1");
|
|
||||||
return engine;
|
return engine;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -280,14 +218,10 @@ public:
|
|||||||
if (it->second.refcount <= 0) return;
|
if (it->second.refcount <= 0) return;
|
||||||
|
|
||||||
it->second.refcount--;
|
it->second.refcount--;
|
||||||
logEvent("[EnginePoolManager] RELEASE: " + key.modelPath
|
|
||||||
+ " refs=" + std::to_string(it->second.refcount));
|
|
||||||
|
|
||||||
if (it->second.refcount <= 0) {
|
if (it->second.refcount <= 0) {
|
||||||
// Mark for lazy eviction — don't destroy yet
|
// Mark for lazy eviction — don't destroy yet
|
||||||
it->second.evictTime = Clock::now() + std::chrono::seconds(kEvictGraceSec);
|
it->second.evictTime = Clock::now() + std::chrono::seconds(kEvictGraceSec);
|
||||||
logEvent("[EnginePoolManager] PENDING EVICT: " + key.modelPath
|
|
||||||
+ " (will evict in " + std::to_string(kEvictGraceSec) + "s if not re-acquired)");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -295,7 +229,6 @@ public:
|
|||||||
void clearAll() {
|
void clearAll() {
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(m_mutex);
|
std::lock_guard<std::mutex> lock(m_mutex);
|
||||||
logEvent("[EnginePoolManager] CLEAR ALL (" + std::to_string(m_pools.size()) + " pools)");
|
|
||||||
m_pools.clear();
|
m_pools.clear();
|
||||||
}
|
}
|
||||||
stopSweeper();
|
stopSweeper();
|
||||||
@@ -361,17 +294,6 @@ private:
|
|||||||
using Clock = std::chrono::steady_clock;
|
using Clock = std::chrono::steady_clock;
|
||||||
using TimePoint = std::chrono::time_point<Clock>;
|
using TimePoint = std::chrono::time_point<Clock>;
|
||||||
|
|
||||||
// Log to stdout/stderr only — no Windows Event Viewer.
|
|
||||||
// Event Viewer logging is handled by logEngineEvent() in engine.h for
|
|
||||||
// critical engine-level errors. EnginePoolManager messages are
|
|
||||||
// informational (HIT/MISS/EVICT) and don't need Event Viewer entries.
|
|
||||||
static void logEvent(const std::string& msg, bool isError = false) {
|
|
||||||
if (isError)
|
|
||||||
std::cerr << msg << std::endl;
|
|
||||||
else
|
|
||||||
std::cout << msg << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct PoolEntry {
|
struct PoolEntry {
|
||||||
std::shared_ptr<Engine<T>> engine;
|
std::shared_ptr<Engine<T>> engine;
|
||||||
int refcount = 0;
|
int refcount = 0;
|
||||||
@@ -408,7 +330,6 @@ private:
|
|||||||
int evicted = 0;
|
int evicted = 0;
|
||||||
for (auto it = m_pools.begin(); it != m_pools.end(); ) {
|
for (auto it = m_pools.begin(); it != m_pools.end(); ) {
|
||||||
if (it->second.refcount <= 0) {
|
if (it->second.refcount <= 0) {
|
||||||
logEvent("[EnginePoolManager] FORCE EVICT (VRAM recovery): " + it->first.modelPath);
|
|
||||||
it = m_pools.erase(it);
|
it = m_pools.erase(it);
|
||||||
evicted++;
|
evicted++;
|
||||||
} else {
|
} else {
|
||||||
@@ -428,7 +349,6 @@ private:
|
|||||||
&& entry.evictTime != TimePoint{}
|
&& entry.evictTime != TimePoint{}
|
||||||
&& now >= entry.evictTime)
|
&& now >= entry.evictTime)
|
||||||
{
|
{
|
||||||
logEvent("[EnginePoolManager] EVICT (expired): " + it->first.modelPath);
|
|
||||||
it = m_pools.erase(it);
|
it = m_pools.erase(it);
|
||||||
} else {
|
} else {
|
||||||
++it;
|
++it;
|
||||||
|
|||||||
@@ -486,10 +486,6 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
// valid here. Guard against the (unlikely) edge case where runInference is
|
// valid here. Guard against the (unlikely) edge case where runInference is
|
||||||
// called before loadNetwork succeeds.
|
// called before loadNetwork succeeds.
|
||||||
if (!m_streamInitialized || !m_inferenceStream) {
|
if (!m_streamInitialized || !m_inferenceStream) {
|
||||||
std::string errMsg = "Error: Inference stream not initialised. "
|
|
||||||
"Call loadNetwork() / buildLoadNetwork() before runInference().";
|
|
||||||
std::cout << errMsg << std::endl;
|
|
||||||
logEngineEvent("[Engine] runInference: " + errMsg, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -902,20 +898,6 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
}
|
}
|
||||||
if (!success) {
|
if (!success) {
|
||||||
ANS_DBG("TRT_Engine", "ERROR: enqueueV3 FAILED batch=%d", batchSize);
|
ANS_DBG("TRT_Engine", "ERROR: enqueueV3 FAILED batch=%d", batchSize);
|
||||||
std::string debugInfo = "[Engine] runInference FAIL: enqueue returned false, batch="
|
|
||||||
+ std::to_string(batchSize)
|
|
||||||
+ ", dimsSpecified=" + (m_context->allInputDimensionsSpecified() ? "YES" : "NO");
|
|
||||||
for (size_t i = 0; i < m_IOTensorNames.size(); ++i) {
|
|
||||||
auto shape = m_context->getTensorShape(m_IOTensorNames[i].c_str());
|
|
||||||
debugInfo += ", tensor'" + m_IOTensorNames[i] + "'=[";
|
|
||||||
for (int j = 0; j < shape.nbDims; ++j) {
|
|
||||||
if (j > 0) debugInfo += ",";
|
|
||||||
debugInfo += std::to_string(shape.d[j]);
|
|
||||||
}
|
|
||||||
debugInfo += "]";
|
|
||||||
}
|
|
||||||
std::cout << debugInfo << std::endl;
|
|
||||||
logEngineEvent(debugInfo, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -933,11 +915,6 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
m_inferenceStream);
|
m_inferenceStream);
|
||||||
|
|
||||||
if (copyErr != cudaSuccess) {
|
if (copyErr != cudaSuccess) {
|
||||||
std::string errMsg = "[Engine] runInference FAIL: cudaMemcpyAsync output "
|
|
||||||
+ std::to_string(outputIdx) + " batch " + std::to_string(batch)
|
|
||||||
+ ": " + cudaGetErrorString(copyErr);
|
|
||||||
std::cout << errMsg << std::endl;
|
|
||||||
logEngineEvent(errMsg, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -953,10 +930,6 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>>& i
|
|||||||
if (syncErr != cudaSuccess) {
|
if (syncErr != cudaSuccess) {
|
||||||
ANS_DBG("TRT_Engine", "ERROR: cudaStreamSync FAILED err=%d (%s)",
|
ANS_DBG("TRT_Engine", "ERROR: cudaStreamSync FAILED err=%d (%s)",
|
||||||
(int)syncErr, cudaGetErrorString(syncErr));
|
(int)syncErr, cudaGetErrorString(syncErr));
|
||||||
std::string errMsg = "[Engine] runInference FAIL: cudaStreamSynchronize: "
|
|
||||||
+ std::string(cudaGetErrorString(syncErr));
|
|
||||||
std::cout << errMsg << std::endl;
|
|
||||||
logEngineEvent(errMsg, true);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user