From 257145c42982345744a234ba63c2c8fc5dd9e528 Mon Sep 17 00:00:00 2001 From: Tuan Nghia Nguyen Date: Fri, 10 Apr 2026 17:13:47 +1000 Subject: [PATCH] Fix Intel ONNX runtime --- .claude/settings.local.json | 8 +++- engines/ONNXEngine/ONNXEngine.cpp | 15 +++++- engines/ONNXEngine/ONNXEngine.h | 4 ++ modules/ANSODEngine/ANSONNXYOLO.cpp | 71 +++++++++++++++++++++++++++++ modules/ANSODEngine/ANSONNXYOLO.h | 8 ++++ 5 files changed, 104 insertions(+), 2 deletions(-) diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 5e06612..cc2aa19 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -89,7 +89,13 @@ "Read(//c/ANSLibs/onnxruntime/include//**)", "Read(//c/ANSLibs/onnxruntime/lib/**)", "Read(//c/ANSLibs/onnxruntime/**)", - "Bash(python -c ':*)" + "Bash(python -c ':*)", + "Bash(grep -oE \"WATCHDOG-[0-9-]+\\\\.dmp|Minidump\\\\\\\\\\\\\\\\[0-9-]+\\\\.dmp\" \"C:\\\\Users\\\\nghia\\\\Downloads\\\\AMDeventlog3.xml\")", + "Bash(grep -oE \"\\\\\\\\\\\\\\\\Minidump\\\\\\\\\\\\\\\\[^<]+\\\\.dmp\" \"C:\\\\Users\\\\nghia\\\\Downloads\\\\AMDeventlog3.xml\")", + "mcp__desktop-commander__start_search", + "mcp__desktop-commander__get_more_search_results", + "mcp__desktop-commander__list_searches", + "Bash(grep -oE \"Faulting[^<]{1,500}|APPLICATION CRASHING[^<]{1,300}|EventID[^>]*>1000[^<]*|ExceptionCode[^<]{1,100}|onnxruntime[^<]{1,300}\" \"C:\\\\Users\\\\nghia\\\\Downloads\\\\Evenlog.xml\")" ] } } diff --git a/engines/ONNXEngine/ONNXEngine.cpp b/engines/ONNXEngine/ONNXEngine.cpp index 5cca830..d5549e3 100644 --- a/engines/ONNXEngine/ONNXEngine.cpp +++ b/engines/ONNXEngine/ONNXEngine.cpp @@ -233,7 +233,20 @@ namespace ANSCENTER { if (engine == EngineType::AMD_GPU) { session_options.DisableMemPattern(); session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); - ANS_DBG("OrtHandler", "DirectML: DisableMemPattern + ORT_SEQUENTIAL set"); + + // DirectML 1.15.4 (the latest; Microsoft has moved DirectML into + // sustained engineering only) has a deterministic crash path in + // amdkmdag.sys +0xf03d on RDNA2 iGPUs (Radeon 680M on Ryzen 6000) + // when ORT_ENABLE_ALL applies layout-reorder transforms to + // YOLO-style conv graphs. Downgrade to EXTENDED on DML: still + // keeps constant folding and Conv+BN+ReLU fusion (the big wins), + // drops the risky layout transforms. Perf impact on YOLO is + // typically under 5%. + session_options.SetGraphOptimizationLevel( + GraphOptimizationLevel::ORT_ENABLE_EXTENDED); + + ANS_DBG("OrtHandler", + "DirectML: DisableMemPattern + ORT_SEQUENTIAL + EXTENDED opt"); } std::vector available = Ort::GetAvailableProviders(); diff --git a/engines/ONNXEngine/ONNXEngine.h b/engines/ONNXEngine/ONNXEngine.h index 08f164f..ed957b5 100644 --- a/engines/ONNXEngine/ONNXEngine.h +++ b/engines/ONNXEngine/ONNXEngine.h @@ -294,6 +294,10 @@ namespace ANSCENTER { BasicOrtHandler(const BasicOrtHandler&) = delete; BasicOrtHandler& operator=(const BasicOrtHandler&) = delete; + public: + // Resolved EP type (after EPLoader fallback). Subclasses use this + // to branch on actual EP at inference time. + EngineType getEngineType() const { return m_engineType; } private: void initialize_handler(); protected: diff --git a/modules/ANSODEngine/ANSONNXYOLO.cpp b/modules/ANSODEngine/ANSONNXYOLO.cpp index 4242da5..caef52f 100644 --- a/modules/ANSODEngine/ANSONNXYOLO.cpp +++ b/modules/ANSODEngine/ANSONNXYOLO.cpp @@ -4,6 +4,7 @@ #include "NV12PreprocessHelper.h" // tl_currentGpuFrame() #include // std::iota #include +#include // WarmUpEngine() timing namespace ANSCENTER { @@ -1587,6 +1588,64 @@ namespace ANSCENTER { } } + // ======================================================================== + // WarmUpEngine — run 2 dummy inferences after session creation + // + // On AMD RDNA2 iGPUs (e.g. Radeon 680M on Ryzen 6000-series APUs), the + // very first detect() call triggers DirectML shader compile + GPU kernel + // cache population for the entire YOLO graph. That first pass can + // legitimately take several seconds of sustained GPU work, which is long + // enough to coincide with TDR watchdog firing and has triggered + // amdkmdag.sys bugchecks at +0xf03d under DirectML 1.15.4 (the latest). + // + // Running 2 dummy inferences at startup burns the compile cost under + // controlled conditions so that the first real frame is already fast. + // The second call should always be quick and confirms the cache is warm. + // + // Non-fatal on failure: if warm-up itself crashes, regular inference may + // still succeed, or will fail with a clearer error message. + // ======================================================================== + void ANSONNXYOLO::WarmUpEngine() { + if (!m_ortEngine) return; + + // Warm-up exists solely to pre-compile DirectML shaders on AMD RDNA2 + // iGPUs (Radeon 680M). It has no benefit on CPU / OpenVINO / CUDA + // and running detect() at load time has been observed to expose + // latent heap-corruption bugs (ntdll +0x1176e5 / STATUS_HEAP_CORRUPTION + // 0xc0000374) on Intel machines. Gate strictly on AMD_GPU. + if (m_ortEngine->getEngineType() != EngineType::AMD_GPU) { + ANS_DBG("ONNXYOLO", "Warm-up skipped (non-AMD EP)"); + return; + } + + try { + const int w = _modelConfig.inpWidth > 0 ? _modelConfig.inpWidth : 640; + const int h = _modelConfig.inpHeight > 0 ? _modelConfig.inpHeight : 640; + + // Mid-gray BGR image matches the letterbox fill colour used in + // preprocessing (114,114,114 ~ 128) and avoids degenerate inputs. + cv::Mat dummy(h, w, CV_8UC3, cv::Scalar(128, 128, 128)); + + ANS_DBG("ONNXYOLO", "Warm-up: running 2 dummy inferences (%dx%d)", w, h); + + for (int i = 0; i < 2; ++i) { + auto t0 = std::chrono::steady_clock::now(); + (void)m_ortEngine->detect(dummy, _classes, + PROBABILITY_THRESHOLD, + NMS_THRESHOLD, + NUM_KPS); + auto t1 = std::chrono::steady_clock::now(); + auto ms = std::chrono::duration_cast(t1 - t0).count(); + ANS_DBG("ONNXYOLO", "Warm-up #%d: %lld ms", i, (long long)ms); + } + } + catch (const std::exception& e) { + _logger.LogError("ANSONNXYOLO::WarmUpEngine", + std::string("Warm-up failed (non-fatal): ") + e.what(), + __FILE__, __LINE__); + } + } + bool ANSONNXYOLO::Initialize(std::string licenseKey, ModelConfig modelConfig, const std::string& modelZipFilePath, const std::string& modelZipPassword, @@ -1651,6 +1710,10 @@ namespace ANSCENTER { } } + // Pre-compile DirectML shaders / kernel cache before first real + // frame (mitigates amdkmdag TDR on Radeon 680M). Non-fatal. + WarmUpEngine(); + _modelLoadValid = true; _isInitialized = true; return true; @@ -1708,6 +1771,10 @@ namespace ANSCENTER { } } + // Pre-compile DirectML shaders / kernel cache before first real + // frame (mitigates amdkmdag TDR on Radeon 680M). Non-fatal. + WarmUpEngine(); + _modelLoadValid = true; _isInitialized = true; return true; @@ -1778,6 +1845,10 @@ namespace ANSCENTER { } } + // Pre-compile DirectML shaders / kernel cache before first real + // frame (mitigates amdkmdag TDR on Radeon 680M). Non-fatal. + WarmUpEngine(); + _modelLoadValid = true; _isInitialized = true; return true; diff --git a/modules/ANSODEngine/ANSONNXYOLO.h b/modules/ANSODEngine/ANSONNXYOLO.h index b8dc0a2..1e85136 100644 --- a/modules/ANSODEngine/ANSONNXYOLO.h +++ b/modules/ANSODEngine/ANSONNXYOLO.h @@ -230,6 +230,14 @@ namespace ANSCENTER { // Initialise ORT engine from the resolved model path bool InitOrtEngine(); + + // Run a few dummy inferences after session creation to force + // DirectML shader compile + kernel cache warm-up BEFORE the first + // real frame arrives. On AMD RDNA2 iGPUs (Radeon 680M) the first + // real inference can otherwise take several seconds of GPU time + // while compiling shaders, which has triggered amdkmdag.sys + // bugchecks under DirectML 1.15.4. Non-fatal on failure. + void WarmUpEngine(); public: // Initialise ORT engine with explicit engine type override (e.g. CPU fallback for AMD iGPUs) bool InitOrtEngine(ANSCENTER::EngineType engineType);