From 257145c42982345744a234ba63c2c8fc5dd9e528 Mon Sep 17 00:00:00 2001
From: Tuan Nghia Nguyen <nghia.nguyen@anscenter.com>
Date: Fri, 10 Apr 2026 17:13:47 +1000
Subject: [PATCH] Fix Intel ONNX runtime

---
 .claude/settings.local.json         |  8 +++-
 engines/ONNXEngine/ONNXEngine.cpp   | 15 +++++-
 engines/ONNXEngine/ONNXEngine.h     |  4 ++
 modules/ANSODEngine/ANSONNXYOLO.cpp | 71 +++++++++++++++++++++++++++++
 modules/ANSODEngine/ANSONNXYOLO.h   |  8 ++++
 5 files changed, 104 insertions(+), 2 deletions(-)
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index 5e06612..cc2aa19 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -89,7 +89,13 @@
       "Read(//c/ANSLibs/onnxruntime/include//**)",
       "Read(//c/ANSLibs/onnxruntime/lib/**)",
       "Read(//c/ANSLibs/onnxruntime/**)",
-      "Bash(python -c ':*)"
+      "Bash(python -c ':*)",
+      "Bash(grep -oE \"WATCHDOG-[0-9-]+\\\\.dmp|Minidump\\\\\\\\\\\\\\\\[0-9-]+\\\\.dmp\" \"C:\\\\Users\\\\nghia\\\\Downloads\\\\AMDeventlog3.xml\")",
+      "Bash(grep -oE \"\\\\\\\\\\\\\\\\Minidump\\\\\\\\\\\\\\\\[^<]+\\\\.dmp\" \"C:\\\\Users\\\\nghia\\\\Downloads\\\\AMDeventlog3.xml\")",
+      "mcp__desktop-commander__start_search",
+      "mcp__desktop-commander__get_more_search_results",
+      "mcp__desktop-commander__list_searches",
+      "Bash(grep -oE \"Faulting[^<]{1,500}|APPLICATION CRASHING[^<]{1,300}|EventID[^>]*>1000[^<]*|ExceptionCode[^<]{1,100}|onnxruntime[^<]{1,300}\" \"C:\\\\Users\\\\nghia\\\\Downloads\\\\Evenlog.xml\")"
     ]
   }
 }
diff --git a/engines/ONNXEngine/ONNXEngine.cpp b/engines/ONNXEngine/ONNXEngine.cpp
index 5cca830..d5549e3 100644
--- a/engines/ONNXEngine/ONNXEngine.cpp
+++ b/engines/ONNXEngine/ONNXEngine.cpp
@@ -233,7 +233,20 @@ namespace ANSCENTER {
         if (engine == EngineType::AMD_GPU) {
             session_options.DisableMemPattern();
             session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
-            ANS_DBG("OrtHandler", "DirectML: DisableMemPattern + ORT_SEQUENTIAL set");
+
+            // DirectML 1.15.4 (the latest; Microsoft has moved DirectML into
+            // sustained engineering only) has a deterministic crash path in
+            // amdkmdag.sys +0xf03d on RDNA2 iGPUs (Radeon 680M on Ryzen 6000)
+            // when ORT_ENABLE_ALL applies layout-reorder transforms to
+            // YOLO-style conv graphs. Downgrade to EXTENDED on DML: still
+            // keeps constant folding and Conv+BN+ReLU fusion (the big wins),
+            // drops the risky layout transforms. Perf impact on YOLO is
+            // typically under 5%.
+            session_options.SetGraphOptimizationLevel(
+                GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
+
+            ANS_DBG("OrtHandler",
+                "DirectML: DisableMemPattern + ORT_SEQUENTIAL + EXTENDED opt");
         }
 
         std::vector<std::string> available = Ort::GetAvailableProviders();
diff --git a/engines/ONNXEngine/ONNXEngine.h b/engines/ONNXEngine/ONNXEngine.h
index 08f164f..ed957b5 100644
--- a/engines/ONNXEngine/ONNXEngine.h
+++ b/engines/ONNXEngine/ONNXEngine.h
@@ -294,6 +294,10 @@ namespace ANSCENTER {
 
         BasicOrtHandler(const BasicOrtHandler&) = delete;
         BasicOrtHandler& operator=(const BasicOrtHandler&) = delete;
+    public:
+        // Resolved EP type (after EPLoader fallback). Subclasses use this
+        // to branch on actual EP at inference time.
+        EngineType getEngineType() const { return m_engineType; }
     private:
         void initialize_handler();
     protected:
diff --git a/modules/ANSODEngine/ANSONNXYOLO.cpp b/modules/ANSODEngine/ANSONNXYOLO.cpp
index 4242da5..caef52f 100644
--- a/modules/ANSODEngine/ANSONNXYOLO.cpp
+++ b/modules/ANSODEngine/ANSONNXYOLO.cpp
@@ -4,6 +4,7 @@
 #include "NV12PreprocessHelper.h"   // tl_currentGpuFrame()
 #include <numeric>   // std::iota
 #include <cmath>
+#include <chrono>    // WarmUpEngine() timing
 
 namespace ANSCENTER {
 
@@ -1587,6 +1588,64 @@ namespace ANSCENTER {
         }
     }
 
+    // ========================================================================
+    // WarmUpEngine — run 2 dummy inferences after session creation
+    //
+    // On AMD RDNA2 iGPUs (e.g. Radeon 680M on Ryzen 6000-series APUs), the
+    // very first detect() call triggers DirectML shader compile + GPU kernel
+    // cache population for the entire YOLO graph.  That first pass can
+    // legitimately take several seconds of sustained GPU work, which is long
+    // enough to coincide with TDR watchdog firing and has triggered
+    // amdkmdag.sys bugchecks at +0xf03d under DirectML 1.15.4 (the latest).
+    //
+    // Running 2 dummy inferences at startup burns the compile cost under
+    // controlled conditions so that the first real frame is already fast.
+    // The second call should always be quick and confirms the cache is warm.
+    //
+    // Non-fatal on failure: if warm-up itself crashes, regular inference may
+    // still succeed, or will fail with a clearer error message.
+    // ========================================================================
+    void ANSONNXYOLO::WarmUpEngine() {
+        if (!m_ortEngine) return;
+
+        // Warm-up exists solely to pre-compile DirectML shaders on AMD RDNA2
+        // iGPUs (Radeon 680M).  It has no benefit on CPU / OpenVINO / CUDA
+        // and running detect() at load time has been observed to expose
+        // latent heap-corruption bugs (ntdll +0x1176e5 / STATUS_HEAP_CORRUPTION
+        // 0xc0000374) on Intel machines.  Gate strictly on AMD_GPU.
+        if (m_ortEngine->getEngineType() != EngineType::AMD_GPU) {
+            ANS_DBG("ONNXYOLO", "Warm-up skipped (non-AMD EP)");
+            return;
+        }
+
+        try {
+            const int w = _modelConfig.inpWidth  > 0 ? _modelConfig.inpWidth  : 640;
+            const int h = _modelConfig.inpHeight > 0 ? _modelConfig.inpHeight : 640;
+
+            // Mid-gray BGR image matches the letterbox fill colour used in
+            // preprocessing (114,114,114 ~ 128) and avoids degenerate inputs.
+            cv::Mat dummy(h, w, CV_8UC3, cv::Scalar(128, 128, 128));
+
+            ANS_DBG("ONNXYOLO", "Warm-up: running 2 dummy inferences (%dx%d)", w, h);
+
+            for (int i = 0; i < 2; ++i) {
+                auto t0 = std::chrono::steady_clock::now();
+                (void)m_ortEngine->detect(dummy, _classes,
+                                          PROBABILITY_THRESHOLD,
+                                          NMS_THRESHOLD,
+                                          NUM_KPS);
+                auto t1 = std::chrono::steady_clock::now();
+                auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
+                ANS_DBG("ONNXYOLO", "Warm-up #%d: %lld ms", i, (long long)ms);
+            }
+        }
+        catch (const std::exception& e) {
+            _logger.LogError("ANSONNXYOLO::WarmUpEngine",
+                std::string("Warm-up failed (non-fatal): ") + e.what(),
+                __FILE__, __LINE__);
+        }
+    }
+
     bool ANSONNXYOLO::Initialize(std::string licenseKey, ModelConfig modelConfig,
                                    const std::string& modelZipFilePath,
                                    const std::string& modelZipPassword,
@@ -1651,6 +1710,10 @@ namespace ANSCENTER {
                 }
             }
 
+            // Pre-compile DirectML shaders / kernel cache before first real
+            // frame (mitigates amdkmdag TDR on Radeon 680M).  Non-fatal.
+            WarmUpEngine();
+
             _modelLoadValid = true;
             _isInitialized  = true;
             return true;
@@ -1708,6 +1771,10 @@ namespace ANSCENTER {
                 }
             }
 
+            // Pre-compile DirectML shaders / kernel cache before first real
+            // frame (mitigates amdkmdag TDR on Radeon 680M).  Non-fatal.
+            WarmUpEngine();
+
             _modelLoadValid = true;
             _isInitialized  = true;
             return true;
@@ -1778,6 +1845,10 @@ namespace ANSCENTER {
                 }
             }
 
+            // Pre-compile DirectML shaders / kernel cache before first real
+            // frame (mitigates amdkmdag TDR on Radeon 680M).  Non-fatal.
+            WarmUpEngine();
+
             _modelLoadValid = true;
             _isInitialized  = true;
             return true;
diff --git a/modules/ANSODEngine/ANSONNXYOLO.h b/modules/ANSODEngine/ANSONNXYOLO.h
index b8dc0a2..1e85136 100644
--- a/modules/ANSODEngine/ANSONNXYOLO.h
+++ b/modules/ANSODEngine/ANSONNXYOLO.h
@@ -230,6 +230,14 @@ namespace ANSCENTER {
 
         // Initialise ORT engine from the resolved model path
         bool InitOrtEngine();
+
+        // Run a few dummy inferences after session creation to force
+        // DirectML shader compile + kernel cache warm-up BEFORE the first
+        // real frame arrives.  On AMD RDNA2 iGPUs (Radeon 680M) the first
+        // real inference can otherwise take several seconds of GPU time
+        // while compiling shaders, which has triggered amdkmdag.sys
+        // bugchecks under DirectML 1.15.4.  Non-fatal on failure.
+        void WarmUpEngine();
     public:
         // Initialise ORT engine with explicit engine type override (e.g. CPU fallback for AMD iGPUs)
         bool InitOrtEngine(ANSCENTER::EngineType engineType);