Fix Intel ONNX runtime
This commit is contained in:
@@ -89,7 +89,13 @@
|
|||||||
"Read(//c/ANSLibs/onnxruntime/include//**)",
|
"Read(//c/ANSLibs/onnxruntime/include//**)",
|
||||||
"Read(//c/ANSLibs/onnxruntime/lib/**)",
|
"Read(//c/ANSLibs/onnxruntime/lib/**)",
|
||||||
"Read(//c/ANSLibs/onnxruntime/**)",
|
"Read(//c/ANSLibs/onnxruntime/**)",
|
||||||
"Bash(python -c ':*)"
|
"Bash(python -c ':*)",
|
||||||
|
"Bash(grep -oE \"WATCHDOG-[0-9-]+\\\\.dmp|Minidump\\\\\\\\\\\\\\\\[0-9-]+\\\\.dmp\" \"C:\\\\Users\\\\nghia\\\\Downloads\\\\AMDeventlog3.xml\")",
|
||||||
|
"Bash(grep -oE \"\\\\\\\\\\\\\\\\Minidump\\\\\\\\\\\\\\\\[^<]+\\\\.dmp\" \"C:\\\\Users\\\\nghia\\\\Downloads\\\\AMDeventlog3.xml\")",
|
||||||
|
"mcp__desktop-commander__start_search",
|
||||||
|
"mcp__desktop-commander__get_more_search_results",
|
||||||
|
"mcp__desktop-commander__list_searches",
|
||||||
|
"Bash(grep -oE \"Faulting[^<]{1,500}|APPLICATION CRASHING[^<]{1,300}|EventID[^>]*>1000[^<]*|ExceptionCode[^<]{1,100}|onnxruntime[^<]{1,300}\" \"C:\\\\Users\\\\nghia\\\\Downloads\\\\Evenlog.xml\")"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -233,7 +233,20 @@ namespace ANSCENTER {
|
|||||||
if (engine == EngineType::AMD_GPU) {
|
if (engine == EngineType::AMD_GPU) {
|
||||||
session_options.DisableMemPattern();
|
session_options.DisableMemPattern();
|
||||||
session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
|
session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
|
||||||
ANS_DBG("OrtHandler", "DirectML: DisableMemPattern + ORT_SEQUENTIAL set");
|
|
||||||
|
// DirectML 1.15.4 (the latest; Microsoft has moved DirectML into
|
||||||
|
// sustained engineering only) has a deterministic crash path in
|
||||||
|
// amdkmdag.sys +0xf03d on RDNA2 iGPUs (Radeon 680M on Ryzen 6000)
|
||||||
|
// when ORT_ENABLE_ALL applies layout-reorder transforms to
|
||||||
|
// YOLO-style conv graphs. Downgrade to EXTENDED on DML: still
|
||||||
|
// keeps constant folding and Conv+BN+ReLU fusion (the big wins),
|
||||||
|
// drops the risky layout transforms. Perf impact on YOLO is
|
||||||
|
// typically under 5%.
|
||||||
|
session_options.SetGraphOptimizationLevel(
|
||||||
|
GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
|
||||||
|
|
||||||
|
ANS_DBG("OrtHandler",
|
||||||
|
"DirectML: DisableMemPattern + ORT_SEQUENTIAL + EXTENDED opt");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> available = Ort::GetAvailableProviders();
|
std::vector<std::string> available = Ort::GetAvailableProviders();
|
||||||
|
|||||||
@@ -294,6 +294,10 @@ namespace ANSCENTER {
|
|||||||
|
|
||||||
BasicOrtHandler(const BasicOrtHandler&) = delete;
|
BasicOrtHandler(const BasicOrtHandler&) = delete;
|
||||||
BasicOrtHandler& operator=(const BasicOrtHandler&) = delete;
|
BasicOrtHandler& operator=(const BasicOrtHandler&) = delete;
|
||||||
|
public:
|
||||||
|
// Resolved EP type (after EPLoader fallback). Subclasses use this
|
||||||
|
// to branch on actual EP at inference time.
|
||||||
|
EngineType getEngineType() const { return m_engineType; }
|
||||||
private:
|
private:
|
||||||
void initialize_handler();
|
void initialize_handler();
|
||||||
protected:
|
protected:
|
||||||
|
|||||||
@@ -4,6 +4,7 @@
|
|||||||
#include "NV12PreprocessHelper.h" // tl_currentGpuFrame()
|
#include "NV12PreprocessHelper.h" // tl_currentGpuFrame()
|
||||||
#include <numeric> // std::iota
|
#include <numeric> // std::iota
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <chrono> // WarmUpEngine() timing
|
||||||
|
|
||||||
namespace ANSCENTER {
|
namespace ANSCENTER {
|
||||||
|
|
||||||
@@ -1587,6 +1588,64 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ========================================================================
|
||||||
|
// WarmUpEngine — run 2 dummy inferences after session creation
|
||||||
|
//
|
||||||
|
// On AMD RDNA2 iGPUs (e.g. Radeon 680M on Ryzen 6000-series APUs), the
|
||||||
|
// very first detect() call triggers DirectML shader compile + GPU kernel
|
||||||
|
// cache population for the entire YOLO graph. That first pass can
|
||||||
|
// legitimately take several seconds of sustained GPU work, which is long
|
||||||
|
// enough to coincide with TDR watchdog firing and has triggered
|
||||||
|
// amdkmdag.sys bugchecks at +0xf03d under DirectML 1.15.4 (the latest).
|
||||||
|
//
|
||||||
|
// Running 2 dummy inferences at startup burns the compile cost under
|
||||||
|
// controlled conditions so that the first real frame is already fast.
|
||||||
|
// The second call should always be quick and confirms the cache is warm.
|
||||||
|
//
|
||||||
|
// Non-fatal on failure: if warm-up itself crashes, regular inference may
|
||||||
|
// still succeed, or will fail with a clearer error message.
|
||||||
|
// ========================================================================
|
||||||
|
void ANSONNXYOLO::WarmUpEngine() {
|
||||||
|
if (!m_ortEngine) return;
|
||||||
|
|
||||||
|
// Warm-up exists solely to pre-compile DirectML shaders on AMD RDNA2
|
||||||
|
// iGPUs (Radeon 680M). It has no benefit on CPU / OpenVINO / CUDA
|
||||||
|
// and running detect() at load time has been observed to expose
|
||||||
|
// latent heap-corruption bugs (ntdll +0x1176e5 / STATUS_HEAP_CORRUPTION
|
||||||
|
// 0xc0000374) on Intel machines. Gate strictly on AMD_GPU.
|
||||||
|
if (m_ortEngine->getEngineType() != EngineType::AMD_GPU) {
|
||||||
|
ANS_DBG("ONNXYOLO", "Warm-up skipped (non-AMD EP)");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const int w = _modelConfig.inpWidth > 0 ? _modelConfig.inpWidth : 640;
|
||||||
|
const int h = _modelConfig.inpHeight > 0 ? _modelConfig.inpHeight : 640;
|
||||||
|
|
||||||
|
// Mid-gray BGR image matches the letterbox fill colour used in
|
||||||
|
// preprocessing (114,114,114 ~ 128) and avoids degenerate inputs.
|
||||||
|
cv::Mat dummy(h, w, CV_8UC3, cv::Scalar(128, 128, 128));
|
||||||
|
|
||||||
|
ANS_DBG("ONNXYOLO", "Warm-up: running 2 dummy inferences (%dx%d)", w, h);
|
||||||
|
|
||||||
|
for (int i = 0; i < 2; ++i) {
|
||||||
|
auto t0 = std::chrono::steady_clock::now();
|
||||||
|
(void)m_ortEngine->detect(dummy, _classes,
|
||||||
|
PROBABILITY_THRESHOLD,
|
||||||
|
NMS_THRESHOLD,
|
||||||
|
NUM_KPS);
|
||||||
|
auto t1 = std::chrono::steady_clock::now();
|
||||||
|
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
|
||||||
|
ANS_DBG("ONNXYOLO", "Warm-up #%d: %lld ms", i, (long long)ms);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (const std::exception& e) {
|
||||||
|
_logger.LogError("ANSONNXYOLO::WarmUpEngine",
|
||||||
|
std::string("Warm-up failed (non-fatal): ") + e.what(),
|
||||||
|
__FILE__, __LINE__);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool ANSONNXYOLO::Initialize(std::string licenseKey, ModelConfig modelConfig,
|
bool ANSONNXYOLO::Initialize(std::string licenseKey, ModelConfig modelConfig,
|
||||||
const std::string& modelZipFilePath,
|
const std::string& modelZipFilePath,
|
||||||
const std::string& modelZipPassword,
|
const std::string& modelZipPassword,
|
||||||
@@ -1651,6 +1710,10 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Pre-compile DirectML shaders / kernel cache before first real
|
||||||
|
// frame (mitigates amdkmdag TDR on Radeon 680M). Non-fatal.
|
||||||
|
WarmUpEngine();
|
||||||
|
|
||||||
_modelLoadValid = true;
|
_modelLoadValid = true;
|
||||||
_isInitialized = true;
|
_isInitialized = true;
|
||||||
return true;
|
return true;
|
||||||
@@ -1708,6 +1771,10 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Pre-compile DirectML shaders / kernel cache before first real
|
||||||
|
// frame (mitigates amdkmdag TDR on Radeon 680M). Non-fatal.
|
||||||
|
WarmUpEngine();
|
||||||
|
|
||||||
_modelLoadValid = true;
|
_modelLoadValid = true;
|
||||||
_isInitialized = true;
|
_isInitialized = true;
|
||||||
return true;
|
return true;
|
||||||
@@ -1778,6 +1845,10 @@ namespace ANSCENTER {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Pre-compile DirectML shaders / kernel cache before first real
|
||||||
|
// frame (mitigates amdkmdag TDR on Radeon 680M). Non-fatal.
|
||||||
|
WarmUpEngine();
|
||||||
|
|
||||||
_modelLoadValid = true;
|
_modelLoadValid = true;
|
||||||
_isInitialized = true;
|
_isInitialized = true;
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
@@ -230,6 +230,14 @@ namespace ANSCENTER {
|
|||||||
|
|
||||||
// Initialise ORT engine from the resolved model path
|
// Initialise ORT engine from the resolved model path
|
||||||
bool InitOrtEngine();
|
bool InitOrtEngine();
|
||||||
|
|
||||||
|
// Run a few dummy inferences after session creation to force
|
||||||
|
// DirectML shader compile + kernel cache warm-up BEFORE the first
|
||||||
|
// real frame arrives. On AMD RDNA2 iGPUs (Radeon 680M) the first
|
||||||
|
// real inference can otherwise take several seconds of GPU time
|
||||||
|
// while compiling shaders, which has triggered amdkmdag.sys
|
||||||
|
// bugchecks under DirectML 1.15.4. Non-fatal on failure.
|
||||||
|
void WarmUpEngine();
|
||||||
public:
|
public:
|
||||||
// Initialise ORT engine with explicit engine type override (e.g. CPU fallback for AMD iGPUs)
|
// Initialise ORT engine with explicit engine type override (e.g. CPU fallback for AMD iGPUs)
|
||||||
bool InitOrtEngine(ANSCENTER::EngineType engineType);
|
bool InitOrtEngine(ANSCENTER::EngineType engineType);
|
||||||
|
|||||||
Reference in New Issue
Block a user