Fix Intel ONNX runtime

This commit is contained in:
2026-04-10 17:13:47 +10:00
parent eb863bf510
commit 257145c429
5 changed files with 104 additions and 2 deletions

View File

@@ -4,6 +4,7 @@
#include "NV12PreprocessHelper.h" // tl_currentGpuFrame()
#include <numeric> // std::iota
#include <cmath>
#include <chrono> // WarmUpEngine() timing
namespace ANSCENTER {
@@ -1587,6 +1588,64 @@ namespace ANSCENTER {
}
}
// ========================================================================
// WarmUpEngine — run 2 dummy inferences after session creation
//
// On AMD RDNA2 iGPUs (e.g. Radeon 680M on Ryzen 6000-series APUs), the
// very first detect() call triggers DirectML shader compile + GPU kernel
// cache population for the entire YOLO graph. That first pass can
// legitimately take several seconds of sustained GPU work, which is long
// enough to coincide with TDR watchdog firing and has triggered
// amdkmdag.sys bugchecks at +0xf03d under DirectML 1.15.4 (the latest).
//
// Running 2 dummy inferences at startup burns the compile cost under
// controlled conditions so that the first real frame is already fast.
// The second call should always be quick and confirms the cache is warm.
//
// Non-fatal on failure: if warm-up itself crashes, regular inference may
// still succeed, or will fail with a clearer error message.
// ========================================================================
void ANSONNXYOLO::WarmUpEngine() {
if (!m_ortEngine) return;
// Warm-up exists solely to pre-compile DirectML shaders on AMD RDNA2
// iGPUs (Radeon 680M). It has no benefit on CPU / OpenVINO / CUDA
// and running detect() at load time has been observed to expose
// latent heap-corruption bugs (ntdll +0x1176e5 / STATUS_HEAP_CORRUPTION
// 0xc0000374) on Intel machines. Gate strictly on AMD_GPU.
if (m_ortEngine->getEngineType() != EngineType::AMD_GPU) {
ANS_DBG("ONNXYOLO", "Warm-up skipped (non-AMD EP)");
return;
}
try {
const int w = _modelConfig.inpWidth > 0 ? _modelConfig.inpWidth : 640;
const int h = _modelConfig.inpHeight > 0 ? _modelConfig.inpHeight : 640;
// Mid-gray BGR image matches the letterbox fill colour used in
// preprocessing (114,114,114 ~ 128) and avoids degenerate inputs.
cv::Mat dummy(h, w, CV_8UC3, cv::Scalar(128, 128, 128));
ANS_DBG("ONNXYOLO", "Warm-up: running 2 dummy inferences (%dx%d)", w, h);
for (int i = 0; i < 2; ++i) {
auto t0 = std::chrono::steady_clock::now();
(void)m_ortEngine->detect(dummy, _classes,
PROBABILITY_THRESHOLD,
NMS_THRESHOLD,
NUM_KPS);
auto t1 = std::chrono::steady_clock::now();
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
ANS_DBG("ONNXYOLO", "Warm-up #%d: %lld ms", i, (long long)ms);
}
}
catch (const std::exception& e) {
_logger.LogError("ANSONNXYOLO::WarmUpEngine",
std::string("Warm-up failed (non-fatal): ") + e.what(),
__FILE__, __LINE__);
}
}
bool ANSONNXYOLO::Initialize(std::string licenseKey, ModelConfig modelConfig,
const std::string& modelZipFilePath,
const std::string& modelZipPassword,
@@ -1651,6 +1710,10 @@ namespace ANSCENTER {
}
}
// Pre-compile DirectML shaders / kernel cache before first real
// frame (mitigates amdkmdag TDR on Radeon 680M). Non-fatal.
WarmUpEngine();
_modelLoadValid = true;
_isInitialized = true;
return true;
@@ -1708,6 +1771,10 @@ namespace ANSCENTER {
}
}
// Pre-compile DirectML shaders / kernel cache before first real
// frame (mitigates amdkmdag TDR on Radeon 680M). Non-fatal.
WarmUpEngine();
_modelLoadValid = true;
_isInitialized = true;
return true;
@@ -1778,6 +1845,10 @@ namespace ANSCENTER {
}
}
// Pre-compile DirectML shaders / kernel cache before first real
// frame (mitigates amdkmdag TDR on Radeon 680M). Non-fatal.
WarmUpEngine();
_modelLoadValid = true;
_isInitialized = true;
return true;