Add CPU/GPU gate and support new ANSALPR using OCR

This commit is contained in:
2026-04-12 17:16:16 +10:00
parent 27083a6530
commit 0a8aaed215
30 changed files with 1870 additions and 2166 deletions

View File

@@ -237,6 +237,38 @@ namespace ANSCENTER {
output_node_names.data(),
num_outputs);
// ── Output shape sanity check ───────────────────────────────────
// DirectML on some AMD configurations has been observed to return
// output tensors whose dim[1]/dim[2] values don't match what the
// ONNX graph actually produced, which propagates into
// postprocessLegacy / postprocessEndToEnd as huge numBoxes /
// numChannels values and causes multi-terabyte cv::Mat allocations
// inside the `cv::Mat(numChannels, numBoxes, CV_32F, ...).t()`
// call (observed as "Failed to allocate 3522082959360 bytes" on
// Ryzen APUs). Bail out early here instead of letting the
// postprocess layer try to materialise a 3.5 TB buffer.
//
// Sane upper bounds for Ultralytics YOLO outputs:
// • legacy [1, 84..300, 8400..25200] → max dim ≈ 30k
// • end2end [1, 300, 6..56] → max dim ≈ 300
// • segmentation proto mask [1, 32, 160, 160] → max dim ≈ 160
// • classification [1, 1000] → max dim ≈ 1k
// 1,000,000 is ~30x the largest real-world dim and catches the
// garbage values without clipping any legitimate model.
constexpr int64_t kMaxOutputDim = 1000000;
for (size_t t = 0; t < outputTensors.size(); ++t) {
const auto shape = outputTensors[t].GetTensorTypeAndShapeInfo().GetShape();
for (size_t d = 0; d < shape.size(); ++d) {
if (shape[d] < 0 || shape[d] > kMaxOutputDim) {
std::cerr << "[ONNXYOLO] detect: output[" << t
<< "] dim[" << d << "]=" << shape[d]
<< " is out of range — refusing to postprocess."
<< std::endl;
return {};
}
}
}
const cv::Size resizedShape(
static_cast<int>(input_node_dims[3]),
static_cast<int>(input_node_dims[2]));
@@ -1399,6 +1431,23 @@ namespace ANSCENTER {
output_node_names.data(),
num_outputs);
// Output shape sanity check — see detect() for rationale. Prevents
// DirectML-returned garbage dims from propagating into postprocess
// and triggering multi-terabyte cv::Mat allocations on AMD.
constexpr int64_t kMaxOutputDim = 1000000;
for (size_t t = 0; t < outputTensors.size(); ++t) {
const auto sh = outputTensors[t].GetTensorTypeAndShapeInfo().GetShape();
for (size_t d = 0; d < sh.size(); ++d) {
if (sh[d] < 0 || sh[d] > kMaxOutputDim) {
std::cerr << "[ONNXYOLO] detectBatch: output[" << t
<< "] dim[" << d << "]=" << sh[d]
<< " is out of range — refusing to postprocess."
<< std::endl;
return std::vector<std::vector<Object>>(N);
}
}
}
const cv::Size resizedShape(
static_cast<int>(input_node_dims[3]),
static_cast<int>(input_node_dims[2]));
@@ -1589,59 +1638,92 @@ namespace ANSCENTER {
}
// ========================================================================
// WarmUpEngine — run 2 dummy inferences after session creation
// WarmUpEngine — run a dummy inference after session creation.
//
// On AMD RDNA2 iGPUs (e.g. Radeon 680M on Ryzen 6000-series APUs), the
// very first detect() call triggers DirectML shader compile + GPU kernel
// cache population for the entire YOLO graph. That first pass can
// legitimately take several seconds of sustained GPU work, which is long
// enough to coincide with TDR watchdog firing and has triggered
// amdkmdag.sys bugchecks at +0xf03d under DirectML 1.15.4 (the latest).
// Scope: **NVIDIA (CUDA EP) only.** On first inference, the CUDA EP
// allocates its memory arena (capped at 2 GB via BasicOrtHandler config),
// resolves cuDNN convolution algorithms, and populates the kernel launch
// cache. Running one dummy inference at load time amortises this cost
// so the first real frame doesn't see a latency spike.
//
// Running 2 dummy inferences at startup burns the compile cost under
// controlled conditions so that the first real frame is already fast.
// The second call should always be quick and confirms the cache is warm.
// Explicitly disabled on AMD, Intel and CPU:
// • AMD (DirectML) — calling detect() at load time has been observed
// to hit a multi-terabyte cv::Mat allocation inside postprocessLegacy
// on AMD RDNA iGPUs when DirectML returns garbage output tensor
// dims. ONNXYOLO::detect() now has an output-shape sanity guard
// that catches this at runtime, so the warm-up would add risk
// without benefit. Earlier builds enabled warm-up specifically for
// Radeon 680M TDR mitigation; that workaround is obsolete with
// current DirectML 1.15.x drivers.
// • Intel (OpenVINO) — running detect() at load time has been
// observed to expose latent heap-corruption bugs
// (ntdll +0x1176e5 / STATUS_HEAP_CORRUPTION 0xc0000374).
// • CPU EP — no shader compile or kernel cache to warm up; the first
// real frame has the same latency as any subsequent frame.
//
// Non-fatal on failure: if warm-up itself crashes, regular inference may
// still succeed, or will fail with a clearer error message.
// Non-fatal on failure: if warm-up itself throws, regular inference
// still works — the engine is fully loaded before WarmUpEngine runs.
// ========================================================================
void ANSONNXYOLO::WarmUpEngine() {
if (!m_ortEngine) return;
// Warm-up exists solely to pre-compile DirectML shaders on AMD RDNA2
// iGPUs (Radeon 680M). It has no benefit on CPU / OpenVINO / CUDA
// and running detect() at load time has been observed to expose
// latent heap-corruption bugs (ntdll +0x1176e5 / STATUS_HEAP_CORRUPTION
// 0xc0000374) on Intel machines. Gate strictly on AMD_GPU.
if (m_ortEngine->getEngineType() != EngineType::AMD_GPU) {
ANS_DBG("ONNXYOLO", "Warm-up skipped (non-AMD EP)");
// Gate strictly on NVIDIA_GPU. Every other EP is a no-op.
if (m_ortEngine->getEngineType() != EngineType::NVIDIA_GPU) {
ANS_DBG("ONNXYOLO", "Warm-up skipped (non-NVIDIA EP)");
return;
}
try {
const int w = _modelConfig.inpWidth > 0 ? _modelConfig.inpWidth : 640;
const int h = _modelConfig.inpHeight > 0 ? _modelConfig.inpHeight : 640;
// ── Strict dimension validation ─────────────────────────────────
// Defensive: refuse to warm up with implausible model dimensions.
// _modelConfig values come from the caller's ModelConfig and are
// normally 224..640; anything outside [32, 4096] is almost certainly
// a bug in the caller and we skip warm-up rather than risk a huge
// cv::Mat allocation inside detect().
constexpr int kMinDim = 32;
constexpr int kMaxDim = 4096;
const int rawW = _modelConfig.inpWidth;
const int rawH = _modelConfig.inpHeight;
if (rawW <= 0 || rawH <= 0 || rawW > kMaxDim || rawH > kMaxDim) {
_logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
"Warm-up skipped — suspect input dims ("
+ std::to_string(rawW) + "x" + std::to_string(rawH) + ")",
__FILE__, __LINE__);
return;
}
const int w = std::clamp(rawW, kMinDim, kMaxDim);
const int h = std::clamp(rawH, kMinDim, kMaxDim);
try {
// Mid-gray BGR image matches the letterbox fill colour used in
// preprocessing (114,114,114 ~ 128) and avoids degenerate inputs.
cv::Mat dummy(h, w, CV_8UC3, cv::Scalar(128, 128, 128));
ANS_DBG("ONNXYOLO", "Warm-up: running 2 dummy inferences (%dx%d)", w, h);
ANS_DBG("ONNXYOLO", "Warm-up: running 1 dummy CUDA inference (%dx%d)", w, h);
for (int i = 0; i < 2; ++i) {
auto t0 = std::chrono::steady_clock::now();
(void)m_ortEngine->detect(dummy, _classes,
PROBABILITY_THRESHOLD,
NMS_THRESHOLD,
NUM_KPS);
auto t1 = std::chrono::steady_clock::now();
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
ANS_DBG("ONNXYOLO", "Warm-up #%d: %lld ms", i, (long long)ms);
}
auto t0 = std::chrono::steady_clock::now();
(void)m_ortEngine->detect(dummy, _classes,
PROBABILITY_THRESHOLD,
NMS_THRESHOLD,
NUM_KPS);
auto t1 = std::chrono::steady_clock::now();
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
ANS_DBG("ONNXYOLO", "Warm-up done: %lld ms", (long long)ms);
}
catch (const cv::Exception& e) {
// Defensive — should not fire on NVIDIA CUDA EP, but if it does
// the engine itself is still loaded and real inference will work.
_logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
std::string("Warm-up skipped (cv::Exception, non-fatal): ") + e.what(),
__FILE__, __LINE__);
}
catch (const std::exception& e) {
_logger.LogError("ANSONNXYOLO::WarmUpEngine",
std::string("Warm-up failed (non-fatal): ") + e.what(),
_logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
std::string("Warm-up skipped (std::exception, non-fatal): ") + e.what(),
__FILE__, __LINE__);
}
catch (...) {
_logger.LogWarn("ANSONNXYOLO::WarmUpEngine",
"Warm-up skipped (unknown exception, non-fatal)",
__FILE__, __LINE__);
}
}