Fix model extract race issue to all classes

This commit is contained in:
2026-04-24 12:19:54 +10:00
parent baa88bcc48
commit e2bf17289d
51 changed files with 1252 additions and 148 deletions

View File

@@ -1775,53 +1775,21 @@ namespace ANSCENTER {
labelMap = VectorToCommaSeparatedString(_classes);
if (this->_loadEngineOnCreation) {
// Hold the model-folder lock across session creation so a
// concurrent CreateANSODHandle on the same model cannot
// re-enter ExtractProtectedZipFile and truncate the .onnx
// file while ORT is opening it (which would surface as
// "system error number 13" EACCES from the ORT loader).
//
// Timed wait — 120s ceiling for extract + ORT session
// creation (GPU EP compile can take ~30s on large models).
// If we hit the timeout, the peer thread is deadlocked or
// wedged; fail the load instead of hanging the caller.
auto _folderLock = GetModelFolderLock(_modelFolder);
std::unique_lock<std::timed_mutex> _folderGuard(
*_folderLock, std::defer_lock);
ANS_DBG("ONNXYOLO", "Initialize: waiting on folder lock (120s): %s",
_modelFolder.c_str());
auto _lockT0 = std::chrono::steady_clock::now();
if (!_folderGuard.try_lock_for(std::chrono::seconds(120))) {
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - _lockT0).count();
ANS_DBG("ONNXYOLO",
"Initialize: TIMEOUT on folder lock after %lldms: %s",
(long long)waitedMs, _modelFolder.c_str());
// Serialize session creation against concurrent extract re-entries
// on the same folder. See ModelFolderLockGuard in ANSEngineCommon.h.
ModelFolderLockGuard _flg(_modelFolder, "ANSONNXYOLO::Initialize");
if (!_flg.acquired()) {
_logger.LogError("ANSONNXYOLO::Initialize",
"Timed out waiting for model-folder lock: " + _modelFolder,
__FILE__, __LINE__);
return false;
}
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - _lockT0).count();
ANS_DBG("ONNXYOLO", "Initialize: folder lock acquired in %lldms, calling InitOrtEngine",
(long long)waitedMs);
auto _initT0 = std::chrono::steady_clock::now();
if (!InitOrtEngine()) {
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - _initT0).count();
ANS_DBG("ONNXYOLO",
"Initialize: InitOrtEngine FAILED after %lldms, model=%s",
(long long)initMs, _modelFilePath.c_str());
_logger.LogError("ANSONNXYOLO::Initialize",
"Failed to create ONNX Runtime engine: " + _modelFilePath,
__FILE__, __LINE__);
return false;
}
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - _initT0).count();
ANS_DBG("ONNXYOLO", "Initialize: InitOrtEngine OK in %lldms",
(long long)initMs);
}
// Fix input resolution for dynamic-shape models.
@@ -1886,45 +1854,16 @@ namespace ANSCENTER {
}
if (this->_loadEngineOnCreation) {
// See ANSONNXYOLO::Initialize — hold folder lock so a sibling
// extraction cannot truncate train_last.onnx mid-load. Timed
// wait so a stuck peer cannot hang this thread forever.
auto _folderLock = GetModelFolderLock(_modelFolder);
std::unique_lock<std::timed_mutex> _folderGuard(
*_folderLock, std::defer_lock);
ANS_DBG("ONNXYOLO", "LoadModel: waiting on folder lock (120s): %s",
_modelFolder.c_str());
auto _lockT0 = std::chrono::steady_clock::now();
if (!_folderGuard.try_lock_for(std::chrono::seconds(120))) {
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - _lockT0).count();
ANS_DBG("ONNXYOLO",
"LoadModel: TIMEOUT on folder lock after %lldms: %s",
(long long)waitedMs, _modelFolder.c_str());
// See ModelFolderLockGuard in ANSEngineCommon.h.
ModelFolderLockGuard _flg(_modelFolder, "ANSONNXYOLO::LoadModel");
if (!_flg.acquired()) {
_logger.LogError("ANSONNXYOLO::LoadModel",
"Timed out waiting for model-folder lock: " + _modelFolder,
__FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - _lockT0).count();
ANS_DBG("ONNXYOLO", "LoadModel: folder lock acquired in %lldms, calling InitOrtEngine",
(long long)waitedMs);
auto _initT0 = std::chrono::steady_clock::now();
if (!InitOrtEngine()) {
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - _initT0).count();
ANS_DBG("ONNXYOLO",
"LoadModel: InitOrtEngine FAILED after %lldms, model=%s",
(long long)initMs, _modelFilePath.c_str());
_modelLoadValid = false;
return false;
}
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - _initT0).count();
ANS_DBG("ONNXYOLO", "LoadModel: InitOrtEngine OK in %lldms",
(long long)initMs);
if (!InitOrtEngine()) { _modelLoadValid = false; return false; }
}
// Fix input resolution for dynamic-shape models (same as primary Initialize)
@@ -1999,45 +1938,16 @@ namespace ANSCENTER {
labelMap = VectorToCommaSeparatedString(_classes);
if (this->_loadEngineOnCreation) {
// See ANSONNXYOLO::Initialize — hold folder lock so a sibling
// extraction cannot truncate the model file mid-load. Timed
// wait so a stuck peer cannot hang this thread forever.
auto _folderLock = GetModelFolderLock(_modelFolder);
std::unique_lock<std::timed_mutex> _folderGuard(
*_folderLock, std::defer_lock);
ANS_DBG("ONNXYOLO", "LoadModelFromFolder: waiting on folder lock (120s): %s",
_modelFolder.c_str());
auto _lockT0 = std::chrono::steady_clock::now();
if (!_folderGuard.try_lock_for(std::chrono::seconds(120))) {
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - _lockT0).count();
ANS_DBG("ONNXYOLO",
"LoadModelFromFolder: TIMEOUT on folder lock after %lldms: %s",
(long long)waitedMs, _modelFolder.c_str());
// See ModelFolderLockGuard in ANSEngineCommon.h.
ModelFolderLockGuard _flg(_modelFolder, "ANSONNXYOLO::LoadModelFromFolder");
if (!_flg.acquired()) {
_logger.LogError("ANSONNXYOLO::LoadModelFromFolder",
"Timed out waiting for model-folder lock: " + _modelFolder,
__FILE__, __LINE__);
_modelLoadValid = false;
return false;
}
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - _lockT0).count();
ANS_DBG("ONNXYOLO", "LoadModelFromFolder: folder lock acquired in %lldms, calling InitOrtEngine",
(long long)waitedMs);
auto _initT0 = std::chrono::steady_clock::now();
if (!InitOrtEngine()) {
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - _initT0).count();
ANS_DBG("ONNXYOLO",
"LoadModelFromFolder: InitOrtEngine FAILED after %lldms, model=%s",
(long long)initMs, _modelFilePath.c_str());
_modelLoadValid = false;
return false;
}
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - _initT0).count();
ANS_DBG("ONNXYOLO", "LoadModelFromFolder: InitOrtEngine OK in %lldms",
(long long)initMs);
if (!InitOrtEngine()) { _modelLoadValid = false; return false; }
}
// Fix input resolution for dynamic-shape models (same as primary Initialize)