Fix Concurrent extract + open race on train_last.onnx
This commit is contained in:
@@ -1775,12 +1775,53 @@ namespace ANSCENTER {
|
||||
labelMap = VectorToCommaSeparatedString(_classes);
|
||||
|
||||
if (this->_loadEngineOnCreation) {
|
||||
// Hold the model-folder lock across session creation so a
|
||||
// concurrent CreateANSODHandle on the same model cannot
|
||||
// re-enter ExtractProtectedZipFile and truncate the .onnx
|
||||
// file while ORT is opening it (which would surface as
|
||||
// "system error number 13" EACCES from the ORT loader).
|
||||
//
|
||||
// Timed wait — 120s ceiling for extract + ORT session
|
||||
// creation (GPU EP compile can take ~30s on large models).
|
||||
// If we hit the timeout, the peer thread is deadlocked or
|
||||
// wedged; fail the load instead of hanging the caller.
|
||||
auto _folderLock = GetModelFolderLock(_modelFolder);
|
||||
std::unique_lock<std::timed_mutex> _folderGuard(
|
||||
*_folderLock, std::defer_lock);
|
||||
ANS_DBG("ONNXYOLO", "Initialize: waiting on folder lock (120s): %s",
|
||||
_modelFolder.c_str());
|
||||
auto _lockT0 = std::chrono::steady_clock::now();
|
||||
if (!_folderGuard.try_lock_for(std::chrono::seconds(120))) {
|
||||
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _lockT0).count();
|
||||
ANS_DBG("ONNXYOLO",
|
||||
"Initialize: TIMEOUT on folder lock after %lldms: %s",
|
||||
(long long)waitedMs, _modelFolder.c_str());
|
||||
_logger.LogError("ANSONNXYOLO::Initialize",
|
||||
"Timed out waiting for model-folder lock: " + _modelFolder,
|
||||
__FILE__, __LINE__);
|
||||
return false;
|
||||
}
|
||||
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _lockT0).count();
|
||||
ANS_DBG("ONNXYOLO", "Initialize: folder lock acquired in %lldms, calling InitOrtEngine",
|
||||
(long long)waitedMs);
|
||||
auto _initT0 = std::chrono::steady_clock::now();
|
||||
if (!InitOrtEngine()) {
|
||||
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _initT0).count();
|
||||
ANS_DBG("ONNXYOLO",
|
||||
"Initialize: InitOrtEngine FAILED after %lldms, model=%s",
|
||||
(long long)initMs, _modelFilePath.c_str());
|
||||
_logger.LogError("ANSONNXYOLO::Initialize",
|
||||
"Failed to create ONNX Runtime engine: " + _modelFilePath,
|
||||
__FILE__, __LINE__);
|
||||
return false;
|
||||
}
|
||||
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _initT0).count();
|
||||
ANS_DBG("ONNXYOLO", "Initialize: InitOrtEngine OK in %lldms",
|
||||
(long long)initMs);
|
||||
}
|
||||
|
||||
// Fix input resolution for dynamic-shape models.
|
||||
@@ -1845,7 +1886,45 @@ namespace ANSCENTER {
|
||||
}
|
||||
|
||||
if (this->_loadEngineOnCreation) {
|
||||
if (!InitOrtEngine()) { _modelLoadValid = false; return false; }
|
||||
// See ANSONNXYOLO::Initialize — hold folder lock so a sibling
|
||||
// extraction cannot truncate train_last.onnx mid-load. Timed
|
||||
// wait so a stuck peer cannot hang this thread forever.
|
||||
auto _folderLock = GetModelFolderLock(_modelFolder);
|
||||
std::unique_lock<std::timed_mutex> _folderGuard(
|
||||
*_folderLock, std::defer_lock);
|
||||
ANS_DBG("ONNXYOLO", "LoadModel: waiting on folder lock (120s): %s",
|
||||
_modelFolder.c_str());
|
||||
auto _lockT0 = std::chrono::steady_clock::now();
|
||||
if (!_folderGuard.try_lock_for(std::chrono::seconds(120))) {
|
||||
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _lockT0).count();
|
||||
ANS_DBG("ONNXYOLO",
|
||||
"LoadModel: TIMEOUT on folder lock after %lldms: %s",
|
||||
(long long)waitedMs, _modelFolder.c_str());
|
||||
_logger.LogError("ANSONNXYOLO::LoadModel",
|
||||
"Timed out waiting for model-folder lock: " + _modelFolder,
|
||||
__FILE__, __LINE__);
|
||||
_modelLoadValid = false;
|
||||
return false;
|
||||
}
|
||||
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _lockT0).count();
|
||||
ANS_DBG("ONNXYOLO", "LoadModel: folder lock acquired in %lldms, calling InitOrtEngine",
|
||||
(long long)waitedMs);
|
||||
auto _initT0 = std::chrono::steady_clock::now();
|
||||
if (!InitOrtEngine()) {
|
||||
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _initT0).count();
|
||||
ANS_DBG("ONNXYOLO",
|
||||
"LoadModel: InitOrtEngine FAILED after %lldms, model=%s",
|
||||
(long long)initMs, _modelFilePath.c_str());
|
||||
_modelLoadValid = false;
|
||||
return false;
|
||||
}
|
||||
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _initT0).count();
|
||||
ANS_DBG("ONNXYOLO", "LoadModel: InitOrtEngine OK in %lldms",
|
||||
(long long)initMs);
|
||||
}
|
||||
|
||||
// Fix input resolution for dynamic-shape models (same as primary Initialize)
|
||||
@@ -1920,7 +1999,45 @@ namespace ANSCENTER {
|
||||
labelMap = VectorToCommaSeparatedString(_classes);
|
||||
|
||||
if (this->_loadEngineOnCreation) {
|
||||
if (!InitOrtEngine()) { _modelLoadValid = false; return false; }
|
||||
// See ANSONNXYOLO::Initialize — hold folder lock so a sibling
|
||||
// extraction cannot truncate the model file mid-load. Timed
|
||||
// wait so a stuck peer cannot hang this thread forever.
|
||||
auto _folderLock = GetModelFolderLock(_modelFolder);
|
||||
std::unique_lock<std::timed_mutex> _folderGuard(
|
||||
*_folderLock, std::defer_lock);
|
||||
ANS_DBG("ONNXYOLO", "LoadModelFromFolder: waiting on folder lock (120s): %s",
|
||||
_modelFolder.c_str());
|
||||
auto _lockT0 = std::chrono::steady_clock::now();
|
||||
if (!_folderGuard.try_lock_for(std::chrono::seconds(120))) {
|
||||
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _lockT0).count();
|
||||
ANS_DBG("ONNXYOLO",
|
||||
"LoadModelFromFolder: TIMEOUT on folder lock after %lldms: %s",
|
||||
(long long)waitedMs, _modelFolder.c_str());
|
||||
_logger.LogError("ANSONNXYOLO::LoadModelFromFolder",
|
||||
"Timed out waiting for model-folder lock: " + _modelFolder,
|
||||
__FILE__, __LINE__);
|
||||
_modelLoadValid = false;
|
||||
return false;
|
||||
}
|
||||
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _lockT0).count();
|
||||
ANS_DBG("ONNXYOLO", "LoadModelFromFolder: folder lock acquired in %lldms, calling InitOrtEngine",
|
||||
(long long)waitedMs);
|
||||
auto _initT0 = std::chrono::steady_clock::now();
|
||||
if (!InitOrtEngine()) {
|
||||
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _initT0).count();
|
||||
ANS_DBG("ONNXYOLO",
|
||||
"LoadModelFromFolder: InitOrtEngine FAILED after %lldms, model=%s",
|
||||
(long long)initMs, _modelFilePath.c_str());
|
||||
_modelLoadValid = false;
|
||||
return false;
|
||||
}
|
||||
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _initT0).count();
|
||||
ANS_DBG("ONNXYOLO", "LoadModelFromFolder: InitOrtEngine OK in %lldms",
|
||||
(long long)initMs);
|
||||
}
|
||||
|
||||
// Fix input resolution for dynamic-shape models (same as primary Initialize)
|
||||
|
||||
Reference in New Issue
Block a user