Fix Concurrent extract + open race on train_last.onnx
This commit is contained in:
@@ -95,7 +95,13 @@
|
||||
"Bash(git add *)",
|
||||
"Read(//c/ProgramData/Jh7O7nUe7vS/Models/EngineModels/B-IN_ANS_VehicleDetection_v2.0_67345015/**)",
|
||||
"Bash(xxd)",
|
||||
"Bash(icacls \"C:\\\\ProgramData\\\\Jh7O7nUe7vS\\\\Models\\\\EngineModels\\\\B-IN_ANS_VehicleDetection_v2.0_67345015\\\\train_last.onnx\")"
|
||||
"Bash(icacls \"C:\\\\ProgramData\\\\Jh7O7nUe7vS\\\\Models\\\\EngineModels\\\\B-IN_ANS_VehicleDetection_v2.0_67345015\\\\train_last.onnx\")",
|
||||
"Bash(grep -oE '<Data[^>]*>[^<]{0,400}</Data>' \"C:/Users/nghia/Downloads/error.xml\")",
|
||||
"Bash(grep -oE '<Data[^>]*>[^<]{0,500}</Data>' \"/c/Users/nghia/Downloads/error.xml\")",
|
||||
"Read(//tmp/**)",
|
||||
"Bash(grep -oE '<Data[^>]*>[^<]{0,400}</Data>' \"/c/Users/nghia/Downloads/error.xml\")",
|
||||
"Bash(echo \"Exit: $?\")",
|
||||
"Bash(python -)"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,13 +7,24 @@
|
||||
// Per-path mutex to serialize concurrent zip operations on the same target.
|
||||
// Without this, two LabVIEW threads can race: one extracting a zip while
|
||||
// another truncates/writes the same file, corrupting data and crashing LabVIEW.
|
||||
// Also used to serialize extract ↔ ONNX session creation on the same extracted
|
||||
// model folder — without that, thread A can finish extraction and begin opening
|
||||
// train_last.onnx while thread B re-enters extraction and truncates the file,
|
||||
// producing "system error number 13" (EACCES) on the first reader.
|
||||
static std::mutex g_zipPathMapMutex;
|
||||
static std::map<std::string, std::shared_ptr<std::mutex>> g_zipPathLocks;
|
||||
static std::map<std::string, std::shared_ptr<std::timed_mutex>> g_zipPathLocks;
|
||||
|
||||
static std::shared_ptr<std::mutex> GetZipPathLock(const std::string& path) {
|
||||
static std::shared_ptr<std::timed_mutex> GetZipPathLock(const std::string& path) {
|
||||
std::lock_guard<std::mutex> guard(g_zipPathMapMutex);
|
||||
auto& ptr = g_zipPathLocks[path];
|
||||
if (!ptr) ptr = std::make_shared<std::mutex>();
|
||||
if (!ptr) ptr = std::make_shared<std::timed_mutex>();
|
||||
return ptr;
|
||||
}
|
||||
|
||||
std::shared_ptr<std::timed_mutex> GetModelFolderLock(const std::string& folderPath) {
|
||||
auto ptr = GetZipPathLock(folderPath);
|
||||
ANS_DBG("ModelLock", "GetModelFolderLock: folder=%s mutex=%p",
|
||||
folderPath.c_str(), (void*)ptr.get());
|
||||
return ptr;
|
||||
}
|
||||
|
||||
@@ -453,7 +464,7 @@ bool AddFolderContentsToZip(zip* archive, const char* folderPath, const char* zi
|
||||
|
||||
bool ZipFolderWithPassword(const char* folderPath, const char* zipFilePath, const char* password) {
|
||||
auto pathLock = GetZipPathLock(std::string(zipFilePath));
|
||||
std::lock_guard<std::mutex> zipGuard(*pathLock);
|
||||
std::lock_guard<std::timed_mutex> zipGuard(*pathLock);
|
||||
|
||||
zip* zipArchive;
|
||||
zip_flags_t flags = ZIP_CREATE | ZIP_TRUNCATE;
|
||||
@@ -839,10 +850,64 @@ std::string GetDateTimeString(const std::string& format) {
|
||||
bool ExtractProtectedZipFile(const std::string& zipFileName, const std::string& password, const std::string& modelName, const std::string outputFolder)
|
||||
{
|
||||
auto pathLock = GetZipPathLock(outputFolder);
|
||||
std::lock_guard<std::mutex> zipGuard(*pathLock);
|
||||
std::lock_guard<std::timed_mutex> zipGuard(*pathLock);
|
||||
|
||||
int error;
|
||||
if (!FileExist(zipFileName))return false;
|
||||
|
||||
// Idempotent fast-path: if the target folder already has a complete, fresh
|
||||
// extraction (at least one non-empty regular file, all >= the zip's mtime),
|
||||
// skip re-extraction. This prevents redundant passes from concurrent
|
||||
// CreateANSODHandle calls from truncating files that another thread is
|
||||
// already mmap'ing via ORT (which surfaces as EACCES / system error 13).
|
||||
ANS_DBG("Extract", "ExtractProtectedZipFile: zip=%s -> folder=%s",
|
||||
zipFileName.c_str(), outputFolder.c_str());
|
||||
try {
|
||||
if (std::filesystem::exists(outputFolder) &&
|
||||
std::filesystem::is_directory(outputFolder))
|
||||
{
|
||||
const auto zipTime = std::filesystem::last_write_time(zipFileName);
|
||||
bool anyFile = false;
|
||||
bool allFresh = true;
|
||||
size_t numFiles = 0;
|
||||
std::string staleFile;
|
||||
for (const auto& e : std::filesystem::directory_iterator(outputFolder)) {
|
||||
if (!e.is_regular_file()) continue;
|
||||
anyFile = true;
|
||||
++numFiles;
|
||||
std::error_code ec;
|
||||
const auto sz = e.file_size(ec);
|
||||
if (ec || sz == 0) {
|
||||
allFresh = false;
|
||||
staleFile = e.path().filename().string() + " (zero/err)";
|
||||
break;
|
||||
}
|
||||
const auto ft = std::filesystem::last_write_time(e.path(), ec);
|
||||
if (ec || ft < zipTime) {
|
||||
allFresh = false;
|
||||
staleFile = e.path().filename().string() + " (older than zip)";
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (anyFile && allFresh) {
|
||||
ANS_DBG("Extract", "ExtractProtectedZipFile: SKIP re-extract — %zu file(s) already fresh in %s",
|
||||
numFiles, outputFolder.c_str());
|
||||
return true; // already extracted and up-to-date
|
||||
}
|
||||
ANS_DBG("Extract",
|
||||
"ExtractProtectedZipFile: full extract needed — anyFile=%d stale=%s",
|
||||
anyFile ? 1 : 0,
|
||||
staleFile.empty() ? "(empty folder)" : staleFile.c_str());
|
||||
} else {
|
||||
ANS_DBG("Extract", "ExtractProtectedZipFile: folder absent, full extract");
|
||||
}
|
||||
}
|
||||
catch (const std::exception& ex) {
|
||||
// Any filesystem hiccup: fall through to full extraction.
|
||||
ANS_DBG("Extract", "ExtractProtectedZipFile: freshness check threw, extracting: %s",
|
||||
ex.what());
|
||||
}
|
||||
|
||||
zip_t* archive = zip_open(zipFileName.c_str(), ZIP_RDONLY, &error);
|
||||
if (!archive) {
|
||||
std::cerr << "Error opening ZIP archive: " << zip_strerror(archive) << std::endl;
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
#include <vector>
|
||||
#include <regex>
|
||||
#include <stdio.h>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
||||
//namespace logging = boost::log;
|
||||
//namespace attrs = boost::log::attributes;
|
||||
@@ -89,4 +91,12 @@ namespace fs = std::filesystem;
|
||||
// For training engine
|
||||
//bool ExtractPasswordProtectedZipForTrainingEgnine(const std::string& zipFileName, const std::string& password, const std::string& modelName, std::string& outputFolder, bool edgeDeviceModel = true);
|
||||
ANSLICENSE_API bool ExtractProtectedZipFile(const std::string& zipFileName,const std::string& password,const std::string& modelName,const std::string outputFolder);
|
||||
|
||||
// Per-path mutex for a model folder. Used to serialize extract ↔ session
|
||||
// creation on the same extracted folder so concurrent CreateANSODHandle calls
|
||||
// cannot truncate/rewrite a model file while another thread is loading it.
|
||||
// Keyed by folder path (not zip path) so both extractor and consumer agree.
|
||||
// Returns std::timed_mutex so callers can bound their wait and avoid a hang
|
||||
// if a peer thread deadlocks inside extraction or ORT session creation.
|
||||
ANSLICENSE_API std::shared_ptr<std::timed_mutex> GetModelFolderLock(const std::string& folderPath);
|
||||
#endif
|
||||
@@ -1775,12 +1775,53 @@ namespace ANSCENTER {
|
||||
labelMap = VectorToCommaSeparatedString(_classes);
|
||||
|
||||
if (this->_loadEngineOnCreation) {
|
||||
// Hold the model-folder lock across session creation so a
|
||||
// concurrent CreateANSODHandle on the same model cannot
|
||||
// re-enter ExtractProtectedZipFile and truncate the .onnx
|
||||
// file while ORT is opening it (which would surface as
|
||||
// "system error number 13" EACCES from the ORT loader).
|
||||
//
|
||||
// Timed wait — 120s ceiling for extract + ORT session
|
||||
// creation (GPU EP compile can take ~30s on large models).
|
||||
// If we hit the timeout, the peer thread is deadlocked or
|
||||
// wedged; fail the load instead of hanging the caller.
|
||||
auto _folderLock = GetModelFolderLock(_modelFolder);
|
||||
std::unique_lock<std::timed_mutex> _folderGuard(
|
||||
*_folderLock, std::defer_lock);
|
||||
ANS_DBG("ONNXYOLO", "Initialize: waiting on folder lock (120s): %s",
|
||||
_modelFolder.c_str());
|
||||
auto _lockT0 = std::chrono::steady_clock::now();
|
||||
if (!_folderGuard.try_lock_for(std::chrono::seconds(120))) {
|
||||
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _lockT0).count();
|
||||
ANS_DBG("ONNXYOLO",
|
||||
"Initialize: TIMEOUT on folder lock after %lldms: %s",
|
||||
(long long)waitedMs, _modelFolder.c_str());
|
||||
_logger.LogError("ANSONNXYOLO::Initialize",
|
||||
"Timed out waiting for model-folder lock: " + _modelFolder,
|
||||
__FILE__, __LINE__);
|
||||
return false;
|
||||
}
|
||||
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _lockT0).count();
|
||||
ANS_DBG("ONNXYOLO", "Initialize: folder lock acquired in %lldms, calling InitOrtEngine",
|
||||
(long long)waitedMs);
|
||||
auto _initT0 = std::chrono::steady_clock::now();
|
||||
if (!InitOrtEngine()) {
|
||||
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _initT0).count();
|
||||
ANS_DBG("ONNXYOLO",
|
||||
"Initialize: InitOrtEngine FAILED after %lldms, model=%s",
|
||||
(long long)initMs, _modelFilePath.c_str());
|
||||
_logger.LogError("ANSONNXYOLO::Initialize",
|
||||
"Failed to create ONNX Runtime engine: " + _modelFilePath,
|
||||
__FILE__, __LINE__);
|
||||
return false;
|
||||
}
|
||||
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _initT0).count();
|
||||
ANS_DBG("ONNXYOLO", "Initialize: InitOrtEngine OK in %lldms",
|
||||
(long long)initMs);
|
||||
}
|
||||
|
||||
// Fix input resolution for dynamic-shape models.
|
||||
@@ -1845,7 +1886,45 @@ namespace ANSCENTER {
|
||||
}
|
||||
|
||||
if (this->_loadEngineOnCreation) {
|
||||
if (!InitOrtEngine()) { _modelLoadValid = false; return false; }
|
||||
// See ANSONNXYOLO::Initialize — hold folder lock so a sibling
|
||||
// extraction cannot truncate train_last.onnx mid-load. Timed
|
||||
// wait so a stuck peer cannot hang this thread forever.
|
||||
auto _folderLock = GetModelFolderLock(_modelFolder);
|
||||
std::unique_lock<std::timed_mutex> _folderGuard(
|
||||
*_folderLock, std::defer_lock);
|
||||
ANS_DBG("ONNXYOLO", "LoadModel: waiting on folder lock (120s): %s",
|
||||
_modelFolder.c_str());
|
||||
auto _lockT0 = std::chrono::steady_clock::now();
|
||||
if (!_folderGuard.try_lock_for(std::chrono::seconds(120))) {
|
||||
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _lockT0).count();
|
||||
ANS_DBG("ONNXYOLO",
|
||||
"LoadModel: TIMEOUT on folder lock after %lldms: %s",
|
||||
(long long)waitedMs, _modelFolder.c_str());
|
||||
_logger.LogError("ANSONNXYOLO::LoadModel",
|
||||
"Timed out waiting for model-folder lock: " + _modelFolder,
|
||||
__FILE__, __LINE__);
|
||||
_modelLoadValid = false;
|
||||
return false;
|
||||
}
|
||||
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _lockT0).count();
|
||||
ANS_DBG("ONNXYOLO", "LoadModel: folder lock acquired in %lldms, calling InitOrtEngine",
|
||||
(long long)waitedMs);
|
||||
auto _initT0 = std::chrono::steady_clock::now();
|
||||
if (!InitOrtEngine()) {
|
||||
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _initT0).count();
|
||||
ANS_DBG("ONNXYOLO",
|
||||
"LoadModel: InitOrtEngine FAILED after %lldms, model=%s",
|
||||
(long long)initMs, _modelFilePath.c_str());
|
||||
_modelLoadValid = false;
|
||||
return false;
|
||||
}
|
||||
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _initT0).count();
|
||||
ANS_DBG("ONNXYOLO", "LoadModel: InitOrtEngine OK in %lldms",
|
||||
(long long)initMs);
|
||||
}
|
||||
|
||||
// Fix input resolution for dynamic-shape models (same as primary Initialize)
|
||||
@@ -1920,7 +1999,45 @@ namespace ANSCENTER {
|
||||
labelMap = VectorToCommaSeparatedString(_classes);
|
||||
|
||||
if (this->_loadEngineOnCreation) {
|
||||
if (!InitOrtEngine()) { _modelLoadValid = false; return false; }
|
||||
// See ANSONNXYOLO::Initialize — hold folder lock so a sibling
|
||||
// extraction cannot truncate the model file mid-load. Timed
|
||||
// wait so a stuck peer cannot hang this thread forever.
|
||||
auto _folderLock = GetModelFolderLock(_modelFolder);
|
||||
std::unique_lock<std::timed_mutex> _folderGuard(
|
||||
*_folderLock, std::defer_lock);
|
||||
ANS_DBG("ONNXYOLO", "LoadModelFromFolder: waiting on folder lock (120s): %s",
|
||||
_modelFolder.c_str());
|
||||
auto _lockT0 = std::chrono::steady_clock::now();
|
||||
if (!_folderGuard.try_lock_for(std::chrono::seconds(120))) {
|
||||
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _lockT0).count();
|
||||
ANS_DBG("ONNXYOLO",
|
||||
"LoadModelFromFolder: TIMEOUT on folder lock after %lldms: %s",
|
||||
(long long)waitedMs, _modelFolder.c_str());
|
||||
_logger.LogError("ANSONNXYOLO::LoadModelFromFolder",
|
||||
"Timed out waiting for model-folder lock: " + _modelFolder,
|
||||
__FILE__, __LINE__);
|
||||
_modelLoadValid = false;
|
||||
return false;
|
||||
}
|
||||
auto waitedMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _lockT0).count();
|
||||
ANS_DBG("ONNXYOLO", "LoadModelFromFolder: folder lock acquired in %lldms, calling InitOrtEngine",
|
||||
(long long)waitedMs);
|
||||
auto _initT0 = std::chrono::steady_clock::now();
|
||||
if (!InitOrtEngine()) {
|
||||
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _initT0).count();
|
||||
ANS_DBG("ONNXYOLO",
|
||||
"LoadModelFromFolder: InitOrtEngine FAILED after %lldms, model=%s",
|
||||
(long long)initMs, _modelFilePath.c_str());
|
||||
_modelLoadValid = false;
|
||||
return false;
|
||||
}
|
||||
auto initMs = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - _initT0).count();
|
||||
ANS_DBG("ONNXYOLO", "LoadModelFromFolder: InitOrtEngine OK in %lldms",
|
||||
(long long)initMs);
|
||||
}
|
||||
|
||||
// Fix input resolution for dynamic-shape models (same as primary Initialize)
|
||||
|
||||
Reference in New Issue
Block a user