diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 17f3181..9f1a871 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -95,7 +95,13 @@ "Bash(git add *)", "Read(//c/ProgramData/Jh7O7nUe7vS/Models/EngineModels/B-IN_ANS_VehicleDetection_v2.0_67345015/**)", "Bash(xxd)", - "Bash(icacls \"C:\\\\ProgramData\\\\Jh7O7nUe7vS\\\\Models\\\\EngineModels\\\\B-IN_ANS_VehicleDetection_v2.0_67345015\\\\train_last.onnx\")" + "Bash(icacls \"C:\\\\ProgramData\\\\Jh7O7nUe7vS\\\\Models\\\\EngineModels\\\\B-IN_ANS_VehicleDetection_v2.0_67345015\\\\train_last.onnx\")", + "Bash(grep -oE ']*>[^<]{0,400}' \"C:/Users/nghia/Downloads/error.xml\")", + "Bash(grep -oE ']*>[^<]{0,500}' \"/c/Users/nghia/Downloads/error.xml\")", + "Read(//tmp/**)", + "Bash(grep -oE ']*>[^<]{0,400}' \"/c/Users/nghia/Downloads/error.xml\")", + "Bash(echo \"Exit: $?\")", + "Bash(python -)" ] } } diff --git a/core/ANSLicensingSystem/Utility.cpp b/core/ANSLicensingSystem/Utility.cpp index 5ee3fc6..2614a2c 100644 --- a/core/ANSLicensingSystem/Utility.cpp +++ b/core/ANSLicensingSystem/Utility.cpp @@ -7,13 +7,24 @@ // Per-path mutex to serialize concurrent zip operations on the same target. // Without this, two LabVIEW threads can race: one extracting a zip while // another truncates/writes the same file, corrupting data and crashing LabVIEW. +// Also used to serialize extract ↔ ONNX session creation on the same extracted +// model folder — without that, thread A can finish extraction and begin opening +// train_last.onnx while thread B re-enters extraction and truncates the file, +// producing "system error number 13" (EACCES) on the first reader. static std::mutex g_zipPathMapMutex; -static std::map> g_zipPathLocks; +static std::map> g_zipPathLocks; -static std::shared_ptr GetZipPathLock(const std::string& path) { +static std::shared_ptr GetZipPathLock(const std::string& path) { std::lock_guard guard(g_zipPathMapMutex); auto& ptr = g_zipPathLocks[path]; - if (!ptr) ptr = std::make_shared(); + if (!ptr) ptr = std::make_shared(); + return ptr; +} + +std::shared_ptr GetModelFolderLock(const std::string& folderPath) { + auto ptr = GetZipPathLock(folderPath); + ANS_DBG("ModelLock", "GetModelFolderLock: folder=%s mutex=%p", + folderPath.c_str(), (void*)ptr.get()); return ptr; } @@ -453,7 +464,7 @@ bool AddFolderContentsToZip(zip* archive, const char* folderPath, const char* zi bool ZipFolderWithPassword(const char* folderPath, const char* zipFilePath, const char* password) { auto pathLock = GetZipPathLock(std::string(zipFilePath)); - std::lock_guard zipGuard(*pathLock); + std::lock_guard zipGuard(*pathLock); zip* zipArchive; zip_flags_t flags = ZIP_CREATE | ZIP_TRUNCATE; @@ -839,10 +850,64 @@ std::string GetDateTimeString(const std::string& format) { bool ExtractProtectedZipFile(const std::string& zipFileName, const std::string& password, const std::string& modelName, const std::string outputFolder) { auto pathLock = GetZipPathLock(outputFolder); - std::lock_guard zipGuard(*pathLock); + std::lock_guard zipGuard(*pathLock); int error; if (!FileExist(zipFileName))return false; + + // Idempotent fast-path: if the target folder already has a complete, fresh + // extraction (at least one non-empty regular file, all >= the zip's mtime), + // skip re-extraction. This prevents redundant passes from concurrent + // CreateANSODHandle calls from truncating files that another thread is + // already mmap'ing via ORT (which surfaces as EACCES / system error 13). + ANS_DBG("Extract", "ExtractProtectedZipFile: zip=%s -> folder=%s", + zipFileName.c_str(), outputFolder.c_str()); + try { + if (std::filesystem::exists(outputFolder) && + std::filesystem::is_directory(outputFolder)) + { + const auto zipTime = std::filesystem::last_write_time(zipFileName); + bool anyFile = false; + bool allFresh = true; + size_t numFiles = 0; + std::string staleFile; + for (const auto& e : std::filesystem::directory_iterator(outputFolder)) { + if (!e.is_regular_file()) continue; + anyFile = true; + ++numFiles; + std::error_code ec; + const auto sz = e.file_size(ec); + if (ec || sz == 0) { + allFresh = false; + staleFile = e.path().filename().string() + " (zero/err)"; + break; + } + const auto ft = std::filesystem::last_write_time(e.path(), ec); + if (ec || ft < zipTime) { + allFresh = false; + staleFile = e.path().filename().string() + " (older than zip)"; + break; + } + } + if (anyFile && allFresh) { + ANS_DBG("Extract", "ExtractProtectedZipFile: SKIP re-extract — %zu file(s) already fresh in %s", + numFiles, outputFolder.c_str()); + return true; // already extracted and up-to-date + } + ANS_DBG("Extract", + "ExtractProtectedZipFile: full extract needed — anyFile=%d stale=%s", + anyFile ? 1 : 0, + staleFile.empty() ? "(empty folder)" : staleFile.c_str()); + } else { + ANS_DBG("Extract", "ExtractProtectedZipFile: folder absent, full extract"); + } + } + catch (const std::exception& ex) { + // Any filesystem hiccup: fall through to full extraction. + ANS_DBG("Extract", "ExtractProtectedZipFile: freshness check threw, extracting: %s", + ex.what()); + } + zip_t* archive = zip_open(zipFileName.c_str(), ZIP_RDONLY, &error); if (!archive) { std::cerr << "Error opening ZIP archive: " << zip_strerror(archive) << std::endl; diff --git a/core/ANSLicensingSystem/Utility.h b/core/ANSLicensingSystem/Utility.h index dcb8b24..4c78cd2 100644 --- a/core/ANSLicensingSystem/Utility.h +++ b/core/ANSLicensingSystem/Utility.h @@ -16,6 +16,8 @@ #include #include #include +#include +#include //namespace logging = boost::log; //namespace attrs = boost::log::attributes; @@ -89,4 +91,12 @@ namespace fs = std::filesystem; // For training engine //bool ExtractPasswordProtectedZipForTrainingEgnine(const std::string& zipFileName, const std::string& password, const std::string& modelName, std::string& outputFolder, bool edgeDeviceModel = true); ANSLICENSE_API bool ExtractProtectedZipFile(const std::string& zipFileName,const std::string& password,const std::string& modelName,const std::string outputFolder); + + // Per-path mutex for a model folder. Used to serialize extract ↔ session + // creation on the same extracted folder so concurrent CreateANSODHandle calls + // cannot truncate/rewrite a model file while another thread is loading it. + // Keyed by folder path (not zip path) so both extractor and consumer agree. + // Returns std::timed_mutex so callers can bound their wait and avoid a hang + // if a peer thread deadlocks inside extraction or ORT session creation. + ANSLICENSE_API std::shared_ptr GetModelFolderLock(const std::string& folderPath); #endif \ No newline at end of file diff --git a/modules/ANSODEngine/ANSONNXYOLO.cpp b/modules/ANSODEngine/ANSONNXYOLO.cpp index 8cdfda6..d7ba175 100644 --- a/modules/ANSODEngine/ANSONNXYOLO.cpp +++ b/modules/ANSODEngine/ANSONNXYOLO.cpp @@ -1775,12 +1775,53 @@ namespace ANSCENTER { labelMap = VectorToCommaSeparatedString(_classes); if (this->_loadEngineOnCreation) { + // Hold the model-folder lock across session creation so a + // concurrent CreateANSODHandle on the same model cannot + // re-enter ExtractProtectedZipFile and truncate the .onnx + // file while ORT is opening it (which would surface as + // "system error number 13" EACCES from the ORT loader). + // + // Timed wait — 120s ceiling for extract + ORT session + // creation (GPU EP compile can take ~30s on large models). + // If we hit the timeout, the peer thread is deadlocked or + // wedged; fail the load instead of hanging the caller. + auto _folderLock = GetModelFolderLock(_modelFolder); + std::unique_lock _folderGuard( + *_folderLock, std::defer_lock); + ANS_DBG("ONNXYOLO", "Initialize: waiting on folder lock (120s): %s", + _modelFolder.c_str()); + auto _lockT0 = std::chrono::steady_clock::now(); + if (!_folderGuard.try_lock_for(std::chrono::seconds(120))) { + auto waitedMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - _lockT0).count(); + ANS_DBG("ONNXYOLO", + "Initialize: TIMEOUT on folder lock after %lldms: %s", + (long long)waitedMs, _modelFolder.c_str()); + _logger.LogError("ANSONNXYOLO::Initialize", + "Timed out waiting for model-folder lock: " + _modelFolder, + __FILE__, __LINE__); + return false; + } + auto waitedMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - _lockT0).count(); + ANS_DBG("ONNXYOLO", "Initialize: folder lock acquired in %lldms, calling InitOrtEngine", + (long long)waitedMs); + auto _initT0 = std::chrono::steady_clock::now(); if (!InitOrtEngine()) { + auto initMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - _initT0).count(); + ANS_DBG("ONNXYOLO", + "Initialize: InitOrtEngine FAILED after %lldms, model=%s", + (long long)initMs, _modelFilePath.c_str()); _logger.LogError("ANSONNXYOLO::Initialize", "Failed to create ONNX Runtime engine: " + _modelFilePath, __FILE__, __LINE__); return false; } + auto initMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - _initT0).count(); + ANS_DBG("ONNXYOLO", "Initialize: InitOrtEngine OK in %lldms", + (long long)initMs); } // Fix input resolution for dynamic-shape models. @@ -1845,7 +1886,45 @@ namespace ANSCENTER { } if (this->_loadEngineOnCreation) { - if (!InitOrtEngine()) { _modelLoadValid = false; return false; } + // See ANSONNXYOLO::Initialize — hold folder lock so a sibling + // extraction cannot truncate train_last.onnx mid-load. Timed + // wait so a stuck peer cannot hang this thread forever. + auto _folderLock = GetModelFolderLock(_modelFolder); + std::unique_lock _folderGuard( + *_folderLock, std::defer_lock); + ANS_DBG("ONNXYOLO", "LoadModel: waiting on folder lock (120s): %s", + _modelFolder.c_str()); + auto _lockT0 = std::chrono::steady_clock::now(); + if (!_folderGuard.try_lock_for(std::chrono::seconds(120))) { + auto waitedMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - _lockT0).count(); + ANS_DBG("ONNXYOLO", + "LoadModel: TIMEOUT on folder lock after %lldms: %s", + (long long)waitedMs, _modelFolder.c_str()); + _logger.LogError("ANSONNXYOLO::LoadModel", + "Timed out waiting for model-folder lock: " + _modelFolder, + __FILE__, __LINE__); + _modelLoadValid = false; + return false; + } + auto waitedMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - _lockT0).count(); + ANS_DBG("ONNXYOLO", "LoadModel: folder lock acquired in %lldms, calling InitOrtEngine", + (long long)waitedMs); + auto _initT0 = std::chrono::steady_clock::now(); + if (!InitOrtEngine()) { + auto initMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - _initT0).count(); + ANS_DBG("ONNXYOLO", + "LoadModel: InitOrtEngine FAILED after %lldms, model=%s", + (long long)initMs, _modelFilePath.c_str()); + _modelLoadValid = false; + return false; + } + auto initMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - _initT0).count(); + ANS_DBG("ONNXYOLO", "LoadModel: InitOrtEngine OK in %lldms", + (long long)initMs); } // Fix input resolution for dynamic-shape models (same as primary Initialize) @@ -1920,7 +1999,45 @@ namespace ANSCENTER { labelMap = VectorToCommaSeparatedString(_classes); if (this->_loadEngineOnCreation) { - if (!InitOrtEngine()) { _modelLoadValid = false; return false; } + // See ANSONNXYOLO::Initialize — hold folder lock so a sibling + // extraction cannot truncate the model file mid-load. Timed + // wait so a stuck peer cannot hang this thread forever. + auto _folderLock = GetModelFolderLock(_modelFolder); + std::unique_lock _folderGuard( + *_folderLock, std::defer_lock); + ANS_DBG("ONNXYOLO", "LoadModelFromFolder: waiting on folder lock (120s): %s", + _modelFolder.c_str()); + auto _lockT0 = std::chrono::steady_clock::now(); + if (!_folderGuard.try_lock_for(std::chrono::seconds(120))) { + auto waitedMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - _lockT0).count(); + ANS_DBG("ONNXYOLO", + "LoadModelFromFolder: TIMEOUT on folder lock after %lldms: %s", + (long long)waitedMs, _modelFolder.c_str()); + _logger.LogError("ANSONNXYOLO::LoadModelFromFolder", + "Timed out waiting for model-folder lock: " + _modelFolder, + __FILE__, __LINE__); + _modelLoadValid = false; + return false; + } + auto waitedMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - _lockT0).count(); + ANS_DBG("ONNXYOLO", "LoadModelFromFolder: folder lock acquired in %lldms, calling InitOrtEngine", + (long long)waitedMs); + auto _initT0 = std::chrono::steady_clock::now(); + if (!InitOrtEngine()) { + auto initMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - _initT0).count(); + ANS_DBG("ONNXYOLO", + "LoadModelFromFolder: InitOrtEngine FAILED after %lldms, model=%s", + (long long)initMs, _modelFilePath.c_str()); + _modelLoadValid = false; + return false; + } + auto initMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - _initT0).count(); + ANS_DBG("ONNXYOLO", "LoadModelFromFolder: InitOrtEngine OK in %lldms", + (long long)initMs); } // Fix input resolution for dynamic-shape models (same as primary Initialize)