Fix model extract race issue to all classes
This commit is contained in:
@@ -855,56 +855,104 @@ bool ExtractProtectedZipFile(const std::string& zipFileName, const std::string&
|
||||
int error;
|
||||
if (!FileExist(zipFileName))return false;
|
||||
|
||||
// Idempotent fast-path: if the target folder already has a complete, fresh
|
||||
// extraction (at least one non-empty regular file, all >= the zip's mtime),
|
||||
// skip re-extraction. This prevents redundant passes from concurrent
|
||||
// CreateANSODHandle calls from truncating files that another thread is
|
||||
// already mmap'ing via ORT (which surfaces as EACCES / system error 13).
|
||||
// Idempotent fast-path: skip re-extraction when the target folder already
|
||||
// holds exactly the content of the current zip. Identity is tracked by a
|
||||
// small sidecar file (.ans_extract_info) written inside outputFolder after
|
||||
// every successful extract. The sidecar records three fields — zip path,
|
||||
// zip size, zip mtime — and we SKIP only if ALL match the current zip AND
|
||||
// the folder still has at least one non-empty file.
|
||||
//
|
||||
// Why sidecar and not just mtime? mtime alone breaks on:
|
||||
// • downgrades — a reverted older zip has an older mtime, so "file mtime
|
||||
// >= zip mtime" falsely reports fresh and we never re-extract.
|
||||
// • preserved-timestamp copies (cp --preserve=timestamps, some installers,
|
||||
// Git-LFS checkouts) — new-but-older-mtime zips look "not newer".
|
||||
// Size differs whenever content differs (libzip's CRC/compression shifts
|
||||
// bytes even for tiny payload changes), so size+mtime+path uniquely pins
|
||||
// the zip identity in practice.
|
||||
//
|
||||
// Any mismatch / missing sidecar / FS error → fall through to full extract,
|
||||
// which unconditionally rewrites the sidecar at the end. This means the
|
||||
// first run after deploying this patched binary will do one re-extract per
|
||||
// model to establish the sidecar; subsequent runs hit the fast path.
|
||||
const std::filesystem::path sidecarPath =
|
||||
std::filesystem::path(outputFolder) / ".ans_extract_info";
|
||||
|
||||
ANS_DBG("Extract", "ExtractProtectedZipFile: zip=%s -> folder=%s",
|
||||
zipFileName.c_str(), outputFolder.c_str());
|
||||
|
||||
// Capture the current zip's identity up-front so both the fast-path check
|
||||
// and the post-extract sidecar write see consistent values.
|
||||
std::uintmax_t curZipSize = 0;
|
||||
long long curZipMtime = 0;
|
||||
bool zipStatOk = false;
|
||||
try {
|
||||
if (std::filesystem::exists(outputFolder) &&
|
||||
std::filesystem::is_directory(outputFolder))
|
||||
curZipSize = std::filesystem::file_size(zipFileName);
|
||||
curZipMtime = std::filesystem::last_write_time(zipFileName)
|
||||
.time_since_epoch().count();
|
||||
zipStatOk = true;
|
||||
}
|
||||
catch (const std::exception& ex) {
|
||||
ANS_DBG("Extract",
|
||||
"ExtractProtectedZipFile: failed to stat zip (%s); extracting anyway",
|
||||
ex.what());
|
||||
}
|
||||
|
||||
try {
|
||||
if (zipStatOk &&
|
||||
std::filesystem::exists(outputFolder) &&
|
||||
std::filesystem::is_directory(outputFolder) &&
|
||||
std::filesystem::exists(sidecarPath))
|
||||
{
|
||||
const auto zipTime = std::filesystem::last_write_time(zipFileName);
|
||||
bool anyFile = false;
|
||||
bool allFresh = true;
|
||||
size_t numFiles = 0;
|
||||
std::string staleFile;
|
||||
for (const auto& e : std::filesystem::directory_iterator(outputFolder)) {
|
||||
if (!e.is_regular_file()) continue;
|
||||
anyFile = true;
|
||||
++numFiles;
|
||||
std::error_code ec;
|
||||
const auto sz = e.file_size(ec);
|
||||
if (ec || sz == 0) {
|
||||
allFresh = false;
|
||||
staleFile = e.path().filename().string() + " (zero/err)";
|
||||
break;
|
||||
// Read sidecar: 3 lines — zipPath, zipSize, zipMtime.
|
||||
std::ifstream sf(sidecarPath);
|
||||
std::string storedPath;
|
||||
std::uintmax_t storedSize = 0;
|
||||
long long storedMtime = 0;
|
||||
std::getline(sf, storedPath);
|
||||
sf >> storedSize >> storedMtime;
|
||||
if (sf.good() || sf.eof()) {
|
||||
// Sanity: folder must still contain at least one non-empty file
|
||||
// (defends against a sidecar that outlived its content, e.g. a
|
||||
// user manually wiping model files but leaving the sidecar).
|
||||
bool hasNonEmpty = false;
|
||||
size_t numFiles = 0;
|
||||
for (const auto& e : std::filesystem::directory_iterator(outputFolder)) {
|
||||
if (!e.is_regular_file()) continue;
|
||||
if (e.path().filename() == ".ans_extract_info") continue;
|
||||
++numFiles;
|
||||
std::error_code ec;
|
||||
if (e.file_size(ec) > 0 && !ec) { hasNonEmpty = true; break; }
|
||||
}
|
||||
const auto ft = std::filesystem::last_write_time(e.path(), ec);
|
||||
if (ec || ft < zipTime) {
|
||||
allFresh = false;
|
||||
staleFile = e.path().filename().string() + " (older than zip)";
|
||||
break;
|
||||
const bool pathOk = (storedPath == zipFileName);
|
||||
const bool sizeOk = (storedSize == curZipSize);
|
||||
const bool mtimeOk = (storedMtime == curZipMtime);
|
||||
if (pathOk && sizeOk && mtimeOk && hasNonEmpty) {
|
||||
ANS_DBG("Extract",
|
||||
"ExtractProtectedZipFile: SKIP re-extract — sidecar match (size=%llu mtime=%lld, %zu payload file(s))",
|
||||
(unsigned long long)curZipSize,
|
||||
(long long)curZipMtime, numFiles);
|
||||
return true;
|
||||
}
|
||||
ANS_DBG("Extract",
|
||||
"ExtractProtectedZipFile: full extract needed — sidecar mismatch (path:%d size:%d mtime:%d nonEmpty:%d)",
|
||||
pathOk ? 0 : 1, sizeOk ? 0 : 1, mtimeOk ? 0 : 1,
|
||||
hasNonEmpty ? 1 : 0);
|
||||
} else {
|
||||
ANS_DBG("Extract",
|
||||
"ExtractProtectedZipFile: sidecar unreadable, re-extracting");
|
||||
}
|
||||
if (anyFile && allFresh) {
|
||||
ANS_DBG("Extract", "ExtractProtectedZipFile: SKIP re-extract — %zu file(s) already fresh in %s",
|
||||
numFiles, outputFolder.c_str());
|
||||
return true; // already extracted and up-to-date
|
||||
}
|
||||
} else if (zipStatOk) {
|
||||
ANS_DBG("Extract",
|
||||
"ExtractProtectedZipFile: full extract needed — anyFile=%d stale=%s",
|
||||
anyFile ? 1 : 0,
|
||||
staleFile.empty() ? "(empty folder)" : staleFile.c_str());
|
||||
} else {
|
||||
ANS_DBG("Extract", "ExtractProtectedZipFile: folder absent, full extract");
|
||||
"ExtractProtectedZipFile: %s, full extract",
|
||||
std::filesystem::exists(outputFolder) ? "no sidecar yet"
|
||||
: "folder absent");
|
||||
}
|
||||
}
|
||||
catch (const std::exception& ex) {
|
||||
// Any filesystem hiccup: fall through to full extraction.
|
||||
ANS_DBG("Extract", "ExtractProtectedZipFile: freshness check threw, extracting: %s",
|
||||
// Any filesystem hiccup: fall through to full extraction (safe).
|
||||
ANS_DBG("Extract",
|
||||
"ExtractProtectedZipFile: freshness check threw, extracting: %s",
|
||||
ex.what());
|
||||
}
|
||||
|
||||
@@ -977,6 +1025,35 @@ bool ExtractProtectedZipFile(const std::string& zipFileName, const std::string&
|
||||
return false;
|
||||
}
|
||||
zip_close(archive);
|
||||
|
||||
// Extract succeeded — write the identity sidecar so the next call on this
|
||||
// folder can take the fast-path. We re-read zip stats here (rather than
|
||||
// trust the pre-extract snapshot) in case the zip was touched in between.
|
||||
try {
|
||||
std::uintmax_t finalSize = std::filesystem::file_size(zipFileName);
|
||||
long long finalMtime = std::filesystem::last_write_time(zipFileName)
|
||||
.time_since_epoch().count();
|
||||
std::ofstream sf(sidecarPath, std::ios::binary | std::ios::trunc);
|
||||
if (sf.is_open()) {
|
||||
sf << zipFileName << '\n' << finalSize << '\n' << finalMtime << '\n';
|
||||
ANS_DBG("Extract",
|
||||
"ExtractProtectedZipFile: sidecar written (size=%llu mtime=%lld) at %s",
|
||||
(unsigned long long)finalSize, (long long)finalMtime,
|
||||
sidecarPath.string().c_str());
|
||||
} else {
|
||||
ANS_DBG("Extract",
|
||||
"ExtractProtectedZipFile: could not open sidecar for write: %s",
|
||||
sidecarPath.string().c_str());
|
||||
}
|
||||
}
|
||||
catch (const std::exception& ex) {
|
||||
// Non-fatal — extract already succeeded. Next call will just re-extract
|
||||
// once more until the sidecar can be written.
|
||||
ANS_DBG("Extract",
|
||||
"ExtractProtectedZipFile: sidecar write threw (non-fatal): %s",
|
||||
ex.what());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -99,4 +99,106 @@ namespace fs = std::filesystem;
|
||||
// Returns std::timed_mutex so callers can bound their wait and avoid a hang
|
||||
// if a peer thread deadlocks inside extraction or ORT session creation.
|
||||
ANSLICENSE_API std::shared_ptr<std::timed_mutex> GetModelFolderLock(const std::string& folderPath);
|
||||
|
||||
// ============================================================================
|
||||
// ModelFolderLockGuard
|
||||
//
|
||||
// RAII guard that serializes "open files in an extracted model folder" against
|
||||
// re-entries into ExtractProtectedZipFile on the SAME folder. Without this,
|
||||
// two threads creating handles for the same model zip can race so that thread
|
||||
// A opens a model file (via ORT / TRT / OpenVINO) while thread B truncates it
|
||||
// via std::ofstream inside the extractor — Windows surfaces this as "system
|
||||
// error number 13" (EACCES) on the reader.
|
||||
//
|
||||
// Backed by GetModelFolderLock() above which returns a process-wide
|
||||
// std::timed_mutex keyed on the folder path. The extractor takes the same
|
||||
// lock, so extract ↔ open is mutually exclusive.
|
||||
//
|
||||
// Acquisition is bounded by `timeout` (default 120 s) so a deadlocked peer
|
||||
// cannot hang the caller thread forever. On timeout, .acquired() is false and
|
||||
// the caller must fail the load.
|
||||
//
|
||||
// Lives in Utility.h (rather than a per-module header) so every ANSCORE
|
||||
// module — ANSODEngine, ANSOCR, ANSLPR, ANSFR, etc. — can use it without
|
||||
// pulling in ANSODEngine headers.
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// bool MyEngine::LoadModelFromFolder(...) {
|
||||
// bool result = MyBase::LoadModelFromFolder(...);
|
||||
// if (!result) return false;
|
||||
// // ── serialize derived-class init against concurrent extracts ──
|
||||
// ANSCENTER::ModelFolderLockGuard _flg(_modelFolder,
|
||||
// "MyEngine::LoadModelFromFolder");
|
||||
// if (!_flg.acquired()) {
|
||||
// _logger.LogError("MyEngine::LoadModelFromFolder",
|
||||
// "Timed out waiting for model-folder lock: " + _modelFolder,
|
||||
// __FILE__, __LINE__);
|
||||
// return false;
|
||||
// }
|
||||
// // ... existing body: Init(...) / buildLoadNetwork(...) / etc. ...
|
||||
// }
|
||||
//
|
||||
// Notes:
|
||||
// • Placement — insert AFTER the base's XXX() returns (extractor already
|
||||
// released its own lock by then) and BEFORE any file open in _modelFolder.
|
||||
// Wrapping the base call would deadlock — it takes the same lock itself.
|
||||
// • RAII — destructor auto-releases on every return path within the scope.
|
||||
// • Timing — entry/acquire/timeout are traced via ANS_DBG("EngineLoad"),
|
||||
// filter on [EngineLoad] in DebugView to diagnose stalls.
|
||||
// ============================================================================
|
||||
namespace ANSCENTER {
|
||||
class ModelFolderLockGuard {
|
||||
public:
|
||||
explicit ModelFolderLockGuard(const std::string& folderPath,
|
||||
const char* caller,
|
||||
std::chrono::seconds timeout = std::chrono::seconds(120))
|
||||
: _caller(caller ? caller : "(?)"), _folder(folderPath)
|
||||
{
|
||||
if (folderPath.empty()) {
|
||||
// Nothing to serialize — no extracted folder yet. Treat as
|
||||
// acquired so caller's existing init runs unchanged (e.g.
|
||||
// custom-path engines that never go through the zip extractor).
|
||||
_ok = true;
|
||||
ANS_DBG("EngineLoad", "%s: empty folder path, skipping lock",
|
||||
_caller);
|
||||
return;
|
||||
}
|
||||
auto lock = GetModelFolderLock(folderPath);
|
||||
_guard = std::unique_lock<std::timed_mutex>(*lock, std::defer_lock);
|
||||
ANS_DBG("EngineLoad",
|
||||
"%s: waiting on folder lock (%llds): %s",
|
||||
_caller, (long long)timeout.count(), folderPath.c_str());
|
||||
const auto t0 = std::chrono::steady_clock::now();
|
||||
if (_guard.try_lock_for(timeout)) {
|
||||
const auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - t0).count();
|
||||
_ok = true;
|
||||
ANS_DBG("EngineLoad", "%s: folder lock acquired in %lldms",
|
||||
_caller, (long long)ms);
|
||||
} else {
|
||||
const auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - t0).count();
|
||||
_ok = false;
|
||||
ANS_DBG("EngineLoad",
|
||||
"%s: TIMEOUT on folder lock after %lldms: %s",
|
||||
_caller, (long long)ms, folderPath.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// Non-copyable, non-movable: lock lifetime is tied to this scope.
|
||||
ModelFolderLockGuard(const ModelFolderLockGuard&) = delete;
|
||||
ModelFolderLockGuard& operator=(const ModelFolderLockGuard&) = delete;
|
||||
|
||||
bool acquired() const noexcept { return _ok; }
|
||||
explicit operator bool() const noexcept { return _ok; }
|
||||
|
||||
private:
|
||||
const char* _caller;
|
||||
std::string _folder;
|
||||
std::unique_lock<std::timed_mutex> _guard;
|
||||
bool _ok = false;
|
||||
};
|
||||
} // namespace ANSCENTER
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user