Fix deadlock

This commit is contained in:
2026-04-24 12:54:16 +10:00
parent e2bf17289d
commit fd2394a85a
3 changed files with 138 additions and 16 deletions

View File

@@ -11,17 +11,23 @@
// model folder — without that, thread A can finish extraction and begin opening
// train_last.onnx while thread B re-enters extraction and truncates the file,
// producing "system error number 13" (EACCES) on the first reader.
// Recursive so the same thread can re-acquire the lock through layered load
// calls — ANSALPR_OD::LoadEngine -> ANSONNXYOLO::LoadModelFromFolder both
// acquire the SAME folder lock on the SAME thread. A non-recursive
// timed_mutex deadlocks that nesting for 120 s then fails. Recursive keeps
// cross-thread serialization intact while allowing legitimate re-entry from
// the lock-holding thread.
static std::mutex g_zipPathMapMutex;
static std::map<std::string, std::shared_ptr<std::timed_mutex>> g_zipPathLocks;
static std::map<std::string, std::shared_ptr<std::recursive_timed_mutex>> g_zipPathLocks;
static std::shared_ptr<std::timed_mutex> GetZipPathLock(const std::string& path) {
static std::shared_ptr<std::recursive_timed_mutex> GetZipPathLock(const std::string& path) {
std::lock_guard<std::mutex> guard(g_zipPathMapMutex);
auto& ptr = g_zipPathLocks[path];
if (!ptr) ptr = std::make_shared<std::timed_mutex>();
if (!ptr) ptr = std::make_shared<std::recursive_timed_mutex>();
return ptr;
}
std::shared_ptr<std::timed_mutex> GetModelFolderLock(const std::string& folderPath) {
std::shared_ptr<std::recursive_timed_mutex> GetModelFolderLock(const std::string& folderPath) {
auto ptr = GetZipPathLock(folderPath);
ANS_DBG("ModelLock", "GetModelFolderLock: folder=%s mutex=%p",
folderPath.c_str(), (void*)ptr.get());
@@ -464,7 +470,7 @@ bool AddFolderContentsToZip(zip* archive, const char* folderPath, const char* zi
bool ZipFolderWithPassword(const char* folderPath, const char* zipFilePath, const char* password) {
auto pathLock = GetZipPathLock(std::string(zipFilePath));
std::lock_guard<std::timed_mutex> zipGuard(*pathLock);
std::lock_guard<std::recursive_timed_mutex> zipGuard(*pathLock);
zip* zipArchive;
zip_flags_t flags = ZIP_CREATE | ZIP_TRUNCATE;
@@ -850,7 +856,7 @@ std::string GetDateTimeString(const std::string& format) {
bool ExtractProtectedZipFile(const std::string& zipFileName, const std::string& password, const std::string& modelName, const std::string outputFolder)
{
auto pathLock = GetZipPathLock(outputFolder);
std::lock_guard<std::timed_mutex> zipGuard(*pathLock);
std::lock_guard<std::recursive_timed_mutex> zipGuard(*pathLock);
int error;
if (!FileExist(zipFileName))return false;

View File

@@ -96,9 +96,12 @@ namespace fs = std::filesystem;
// creation on the same extracted folder so concurrent CreateANSODHandle calls
// cannot truncate/rewrite a model file while another thread is loading it.
// Keyed by folder path (not zip path) so both extractor and consumer agree.
// Returns std::timed_mutex so callers can bound their wait and avoid a hang
// if a peer thread deadlocks inside extraction or ORT session creation.
ANSLICENSE_API std::shared_ptr<std::timed_mutex> GetModelFolderLock(const std::string& folderPath);
// Returns std::recursive_timed_mutex so callers can bound their wait and
// recursion — layered load paths (e.g. ANSALPR_OD::LoadEngine ->
// ANSONNXYOLO::LoadModelFromFolder) legitimately re-enter on the same
// thread; a non-recursive timed_mutex self-deadlocks that nesting. Cross-
// thread serialization is unchanged.
ANSLICENSE_API std::shared_ptr<std::recursive_timed_mutex> GetModelFolderLock(const std::string& folderPath);
// ============================================================================
// ModelFolderLockGuard
@@ -111,8 +114,10 @@ namespace fs = std::filesystem;
// error number 13" (EACCES) on the reader.
//
// Backed by GetModelFolderLock() above which returns a process-wide
// std::timed_mutex keyed on the folder path. The extractor takes the same
// lock, so extract ↔ open is mutually exclusive.
// std::recursive_timed_mutex keyed on the folder path. The extractor takes
// the same lock, so extract ↔ open is mutually exclusive across threads,
// while same-thread re-entry (layered loaders) is permitted without
// deadlocking.
//
// Acquisition is bounded by `timeout` (default 120 s) so a deadlocked peer
// cannot hang the caller thread forever. On timeout, .acquired() is false and
@@ -165,7 +170,7 @@ namespace ANSCENTER {
return;
}
auto lock = GetModelFolderLock(folderPath);
_guard = std::unique_lock<std::timed_mutex>(*lock, std::defer_lock);
_guard = std::unique_lock<std::recursive_timed_mutex>(*lock, std::defer_lock);
ANS_DBG("EngineLoad",
"%s: waiting on folder lock (%llds): %s",
_caller, (long long)timeout.count(), folderPath.c_str());
@@ -196,7 +201,7 @@ namespace ANSCENTER {
private:
const char* _caller;
std::string _folder;
std::unique_lock<std::timed_mutex> _guard;
std::unique_lock<std::recursive_timed_mutex> _guard;
bool _ok = false;
};
} // namespace ANSCENTER

View File

@@ -62,6 +62,111 @@ namespace ANSCENTER
return oss.str();
}
// ────────────────────────────────────────────────────────────────────────
// SummarizeChilkatError: turn Chilkat's verbose multi-line lastErrorText()
// into a single diagnostic line suitable for _logger.LogError().
//
// Chilkat's log looks like:
// ChilkatLog: Connect: DllDate: ... ChilkatVersion: ... UnlockStatus: ...
// restConnect: domain_or_ip: s3.us-east-1.amazonaws.com
// socket2Connect: connect2: connectImplicitSsl: connectSocket_v2:
// connect_domain: ckDnsResolveDomainIPv4_n: ...
// clientHandshake2: readHandshakeMessages:
// Failed to read beginning of SSL/TLS record.
// readTlsRecord: Socket operation timeout.
// See https://cknotes.com/...
// ConnectFailReason: 103
// Failed.
//
// Diagnostics we care about:
// • the deepest concrete failure reason (the non-label leaf line)
// • ConnectFailReason: <N> if present
// • the target (domain_or_ip / ip_or_domain + port)
// Rest (DllDate, ChilkatVersion, Architecture, VerboseLogging, ...) is noise.
//
// Returns e.g.:
// "Connect failed [reason=103] s3.us-east-1.amazonaws.com:443 — Socket operation timeout (TLS handshake). See https://cknotes.com/failed-to-read-beginning-of-ssl-tls-record-..."
// ────────────────────────────────────────────────────────────────────────
static std::string SummarizeChilkatError(const char* raw) {
if (!raw || !*raw) return "(empty Chilkat log)";
std::string s(raw);
// Split into trimmed lines.
std::vector<std::string> lines;
{
std::istringstream iss(s);
std::string ln;
while (std::getline(iss, ln)) {
// Trim leading whitespace (Chilkat indents by level).
size_t a = ln.find_first_not_of(" \t\r\n");
if (a == std::string::npos) continue;
size_t b = ln.find_last_not_of(" \t\r\n");
lines.push_back(ln.substr(a, b - a + 1));
}
}
if (lines.empty()) return "(empty Chilkat log)";
// Lines that are pure labels ("foo:" or "--foo") are scaffolding.
auto isLabelLine = [](const std::string& ln) {
if (ln.empty()) return true;
if (ln.rfind("--", 0) == 0) return true; // "--foo" close-marker
// "label: value" — keep only if value is non-empty and line doesn't
// look like pure call-stack breadcrumb (trailing ':').
if (!ln.empty() && ln.back() == ':') return true;
return false;
};
// Fields we want to extract.
std::string target, failReason, tlsHint, leafError, seeUrl;
for (auto& ln : lines) {
if (ln.rfind("domain_or_ip:", 0) == 0 || ln.rfind("ip_or_domain:", 0) == 0) {
auto p = ln.find(':');
if (p != std::string::npos) target = ln.substr(p + 1);
} else if (ln.rfind("ConnectFailReason:", 0) == 0) {
auto p = ln.find(':');
if (p != std::string::npos) failReason = ln.substr(p + 1);
} else if (ln.rfind("port:", 0) == 0) {
auto p = ln.find(':');
if (p != std::string::npos) {
// Append port to target if we have one.
std::string port = ln.substr(p + 1);
// Trim leading space.
size_t a = port.find_first_not_of(" \t");
if (a != std::string::npos) port = port.substr(a);
if (!target.empty()) target += ":" + port;
}
} else if (ln.rfind("See http", 0) == 0 || ln.rfind("See https", 0) == 0) {
seeUrl = ln.substr(4); // drop "See "
} else if (ln.rfind("Failed to read beginning of SSL/TLS record", 0) == 0) {
tlsHint = "TLS handshake — server sent nothing (firewall/AV TLS inspection, proxy, or packet loss)";
} else if (!isLabelLine(ln)) {
// Candidate concrete message. Keep the LATEST one that's not a
// stack label; Chilkat's deepest concrete error is usually the
// most specific explanation.
leafError = ln;
}
}
// Trim leading space on captured fields.
auto trim = [](std::string& v) {
size_t a = v.find_first_not_of(" \t");
if (a == std::string::npos) { v.clear(); return; }
size_t b = v.find_last_not_of(" \t");
v = v.substr(a, b - a + 1);
};
trim(target); trim(failReason); trim(leafError); trim(seeUrl);
// Compose.
std::ostringstream out;
out << "Chilkat Connect failed";
if (!failReason.empty()) out << " [reason=" << failReason << "]";
if (!target.empty()) out << " " << target;
if (!leafError.empty()) out << "" << leafError;
if (!tlsHint.empty()) out << " (" << tlsHint << ")";
if (!seeUrl.empty()) out << " (" << seeUrl << ")";
return out.str();
}
// Private helper function to extract file name from a path
// Helper function to extract filename from path
std::string ANSAWSS3::ExtractFileName(const std::string& filePath) {
@@ -249,7 +354,9 @@ namespace ANSCENTER
// Connect
if (!conn->rest.Connect(_fullAWSURL.c_str(), _port, _bTls, _bAutoReconnect)) {
_logger.LogError("ANSAWSS3::CreateConnection", conn->rest.lastErrorText(), __FILE__, __LINE__);
const std::string summary = SummarizeChilkatError(conn->rest.lastErrorText());
_logger.LogError("ANSAWSS3::CreateConnection", summary, __FILE__, __LINE__);
ANS_DBG("AWSS3", "Connect failed: %s", summary.c_str());
return nullptr;
}
@@ -261,7 +368,9 @@ namespace ANSCENTER
conn->socket.put_HttpProxyPassword(_proxyPassword.c_str());
conn->socket.put_HttpProxyForHttp(_bProxy);
if (!conn->rest.UseConnection(conn->socket, true)) {
_logger.LogError("ANSAWSS3::CreateConnection - Proxy error", conn->rest.lastErrorText(), __FILE__, __LINE__);
const std::string summary = SummarizeChilkatError(conn->rest.lastErrorText());
_logger.LogError("ANSAWSS3::CreateConnection - Proxy error", summary, __FILE__, __LINE__);
ANS_DBG("AWSS3", "Proxy error: %s", summary.c_str());
return nullptr;
}
}
@@ -275,7 +384,9 @@ namespace ANSCENTER
if (!_bucketRegion.empty()) conn->authAws.put_Region(_bucketRegion.c_str());
}
if (!conn->rest.SetAuthAws(conn->authAws)) {
_logger.LogError("ANSAWSS3::CreateConnection - Auth error", conn->rest.lastErrorText(), __FILE__, __LINE__);
const std::string summary = SummarizeChilkatError(conn->rest.lastErrorText());
_logger.LogError("ANSAWSS3::CreateConnection - Auth error", summary, __FILE__, __LINE__);
ANS_DBG("AWSS3", "Auth error: %s", summary.c_str());
return nullptr;
}
}