From fd2394a85a4b7f39994c1ce68d4cd8976b4f8013 Mon Sep 17 00:00:00 2001 From: Tuan Nghia Nguyen Date: Fri, 24 Apr 2026 12:54:16 +1000 Subject: [PATCH] Fix deadlock --- core/ANSLicensingSystem/Utility.cpp | 18 +++-- core/ANSLicensingSystem/Utility.h | 19 +++-- modules/ANSUtilities/ANSAWSS3.cpp | 117 +++++++++++++++++++++++++++- 3 files changed, 138 insertions(+), 16 deletions(-) diff --git a/core/ANSLicensingSystem/Utility.cpp b/core/ANSLicensingSystem/Utility.cpp index 0ce20a5..0a4e71d 100644 --- a/core/ANSLicensingSystem/Utility.cpp +++ b/core/ANSLicensingSystem/Utility.cpp @@ -11,17 +11,23 @@ // model folder — without that, thread A can finish extraction and begin opening // train_last.onnx while thread B re-enters extraction and truncates the file, // producing "system error number 13" (EACCES) on the first reader. +// Recursive so the same thread can re-acquire the lock through layered load +// calls — ANSALPR_OD::LoadEngine -> ANSONNXYOLO::LoadModelFromFolder both +// acquire the SAME folder lock on the SAME thread. A non-recursive +// timed_mutex deadlocks that nesting for 120 s then fails. Recursive keeps +// cross-thread serialization intact while allowing legitimate re-entry from +// the lock-holding thread. static std::mutex g_zipPathMapMutex; -static std::map> g_zipPathLocks; +static std::map> g_zipPathLocks; -static std::shared_ptr GetZipPathLock(const std::string& path) { +static std::shared_ptr GetZipPathLock(const std::string& path) { std::lock_guard guard(g_zipPathMapMutex); auto& ptr = g_zipPathLocks[path]; - if (!ptr) ptr = std::make_shared(); + if (!ptr) ptr = std::make_shared(); return ptr; } -std::shared_ptr GetModelFolderLock(const std::string& folderPath) { +std::shared_ptr GetModelFolderLock(const std::string& folderPath) { auto ptr = GetZipPathLock(folderPath); ANS_DBG("ModelLock", "GetModelFolderLock: folder=%s mutex=%p", folderPath.c_str(), (void*)ptr.get()); @@ -464,7 +470,7 @@ bool AddFolderContentsToZip(zip* archive, const char* folderPath, const char* zi bool ZipFolderWithPassword(const char* folderPath, const char* zipFilePath, const char* password) { auto pathLock = GetZipPathLock(std::string(zipFilePath)); - std::lock_guard zipGuard(*pathLock); + std::lock_guard zipGuard(*pathLock); zip* zipArchive; zip_flags_t flags = ZIP_CREATE | ZIP_TRUNCATE; @@ -850,7 +856,7 @@ std::string GetDateTimeString(const std::string& format) { bool ExtractProtectedZipFile(const std::string& zipFileName, const std::string& password, const std::string& modelName, const std::string outputFolder) { auto pathLock = GetZipPathLock(outputFolder); - std::lock_guard zipGuard(*pathLock); + std::lock_guard zipGuard(*pathLock); int error; if (!FileExist(zipFileName))return false; diff --git a/core/ANSLicensingSystem/Utility.h b/core/ANSLicensingSystem/Utility.h index 7ad6b09..063977a 100644 --- a/core/ANSLicensingSystem/Utility.h +++ b/core/ANSLicensingSystem/Utility.h @@ -96,9 +96,12 @@ namespace fs = std::filesystem; // creation on the same extracted folder so concurrent CreateANSODHandle calls // cannot truncate/rewrite a model file while another thread is loading it. // Keyed by folder path (not zip path) so both extractor and consumer agree. - // Returns std::timed_mutex so callers can bound their wait and avoid a hang - // if a peer thread deadlocks inside extraction or ORT session creation. - ANSLICENSE_API std::shared_ptr GetModelFolderLock(const std::string& folderPath); + // Returns std::recursive_timed_mutex so callers can bound their wait and + // recursion — layered load paths (e.g. ANSALPR_OD::LoadEngine -> + // ANSONNXYOLO::LoadModelFromFolder) legitimately re-enter on the same + // thread; a non-recursive timed_mutex self-deadlocks that nesting. Cross- + // thread serialization is unchanged. + ANSLICENSE_API std::shared_ptr GetModelFolderLock(const std::string& folderPath); // ============================================================================ // ModelFolderLockGuard @@ -111,8 +114,10 @@ namespace fs = std::filesystem; // error number 13" (EACCES) on the reader. // // Backed by GetModelFolderLock() above which returns a process-wide -// std::timed_mutex keyed on the folder path. The extractor takes the same -// lock, so extract ↔ open is mutually exclusive. +// std::recursive_timed_mutex keyed on the folder path. The extractor takes +// the same lock, so extract ↔ open is mutually exclusive across threads, +// while same-thread re-entry (layered loaders) is permitted without +// deadlocking. // // Acquisition is bounded by `timeout` (default 120 s) so a deadlocked peer // cannot hang the caller thread forever. On timeout, .acquired() is false and @@ -165,7 +170,7 @@ namespace ANSCENTER { return; } auto lock = GetModelFolderLock(folderPath); - _guard = std::unique_lock(*lock, std::defer_lock); + _guard = std::unique_lock(*lock, std::defer_lock); ANS_DBG("EngineLoad", "%s: waiting on folder lock (%llds): %s", _caller, (long long)timeout.count(), folderPath.c_str()); @@ -196,7 +201,7 @@ namespace ANSCENTER { private: const char* _caller; std::string _folder; - std::unique_lock _guard; + std::unique_lock _guard; bool _ok = false; }; } // namespace ANSCENTER diff --git a/modules/ANSUtilities/ANSAWSS3.cpp b/modules/ANSUtilities/ANSAWSS3.cpp index 9b9f3c8..21970e5 100644 --- a/modules/ANSUtilities/ANSAWSS3.cpp +++ b/modules/ANSUtilities/ANSAWSS3.cpp @@ -62,6 +62,111 @@ namespace ANSCENTER return oss.str(); } + // ──────────────────────────────────────────────────────────────────────── + // SummarizeChilkatError: turn Chilkat's verbose multi-line lastErrorText() + // into a single diagnostic line suitable for _logger.LogError(). + // + // Chilkat's log looks like: + // ChilkatLog: Connect: DllDate: ... ChilkatVersion: ... UnlockStatus: ... + // restConnect: domain_or_ip: s3.us-east-1.amazonaws.com + // socket2Connect: connect2: connectImplicitSsl: connectSocket_v2: + // connect_domain: ckDnsResolveDomainIPv4_n: ... + // clientHandshake2: readHandshakeMessages: + // Failed to read beginning of SSL/TLS record. + // readTlsRecord: Socket operation timeout. + // See https://cknotes.com/... + // ConnectFailReason: 103 + // Failed. + // + // Diagnostics we care about: + // • the deepest concrete failure reason (the non-label leaf line) + // • ConnectFailReason: if present + // • the target (domain_or_ip / ip_or_domain + port) + // Rest (DllDate, ChilkatVersion, Architecture, VerboseLogging, ...) is noise. + // + // Returns e.g.: + // "Connect failed [reason=103] s3.us-east-1.amazonaws.com:443 — Socket operation timeout (TLS handshake). See https://cknotes.com/failed-to-read-beginning-of-ssl-tls-record-..." + // ──────────────────────────────────────────────────────────────────────── + static std::string SummarizeChilkatError(const char* raw) { + if (!raw || !*raw) return "(empty Chilkat log)"; + std::string s(raw); + // Split into trimmed lines. + std::vector lines; + { + std::istringstream iss(s); + std::string ln; + while (std::getline(iss, ln)) { + // Trim leading whitespace (Chilkat indents by level). + size_t a = ln.find_first_not_of(" \t\r\n"); + if (a == std::string::npos) continue; + size_t b = ln.find_last_not_of(" \t\r\n"); + lines.push_back(ln.substr(a, b - a + 1)); + } + } + if (lines.empty()) return "(empty Chilkat log)"; + + // Lines that are pure labels ("foo:" or "--foo") are scaffolding. + auto isLabelLine = [](const std::string& ln) { + if (ln.empty()) return true; + if (ln.rfind("--", 0) == 0) return true; // "--foo" close-marker + // "label: value" — keep only if value is non-empty and line doesn't + // look like pure call-stack breadcrumb (trailing ':'). + if (!ln.empty() && ln.back() == ':') return true; + return false; + }; + + // Fields we want to extract. + std::string target, failReason, tlsHint, leafError, seeUrl; + + for (auto& ln : lines) { + if (ln.rfind("domain_or_ip:", 0) == 0 || ln.rfind("ip_or_domain:", 0) == 0) { + auto p = ln.find(':'); + if (p != std::string::npos) target = ln.substr(p + 1); + } else if (ln.rfind("ConnectFailReason:", 0) == 0) { + auto p = ln.find(':'); + if (p != std::string::npos) failReason = ln.substr(p + 1); + } else if (ln.rfind("port:", 0) == 0) { + auto p = ln.find(':'); + if (p != std::string::npos) { + // Append port to target if we have one. + std::string port = ln.substr(p + 1); + // Trim leading space. + size_t a = port.find_first_not_of(" \t"); + if (a != std::string::npos) port = port.substr(a); + if (!target.empty()) target += ":" + port; + } + } else if (ln.rfind("See http", 0) == 0 || ln.rfind("See https", 0) == 0) { + seeUrl = ln.substr(4); // drop "See " + } else if (ln.rfind("Failed to read beginning of SSL/TLS record", 0) == 0) { + tlsHint = "TLS handshake — server sent nothing (firewall/AV TLS inspection, proxy, or packet loss)"; + } else if (!isLabelLine(ln)) { + // Candidate concrete message. Keep the LATEST one that's not a + // stack label; Chilkat's deepest concrete error is usually the + // most specific explanation. + leafError = ln; + } + } + + // Trim leading space on captured fields. + auto trim = [](std::string& v) { + size_t a = v.find_first_not_of(" \t"); + if (a == std::string::npos) { v.clear(); return; } + size_t b = v.find_last_not_of(" \t"); + v = v.substr(a, b - a + 1); + }; + trim(target); trim(failReason); trim(leafError); trim(seeUrl); + + // Compose. + std::ostringstream out; + out << "Chilkat Connect failed"; + if (!failReason.empty()) out << " [reason=" << failReason << "]"; + if (!target.empty()) out << " " << target; + if (!leafError.empty()) out << " — " << leafError; + if (!tlsHint.empty()) out << " (" << tlsHint << ")"; + if (!seeUrl.empty()) out << " (" << seeUrl << ")"; + return out.str(); + } + // Private helper function to extract file name from a path // Helper function to extract filename from path std::string ANSAWSS3::ExtractFileName(const std::string& filePath) { @@ -249,7 +354,9 @@ namespace ANSCENTER // Connect if (!conn->rest.Connect(_fullAWSURL.c_str(), _port, _bTls, _bAutoReconnect)) { - _logger.LogError("ANSAWSS3::CreateConnection", conn->rest.lastErrorText(), __FILE__, __LINE__); + const std::string summary = SummarizeChilkatError(conn->rest.lastErrorText()); + _logger.LogError("ANSAWSS3::CreateConnection", summary, __FILE__, __LINE__); + ANS_DBG("AWSS3", "Connect failed: %s", summary.c_str()); return nullptr; } @@ -261,7 +368,9 @@ namespace ANSCENTER conn->socket.put_HttpProxyPassword(_proxyPassword.c_str()); conn->socket.put_HttpProxyForHttp(_bProxy); if (!conn->rest.UseConnection(conn->socket, true)) { - _logger.LogError("ANSAWSS3::CreateConnection - Proxy error", conn->rest.lastErrorText(), __FILE__, __LINE__); + const std::string summary = SummarizeChilkatError(conn->rest.lastErrorText()); + _logger.LogError("ANSAWSS3::CreateConnection - Proxy error", summary, __FILE__, __LINE__); + ANS_DBG("AWSS3", "Proxy error: %s", summary.c_str()); return nullptr; } } @@ -275,7 +384,9 @@ namespace ANSCENTER if (!_bucketRegion.empty()) conn->authAws.put_Region(_bucketRegion.c_str()); } if (!conn->rest.SetAuthAws(conn->authAws)) { - _logger.LogError("ANSAWSS3::CreateConnection - Auth error", conn->rest.lastErrorText(), __FILE__, __LINE__); + const std::string summary = SummarizeChilkatError(conn->rest.lastErrorText()); + _logger.LogError("ANSAWSS3::CreateConnection - Auth error", summary, __FILE__, __LINE__); + ANS_DBG("AWSS3", "Auth error: %s", summary.c_str()); return nullptr; } }