Fix deadlock
This commit is contained in:
@@ -11,17 +11,23 @@
|
||||
// model folder — without that, thread A can finish extraction and begin opening
|
||||
// train_last.onnx while thread B re-enters extraction and truncates the file,
|
||||
// producing "system error number 13" (EACCES) on the first reader.
|
||||
// Recursive so the same thread can re-acquire the lock through layered load
|
||||
// calls — ANSALPR_OD::LoadEngine -> ANSONNXYOLO::LoadModelFromFolder both
|
||||
// acquire the SAME folder lock on the SAME thread. A non-recursive
|
||||
// timed_mutex deadlocks that nesting for 120 s then fails. Recursive keeps
|
||||
// cross-thread serialization intact while allowing legitimate re-entry from
|
||||
// the lock-holding thread.
|
||||
static std::mutex g_zipPathMapMutex;
|
||||
static std::map<std::string, std::shared_ptr<std::timed_mutex>> g_zipPathLocks;
|
||||
static std::map<std::string, std::shared_ptr<std::recursive_timed_mutex>> g_zipPathLocks;
|
||||
|
||||
static std::shared_ptr<std::timed_mutex> GetZipPathLock(const std::string& path) {
|
||||
static std::shared_ptr<std::recursive_timed_mutex> GetZipPathLock(const std::string& path) {
|
||||
std::lock_guard<std::mutex> guard(g_zipPathMapMutex);
|
||||
auto& ptr = g_zipPathLocks[path];
|
||||
if (!ptr) ptr = std::make_shared<std::timed_mutex>();
|
||||
if (!ptr) ptr = std::make_shared<std::recursive_timed_mutex>();
|
||||
return ptr;
|
||||
}
|
||||
|
||||
std::shared_ptr<std::timed_mutex> GetModelFolderLock(const std::string& folderPath) {
|
||||
std::shared_ptr<std::recursive_timed_mutex> GetModelFolderLock(const std::string& folderPath) {
|
||||
auto ptr = GetZipPathLock(folderPath);
|
||||
ANS_DBG("ModelLock", "GetModelFolderLock: folder=%s mutex=%p",
|
||||
folderPath.c_str(), (void*)ptr.get());
|
||||
@@ -464,7 +470,7 @@ bool AddFolderContentsToZip(zip* archive, const char* folderPath, const char* zi
|
||||
|
||||
bool ZipFolderWithPassword(const char* folderPath, const char* zipFilePath, const char* password) {
|
||||
auto pathLock = GetZipPathLock(std::string(zipFilePath));
|
||||
std::lock_guard<std::timed_mutex> zipGuard(*pathLock);
|
||||
std::lock_guard<std::recursive_timed_mutex> zipGuard(*pathLock);
|
||||
|
||||
zip* zipArchive;
|
||||
zip_flags_t flags = ZIP_CREATE | ZIP_TRUNCATE;
|
||||
@@ -850,7 +856,7 @@ std::string GetDateTimeString(const std::string& format) {
|
||||
bool ExtractProtectedZipFile(const std::string& zipFileName, const std::string& password, const std::string& modelName, const std::string outputFolder)
|
||||
{
|
||||
auto pathLock = GetZipPathLock(outputFolder);
|
||||
std::lock_guard<std::timed_mutex> zipGuard(*pathLock);
|
||||
std::lock_guard<std::recursive_timed_mutex> zipGuard(*pathLock);
|
||||
|
||||
int error;
|
||||
if (!FileExist(zipFileName))return false;
|
||||
|
||||
@@ -96,9 +96,12 @@ namespace fs = std::filesystem;
|
||||
// creation on the same extracted folder so concurrent CreateANSODHandle calls
|
||||
// cannot truncate/rewrite a model file while another thread is loading it.
|
||||
// Keyed by folder path (not zip path) so both extractor and consumer agree.
|
||||
// Returns std::timed_mutex so callers can bound their wait and avoid a hang
|
||||
// if a peer thread deadlocks inside extraction or ORT session creation.
|
||||
ANSLICENSE_API std::shared_ptr<std::timed_mutex> GetModelFolderLock(const std::string& folderPath);
|
||||
// Returns std::recursive_timed_mutex so callers can bound their wait and
|
||||
// recursion — layered load paths (e.g. ANSALPR_OD::LoadEngine ->
|
||||
// ANSONNXYOLO::LoadModelFromFolder) legitimately re-enter on the same
|
||||
// thread; a non-recursive timed_mutex self-deadlocks that nesting. Cross-
|
||||
// thread serialization is unchanged.
|
||||
ANSLICENSE_API std::shared_ptr<std::recursive_timed_mutex> GetModelFolderLock(const std::string& folderPath);
|
||||
|
||||
// ============================================================================
|
||||
// ModelFolderLockGuard
|
||||
@@ -111,8 +114,10 @@ namespace fs = std::filesystem;
|
||||
// error number 13" (EACCES) on the reader.
|
||||
//
|
||||
// Backed by GetModelFolderLock() above which returns a process-wide
|
||||
// std::timed_mutex keyed on the folder path. The extractor takes the same
|
||||
// lock, so extract ↔ open is mutually exclusive.
|
||||
// std::recursive_timed_mutex keyed on the folder path. The extractor takes
|
||||
// the same lock, so extract ↔ open is mutually exclusive across threads,
|
||||
// while same-thread re-entry (layered loaders) is permitted without
|
||||
// deadlocking.
|
||||
//
|
||||
// Acquisition is bounded by `timeout` (default 120 s) so a deadlocked peer
|
||||
// cannot hang the caller thread forever. On timeout, .acquired() is false and
|
||||
@@ -165,7 +170,7 @@ namespace ANSCENTER {
|
||||
return;
|
||||
}
|
||||
auto lock = GetModelFolderLock(folderPath);
|
||||
_guard = std::unique_lock<std::timed_mutex>(*lock, std::defer_lock);
|
||||
_guard = std::unique_lock<std::recursive_timed_mutex>(*lock, std::defer_lock);
|
||||
ANS_DBG("EngineLoad",
|
||||
"%s: waiting on folder lock (%llds): %s",
|
||||
_caller, (long long)timeout.count(), folderPath.c_str());
|
||||
@@ -196,7 +201,7 @@ namespace ANSCENTER {
|
||||
private:
|
||||
const char* _caller;
|
||||
std::string _folder;
|
||||
std::unique_lock<std::timed_mutex> _guard;
|
||||
std::unique_lock<std::recursive_timed_mutex> _guard;
|
||||
bool _ok = false;
|
||||
};
|
||||
} // namespace ANSCENTER
|
||||
|
||||
@@ -62,6 +62,111 @@ namespace ANSCENTER
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────────────────
|
||||
// SummarizeChilkatError: turn Chilkat's verbose multi-line lastErrorText()
|
||||
// into a single diagnostic line suitable for _logger.LogError().
|
||||
//
|
||||
// Chilkat's log looks like:
|
||||
// ChilkatLog: Connect: DllDate: ... ChilkatVersion: ... UnlockStatus: ...
|
||||
// restConnect: domain_or_ip: s3.us-east-1.amazonaws.com
|
||||
// socket2Connect: connect2: connectImplicitSsl: connectSocket_v2:
|
||||
// connect_domain: ckDnsResolveDomainIPv4_n: ...
|
||||
// clientHandshake2: readHandshakeMessages:
|
||||
// Failed to read beginning of SSL/TLS record.
|
||||
// readTlsRecord: Socket operation timeout.
|
||||
// See https://cknotes.com/...
|
||||
// ConnectFailReason: 103
|
||||
// Failed.
|
||||
//
|
||||
// Diagnostics we care about:
|
||||
// • the deepest concrete failure reason (the non-label leaf line)
|
||||
// • ConnectFailReason: <N> if present
|
||||
// • the target (domain_or_ip / ip_or_domain + port)
|
||||
// Rest (DllDate, ChilkatVersion, Architecture, VerboseLogging, ...) is noise.
|
||||
//
|
||||
// Returns e.g.:
|
||||
// "Connect failed [reason=103] s3.us-east-1.amazonaws.com:443 — Socket operation timeout (TLS handshake). See https://cknotes.com/failed-to-read-beginning-of-ssl-tls-record-..."
|
||||
// ────────────────────────────────────────────────────────────────────────
|
||||
static std::string SummarizeChilkatError(const char* raw) {
|
||||
if (!raw || !*raw) return "(empty Chilkat log)";
|
||||
std::string s(raw);
|
||||
// Split into trimmed lines.
|
||||
std::vector<std::string> lines;
|
||||
{
|
||||
std::istringstream iss(s);
|
||||
std::string ln;
|
||||
while (std::getline(iss, ln)) {
|
||||
// Trim leading whitespace (Chilkat indents by level).
|
||||
size_t a = ln.find_first_not_of(" \t\r\n");
|
||||
if (a == std::string::npos) continue;
|
||||
size_t b = ln.find_last_not_of(" \t\r\n");
|
||||
lines.push_back(ln.substr(a, b - a + 1));
|
||||
}
|
||||
}
|
||||
if (lines.empty()) return "(empty Chilkat log)";
|
||||
|
||||
// Lines that are pure labels ("foo:" or "--foo") are scaffolding.
|
||||
auto isLabelLine = [](const std::string& ln) {
|
||||
if (ln.empty()) return true;
|
||||
if (ln.rfind("--", 0) == 0) return true; // "--foo" close-marker
|
||||
// "label: value" — keep only if value is non-empty and line doesn't
|
||||
// look like pure call-stack breadcrumb (trailing ':').
|
||||
if (!ln.empty() && ln.back() == ':') return true;
|
||||
return false;
|
||||
};
|
||||
|
||||
// Fields we want to extract.
|
||||
std::string target, failReason, tlsHint, leafError, seeUrl;
|
||||
|
||||
for (auto& ln : lines) {
|
||||
if (ln.rfind("domain_or_ip:", 0) == 0 || ln.rfind("ip_or_domain:", 0) == 0) {
|
||||
auto p = ln.find(':');
|
||||
if (p != std::string::npos) target = ln.substr(p + 1);
|
||||
} else if (ln.rfind("ConnectFailReason:", 0) == 0) {
|
||||
auto p = ln.find(':');
|
||||
if (p != std::string::npos) failReason = ln.substr(p + 1);
|
||||
} else if (ln.rfind("port:", 0) == 0) {
|
||||
auto p = ln.find(':');
|
||||
if (p != std::string::npos) {
|
||||
// Append port to target if we have one.
|
||||
std::string port = ln.substr(p + 1);
|
||||
// Trim leading space.
|
||||
size_t a = port.find_first_not_of(" \t");
|
||||
if (a != std::string::npos) port = port.substr(a);
|
||||
if (!target.empty()) target += ":" + port;
|
||||
}
|
||||
} else if (ln.rfind("See http", 0) == 0 || ln.rfind("See https", 0) == 0) {
|
||||
seeUrl = ln.substr(4); // drop "See "
|
||||
} else if (ln.rfind("Failed to read beginning of SSL/TLS record", 0) == 0) {
|
||||
tlsHint = "TLS handshake — server sent nothing (firewall/AV TLS inspection, proxy, or packet loss)";
|
||||
} else if (!isLabelLine(ln)) {
|
||||
// Candidate concrete message. Keep the LATEST one that's not a
|
||||
// stack label; Chilkat's deepest concrete error is usually the
|
||||
// most specific explanation.
|
||||
leafError = ln;
|
||||
}
|
||||
}
|
||||
|
||||
// Trim leading space on captured fields.
|
||||
auto trim = [](std::string& v) {
|
||||
size_t a = v.find_first_not_of(" \t");
|
||||
if (a == std::string::npos) { v.clear(); return; }
|
||||
size_t b = v.find_last_not_of(" \t");
|
||||
v = v.substr(a, b - a + 1);
|
||||
};
|
||||
trim(target); trim(failReason); trim(leafError); trim(seeUrl);
|
||||
|
||||
// Compose.
|
||||
std::ostringstream out;
|
||||
out << "Chilkat Connect failed";
|
||||
if (!failReason.empty()) out << " [reason=" << failReason << "]";
|
||||
if (!target.empty()) out << " " << target;
|
||||
if (!leafError.empty()) out << " — " << leafError;
|
||||
if (!tlsHint.empty()) out << " (" << tlsHint << ")";
|
||||
if (!seeUrl.empty()) out << " (" << seeUrl << ")";
|
||||
return out.str();
|
||||
}
|
||||
|
||||
// Private helper function to extract file name from a path
|
||||
// Helper function to extract filename from path
|
||||
std::string ANSAWSS3::ExtractFileName(const std::string& filePath) {
|
||||
@@ -249,7 +354,9 @@ namespace ANSCENTER
|
||||
|
||||
// Connect
|
||||
if (!conn->rest.Connect(_fullAWSURL.c_str(), _port, _bTls, _bAutoReconnect)) {
|
||||
_logger.LogError("ANSAWSS3::CreateConnection", conn->rest.lastErrorText(), __FILE__, __LINE__);
|
||||
const std::string summary = SummarizeChilkatError(conn->rest.lastErrorText());
|
||||
_logger.LogError("ANSAWSS3::CreateConnection", summary, __FILE__, __LINE__);
|
||||
ANS_DBG("AWSS3", "Connect failed: %s", summary.c_str());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -261,7 +368,9 @@ namespace ANSCENTER
|
||||
conn->socket.put_HttpProxyPassword(_proxyPassword.c_str());
|
||||
conn->socket.put_HttpProxyForHttp(_bProxy);
|
||||
if (!conn->rest.UseConnection(conn->socket, true)) {
|
||||
_logger.LogError("ANSAWSS3::CreateConnection - Proxy error", conn->rest.lastErrorText(), __FILE__, __LINE__);
|
||||
const std::string summary = SummarizeChilkatError(conn->rest.lastErrorText());
|
||||
_logger.LogError("ANSAWSS3::CreateConnection - Proxy error", summary, __FILE__, __LINE__);
|
||||
ANS_DBG("AWSS3", "Proxy error: %s", summary.c_str());
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
@@ -275,7 +384,9 @@ namespace ANSCENTER
|
||||
if (!_bucketRegion.empty()) conn->authAws.put_Region(_bucketRegion.c_str());
|
||||
}
|
||||
if (!conn->rest.SetAuthAws(conn->authAws)) {
|
||||
_logger.LogError("ANSAWSS3::CreateConnection - Auth error", conn->rest.lastErrorText(), __FILE__, __LINE__);
|
||||
const std::string summary = SummarizeChilkatError(conn->rest.lastErrorText());
|
||||
_logger.LogError("ANSAWSS3::CreateConnection - Auth error", summary, __FILE__, __LINE__);
|
||||
ANS_DBG("AWSS3", "Auth error: %s", summary.c_str());
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user