Fix deadlock
This commit is contained in:
@@ -11,17 +11,23 @@
|
|||||||
// model folder — without that, thread A can finish extraction and begin opening
|
// model folder — without that, thread A can finish extraction and begin opening
|
||||||
// train_last.onnx while thread B re-enters extraction and truncates the file,
|
// train_last.onnx while thread B re-enters extraction and truncates the file,
|
||||||
// producing "system error number 13" (EACCES) on the first reader.
|
// producing "system error number 13" (EACCES) on the first reader.
|
||||||
|
// Recursive so the same thread can re-acquire the lock through layered load
|
||||||
|
// calls — ANSALPR_OD::LoadEngine -> ANSONNXYOLO::LoadModelFromFolder both
|
||||||
|
// acquire the SAME folder lock on the SAME thread. A non-recursive
|
||||||
|
// timed_mutex deadlocks that nesting for 120 s then fails. Recursive keeps
|
||||||
|
// cross-thread serialization intact while allowing legitimate re-entry from
|
||||||
|
// the lock-holding thread.
|
||||||
static std::mutex g_zipPathMapMutex;
|
static std::mutex g_zipPathMapMutex;
|
||||||
static std::map<std::string, std::shared_ptr<std::timed_mutex>> g_zipPathLocks;
|
static std::map<std::string, std::shared_ptr<std::recursive_timed_mutex>> g_zipPathLocks;
|
||||||
|
|
||||||
static std::shared_ptr<std::timed_mutex> GetZipPathLock(const std::string& path) {
|
static std::shared_ptr<std::recursive_timed_mutex> GetZipPathLock(const std::string& path) {
|
||||||
std::lock_guard<std::mutex> guard(g_zipPathMapMutex);
|
std::lock_guard<std::mutex> guard(g_zipPathMapMutex);
|
||||||
auto& ptr = g_zipPathLocks[path];
|
auto& ptr = g_zipPathLocks[path];
|
||||||
if (!ptr) ptr = std::make_shared<std::timed_mutex>();
|
if (!ptr) ptr = std::make_shared<std::recursive_timed_mutex>();
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::shared_ptr<std::timed_mutex> GetModelFolderLock(const std::string& folderPath) {
|
std::shared_ptr<std::recursive_timed_mutex> GetModelFolderLock(const std::string& folderPath) {
|
||||||
auto ptr = GetZipPathLock(folderPath);
|
auto ptr = GetZipPathLock(folderPath);
|
||||||
ANS_DBG("ModelLock", "GetModelFolderLock: folder=%s mutex=%p",
|
ANS_DBG("ModelLock", "GetModelFolderLock: folder=%s mutex=%p",
|
||||||
folderPath.c_str(), (void*)ptr.get());
|
folderPath.c_str(), (void*)ptr.get());
|
||||||
@@ -464,7 +470,7 @@ bool AddFolderContentsToZip(zip* archive, const char* folderPath, const char* zi
|
|||||||
|
|
||||||
bool ZipFolderWithPassword(const char* folderPath, const char* zipFilePath, const char* password) {
|
bool ZipFolderWithPassword(const char* folderPath, const char* zipFilePath, const char* password) {
|
||||||
auto pathLock = GetZipPathLock(std::string(zipFilePath));
|
auto pathLock = GetZipPathLock(std::string(zipFilePath));
|
||||||
std::lock_guard<std::timed_mutex> zipGuard(*pathLock);
|
std::lock_guard<std::recursive_timed_mutex> zipGuard(*pathLock);
|
||||||
|
|
||||||
zip* zipArchive;
|
zip* zipArchive;
|
||||||
zip_flags_t flags = ZIP_CREATE | ZIP_TRUNCATE;
|
zip_flags_t flags = ZIP_CREATE | ZIP_TRUNCATE;
|
||||||
@@ -850,7 +856,7 @@ std::string GetDateTimeString(const std::string& format) {
|
|||||||
bool ExtractProtectedZipFile(const std::string& zipFileName, const std::string& password, const std::string& modelName, const std::string outputFolder)
|
bool ExtractProtectedZipFile(const std::string& zipFileName, const std::string& password, const std::string& modelName, const std::string outputFolder)
|
||||||
{
|
{
|
||||||
auto pathLock = GetZipPathLock(outputFolder);
|
auto pathLock = GetZipPathLock(outputFolder);
|
||||||
std::lock_guard<std::timed_mutex> zipGuard(*pathLock);
|
std::lock_guard<std::recursive_timed_mutex> zipGuard(*pathLock);
|
||||||
|
|
||||||
int error;
|
int error;
|
||||||
if (!FileExist(zipFileName))return false;
|
if (!FileExist(zipFileName))return false;
|
||||||
|
|||||||
@@ -96,9 +96,12 @@ namespace fs = std::filesystem;
|
|||||||
// creation on the same extracted folder so concurrent CreateANSODHandle calls
|
// creation on the same extracted folder so concurrent CreateANSODHandle calls
|
||||||
// cannot truncate/rewrite a model file while another thread is loading it.
|
// cannot truncate/rewrite a model file while another thread is loading it.
|
||||||
// Keyed by folder path (not zip path) so both extractor and consumer agree.
|
// Keyed by folder path (not zip path) so both extractor and consumer agree.
|
||||||
// Returns std::timed_mutex so callers can bound their wait and avoid a hang
|
// Returns std::recursive_timed_mutex so callers can bound their wait and
|
||||||
// if a peer thread deadlocks inside extraction or ORT session creation.
|
// recursion — layered load paths (e.g. ANSALPR_OD::LoadEngine ->
|
||||||
ANSLICENSE_API std::shared_ptr<std::timed_mutex> GetModelFolderLock(const std::string& folderPath);
|
// ANSONNXYOLO::LoadModelFromFolder) legitimately re-enter on the same
|
||||||
|
// thread; a non-recursive timed_mutex self-deadlocks that nesting. Cross-
|
||||||
|
// thread serialization is unchanged.
|
||||||
|
ANSLICENSE_API std::shared_ptr<std::recursive_timed_mutex> GetModelFolderLock(const std::string& folderPath);
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// ModelFolderLockGuard
|
// ModelFolderLockGuard
|
||||||
@@ -111,8 +114,10 @@ namespace fs = std::filesystem;
|
|||||||
// error number 13" (EACCES) on the reader.
|
// error number 13" (EACCES) on the reader.
|
||||||
//
|
//
|
||||||
// Backed by GetModelFolderLock() above which returns a process-wide
|
// Backed by GetModelFolderLock() above which returns a process-wide
|
||||||
// std::timed_mutex keyed on the folder path. The extractor takes the same
|
// std::recursive_timed_mutex keyed on the folder path. The extractor takes
|
||||||
// lock, so extract ↔ open is mutually exclusive.
|
// the same lock, so extract ↔ open is mutually exclusive across threads,
|
||||||
|
// while same-thread re-entry (layered loaders) is permitted without
|
||||||
|
// deadlocking.
|
||||||
//
|
//
|
||||||
// Acquisition is bounded by `timeout` (default 120 s) so a deadlocked peer
|
// Acquisition is bounded by `timeout` (default 120 s) so a deadlocked peer
|
||||||
// cannot hang the caller thread forever. On timeout, .acquired() is false and
|
// cannot hang the caller thread forever. On timeout, .acquired() is false and
|
||||||
@@ -165,7 +170,7 @@ namespace ANSCENTER {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
auto lock = GetModelFolderLock(folderPath);
|
auto lock = GetModelFolderLock(folderPath);
|
||||||
_guard = std::unique_lock<std::timed_mutex>(*lock, std::defer_lock);
|
_guard = std::unique_lock<std::recursive_timed_mutex>(*lock, std::defer_lock);
|
||||||
ANS_DBG("EngineLoad",
|
ANS_DBG("EngineLoad",
|
||||||
"%s: waiting on folder lock (%llds): %s",
|
"%s: waiting on folder lock (%llds): %s",
|
||||||
_caller, (long long)timeout.count(), folderPath.c_str());
|
_caller, (long long)timeout.count(), folderPath.c_str());
|
||||||
@@ -196,7 +201,7 @@ namespace ANSCENTER {
|
|||||||
private:
|
private:
|
||||||
const char* _caller;
|
const char* _caller;
|
||||||
std::string _folder;
|
std::string _folder;
|
||||||
std::unique_lock<std::timed_mutex> _guard;
|
std::unique_lock<std::recursive_timed_mutex> _guard;
|
||||||
bool _ok = false;
|
bool _ok = false;
|
||||||
};
|
};
|
||||||
} // namespace ANSCENTER
|
} // namespace ANSCENTER
|
||||||
|
|||||||
@@ -62,6 +62,111 @@ namespace ANSCENTER
|
|||||||
return oss.str();
|
return oss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ────────────────────────────────────────────────────────────────────────
|
||||||
|
// SummarizeChilkatError: turn Chilkat's verbose multi-line lastErrorText()
|
||||||
|
// into a single diagnostic line suitable for _logger.LogError().
|
||||||
|
//
|
||||||
|
// Chilkat's log looks like:
|
||||||
|
// ChilkatLog: Connect: DllDate: ... ChilkatVersion: ... UnlockStatus: ...
|
||||||
|
// restConnect: domain_or_ip: s3.us-east-1.amazonaws.com
|
||||||
|
// socket2Connect: connect2: connectImplicitSsl: connectSocket_v2:
|
||||||
|
// connect_domain: ckDnsResolveDomainIPv4_n: ...
|
||||||
|
// clientHandshake2: readHandshakeMessages:
|
||||||
|
// Failed to read beginning of SSL/TLS record.
|
||||||
|
// readTlsRecord: Socket operation timeout.
|
||||||
|
// See https://cknotes.com/...
|
||||||
|
// ConnectFailReason: 103
|
||||||
|
// Failed.
|
||||||
|
//
|
||||||
|
// Diagnostics we care about:
|
||||||
|
// • the deepest concrete failure reason (the non-label leaf line)
|
||||||
|
// • ConnectFailReason: <N> if present
|
||||||
|
// • the target (domain_or_ip / ip_or_domain + port)
|
||||||
|
// Rest (DllDate, ChilkatVersion, Architecture, VerboseLogging, ...) is noise.
|
||||||
|
//
|
||||||
|
// Returns e.g.:
|
||||||
|
// "Connect failed [reason=103] s3.us-east-1.amazonaws.com:443 — Socket operation timeout (TLS handshake). See https://cknotes.com/failed-to-read-beginning-of-ssl-tls-record-..."
|
||||||
|
// ────────────────────────────────────────────────────────────────────────
|
||||||
|
static std::string SummarizeChilkatError(const char* raw) {
|
||||||
|
if (!raw || !*raw) return "(empty Chilkat log)";
|
||||||
|
std::string s(raw);
|
||||||
|
// Split into trimmed lines.
|
||||||
|
std::vector<std::string> lines;
|
||||||
|
{
|
||||||
|
std::istringstream iss(s);
|
||||||
|
std::string ln;
|
||||||
|
while (std::getline(iss, ln)) {
|
||||||
|
// Trim leading whitespace (Chilkat indents by level).
|
||||||
|
size_t a = ln.find_first_not_of(" \t\r\n");
|
||||||
|
if (a == std::string::npos) continue;
|
||||||
|
size_t b = ln.find_last_not_of(" \t\r\n");
|
||||||
|
lines.push_back(ln.substr(a, b - a + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (lines.empty()) return "(empty Chilkat log)";
|
||||||
|
|
||||||
|
// Lines that are pure labels ("foo:" or "--foo") are scaffolding.
|
||||||
|
auto isLabelLine = [](const std::string& ln) {
|
||||||
|
if (ln.empty()) return true;
|
||||||
|
if (ln.rfind("--", 0) == 0) return true; // "--foo" close-marker
|
||||||
|
// "label: value" — keep only if value is non-empty and line doesn't
|
||||||
|
// look like pure call-stack breadcrumb (trailing ':').
|
||||||
|
if (!ln.empty() && ln.back() == ':') return true;
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Fields we want to extract.
|
||||||
|
std::string target, failReason, tlsHint, leafError, seeUrl;
|
||||||
|
|
||||||
|
for (auto& ln : lines) {
|
||||||
|
if (ln.rfind("domain_or_ip:", 0) == 0 || ln.rfind("ip_or_domain:", 0) == 0) {
|
||||||
|
auto p = ln.find(':');
|
||||||
|
if (p != std::string::npos) target = ln.substr(p + 1);
|
||||||
|
} else if (ln.rfind("ConnectFailReason:", 0) == 0) {
|
||||||
|
auto p = ln.find(':');
|
||||||
|
if (p != std::string::npos) failReason = ln.substr(p + 1);
|
||||||
|
} else if (ln.rfind("port:", 0) == 0) {
|
||||||
|
auto p = ln.find(':');
|
||||||
|
if (p != std::string::npos) {
|
||||||
|
// Append port to target if we have one.
|
||||||
|
std::string port = ln.substr(p + 1);
|
||||||
|
// Trim leading space.
|
||||||
|
size_t a = port.find_first_not_of(" \t");
|
||||||
|
if (a != std::string::npos) port = port.substr(a);
|
||||||
|
if (!target.empty()) target += ":" + port;
|
||||||
|
}
|
||||||
|
} else if (ln.rfind("See http", 0) == 0 || ln.rfind("See https", 0) == 0) {
|
||||||
|
seeUrl = ln.substr(4); // drop "See "
|
||||||
|
} else if (ln.rfind("Failed to read beginning of SSL/TLS record", 0) == 0) {
|
||||||
|
tlsHint = "TLS handshake — server sent nothing (firewall/AV TLS inspection, proxy, or packet loss)";
|
||||||
|
} else if (!isLabelLine(ln)) {
|
||||||
|
// Candidate concrete message. Keep the LATEST one that's not a
|
||||||
|
// stack label; Chilkat's deepest concrete error is usually the
|
||||||
|
// most specific explanation.
|
||||||
|
leafError = ln;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Trim leading space on captured fields.
|
||||||
|
auto trim = [](std::string& v) {
|
||||||
|
size_t a = v.find_first_not_of(" \t");
|
||||||
|
if (a == std::string::npos) { v.clear(); return; }
|
||||||
|
size_t b = v.find_last_not_of(" \t");
|
||||||
|
v = v.substr(a, b - a + 1);
|
||||||
|
};
|
||||||
|
trim(target); trim(failReason); trim(leafError); trim(seeUrl);
|
||||||
|
|
||||||
|
// Compose.
|
||||||
|
std::ostringstream out;
|
||||||
|
out << "Chilkat Connect failed";
|
||||||
|
if (!failReason.empty()) out << " [reason=" << failReason << "]";
|
||||||
|
if (!target.empty()) out << " " << target;
|
||||||
|
if (!leafError.empty()) out << " — " << leafError;
|
||||||
|
if (!tlsHint.empty()) out << " (" << tlsHint << ")";
|
||||||
|
if (!seeUrl.empty()) out << " (" << seeUrl << ")";
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
// Private helper function to extract file name from a path
|
// Private helper function to extract file name from a path
|
||||||
// Helper function to extract filename from path
|
// Helper function to extract filename from path
|
||||||
std::string ANSAWSS3::ExtractFileName(const std::string& filePath) {
|
std::string ANSAWSS3::ExtractFileName(const std::string& filePath) {
|
||||||
@@ -249,7 +354,9 @@ namespace ANSCENTER
|
|||||||
|
|
||||||
// Connect
|
// Connect
|
||||||
if (!conn->rest.Connect(_fullAWSURL.c_str(), _port, _bTls, _bAutoReconnect)) {
|
if (!conn->rest.Connect(_fullAWSURL.c_str(), _port, _bTls, _bAutoReconnect)) {
|
||||||
_logger.LogError("ANSAWSS3::CreateConnection", conn->rest.lastErrorText(), __FILE__, __LINE__);
|
const std::string summary = SummarizeChilkatError(conn->rest.lastErrorText());
|
||||||
|
_logger.LogError("ANSAWSS3::CreateConnection", summary, __FILE__, __LINE__);
|
||||||
|
ANS_DBG("AWSS3", "Connect failed: %s", summary.c_str());
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -261,7 +368,9 @@ namespace ANSCENTER
|
|||||||
conn->socket.put_HttpProxyPassword(_proxyPassword.c_str());
|
conn->socket.put_HttpProxyPassword(_proxyPassword.c_str());
|
||||||
conn->socket.put_HttpProxyForHttp(_bProxy);
|
conn->socket.put_HttpProxyForHttp(_bProxy);
|
||||||
if (!conn->rest.UseConnection(conn->socket, true)) {
|
if (!conn->rest.UseConnection(conn->socket, true)) {
|
||||||
_logger.LogError("ANSAWSS3::CreateConnection - Proxy error", conn->rest.lastErrorText(), __FILE__, __LINE__);
|
const std::string summary = SummarizeChilkatError(conn->rest.lastErrorText());
|
||||||
|
_logger.LogError("ANSAWSS3::CreateConnection - Proxy error", summary, __FILE__, __LINE__);
|
||||||
|
ANS_DBG("AWSS3", "Proxy error: %s", summary.c_str());
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -275,7 +384,9 @@ namespace ANSCENTER
|
|||||||
if (!_bucketRegion.empty()) conn->authAws.put_Region(_bucketRegion.c_str());
|
if (!_bucketRegion.empty()) conn->authAws.put_Region(_bucketRegion.c_str());
|
||||||
}
|
}
|
||||||
if (!conn->rest.SetAuthAws(conn->authAws)) {
|
if (!conn->rest.SetAuthAws(conn->authAws)) {
|
||||||
_logger.LogError("ANSAWSS3::CreateConnection - Auth error", conn->rest.lastErrorText(), __FILE__, __LINE__);
|
const std::string summary = SummarizeChilkatError(conn->rest.lastErrorText());
|
||||||
|
_logger.LogError("ANSAWSS3::CreateConnection - Auth error", summary, __FILE__, __LINE__);
|
||||||
|
ANS_DBG("AWSS3", "Auth error: %s", summary.c_str());
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user