Fix mutex lock issues

This commit is contained in:
2026-04-13 19:48:32 +10:00
parent 56a8f09adf
commit 844d7396b2
30 changed files with 445 additions and 575 deletions

View File

@@ -1734,6 +1734,7 @@ namespace ANSCENTER {
std::string& labelMap)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
ModelLoadingGuard mlg(_modelLoading);
try {
_modelLoadValid = false;
bool result = ANSODBase::Initialize(licenseKey, modelConfig,
@@ -1810,6 +1811,7 @@ namespace ANSCENTER {
const std::string& modelZipPassword)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
ModelLoadingGuard mlg(_modelLoading);
try {
bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword);
if (!result) return false;
@@ -1873,6 +1875,7 @@ namespace ANSCENTER {
std::string& labelMap)
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
ModelLoadingGuard mlg(_modelLoading);
try {
bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig,
modelName, className,
@@ -1948,8 +1951,10 @@ namespace ANSCENTER {
std::vector<Object> ANSONNXYOLO::RunInference(const cv::Mat& inputImgBGR,
const std::string& camera_id)
{
if (_modelLoading.load()) return {};
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
auto lock = TryLockWithTimeout("ANSONNXYOLO::RunInference");
if (!lock.owns_lock()) return {};
if (!_modelLoadValid) {
_logger.LogError("ANSONNXYOLO::RunInference", "Model not loaded", __FILE__, __LINE__);
return {};
@@ -1976,29 +1981,55 @@ namespace ANSCENTER {
const std::string& camera_id)
{
try {
ANS_DBG("ONNXYOLO", "DetectObjects: cam=%s acquiring mutex...", camera_id.c_str());
std::lock_guard<std::recursive_mutex> lock(_mutex);
ANS_DBG("ONNXYOLO", "DetectObjects: mutex acquired, cam=%s", camera_id.c_str());
if (!m_ortEngine) {
_logger.LogError("ANSONNXYOLO::DetectObjects", "ORT engine is null", __FILE__, __LINE__);
ANS_DBG("ONNXYOLO", "DetectObjects: ORT engine is null!");
// Fail-fast if a model load/init is in progress on another thread
if (_modelLoading.load()) {
ANS_DBG("ONNXYOLO", "DetectObjects: skipped — model loading in progress, cam=%s", camera_id.c_str());
return {};
}
// ── Snapshot config under a brief lock ──────────────────
// Only hold _mutex long enough to validate state and copy
// the parameters needed for inference. The actual ORT
// detect() call runs outside the lock so that concurrent
// Initialize/LoadModel calls are not blocked for the full
// duration of inference.
float probThresh, nmsThresh;
int numKps;
std::vector<std::string> classes;
bool trackerEnabled, stabilizationEnabled;
{
ANS_DBG("ONNXYOLO", "DetectObjects: cam=%s acquiring mutex...", camera_id.c_str());
auto lk = TryLockWithTimeout("ANSONNXYOLO::DetectObjects");
if (!lk.owns_lock()) return {}; // timed out
ANS_DBG("ONNXYOLO", "DetectObjects: mutex acquired, cam=%s", camera_id.c_str());
if (!m_ortEngine) {
_logger.LogError("ANSONNXYOLO::DetectObjects", "ORT engine is null", __FILE__, __LINE__);
ANS_DBG("ONNXYOLO", "DetectObjects: ORT engine is null!");
return {};
}
// Snapshot parameters while locked
probThresh = PROBABILITY_THRESHOLD;
nmsThresh = NMS_THRESHOLD;
numKps = NUM_KPS;
classes = _classes;
trackerEnabled = _trackerEnabled;
stabilizationEnabled = _stabilizationEnabled;
}
// ── _mutex released — heavy work below runs lock-free ───
// --- NV12 fast path: try to get full-res BGR from GPU NV12 frame ---
cv::Mat inferenceImage = inputImage;
float bgrScaleX = 1.0f, bgrScaleY = 1.0f;
{
auto* gpuData = tl_currentGpuFrame();
if (gpuData && gpuData->width > 0 && gpuData->height > 0) {
// Full-res NV12 available — convert to BGR on CPU for ORT
// (ORT preprocessing is CPU-based, so we need a cv::Mat)
if (gpuData->cpuYPlane && gpuData->cpuUvPlane &&
gpuData->cpuYLinesize >= gpuData->width &&
gpuData->cpuUvLinesize >= gpuData->width) {
const int fw = gpuData->width;
const int fh = gpuData->height;
// NV12 requires even dimensions
if ((fw % 2) == 0 && (fh % 2) == 0) {
try {
cv::Mat yPlane(fh, fw, CV_8UC1,
@@ -2017,10 +2048,12 @@ namespace ANSCENTER {
}
}
}
auto results = m_ortEngine->detect(inferenceImage, _classes,
PROBABILITY_THRESHOLD,
NMS_THRESHOLD,
NUM_KPS);
// Run ORT inference — no mutex held, this is the expensive call
auto results = m_ortEngine->detect(inferenceImage, classes,
probThresh,
nmsThresh,
numKps);
// --- Rescale coordinates from full-res back to display-res ---
if (bgrScaleX != 1.0f || bgrScaleY != 1.0f) {
@@ -2044,21 +2077,16 @@ namespace ANSCENTER {
for (auto& obj : results)
obj.cameraId = camera_id;
// Skip tracking for classification models
if (_trackerEnabled && !m_ortEngine->lastWasClassification) {
// Tracking/stabilization (ApplyTracking has its own lock)
if (trackerEnabled && !m_ortEngine->lastWasClassification) {
results = ApplyTracking(results, camera_id);
if (_stabilizationEnabled) results = StabilizeDetections(results, camera_id);
if (stabilizationEnabled) results = StabilizeDetections(results, camera_id);
}
return results;
}
catch (const std::exception& e) {
const std::string msg = e.what();
// ── DML device-removal detection ──────────────────────────
// HRESULT 887A0005 = DXGI_ERROR_DEVICE_REMOVED ("The GPU
// device instance has been suspended"). Once the D3D12
// device is gone the ORT session is permanently broken.
// Log once, attempt CPU fallback, suppress further flood.
if (msg.find("887A0005") != std::string::npos) {
if (!_dmlDeviceLost) {
_dmlDeviceLost = true;
@@ -2067,6 +2095,7 @@ namespace ANSCENTER {
__FILE__, __LINE__);
ANS_DBG("ONNXYOLO", "DML device lost — recreating session on CPU");
try {
std::lock_guard<std::recursive_mutex> lk(_mutex);
m_ortEngine.reset();
if (InitOrtEngine(ANSCENTER::EngineType::CPU)) {
_logger.LogInfo("ANSONNXYOLO::DetectObjects",
@@ -2084,7 +2113,6 @@ namespace ANSCENTER {
__FILE__, __LINE__);
}
}
// Suppress flood — already logged above
return {};
}
@@ -2101,8 +2129,10 @@ namespace ANSCENTER {
std::vector<std::vector<Object>> ANSONNXYOLO::RunInferencesBatch(
const std::vector<cv::Mat>& inputs, const std::string& camera_id)
{
if (_modelLoading.load()) return {};
{
std::lock_guard<std::recursive_mutex> lock(_mutex);
auto lock = TryLockWithTimeout("ANSONNXYOLO::RunInferencesBatch");
if (!lock.owns_lock()) return {};
if (!_modelLoadValid) {
_logger.LogFatal("ANSONNXYOLO::RunInferencesBatch",
"Cannot load ONNX model", __FILE__, __LINE__);
@@ -2154,16 +2184,33 @@ namespace ANSCENTER {
const std::vector<cv::Mat>& inputImages, const std::string& camera_id)
{
try {
std::lock_guard<std::recursive_mutex> lock(_mutex);
if (!m_ortEngine) {
_logger.LogError("ANSONNXYOLO::DetectObjectsBatch",
"ORT engine is null", __FILE__, __LINE__);
return {};
if (_modelLoading.load()) return {};
// Snapshot config under brief lock
float probThresh, nmsThresh;
int numKps;
std::vector<std::string> classes;
bool trackerEnabled, stabilizationEnabled;
{
auto lk = TryLockWithTimeout("ANSONNXYOLO::DetectObjectsBatch");
if (!lk.owns_lock()) return {};
if (!m_ortEngine) {
_logger.LogError("ANSONNXYOLO::DetectObjectsBatch",
"ORT engine is null", __FILE__, __LINE__);
return {};
}
probThresh = PROBABILITY_THRESHOLD;
nmsThresh = NMS_THRESHOLD;
numKps = NUM_KPS;
classes = _classes;
trackerEnabled = _trackerEnabled;
stabilizationEnabled = _stabilizationEnabled;
}
// Heavy work outside lock
auto batchResults = m_ortEngine->detectBatch(
inputImages, _classes,
PROBABILITY_THRESHOLD, NMS_THRESHOLD, NUM_KPS);
inputImages, classes, probThresh, nmsThresh, numKps);
const bool isClassification = m_ortEngine->lastBatchWasClassification;
@@ -2171,10 +2218,9 @@ namespace ANSCENTER {
for (auto& obj : results)
obj.cameraId = camera_id;
// Skip tracking for classification models
if (_trackerEnabled && !isClassification) {
if (trackerEnabled && !isClassification) {
results = ApplyTracking(results, camera_id);
if (_stabilizationEnabled) results = StabilizeDetections(results, camera_id);
if (stabilizationEnabled) results = StabilizeDetections(results, camera_id);
}
}
@@ -2189,6 +2235,7 @@ namespace ANSCENTER {
"DirectML GPU device lost (887A0005) — attempting CPU fallback",
__FILE__, __LINE__);
try {
std::lock_guard<std::recursive_mutex> lk(_mutex);
m_ortEngine.reset();
if (!InitOrtEngine(ANSCENTER::EngineType::CPU))
_logger.LogFatal("ANSONNXYOLO::DetectObjectsBatch",