Fix mutex lock issues

2026-04-13 19:48:32 +10:00
parent 56a8f09adf
commit 844d7396b2
30 changed files with 445 additions and 575 deletions
--- a/modules/ANSODEngine/ANSONNXYOLO.cpp
+++ b/modules/ANSODEngine/ANSONNXYOLO.cpp
@@ -1734,6 +1734,7 @@ namespace ANSCENTER {
                                   std::string& labelMap)
    {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        ModelLoadingGuard mlg(_modelLoading);
        try {
            _modelLoadValid = false;
            bool result = ANSODBase::Initialize(licenseKey, modelConfig,
@@ -1810,6 +1811,7 @@ namespace ANSCENTER {
                                  const std::string& modelZipPassword)
    {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        ModelLoadingGuard mlg(_modelLoading);
        try {
            bool result = ANSODBase::LoadModel(modelZipFilePath, modelZipPassword);
            if (!result) return false;
@@ -1873,6 +1875,7 @@ namespace ANSCENTER {
                                            std::string& labelMap)
    {
        std::lock_guard<std::recursive_mutex> lock(_mutex);
+        ModelLoadingGuard mlg(_modelLoading);
        try {
            bool result = ANSODBase::LoadModelFromFolder(licenseKey, modelConfig,
                                                         modelName, className,
@@ -1948,8 +1951,10 @@ namespace ANSCENTER {
    std::vector<Object> ANSONNXYOLO::RunInference(const cv::Mat& inputImgBGR,
                                                     const std::string& camera_id)
    {
+        if (_modelLoading.load()) return {};
        {
-            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            auto lock = TryLockWithTimeout("ANSONNXYOLO::RunInference");
+            if (!lock.owns_lock()) return {};
            if (!_modelLoadValid) {
                _logger.LogError("ANSONNXYOLO::RunInference", "Model not loaded", __FILE__, __LINE__);
                return {};
@@ -1976,29 +1981,55 @@ namespace ANSCENTER {
                                                      const std::string& camera_id)
    {
        try {
-            ANS_DBG("ONNXYOLO", "DetectObjects: cam=%s acquiring mutex...", camera_id.c_str());
-            std::lock_guard<std::recursive_mutex> lock(_mutex);
-            ANS_DBG("ONNXYOLO", "DetectObjects: mutex acquired, cam=%s", camera_id.c_str());
-            if (!m_ortEngine) {
-                _logger.LogError("ANSONNXYOLO::DetectObjects", "ORT engine is null", __FILE__, __LINE__);
-                ANS_DBG("ONNXYOLO", "DetectObjects: ORT engine is null!");
+            // Fail-fast if a model load/init is in progress on another thread
+            if (_modelLoading.load()) {
+                ANS_DBG("ONNXYOLO", "DetectObjects: skipped — model loading in progress, cam=%s", camera_id.c_str());
                return {};
            }

+            // ── Snapshot config under a brief lock ──────────────────
+            // Only hold _mutex long enough to validate state and copy
+            // the parameters needed for inference.  The actual ORT
+            // detect() call runs outside the lock so that concurrent
+            // Initialize/LoadModel calls are not blocked for the full
+            // duration of inference.
+            float probThresh, nmsThresh;
+            int   numKps;
+            std::vector<std::string> classes;
+            bool  trackerEnabled, stabilizationEnabled;
+            {
+                ANS_DBG("ONNXYOLO", "DetectObjects: cam=%s acquiring mutex...", camera_id.c_str());
+                auto lk = TryLockWithTimeout("ANSONNXYOLO::DetectObjects");
+                if (!lk.owns_lock()) return {};  // timed out
+                ANS_DBG("ONNXYOLO", "DetectObjects: mutex acquired, cam=%s", camera_id.c_str());
+
+                if (!m_ortEngine) {
+                    _logger.LogError("ANSONNXYOLO::DetectObjects", "ORT engine is null", __FILE__, __LINE__);
+                    ANS_DBG("ONNXYOLO", "DetectObjects: ORT engine is null!");
+                    return {};
+                }
+
+                // Snapshot parameters while locked
+                probThresh          = PROBABILITY_THRESHOLD;
+                nmsThresh           = NMS_THRESHOLD;
+                numKps              = NUM_KPS;
+                classes             = _classes;
+                trackerEnabled      = _trackerEnabled;
+                stabilizationEnabled = _stabilizationEnabled;
+            }
+            // ── _mutex released — heavy work below runs lock-free ───
+
            // --- NV12 fast path: try to get full-res BGR from GPU NV12 frame ---
            cv::Mat inferenceImage = inputImage;
            float bgrScaleX = 1.0f, bgrScaleY = 1.0f;
            {
                auto* gpuData = tl_currentGpuFrame();
                if (gpuData && gpuData->width > 0 && gpuData->height > 0) {
-                    // Full-res NV12 available — convert to BGR on CPU for ORT
-                    // (ORT preprocessing is CPU-based, so we need a cv::Mat)
                    if (gpuData->cpuYPlane && gpuData->cpuUvPlane &&
                            gpuData->cpuYLinesize >= gpuData->width &&
                            gpuData->cpuUvLinesize >= gpuData->width) {
                        const int fw = gpuData->width;
                        const int fh = gpuData->height;
-                        // NV12 requires even dimensions
                        if ((fw % 2) == 0 && (fh % 2) == 0) {
                            try {
                                cv::Mat yPlane(fh, fw, CV_8UC1,
@@ -2017,10 +2048,12 @@ namespace ANSCENTER {
                    }
                }
            }
-            auto results = m_ortEngine->detect(inferenceImage, _classes,
-                                               PROBABILITY_THRESHOLD,
-                                               NMS_THRESHOLD,
-                                               NUM_KPS);
+
+            // Run ORT inference — no mutex held, this is the expensive call
+            auto results = m_ortEngine->detect(inferenceImage, classes,
+                                               probThresh,
+                                               nmsThresh,
+                                               numKps);

            // --- Rescale coordinates from full-res back to display-res ---
            if (bgrScaleX != 1.0f || bgrScaleY != 1.0f) {
@@ -2044,21 +2077,16 @@ namespace ANSCENTER {
            for (auto& obj : results)
                obj.cameraId = camera_id;

-            // Skip tracking for classification models
-            if (_trackerEnabled && !m_ortEngine->lastWasClassification) {
+            // Tracking/stabilization (ApplyTracking has its own lock)
+            if (trackerEnabled && !m_ortEngine->lastWasClassification) {
                results = ApplyTracking(results, camera_id);
-                if (_stabilizationEnabled) results = StabilizeDetections(results, camera_id);
+                if (stabilizationEnabled) results = StabilizeDetections(results, camera_id);
            }
            return results;
        }
        catch (const std::exception& e) {
            const std::string msg = e.what();

-            // ── DML device-removal detection ──────────────────────────
-            // HRESULT 887A0005 = DXGI_ERROR_DEVICE_REMOVED ("The GPU
-            // device instance has been suspended").  Once the D3D12
-            // device is gone the ORT session is permanently broken.
-            // Log once, attempt CPU fallback, suppress further flood.
            if (msg.find("887A0005") != std::string::npos) {
                if (!_dmlDeviceLost) {
                    _dmlDeviceLost = true;
@@ -2067,6 +2095,7 @@ namespace ANSCENTER {
                        __FILE__, __LINE__);
                    ANS_DBG("ONNXYOLO", "DML device lost — recreating session on CPU");
                    try {
+                        std::lock_guard<std::recursive_mutex> lk(_mutex);
                        m_ortEngine.reset();
                        if (InitOrtEngine(ANSCENTER::EngineType::CPU)) {
                            _logger.LogInfo("ANSONNXYOLO::DetectObjects",
@@ -2084,7 +2113,6 @@ namespace ANSCENTER {
                            __FILE__, __LINE__);
                    }
                }
-                // Suppress flood — already logged above
                return {};
            }

@@ -2101,8 +2129,10 @@ namespace ANSCENTER {
    std::vector<std::vector<Object>> ANSONNXYOLO::RunInferencesBatch(
        const std::vector<cv::Mat>& inputs, const std::string& camera_id)
    {
+        if (_modelLoading.load()) return {};
        {
-            std::lock_guard<std::recursive_mutex> lock(_mutex);
+            auto lock = TryLockWithTimeout("ANSONNXYOLO::RunInferencesBatch");
+            if (!lock.owns_lock()) return {};
            if (!_modelLoadValid) {
                _logger.LogFatal("ANSONNXYOLO::RunInferencesBatch",
                    "Cannot load ONNX model", __FILE__, __LINE__);
@@ -2154,16 +2184,33 @@ namespace ANSCENTER {
        const std::vector<cv::Mat>& inputImages, const std::string& camera_id)
    {
        try {
-            std::lock_guard<std::recursive_mutex> lock(_mutex);
-            if (!m_ortEngine) {
-                _logger.LogError("ANSONNXYOLO::DetectObjectsBatch",
-                    "ORT engine is null", __FILE__, __LINE__);
-                return {};
+            if (_modelLoading.load()) return {};
+
+            // Snapshot config under brief lock
+            float probThresh, nmsThresh;
+            int   numKps;
+            std::vector<std::string> classes;
+            bool  trackerEnabled, stabilizationEnabled;
+            {
+                auto lk = TryLockWithTimeout("ANSONNXYOLO::DetectObjectsBatch");
+                if (!lk.owns_lock()) return {};
+
+                if (!m_ortEngine) {
+                    _logger.LogError("ANSONNXYOLO::DetectObjectsBatch",
+                        "ORT engine is null", __FILE__, __LINE__);
+                    return {};
+                }
+                probThresh           = PROBABILITY_THRESHOLD;
+                nmsThresh            = NMS_THRESHOLD;
+                numKps               = NUM_KPS;
+                classes              = _classes;
+                trackerEnabled       = _trackerEnabled;
+                stabilizationEnabled = _stabilizationEnabled;
            }

+            // Heavy work outside lock
            auto batchResults = m_ortEngine->detectBatch(
-                inputImages, _classes,
-                PROBABILITY_THRESHOLD, NMS_THRESHOLD, NUM_KPS);
+                inputImages, classes, probThresh, nmsThresh, numKps);

            const bool isClassification = m_ortEngine->lastBatchWasClassification;

@@ -2171,10 +2218,9 @@ namespace ANSCENTER {
                for (auto& obj : results)
                    obj.cameraId = camera_id;

-                // Skip tracking for classification models
-                if (_trackerEnabled && !isClassification) {
+                if (trackerEnabled && !isClassification) {
                    results = ApplyTracking(results, camera_id);
-                    if (_stabilizationEnabled) results = StabilizeDetections(results, camera_id);
+                    if (stabilizationEnabled) results = StabilizeDetections(results, camera_id);
                }
            }

@@ -2189,6 +2235,7 @@ namespace ANSCENTER {
                        "DirectML GPU device lost (887A0005) — attempting CPU fallback",
                        __FILE__, __LINE__);
                    try {
+                        std::lock_guard<std::recursive_mutex> lk(_mutex);
                        m_ortEngine.reset();
                        if (!InitOrtEngine(ANSCENTER::EngineType::CPU))
                            _logger.LogFatal("ANSONNXYOLO::DetectObjectsBatch",