Fix NV12 crash issue when recreate camera object

2026-04-02 22:07:27 +11:00
parent 4bedf3a3a2
commit 958cab6ae3
25 changed files with 1459 additions and 393 deletions
--- a/modules/ANSODEngine/ANSODEngine.cpp
+++ b/modules/ANSODEngine/ANSODEngine.cpp
@@ -1455,7 +1455,7 @@ namespace ANSCENTER
 		}
 	}
 	std::vector<Object> ANSODBase::RunStaticInference(const cv::Mat& input, cv::Rect Bbox, const std::string& camera_id) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — only uses local variables and virtual RunInference() which has its own engine lock
 		std::vector<Object> output;
 		output.clear();
 		try {
@@ -2100,7 +2100,8 @@ namespace ANSCENTER
 		}
 	}
 	std::vector<Object>   ANSODBase::RunInferenceWithOption(const cv::Mat& input, const std::string& camera_id, const std::string activeROIMode) {
-		std::lock_guard<std::recursive_mutex> lock(_mutex);
+		// No coarse _mutex — sub-components (engines, trackers) have their own locks.
+		// LabVIEW semaphore controls concurrency at the caller level.
 		try {
 			int mode = 0;
 			double confidenceThreshold = 0.35;
@@ -2116,8 +2117,11 @@ namespace ANSCENTER
 			if (confidenceThreshold <= 0) confidenceThreshold = 0;
 			if (confidenceThreshold > 1) confidenceThreshold = 1;

-			// Update model configuration with the new parameters
-			if(confidenceThreshold>0)_modelConfig.detectionScoreThreshold = confidenceThreshold;
+			// Update model configuration with the new parameters (brief lock for config)
+			if (confidenceThreshold > 0) {
+				std::lock_guard<std::recursive_mutex> cfgLock(_mutex);
+				_modelConfig.detectionScoreThreshold = confidenceThreshold;
+			}
 			switch (mode) {
 			case 0: // Normal mode
 				return RunInference(input, camera_id); //RunInference
--- a/modules/ANSODEngine/NV12PreprocessHelper.cpp
+++ b/modules/ANSODEngine/NV12PreprocessHelper.cpp
@@ -275,6 +275,26 @@ namespace ANSCENTER {
                              gpuData->gpuIndex == inferenceGpu;
        const bool useZeroCopy = isCudaDevice && gpuMatch;

+        // --- Debug: log pointer state before reading ---
+        {
+            char _nv12_dbg[512];
+            snprintf(_nv12_dbg, sizeof(_nv12_dbg),
+                "[NV12Helper] tryNV12: gpuData=%p yPlane=%p uvPlane=%p isCuda=%d "
+                "gpuIdx=%d infGpu=%d gpuMatch=%d zeroCopy=%d "
+                "gpuCacheY=%p gpuCacheUV=%p gpuCacheValid=%d refcount=%d %dx%d\n",
+                (void*)gpuData, (void*)gpuData->yPlane, (void*)gpuData->uvPlane,
+                (int)isCudaDevice, gpuData->gpuIndex, inferenceGpu,
+                (int)gpuMatch, (int)useZeroCopy,
+                gpuData->gpuCacheY, gpuData->gpuCacheUV,
+                (int)gpuData->gpuCacheValid,
+                gpuData->refcount.load(),
+                frameW, frameH);
+#ifdef _WIN32
+            OutputDebugStringA(_nv12_dbg);
+#endif
+            fprintf(stderr, "%s", _nv12_dbg);
+        }
+
        // Effective plane pointers — for zero-copy, use CUDA device ptrs;
        // for CPU upload, use the CPU snapshot buffers.
        uint8_t* effYPlane;
@@ -283,7 +303,7 @@ namespace ANSCENTER {
        int      effUvLinesize;

        if (useZeroCopy) {
-            // Same GPU: wrap NVDEC device pointers directly
+            // Same GPU: wrap owned CUDA device pointers directly
            effYPlane     = gpuData->yPlane;
            effUvPlane    = gpuData->uvPlane;
            effYLinesize  = gpuData->yLinesize;
@@ -435,6 +455,18 @@ namespace ANSCENTER {
        gpuResized.create(inputH, inputW, CV_8UC3);

        cudaStream_t rawStream = cv::cuda::StreamAccessor::getStream(stream);
+        {
+            char _nv12_dbg2[256];
+            snprintf(_nv12_dbg2, sizeof(_nv12_dbg2),
+                "[NV12Helper] KERNEL LAUNCH: gpuY=%p(%dx%d) gpuUV=%p(%dx%d) -> %dx%d zeroCopy=%d\n",
+                (void*)gpuY.data, gpuY.cols, gpuY.rows,
+                (void*)gpuUV.data, gpuUV.cols, gpuUV.rows,
+                inputW, inputH, (int)useZeroCopy);
+#ifdef _WIN32
+            OutputDebugStringA(_nv12_dbg2);
+#endif
+            fprintf(stderr, "%s", _nv12_dbg2);
+        }
        launcher(gpuY, gpuUV, gpuResized, frameW, frameH, inputW, inputH, rawStream);

        stream.waitForCompletion();
@@ -945,7 +977,15 @@ namespace ANSCENTER {
                inputW, inputH, frameW, frameH, stream);
        }

-        cudaStreamSynchronize(stream);
+        // Use polling sync instead of cudaStreamSynchronize to avoid
+        // holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
+        {
+            cudaError_t err = cudaStreamQuery(stream);
+            while (err == cudaErrorNotReady) {
+                Sleep(0);
+                err = cudaStreamQuery(stream);
+            }
+        }

        // (No registry lock to release — data kept alive by refcount)

--- a/modules/ANSODEngine/nv12_to_rgb.cu
+++ b/modules/ANSODEngine/nv12_to_rgb.cu
@@ -8,6 +8,9 @@

 #include <cuda_runtime.h>
 #include <cstdint>
+#ifdef _WIN32
+#include <windows.h>  // Sleep()
+#endif
 #include <cstdio>

 // ── Shared YUV→RGB computation ───────────────────────────────────────────
@@ -651,7 +654,24 @@ int ANSGpuNV12ToBGR(
                      width * 3, height,
                      cudaMemcpyDeviceToHost, t_bufs.stream);

-    cudaStreamSynchronize(t_bufs.stream);
+    // Use polling sync instead of cudaStreamSynchronize to avoid
+    // holding nvcuda64 SRW lock continuously (WDDM deadlock prevention).
+    // Short Sleep(0) fast path for sub-ms kernels, then Sleep(1) to give
+    // cleanup operations (cuArrayDestroy, cuMemFree) a window to acquire
+    // the exclusive SRW lock.
+    {
+        cudaError_t qerr = cudaStreamQuery(t_bufs.stream);
+        if (qerr == cudaErrorNotReady) {
+            for (int i = 0; i < 10 && qerr == cudaErrorNotReady; ++i) {
+                Sleep(0);
+                qerr = cudaStreamQuery(t_bufs.stream);
+            }
+            while (qerr == cudaErrorNotReady) {
+                Sleep(1);
+                qerr = cudaStreamQuery(t_bufs.stream);
+            }
+        }
+    }

    // Check for errors
    cudaError_t err = cudaGetLastError();