Fix NV12 crash issue when recreate camera object

2026-04-02 22:07:27 +11:00
parent 4bedf3a3a2
commit 958cab6ae3
25 changed files with 1459 additions and 393 deletions
--- a/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp
+++ b/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp
@@ -23,6 +23,7 @@
 #include <thread>
 #include <mutex>
 #include <atomic>
+#include <random>
 #include <chrono>
 #include <deque>
 #include <set>
@@ -751,8 +752,11 @@ static void LogGpuInfo() {
 // Worker thread: reads RTSP frames and runs ALPR inference
 // RTSP client and ALPR engine are pre-created on the main thread to avoid
 // race conditions in CreateANSRTSPHandle / CreateANSALPRHandle.
+// Takes rtspClientPtr (pointer to array slot) + streamGuard mutex so the
+// CHAOS thread can safely destroy+recreate the stream without use-after-free.
 static void ALPRWorkerThread(int taskId,
-                              ANSCENTER::ANSRTSPClient* rtspClient,
+                              ANSCENTER::ANSRTSPClient** rtspClientPtr,
+                              std::mutex* streamGuard,
                              ANSCENTER::ANSALPR* alprHandle,
                              TaskState& state) {
    char tag[32];
@@ -780,6 +784,23 @@ static void ALPRWorkerThread(int taskId,
    bool hwDecodeLogged = false;

    while (g_running.load()) {
+        // Lock the stream guard to prevent CHAOS from destroying the client
+        // while we're mid-frame-grab or mid-inference.
+        std::unique_lock<std::mutex> streamLock(*streamGuard);
+
+        // Re-read the client pointer each iteration — CHAOS may have
+        // destroyed+recreated it, so our old pointer could be dangling.
+        ANSCENTER::ANSRTSPClient* rtspClient = *rtspClientPtr;
+        if (rtspClient == nullptr) {
+            streamLock.unlock();
+            emptyFrames++;
+            if (emptyFrames % 100 == 1) {
+                g_log.add(prefix + " Stream destroyed by CHAOS, waiting... (count=" + std::to_string(emptyFrames) + ")");
+            }
+            std::this_thread::sleep_for(std::chrono::milliseconds(50));
+            continue;
+        }
+
        // Read frame from RTSP via ANSCV
        auto grabStart = std::chrono::steady_clock::now();
        cv::Mat* framePtr = nullptr;
@@ -797,6 +818,7 @@ static void ALPRWorkerThread(int taskId,
                ReconnectRTSP(&rtspClient);
                emptyFrames = 0;
            }
+            streamLock.unlock();
            if (framePtr) delete framePtr;
            std::this_thread::sleep_for(std::chrono::milliseconds(10));
            continue;
@@ -829,6 +851,9 @@ static void ALPRWorkerThread(int taskId,
        // matches by cv::Mat* pointer, so `new cv::Mat(*framePtr)` would create
        // a different pointer the registry doesn't know, breaking NV12 zero-copy.
        ANSALPR_RunInferenceComplete_CPP(&alprHandle, &framePtr, cameraId.c_str(), 0, 0, lpnResult, jpegImage);
+
+        // Release stream lock — inference is done, CHAOS can now safely destroy.
+        streamLock.unlock();
        auto infEnd = std::chrono::steady_clock::now();
        double infMs = std::chrono::duration<double, std::milli>(infEnd - infStart).count();
        totalInfMs += infMs;
@@ -933,19 +958,20 @@ int ANSLPR_MultiGPU_StressTest() {

    printf("\n");
    printf("============================================================\n");
-    printf("  ANSLPR Multi-GPU Stress Test — 4 Parallel ALPR Tasks\n");
+    printf("  ANSLPR Multi-GPU Stress Test — 5 Parallel ALPR Tasks\n");
+    printf("  (4 cameras, 5 AI tasks — Task 4 shares Stream 2)\n");
    printf("  Press ESC to stop\n");
    printf("  Log file: %s\n", LOG_FILE_PATH);
    printf("============================================================\n\n");

    g_log.add("============================================================");
-    g_log.add("  ANSLPR Multi-GPU Stress Test — 4 Parallel ALPR Tasks");
+    g_log.add("  ANSLPR Multi-GPU Stress Test — 5 Parallel ALPR Tasks");
    g_log.add("============================================================");

    // --- Log GPU info for diagnostics ---
    LogGpuInfo();

-    // --- RTSP URLs (4 independent streams, one per task) ---
+    // --- RTSP URLs (4 independent camera streams) ---
    const std::string rtspUrl0 = "rtsp://admin:admin123@103.156.0.133:8010/cam/realmonitor?channel=1&subtype=0";
    const std::string rtspUrl1 = "rtsp://cafe2471.ddns.net:600/rtsp/streaming?channel=01&subtype=0";
    const std::string rtspUrl2 = "rtsp://nhathuocngoclinh.zapto.org:600/rtsp/streaming?channel=01&subtype=0";
@@ -956,18 +982,39 @@ int ANSLPR_MultiGPU_StressTest() {
    g_log.add("Stream 2: " + rtspUrl2);
    g_log.add("Stream 3: " + rtspUrl3);

+    // =========================================================================
+    //  Architecture: Camera Process + AI Task Process (mimics LabVIEW)
+    //  -----------------------------------------------------------------------
+    //  Camera Process:  4 independent RTSP streams acquire frames from cameras.
+    //  AI Task Process: 5 AI tasks subscribe to camera streams and run inference
+    //                   in parallel.  Multiple tasks can share one camera stream.
+    //  Task 4 subscribes to Stream 2 (nhathuocngoclinh) to demonstrate the
+    //  shared-camera subscription model used in LabVIEW.
+    // =========================================================================
+
+    const int NUM_STREAMS = 4;
+    const int NUM_TASKS   = 5;
+
    // --- Task states ---
-    TaskState taskStates[4];
+    TaskState taskStates[NUM_TASKS];

    // =========================================================================
-    //  Create 4 INDEPENDENT RTSP readers — one per task, each with its own
-    //  camera stream.  Each task gets a dedicated RTSP connection.
+    //  CAMERA PROCESS: Create 4 independent RTSP readers (one per camera).
+    //  These form the camera acquisition layer that AI tasks subscribe to.
    // =========================================================================
-    const int NUM_STREAMS = 4;
    ANSCENTER::ANSRTSPClient* rtspClients[NUM_STREAMS] = {};
    const std::string streamUrls[NUM_STREAMS] = { rtspUrl0, rtspUrl1, rtspUrl2, rtspUrl3 };
-    // Map: task index -> stream index (1:1 mapping)
-    const int taskStreamMap[4] = { 0, 1, 2, 3 };
+    // Map: task index -> stream index
+    // Tasks 0-3 map 1:1 to streams 0-3.
+    // Task 4 subscribes to Stream 2 (nhathuocngoclinh) — shared camera.
+    const int taskStreamMap[NUM_TASKS] = { 0, 1, 2, 3, 2 };
+
+    // Log task-to-stream subscription mapping
+    g_log.add("--- AI Task -> Camera Stream subscription ---");
+    for (int i = 0; i < NUM_TASKS; i++) {
+        g_log.add("  Task " + std::to_string(i) + " -> Stream " + std::to_string(taskStreamMap[i])
+                   + " (" + streamUrls[taskStreamMap[i]] + ")");
+    }

    for (int s = 0; s < NUM_STREAMS; s++) {
        printf("[Stream%d] Creating RTSP handle for %s...\n", s, streamUrls[s].c_str());
@@ -986,14 +1033,17 @@ int ANSLPR_MultiGPU_StressTest() {
    }

    // =========================================================================
-    //  Create 4 ALPR engines sequentially
+    //  AI TASK PROCESS: Create 5 ALPR engines sequentially.
+    //  Each AI task gets its own engine and subscribes to a camera stream.
+    //  Task 4 shares Stream 2 (nhathuocngoclinh) with Task 2 — demonstrating
+    //  the LabVIEW pattern where multiple AI tasks subscribe to one camera.
    // =========================================================================
-    ANSCENTER::ANSALPR* alprHandles[4] = {};
+    ANSCENTER::ANSALPR* alprHandles[NUM_TASKS] = {};
    std::string modelZipFile = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_ALPR_v1.2.zip";
    int engineType = 1; // NVIDIA_GPU
    double detThresh = 0.5, ocrThresh = 0.5, colThresh = 0.5;

-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < NUM_TASKS; i++) {
        char tag[32];
        snprintf(tag, sizeof(tag), "[Task%d]", i);

@@ -1109,7 +1159,7 @@ int ANSLPR_MultiGPU_StressTest() {

            // Count votes: how many tasks on this stream use each GPU
            std::map<int, int> gpuVotes;
-            for (int i = 0; i < 4; i++) {
+            for (int i = 0; i < NUM_TASKS; i++) {
                if (taskStreamMap[i] == s && alprHandles[i]) {
                    gpuVotes[taskStates[i].gpuDeviceId]++;
                }
@@ -1194,30 +1244,132 @@ int ANSLPR_MultiGPU_StressTest() {
    }

    // --- Enable deep pipeline benchmarking on all ALPR handles ---
-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < NUM_TASKS; i++) {
        if (alprHandles[i]) {
            alprHandles[i]->ActivateDebugger(true);
        }
    }
    g_log.add("Debug benchmarking ENABLED on all ALPR handles");

-    // --- Launch worker threads — tasks sharing a stream get the same RTSP client ---
-    g_log.add("Launching worker threads...");
-    std::thread workers[4];
-    for (int i = 0; i < 4; i++) {
+    // --- Per-stream mutex: prevents CHAOS from destroying a stream while a
+    //     worker is mid-frame-grab or mid-inference (use-after-free fix). ---
+    std::mutex streamGuards[NUM_STREAMS];
+
+    // --- Launch worker threads ---
+    // Each AI task subscribes to its camera stream via taskStreamMap.
+    // Tasks sharing a stream (e.g. Task 2 & Task 4 on Stream 2) both get
+    // the same RTSP client pointer and share the stream's mutex guard.
+    g_log.add("Launching " + std::to_string(NUM_TASKS) + " worker threads...");
+    std::thread workers[NUM_TASKS];
+    for (int i = 0; i < NUM_TASKS; i++) {
        int streamIdx = taskStreamMap[i];
        if (rtspClients[streamIdx] && alprHandles[i]) {
            workers[i] = std::thread(ALPRWorkerThread, i,
-                                     rtspClients[streamIdx], alprHandles[i],
+                                     &rtspClients[streamIdx],
+                                     &streamGuards[streamIdx],
+                                     alprHandles[i],
                                     std::ref(taskStates[i]));
        }
    }

+    // =========================================================================
+    //  Camera Chaos Thread — simulates camera errors / reconnects
+    //  Mimics LabVIEW behavior: cameras randomly go into Error/Recovering
+    //  state, triggering Stop/Reconnect/Destroy+Recreate cycles that cause
+    //  CUDA cleanup (cuArrayDestroy, cuMemFree) while inference is running.
+    //  This is the exact scenario that triggers the nvcuda64 SRW lock deadlock.
+    // =========================================================================
+    std::atomic<bool> chaosEnabled{true};
+    std::thread chaosThread([&]() {
+        std::mt19937 rng(std::random_device{}());
+
+        // Wait 10 seconds for system to stabilize before starting chaos
+        for (int i = 0; i < 100 && g_running.load(); i++) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        }
+
+        g_log.add("[CHAOS] Camera chaos thread started — every 10s, stop/destroy/recreate one camera (round-robin)");
+        printf("[CHAOS] Camera chaos thread started — 10s interval, round-robin across %d streams\n", NUM_STREAMS);
+
+        int chaosCount = 0;
+        int nextStream = 0;  // Round-robin: cycle through streams 0,1,2,3,0,1,...
+        while (g_running.load() && chaosEnabled.load()) {
+            // Fixed 10-second interval between chaos events
+            for (int s = 0; s < 100 && g_running.load(); s++) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+            }
+            if (!g_running.load()) break;
+
+            int streamIdx = nextStream;
+            nextStream = (nextStream + 1) % NUM_STREAMS;
+            chaosCount++;
+
+            char buf[512];
+            auto chaosStart = std::chrono::steady_clock::now();
+
+            // Lock stream guard: wait for any in-flight inference to finish
+            // before touching the RTSP client.  This prevents use-after-free
+            // when CHAOS destroys a stream while a worker is mid-inference.
+            std::unique_lock<std::mutex> chaosLock(streamGuards[streamIdx]);
+
+            // Always use full DESTROY + RECREATE cycle.
+            // Reconnect() reuses internal player state which can leave stale
+            // CUDA resources and cause freezes.  A clean destroy + recreate
+            // guarantees a fresh decoder/player with no leftover state.
+            {
+                bool wasAlive = (rtspClients[streamIdx] != nullptr);
+
+                snprintf(buf, sizeof(buf), "[CHAOS #%d] Stream%d: DESTROY + RECREATE (%s)",
+                    chaosCount, streamIdx,
+                    wasAlive ? "camera was running" : "camera was already offline");
+                g_log.add(buf);
+                printf("%s\n", buf);
+
+                // Stop and release old handle if it exists
+                if (rtspClients[streamIdx]) {
+                    StopRTSP(&rtspClients[streamIdx]);
+                    ReleaseANSRTSPHandle(&rtspClients[streamIdx]);
+                    rtspClients[streamIdx] = nullptr;
+                }
+
+                // Release lock during offline sleep — worker sees nullptr and skips
+                int offlineMs = 500 + (rng() % 2500);  // 0.5 - 3 seconds offline
+                chaosLock.unlock();
+                std::this_thread::sleep_for(std::chrono::milliseconds(offlineMs));
+                chaosLock.lock();
+
+                // Recreate the RTSP handle (under lock again)
+                int result = CreateANSRTSPHandle(&rtspClients[streamIdx], "", "", "",
+                    streamUrls[streamIdx].c_str());
+                if (result == 1 && rtspClients[streamIdx]) {
+                    SetRTSPImageQuality(&rtspClients[streamIdx], 0);
+                    SetRTSPHWDecoding(&rtspClients[streamIdx], 7);
+                    StartRTSP(&rtspClients[streamIdx]);
+
+                    auto chaosEnd = std::chrono::steady_clock::now();
+                    double chaosMs = std::chrono::duration<double, std::milli>(chaosEnd - chaosStart).count();
+                    snprintf(buf, sizeof(buf), "[CHAOS #%d] Stream%d: RECREATED in %.0f ms (offline %d ms)",
+                        chaosCount, streamIdx, chaosMs, offlineMs);
+                } else {
+                    snprintf(buf, sizeof(buf), "[CHAOS #%d] Stream%d: RECREATE FAILED (result=%d)",
+                        chaosCount, streamIdx, result);
+                }
+                g_log.add(buf);
+                printf("%s\n", buf);
+            }
+        }
+
+        g_log.add("[CHAOS] Camera chaos thread stopped (total events: " + std::to_string(chaosCount) + ")");
+        printf("[CHAOS] Camera chaos thread stopped (total events: %d)\n", chaosCount);
+    });
+
    // --- Display loop (main thread) ---
-    const int cellW = 640, cellH = 480;
-    const int logPanelH = 200;
+    // 3x2 grid layout: 5 tasks displayed in 3 columns x 2 rows
+    const int cellW = 480, cellH = 360;  // Smaller cells for 3-column layout
+    const int logPanelH = 220;
+    const int gridCols = 3, gridRows = 2;
    cv::namedWindow("ANSLPR Multi-GPU Stress Test", cv::WINDOW_NORMAL);
-    cv::resizeWindow("ANSLPR Multi-GPU Stress Test", cellW * 2, cellH * 2 + logPanelH);
+    cv::resizeWindow("ANSLPR Multi-GPU Stress Test", cellW * gridCols, cellH * gridRows + logPanelH);

    auto testStart = std::chrono::steady_clock::now();
    auto lastGpuSnapshot = std::chrono::steady_clock::now();
@@ -1244,12 +1396,12 @@ int ANSLPR_MultiGPU_StressTest() {
            }
            // Per-task stats
            double totalFpsSnap = 0;
-            for (int t = 0; t < 4; t++) {
+            for (int t = 0; t < NUM_TASKS; t++) {
                std::lock_guard<std::mutex> lk(taskStates[t].mtx);
                char buf[256];
                snprintf(buf, sizeof(buf),
-                    "  T%d: GPU[%d] VRAM=%zuMiB FPS=%.1f GrabMs=%.0f InfMs=%.0f Frames=%d Det=%d",
-                    t, taskStates[t].gpuDeviceId,
+                    "  T%d(S%d): GPU[%d] VRAM=%zuMiB FPS=%.1f GrabMs=%.0f InfMs=%.0f Frames=%d Det=%d",
+                    t, taskStreamMap[t], taskStates[t].gpuDeviceId,
                    taskStates[t].vramUsedBytes / (1024 * 1024),
                    taskStates[t].fps, taskStates[t].lastGrabMs, taskStates[t].inferenceMs,
                    taskStates[t].frameCount, taskStates[t].detectionCount);
@@ -1261,7 +1413,7 @@ int ANSLPR_MultiGPU_StressTest() {
            g_log.add(buf);
            // Multi-GPU check
            std::set<int> gpusUsed;
-            for (int t = 0; t < 4; t++) {
+            for (int t = 0; t < NUM_TASKS; t++) {
                if (taskStates[t].gpuDeviceId >= 0) gpusUsed.insert(taskStates[t].gpuDeviceId);
            }
            if (gpusUsed.size() > 1) {
@@ -1271,12 +1423,12 @@ int ANSLPR_MultiGPU_StressTest() {
            }
            g_log.add("---- END SNAPSHOT ----");
        }
-        // Build 2x2 grid + log panel
-        cv::Mat canvas(cellH * 2 + logPanelH, cellW * 2, CV_8UC3, cv::Scalar(30, 30, 30));
+        // Build 3x2 grid + log panel (5 tasks: 3 cols x 2 rows, cell [1][2] empty)
+        cv::Mat canvas(cellH * gridRows + logPanelH, cellW * gridCols, CV_8UC3, cv::Scalar(30, 30, 30));

-        // Place each task's frame in its quadrant
-        for (int i = 0; i < 4; i++) {
-            int row = i / 2, col = i % 2;
+        // Place each task's frame in its cell
+        for (int i = 0; i < NUM_TASKS; i++) {
+            int row = i / gridCols, col = i % gridCols;
            cv::Rect roi(col * cellW, row * cellH, cellW, cellH);

            cv::Mat cell;
@@ -1313,8 +1465,8 @@ int ANSLPR_MultiGPU_StressTest() {
            // Draw status bar at bottom of each cell (2 lines)
            cv::rectangle(cell, cv::Rect(0, cellH - 50, cellW, 50), cv::Scalar(0, 0, 0), cv::FILLED);
            char bar1[256], bar2[256];
-            snprintf(bar1, sizeof(bar1), "T%d | %.1f FPS | %.0fms | Frames:%d | Det:%d | %s",
-                     i, fps, infMs, fCount, dCount,
+            snprintf(bar1, sizeof(bar1), "T%d(S%d) | %.1f FPS | %.0fms | F:%d | D:%d | %s",
+                     i, taskStreamMap[i], fps, infMs, fCount, dCount,
                     lastPlate.empty() ? "-" : lastPlate.c_str());
            if (gpuId >= 0) {
                snprintf(bar2, sizeof(bar2), "GPU[%d] | VRAM: %zu MiB", gpuId, vramMiB);
@@ -1323,45 +1475,53 @@ int ANSLPR_MultiGPU_StressTest() {
            }
            cv::Scalar barColor = engineLoaded ? cv::Scalar(0, 255, 0) : cv::Scalar(0, 100, 255);
            cv::putText(cell, bar1, cv::Point(5, cellH - 28),
-                        cv::FONT_HERSHEY_SIMPLEX, 0.45, barColor, 1);
+                        cv::FONT_HERSHEY_SIMPLEX, 0.4, barColor, 1);
            cv::putText(cell, bar2, cv::Point(5, cellH - 8),
-                        cv::FONT_HERSHEY_SIMPLEX, 0.45, cv::Scalar(0, 200, 255), 1);
+                        cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(0, 200, 255), 1);

            cell.copyTo(canvas(roi));
-
-            // Draw grid lines
-            cv::line(canvas, cv::Point(cellW, 0), cv::Point(cellW, cellH * 2),
-                     cv::Scalar(100, 100, 100), 1);
-            cv::line(canvas, cv::Point(0, cellH), cv::Point(cellW * 2, cellH),
-                     cv::Scalar(100, 100, 100), 1);
        }

+        // Draw grid lines
+        for (int c = 1; c < gridCols; c++)
+            cv::line(canvas, cv::Point(c * cellW, 0), cv::Point(c * cellW, cellH * gridRows),
+                     cv::Scalar(100, 100, 100), 1);
+        for (int r = 1; r < gridRows; r++)
+            cv::line(canvas, cv::Point(0, r * cellH), cv::Point(cellW * gridCols, r * cellH),
+                     cv::Scalar(100, 100, 100), 1);
+
        // --- Log panel at bottom ---
-        cv::Rect logRoi(0, cellH * 2, cellW * 2, logPanelH);
+        cv::Rect logRoi(0, cellH * gridRows, cellW * gridCols, logPanelH);
        cv::Mat logPanel = canvas(logRoi);
        logPanel.setTo(cv::Scalar(20, 20, 20));

        // Elapsed time header
        auto elapsed = std::chrono::duration<double>(std::chrono::steady_clock::now() - testStart).count();
-        char header[128];
+        char header[256];
        snprintf(header, sizeof(header),
-                 "Elapsed: %.0fs | Press ESC to stop | Resize window freely", elapsed);
+                 "Elapsed: %.0fs | %d cameras, %d AI tasks | Press ESC to stop",
+                 elapsed, NUM_STREAMS, NUM_TASKS);
        cv::putText(logPanel, header, cv::Point(10, 18),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(200, 200, 0), 1);

        // Aggregate stats + per-task GPU summary
        double totalFps = 0;
-        for (int i = 0; i < 4; i++) {
+        for (int i = 0; i < NUM_TASKS; i++) {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            totalFps += taskStates[i].fps;
        }
-        char aggLine[256];
-        snprintf(aggLine, sizeof(aggLine), "Total throughput: %.1f FPS | T0:GPU%d T1:GPU%d T2:GPU%d T3:GPU%d",
-                 totalFps,
-                 taskStates[0].gpuDeviceId, taskStates[1].gpuDeviceId,
-                 taskStates[2].gpuDeviceId, taskStates[3].gpuDeviceId);
+        // Build dynamic task-GPU summary string
+        std::string taskGpuStr;
+        for (int i = 0; i < NUM_TASKS; i++) {
+            if (i > 0) taskGpuStr += " ";
+            taskGpuStr += "T" + std::to_string(i) + "(S" + std::to_string(taskStreamMap[i])
+                        + "):GPU" + std::to_string(taskStates[i].gpuDeviceId);
+        }
+        char aggLine[512];
+        snprintf(aggLine, sizeof(aggLine), "Total: %.1f FPS | %s",
+                 totalFps, taskGpuStr.c_str());
        cv::putText(logPanel, aggLine, cv::Point(10, 38),
-                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 255), 1);
+                    cv::FONT_HERSHEY_SIMPLEX, 0.45, cv::Scalar(0, 255, 255), 1);

        // Real-time GPU VRAM monitor (query every frame — cheap call)
        auto gpuSnaps = QueryGpuVram();
@@ -1370,7 +1530,7 @@ int ANSLPR_MultiGPU_StressTest() {
            // Count tasks on this GPU and their total VRAM
            int tasksOnGpu = 0;
            size_t taskVramMiB = 0;
-            for (int i = 0; i < 4; i++) {
+            for (int i = 0; i < NUM_TASKS; i++) {
                std::lock_guard<std::mutex> lk(taskStates[i].mtx);
                if (taskStates[i].gpuDeviceId == gs.deviceId) {
                    tasksOnGpu++;
@@ -1387,13 +1547,13 @@ int ANSLPR_MultiGPU_StressTest() {
            gpuLineY += 18;
        }

-        // Per-task resource line
-        for (int i = 0; i < 4; i++) {
+        // Per-task resource line (shows which stream each task subscribes to)
+        for (int i = 0; i < NUM_TASKS; i++) {
            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
            char tLine[256];
            snprintf(tLine, sizeof(tLine),
-                "T%d: GPU[%d] VRAM=%zuMiB FPS=%.1f Inf=%.0fms Frames=%d Det=%d",
-                i, taskStates[i].gpuDeviceId,
+                "T%d(S%d): GPU[%d] VRAM=%zuMiB FPS=%.1f Inf=%.0fms Frames=%d Det=%d",
+                i, taskStreamMap[i], taskStates[i].gpuDeviceId,
                taskStates[i].vramUsedBytes / (1024 * 1024),
                taskStates[i].fps, taskStates[i].inferenceMs,
                taskStates[i].frameCount, taskStates[i].detectionCount);
@@ -1421,9 +1581,13 @@ int ANSLPR_MultiGPU_StressTest() {
        }
    }

+    // --- Stop chaos thread ---
+    chaosEnabled.store(false);
+    if (chaosThread.joinable()) chaosThread.join();
+
    // --- Wait for all workers ---
-    printf("Waiting for worker threads to finish...\n");
-    for (int i = 0; i < 4; i++) {
+    printf("Waiting for %d worker threads to finish...\n", NUM_TASKS);
+    for (int i = 0; i < NUM_TASKS; i++) {
        if (workers[i].joinable()) workers[i].join();
    }

@@ -1433,19 +1597,21 @@ int ANSLPR_MultiGPU_StressTest() {

    g_log.add("================================================================");
    g_log.add("  FINAL PERFORMANCE SUMMARY");
+    g_log.add("  " + std::to_string(NUM_STREAMS) + " cameras, " + std::to_string(NUM_TASKS) + " AI tasks");
    g_log.add("  Total runtime: " + std::to_string((int)totalElapsed) + " seconds");
    g_log.add("================================================================");

    printf("\n============================================================\n");
    printf("  FINAL PERFORMANCE SUMMARY (runtime: %.0fs)\n", totalElapsed);
+    printf("  %d cameras, %d AI tasks\n", NUM_STREAMS, NUM_TASKS);
    printf("============================================================\n");

    double totalFpsFinal = 0;
-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < NUM_TASKS; i++) {
        char buf[512];
        snprintf(buf, sizeof(buf),
-            "  Task %d: GPU[%d] | VRAM=%zuMiB | %d frames, %d detections, FPS=%.1f, InfMs=%.0f",
-            i, taskStates[i].gpuDeviceId,
+            "  Task %d (Stream %d): GPU[%d] | VRAM=%zuMiB | %d frames, %d detections, FPS=%.1f, InfMs=%.0f",
+            i, taskStreamMap[i], taskStates[i].gpuDeviceId,
            taskStates[i].vramUsedBytes / (1024 * 1024),
            taskStates[i].frameCount, taskStates[i].detectionCount,
            taskStates[i].fps, taskStates[i].inferenceMs);
@@ -1466,12 +1632,13 @@ int ANSLPR_MultiGPU_StressTest() {

    // Multi-GPU verdict
    std::set<int> finalGpusUsed;
-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < NUM_TASKS; i++) {
        if (taskStates[i].gpuDeviceId >= 0) finalGpusUsed.insert(taskStates[i].gpuDeviceId);
    }
    {
        char buf[256];
-        snprintf(buf, sizeof(buf), "  Total throughput: %.1f FPS across 4 tasks", totalFpsFinal);
+        snprintf(buf, sizeof(buf), "  Total throughput: %.1f FPS across %d tasks (%d cameras)",
+                 totalFpsFinal, NUM_TASKS, NUM_STREAMS);
        printf("%s\n", buf);
        g_log.add(buf);
    }
@@ -1491,13 +1658,16 @@ int ANSLPR_MultiGPU_StressTest() {
        g_log.add("    3. No CUDA_VISIBLE_DEVICES env var restricting GPU access");
    }

+    // Log shared-camera subscription info
+    g_log.add("  Camera subscription: Task 2 and Task 4 both subscribe to Stream 2 (nhathuocngoclinh)");
+
    printf("============================================================\n");
    g_log.add("================================================================");
    g_log.add("  Log saved to: " + std::string(LOG_FILE_PATH));
    g_log.add("================================================================");

    // --- Release all handles (sequentially on main thread) ---
-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < NUM_TASKS; i++) {
        if (alprHandles[i]) {
            ReleaseANSALPRHandle(&alprHandles[i]);
        }
@@ -2770,9 +2940,9 @@ int main()
    //for (int i = 0; i < 100; i++) {
    //    ANSLPR_CPU_Inferences_FileTest();
    //}
-    //ANSLPR_MultiGPU_StressTest();
+    ANSLPR_MultiGPU_StressTest();
    //ANSLPR_MultiGPU_StressTest_SimulatedCam();
-    ANSLPR_MultiGPU_StressTest_FilePlayer();
+   // ANSLPR_MultiGPU_StressTest_FilePlayer();
    return 0;

 }