Fix AMD and OpenVINO

2026-04-08 13:45:52 +10:00
parent a4a8caaa86
commit 69787b0ff0
15 changed files with 1209 additions and 132 deletions
--- a/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp
+++ b/tests/ANSLPR-UnitTest/ANSLPR-UnitTest.cpp
@@ -29,6 +29,7 @@
 #include <set>
 #include <map>
 #include <cuda_runtime.h>
+#include "EPLoader.h"

 template<typename T>
 T GetOptionalValue(const boost::property_tree::ptree& pt, std::string attribute, T defaultValue) {
@@ -664,9 +665,21 @@ struct GpuSnapshot {
    size_t      usedMiB  = 0;
 };

+// Safe check: is CUDA runtime available? (prevents crash on CPU-only PCs)
+static bool IsCudaAvailable() {
+    static int cached = -1;
+    if (cached < 0) {
+        HMODULE h = LoadLibraryA("nvcuda.dll");
+        cached = (h != nullptr) ? 1 : 0;
+        if (h) FreeLibrary(h);
+    }
+    return cached == 1;
+}
+
 // Query current GPU VRAM usage for all devices
 static std::vector<GpuSnapshot> QueryGpuVram() {
    std::vector<GpuSnapshot> snapshots;
+    if (!IsCudaAvailable()) return snapshots;
    int deviceCount = 0;
    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) return snapshots;
    for (int i = 0; i < deviceCount; i++) {
@@ -693,6 +706,7 @@ static std::vector<GpuSnapshot> QueryGpuVram() {
 // Measure per-GPU free VRAM (returns array indexed by device)
 static std::vector<size_t> GetPerGpuFreeMiB() {
    std::vector<size_t> result;
+    if (!IsCudaAvailable()) return result;
    int deviceCount = 0;
    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) return result;
    int prevDevice;
@@ -712,6 +726,11 @@ static ThreadSafeLog     g_log;

 // Log GPU info using CUDA runtime
 static void LogGpuInfo() {
+    if (!IsCudaAvailable()) {
+        g_log.add("No NVIDIA GPU detected — running in CPU mode");
+        printf("[GPU] No NVIDIA GPU detected — running in CPU mode\n");
+        return;
+    }
    int deviceCount = 0;
    cudaError_t err = cudaGetDeviceCount(&deviceCount);
    if (err != cudaSuccess) {
@@ -749,6 +768,12 @@ static void LogGpuInfo() {
    printf("============================================================\n");
 }

+// Global inference mutex: serializes inference on non-NVIDIA GPUs (DirectML/OpenVINO).
+// DirectML is not thread-safe when multiple ORT sessions run concurrently on the
+// same integrated GPU — causes access violations on 4K frames.
+// On NVIDIA, each task has its own CUDA context so no serialization needed.
+static std::mutex g_inferenceMutex;
+
 // Worker thread: reads RTSP frames and runs ALPR inference
 // RTSP client and ALPR engine are pre-created on the main thread to avoid
 // race conditions in CreateANSRTSPHandle / CreateANSALPRHandle.
@@ -845,12 +870,18 @@ static void ALPRWorkerThread(int taskId,
        if (grabMs > maxGrabMs) maxGrabMs = grabMs;

        // Run ALPR inference
+        bool isNvidia = (ANSCENTER::EPLoader::Current().type == ANSCENTER::EngineType::NVIDIA_GPU);
+        fprintf(stderr, "[Worker T%d] frame %d: calling inference %dx%d...\n",
+            taskId, state.frameCount + 1, framePtr->cols, framePtr->rows);
        auto infStart = std::chrono::steady_clock::now();
        std::string lpnResult, jpegImage;
-        // Pass framePtr directly — NOT a copy. ANSGpuFrameRegistry::lookup()
-        // matches by cv::Mat* pointer, so `new cv::Mat(*framePtr)` would create
-        // a different pointer the registry doesn't know, breaking NV12 zero-copy.
-        ANSALPR_RunInferenceComplete_CPP(&alprHandle, &framePtr, cameraId.c_str(), 0, 0, lpnResult, jpegImage);
+        {
+            std::unique_lock<std::mutex> infLock(g_inferenceMutex, std::defer_lock);
+            if (!isNvidia) infLock.lock();
+            ANSALPR_RunInferenceComplete_CPP(&alprHandle, &framePtr, cameraId.c_str(), 0, 0, lpnResult, jpegImage);
+        }
+        fprintf(stderr, "[Worker T%d] frame %d: inference done, result len=%zu\n",
+            taskId, state.frameCount + 1, lpnResult.size());

        // Release stream lock — inference is done, CHAOS can now safely destroy.
        streamLock.unlock();
@@ -950,25 +981,454 @@ static void ALPRWorkerThread(int taskId,
    g_log.add(prefix + " Worker loop exited");
 }

+// =============================================================================
+//  ANSLPR_SingleTask_Test — 1 stream, 1 AI task. For isolating DirectML/ORT
+//  issues on non-NVIDIA GPUs. If this works but 2-task crashes, it's concurrency.
+// =============================================================================
+int ANSLPR_SingleTask_Test() {
+    ANSCENTER::ANSOPENCV::InitCameraNetwork();
+    g_log.init();
+
+    printf("\n");
+    printf("============================================================\n");
+    printf("  ANSLPR Single-Task Test — 1 Stream, 1 AI Task\n");
+    printf("  Press ESC to stop\n");
+    printf("  Log file: %s\n", LOG_FILE_PATH);
+    printf("============================================================\n\n");
+
+    g_log.add("============================================================");
+    g_log.add("  ANSLPR Single-Task Test — 1 Stream, 1 AI Task");
+    g_log.add("============================================================");
+
+    const std::string streamUrl = "rtsp://admin:admin123@103.156.0.133:8010/cam/realmonitor?channel=1&subtype=0";
+    g_log.add("Stream: " + streamUrl);
+
+    // --- Create RTSP client ---
+    ANSCENTER::ANSRTSPClient* rtspClient = nullptr;
+    printf("[Stream0] Creating RTSP handle...\n");
+    int rtspResult = CreateANSRTSPHandle(&rtspClient, "", "", "", streamUrl.c_str());
+    if (rtspResult != 1 || rtspClient == nullptr) {
+        printf("[Stream0] FAILED to create RTSP handle\n");
+        ANSCENTER::ANSOPENCV::DeinitCameraNetwork();
+        return -1;
+    }
+    SetRTSPImageQuality(&rtspClient, 0);
+    SetRTSPHWDecoding(&rtspClient, -1);  // Force software decoding
+    StartRTSP(&rtspClient);
+    g_log.add("[Stream0] RTSP started (software decode)");
+
+    // --- Create single ALPR engine ---
+    ANSCENTER::ANSALPR* alprHandle = nullptr;
+    std::string modelZipFile = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_ALPR_v1.2.zip";
+    printf("[Task0] Creating ALPR handle...\n");
+    auto engineStart = std::chrono::steady_clock::now();
+    int createResult = CreateANSALPRHandle(&alprHandle, "", modelZipFile.c_str(), "",
+                                            1, 0.5, 0.5, 0.5);
+    if (createResult != 1 || alprHandle == nullptr) {
+        printf("[Task0] FAILED to create ALPR handle (result=%d)\n", createResult);
+        StopRTSP(&rtspClient); ReleaseANSRTSPHandle(&rtspClient);
+        ANSCENTER::ANSOPENCV::DeinitCameraNetwork();
+        return -1;
+    }
+
+    printf("[Task0] Loading ALPR engine...\n");
+    int loadResult = LoadANSALPREngineHandle(&alprHandle);
+    auto engineEnd = std::chrono::steady_clock::now();
+    double loadMs = std::chrono::duration<double, std::milli>(engineEnd - engineStart).count();
+    if (loadResult != 1) {
+        printf("[Task0] FAILED to load ALPR engine (result=%d)\n", loadResult);
+        ReleaseANSALPRHandle(&alprHandle);
+        StopRTSP(&rtspClient); ReleaseANSRTSPHandle(&rtspClient);
+        ANSCENTER::ANSOPENCV::DeinitCameraNetwork();
+        return -1;
+    }
+    printf("[Task0] Engine loaded in %.0f ms\n", loadMs);
+    g_log.add("[Task0] Engine loaded in " + std::to_string((int)loadMs) + " ms");
+
+    // --- Single-task worker + display ---
+    TaskState state;
+    state.engineLoaded = true;
+    state.streamOk = true;
+    state.statusMsg = "Running";
+
+    std::mutex streamGuard;
+    std::thread worker(ALPRWorkerThread, 0, &rtspClient, &streamGuard, alprHandle, std::ref(state));
+
+    const int cellW = 800, cellH = 600;
+    const int logPanelH = 80;
+    std::string windowTitle = "ANSLPR Single-Task Test";
+    cv::namedWindow(windowTitle, cv::WINDOW_NORMAL);
+    cv::resizeWindow(windowTitle, cellW, cellH + logPanelH);
+
+    auto testStart = std::chrono::steady_clock::now();
+
+    while (g_running.load()) {
+        cv::Mat canvas(cellH + logPanelH, cellW, CV_8UC3, cv::Scalar(30, 30, 30));
+
+        cv::Mat cell;
+        double fps = 0, infMs = 0;
+        int fCount = 0, dCount = 0;
+        std::string lastPlate;
+        {
+            std::lock_guard<std::mutex> lk(state.mtx);
+            if (!state.displayFrame.empty())
+                cv::resize(state.displayFrame, cell, cv::Size(cellW, cellH));
+            fps = state.fps;
+            infMs = state.inferenceMs;
+            fCount = state.frameCount;
+            dCount = state.detectionCount;
+            lastPlate = state.lastPlate;
+        }
+
+        if (cell.empty())
+            cell = cv::Mat(cellH, cellW, CV_8UC3, cv::Scalar(40, 40, 40));
+
+        cv::rectangle(cell, cv::Rect(0, cellH - 40, cellW, 40), cv::Scalar(0, 0, 0), cv::FILLED);
+        char bar[256];
+        snprintf(bar, sizeof(bar), "T0 | %.1f FPS | %.0fms | F:%d | D:%d | %s",
+                 fps, infMs, fCount, dCount, lastPlate.empty() ? "-" : lastPlate.c_str());
+        cv::putText(cell, bar, cv::Point(5, cellH - 12),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 0), 1);
+        cell.copyTo(canvas(cv::Rect(0, 0, cellW, cellH)));
+
+        cv::Mat logPanel = canvas(cv::Rect(0, cellH, cellW, logPanelH));
+        logPanel.setTo(cv::Scalar(20, 20, 20));
+        auto elapsed = std::chrono::duration<double>(std::chrono::steady_clock::now() - testStart).count();
+        char header[256];
+        snprintf(header, sizeof(header), "Elapsed: %.0fs | 1 camera, 1 AI task | %.1f FPS | Press ESC to stop",
+                 elapsed, fps);
+        cv::putText(logPanel, header, cv::Point(10, 20),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(200, 200, 0), 1);
+
+        cv::imshow(windowTitle, canvas);
+        if (cv::waitKey(30) == 27) {
+            g_log.add("ESC pressed — stopping...");
+            printf("\nESC pressed — stopping...\n");
+            g_running.store(false);
+        }
+    }
+
+    if (worker.joinable()) worker.join();
+
+    printf("\n============================================================\n");
+    printf("  FINAL SUMMARY\n");
+    printf("  Frames: %d | Detections: %d | FPS: %.1f | InfMs: %.0f\n",
+        state.frameCount, state.detectionCount, state.fps, state.inferenceMs);
+    printf("============================================================\n");
+
+    ReleaseANSALPRHandle(&alprHandle);
+    StopRTSP(&rtspClient);
+    ReleaseANSRTSPHandle(&rtspClient);
+    g_log.close();
+    cv::destroyAllWindows();
+    ANSCENTER::ANSOPENCV::DeinitCameraNetwork();
+    return 0;
+}
+
+// =============================================================================
+//  ANSLPR_CPU_StressTest — Lightweight 2-task stress test for CPU-only PCs
+//  Uses ANSALPR_OD (engineType=1) which auto-falls-back to ONNX Runtime on CPU.
+//  No VRAM tracking, no NVDEC alignment, no chaos thread.
+// =============================================================================
+int ANSLPR_CPU_StressTest() {
+    ANSCENTER::ANSOPENCV::InitCameraNetwork();
+    g_log.init();
+
+    const int NUM_STREAMS = 2;
+    const int NUM_TASKS   = 2;
+
+    printf("\n");
+    printf("============================================================\n");
+    printf("  ANSLPR CPU Stress Test — %d Parallel ALPR Tasks\n", NUM_TASKS);
+    printf("  Press ESC to stop\n");
+    printf("  Log file: %s\n", LOG_FILE_PATH);
+    printf("============================================================\n\n");
+
+    g_log.add("============================================================");
+    g_log.add("  ANSLPR CPU Stress Test — " + std::to_string(NUM_TASKS) + " Tasks");
+    g_log.add("============================================================");
+
+    // --- RTSP URLs (2 camera streams) ---
+    const std::string streamUrls[NUM_STREAMS] = {
+        "rtsp://admin:admin123@103.156.0.133:8010/cam/realmonitor?channel=1&subtype=0",
+        "rtsp://nhathuocngoclinh.zapto.org:600/rtsp/streaming?channel=01&subtype=0"
+    };
+    const int taskStreamMap[NUM_TASKS] = { 0, 1 };
+
+    for (int i = 0; i < NUM_STREAMS; i++)
+        g_log.add("Stream " + std::to_string(i) + ": " + streamUrls[i]);
+
+    // --- Task states ---
+    TaskState taskStates[NUM_TASKS];
+
+    // --- Create RTSP clients (software decoding) ---
+    ANSCENTER::ANSRTSPClient* rtspClients[NUM_STREAMS] = {};
+    for (int s = 0; s < NUM_STREAMS; s++) {
+        printf("[Stream%d] Creating RTSP handle...\n", s);
+        int result = CreateANSRTSPHandle(&rtspClients[s], "", "", "", streamUrls[s].c_str());
+        if (result != 1 || rtspClients[s] == nullptr) {
+            printf("[Stream%d] FAILED to create RTSP handle\n", s);
+            g_log.add("[Stream" + std::to_string(s) + "] RTSP create FAILED");
+            rtspClients[s] = nullptr;
+            continue;
+        }
+        SetRTSPImageQuality(&rtspClients[s], 0);
+        SetRTSPHWDecoding(&rtspClients[s], -1);  // HW_DECODING_DISABLE: force software decoding
+        StartRTSP(&rtspClients[s]);
+        g_log.add("[Stream" + std::to_string(s) + "] RTSP started (software decode)");
+    }
+
+    // --- Create ALPR engines (engineType=1 → ANSALPR_OD, auto CPU/GPU) ---
+    ANSCENTER::ANSALPR* alprHandles[NUM_TASKS] = {};
+    std::string modelZipFile = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_ALPR_v1.2.zip";
+    int engineType = 1; // ANSALPR_OD: auto CPU/GPU
+    double detThresh = 0.5, ocrThresh = 0.5, colThresh = 0.5;
+
+    for (int i = 0; i < NUM_TASKS; i++) {
+        char tag[32];
+        snprintf(tag, sizeof(tag), "[Task%d]", i);
+        int streamIdx = taskStreamMap[i];
+        if (rtspClients[streamIdx] == nullptr) {
+            printf("%s Skipped — Stream%d not available\n", tag, streamIdx);
+            continue;
+        }
+
+        {
+            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
+            taskStates[i].streamOk = true;
+            taskStates[i].statusMsg = "Loading ALPR engine...";
+        }
+
+        printf("%s Creating ALPR handle...\n", tag);
+        auto engineStart = std::chrono::steady_clock::now();
+        int createResult = CreateANSALPRHandle(&alprHandles[i], "", modelZipFile.c_str(), "",
+                                                engineType, detThresh, ocrThresh, colThresh);
+        if (createResult != 1 || alprHandles[i] == nullptr) {
+            printf("%s FAILED to create ALPR handle (result=%d)\n", tag, createResult);
+            g_log.add(std::string(tag) + " ALPR create FAILED");
+            continue;
+        }
+
+        printf("%s Loading ALPR engine...\n", tag);
+        int loadResult = LoadANSALPREngineHandle(&alprHandles[i]);
+        auto engineEnd = std::chrono::steady_clock::now();
+        double loadMs = std::chrono::duration<double, std::milli>(engineEnd - engineStart).count();
+
+        if (loadResult != 1) {
+            printf("%s FAILED to load ALPR engine (result=%d)\n", tag, loadResult);
+            g_log.add(std::string(tag) + " Engine load FAILED");
+            ReleaseANSALPRHandle(&alprHandles[i]);
+            alprHandles[i] = nullptr;
+            continue;
+        }
+
+        char buf[256];
+        snprintf(buf, sizeof(buf), "%s Engine loaded in %.0f ms (Stream%d)", tag, loadMs, streamIdx);
+        printf("%s\n", buf);
+        g_log.add(buf);
+
+        {
+            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
+            taskStates[i].engineLoaded = true;
+            taskStates[i].statusMsg = "Running";
+        }
+    }
+
+    // --- Launch worker threads ---
+    std::mutex streamGuards[NUM_STREAMS];
+    std::thread workers[NUM_TASKS];
+    for (int i = 0; i < NUM_TASKS; i++) {
+        int streamIdx = taskStreamMap[i];
+        if (rtspClients[streamIdx] && alprHandles[i]) {
+            workers[i] = std::thread(ALPRWorkerThread, i,
+                                     &rtspClients[streamIdx],
+                                     &streamGuards[streamIdx],
+                                     alprHandles[i],
+                                     std::ref(taskStates[i]));
+        }
+    }
+
+    // --- Display loop ---
+    const int cellW = 640, cellH = 480;
+    const int logPanelH = 120;
+    const int gridCols = 2, gridRows = 1;
+    std::string windowTitle = "ANSLPR CPU Stress Test";
+    cv::namedWindow(windowTitle, cv::WINDOW_NORMAL);
+    cv::resizeWindow(windowTitle, cellW * gridCols, cellH * gridRows + logPanelH);
+
+    auto testStart = std::chrono::steady_clock::now();
+
+    while (g_running.load()) {
+        cv::Mat canvas(cellH * gridRows + logPanelH, cellW * gridCols, CV_8UC3, cv::Scalar(30, 30, 30));
+
+        for (int i = 0; i < NUM_TASKS; i++) {
+            int col = i % gridCols, row = i / gridCols;
+            cv::Rect roi(col * cellW, row * cellH, cellW, cellH);
+
+            cv::Mat cell;
+            double fps = 0, infMs = 0;
+            int fCount = 0, dCount = 0;
+            std::string statusMsg, lastPlate;
+            bool engineLoaded = false;
+            {
+                std::lock_guard<std::mutex> lk(taskStates[i].mtx);
+                if (!taskStates[i].displayFrame.empty())
+                    cv::resize(taskStates[i].displayFrame, cell, cv::Size(cellW, cellH));
+                fps = taskStates[i].fps;
+                infMs = taskStates[i].inferenceMs;
+                fCount = taskStates[i].frameCount;
+                dCount = taskStates[i].detectionCount;
+                statusMsg = taskStates[i].statusMsg;
+                lastPlate = taskStates[i].lastPlate;
+                engineLoaded = taskStates[i].engineLoaded;
+            }
+
+            if (cell.empty()) {
+                cell = cv::Mat(cellH, cellW, CV_8UC3, cv::Scalar(40, 40, 40));
+                cv::putText(cell, "Task " + std::to_string(i) + ": " + statusMsg,
+                            cv::Point(20, cellH / 2),
+                            cv::FONT_HERSHEY_SIMPLEX, 0.8, cv::Scalar(100, 100, 255), 2);
+            }
+
+            // Status bar
+            cv::rectangle(cell, cv::Rect(0, cellH - 40, cellW, 40), cv::Scalar(0, 0, 0), cv::FILLED);
+            char bar[256];
+            snprintf(bar, sizeof(bar), "T%d(S%d) | %.1f FPS | %.0fms | F:%d | D:%d | %s",
+                     i, taskStreamMap[i], fps, infMs, fCount, dCount,
+                     lastPlate.empty() ? "-" : lastPlate.c_str());
+            cv::Scalar barColor = engineLoaded ? cv::Scalar(0, 255, 0) : cv::Scalar(0, 100, 255);
+            cv::putText(cell, bar, cv::Point(5, cellH - 12),
+                        cv::FONT_HERSHEY_SIMPLEX, 0.45, barColor, 1);
+            cell.copyTo(canvas(roi));
+        }
+
+        // Grid line
+        if (gridCols > 1)
+            cv::line(canvas, cv::Point(cellW, 0), cv::Point(cellW, cellH * gridRows),
+                     cv::Scalar(100, 100, 100), 1);
+
+        // Log panel
+        cv::Rect logRoi(0, cellH * gridRows, cellW * gridCols, logPanelH);
+        cv::Mat logPanel = canvas(logRoi);
+        logPanel.setTo(cv::Scalar(20, 20, 20));
+
+        auto elapsed = std::chrono::duration<double>(std::chrono::steady_clock::now() - testStart).count();
+        double totalFps = 0;
+        for (int i = 0; i < NUM_TASKS; i++) {
+            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
+            totalFps += taskStates[i].fps;
+        }
+        char header[256];
+        snprintf(header, sizeof(header),
+                 "Elapsed: %.0fs | %d cameras, %d AI tasks | Total: %.1f FPS | Press ESC to stop",
+                 elapsed, NUM_STREAMS, NUM_TASKS, totalFps);
+        cv::putText(logPanel, header, cv::Point(10, 20),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(200, 200, 0), 1);
+
+        // Per-task summary
+        for (int i = 0; i < NUM_TASKS; i++) {
+            std::lock_guard<std::mutex> lk(taskStates[i].mtx);
+            char tLine[256];
+            snprintf(tLine, sizeof(tLine),
+                "T%d(S%d): FPS=%.1f Inf=%.0fms Frames=%d Det=%d",
+                i, taskStreamMap[i], taskStates[i].fps, taskStates[i].inferenceMs,
+                taskStates[i].frameCount, taskStates[i].detectionCount);
+            cv::putText(logPanel, tLine, cv::Point(10, 42 + i * 18),
+                        cv::FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(200, 200, 200), 1);
+        }
+
+        // Recent log
+        auto recentLogs = g_log.getRecent(3);
+        int logY = 42 + NUM_TASKS * 18 + 5;
+        for (const auto& line : recentLogs) {
+            if (logY > logPanelH - 5) break;
+            std::string display = (line.size() > 120) ? line.substr(0, 117) + "..." : line;
+            cv::putText(logPanel, display, cv::Point(10, logY),
+                        cv::FONT_HERSHEY_PLAIN, 1.0, cv::Scalar(140, 140, 140), 1);
+            logY += 15;
+        }
+
+        cv::imshow(windowTitle, canvas);
+        int key = cv::waitKey(30);
+        if (key == 27) {
+            g_log.add("ESC pressed — stopping...");
+            printf("\nESC pressed — stopping...\n");
+            g_running.store(false);
+        }
+    }
+
+    // --- Wait for workers ---
+    for (int i = 0; i < NUM_TASKS; i++) {
+        if (workers[i].joinable()) workers[i].join();
+    }
+
+    // --- Final summary ---
+    double totalElapsed = std::chrono::duration<double>(
+        std::chrono::steady_clock::now() - testStart).count();
+    printf("\n============================================================\n");
+    printf("  FINAL SUMMARY (runtime: %.0fs)\n", totalElapsed);
+    printf("============================================================\n");
+    double totalFpsFinal = 0;
+    for (int i = 0; i < NUM_TASKS; i++) {
+        char buf[256];
+        snprintf(buf, sizeof(buf), "  Task %d (Stream %d): %d frames, %d detections, FPS=%.1f, InfMs=%.0f",
+            i, taskStreamMap[i], taskStates[i].frameCount, taskStates[i].detectionCount,
+            taskStates[i].fps, taskStates[i].inferenceMs);
+        printf("%s\n", buf);
+        g_log.add(buf);
+        totalFpsFinal += taskStates[i].fps;
+    }
+    printf("  Total throughput: %.1f FPS\n", totalFpsFinal);
+    printf("============================================================\n");
+
+    // --- Cleanup ---
+    for (int i = 0; i < NUM_TASKS; i++) {
+        if (alprHandles[i]) ReleaseANSALPRHandle(&alprHandles[i]);
+    }
+    for (int s = 0; s < NUM_STREAMS; s++) {
+        if (rtspClients[s]) {
+            StopRTSP(&rtspClients[s]);
+            ReleaseANSRTSPHandle(&rtspClients[s]);
+        }
+    }
+
+    g_log.close();
+    cv::destroyAllWindows();
+    ANSCENTER::ANSOPENCV::DeinitCameraNetwork();
+    return 0;
+}
+
 int ANSLPR_MultiGPU_StressTest() {
    ANSCENTER::ANSOPENCV::InitCameraNetwork();

    // --- Initialize log file ---
    g_log.init();

+    printf("\n");
+    // --- Auto-detect GPU availability (safe on CPU-only PCs without CUDA runtime) ---
+    int gpuCount = 0;
+    bool hasGpu = false;
+    if (IsCudaAvailable()) {
+        cudaGetDeviceCount(&gpuCount);
+        hasGpu = (gpuCount > 0);
+    }
+    const char* modeStr = hasGpu ? "GPU (NVIDIA CUDA)" : "CPU (Software Decoding)";
+
    printf("\n");
    printf("============================================================\n");
-    printf("  ANSLPR Multi-GPU Stress Test — 5 Parallel ALPR Tasks\n");
+    printf("  ANSLPR Multi-Engine Stress Test — 5 Parallel ALPR Tasks\n");
+    printf("  Mode: %s\n", modeStr);
    printf("  (4 cameras, 5 AI tasks — Task 4 shares Stream 2)\n");
    printf("  Press ESC to stop\n");
    printf("  Log file: %s\n", LOG_FILE_PATH);
    printf("============================================================\n\n");

    g_log.add("============================================================");
-    g_log.add("  ANSLPR Multi-GPU Stress Test — 5 Parallel ALPR Tasks");
+    g_log.add("  ANSLPR Multi-Engine Stress Test — 5 Parallel ALPR Tasks");
+    g_log.add("  Mode: " + std::string(modeStr));
    g_log.add("============================================================");

-    // --- Log GPU info for diagnostics ---
+    // --- Log GPU info for diagnostics (safe on CPU — prints "no GPU found") ---
    LogGpuInfo();

    // --- RTSP URLs (4 independent camera streams) ---
@@ -1027,7 +1487,7 @@ int ANSLPR_MultiGPU_StressTest() {
            continue;
        }
        SetRTSPImageQuality(&rtspClients[s], 0);
-        SetRTSPHWDecoding(&rtspClients[s], 7);  // HW_DECODING_CUDA: force CUDA/NVDEC zero-copy path
+        if (hasGpu) SetRTSPHWDecoding(&rtspClients[s], 7);  // CUDA HW decode only with GPU
        StartRTSP(&rtspClients[s]);
        g_log.add("[Stream" + std::to_string(s) + "] RTSP started");
    }
@@ -1040,7 +1500,7 @@ int ANSLPR_MultiGPU_StressTest() {
    // =========================================================================
    ANSCENTER::ANSALPR* alprHandles[NUM_TASKS] = {};
    std::string modelZipFile = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_ALPR_v1.2.zip";
-    int engineType = 1; // NVIDIA_GPU
+    int engineType = 1; // ANSALPR_OD: auto-detects GPU/CPU, uses ONNX Runtime on CPU
    double detThresh = 0.5, ocrThresh = 0.5, colThresh = 0.5;

    for (int i = 0; i < NUM_TASKS; i++) {
@@ -1074,11 +1534,12 @@ int ANSLPR_MultiGPU_StressTest() {
            continue;
        }

-        printf("%s Loading ALPR engine (TensorRT)...\n", tag);
+        printf("%s Loading ALPR engine (%s)...\n", tag, hasGpu ? "TensorRT" : "CPU");
        g_log.add(std::string(tag) + " Loading ALPR engine...");

-        // Snapshot VRAM before engine load to measure consumption
-        auto vramBefore = GetPerGpuFreeMiB();
+        // Snapshot VRAM before engine load to measure consumption (GPU only)
+        std::vector<size_t> vramBefore;
+        if (hasGpu) vramBefore = GetPerGpuFreeMiB();

        int loadResult = LoadANSALPREngineHandle(&alprHandles[i]);
        auto engineEnd = std::chrono::steady_clock::now();
@@ -1094,40 +1555,47 @@ int ANSLPR_MultiGPU_StressTest() {
            continue;
        }

-        // Snapshot VRAM after engine load — find which GPU lost the most VRAM
-        auto vramAfter = GetPerGpuFreeMiB();
-        int bestGpu = 0;
+        int bestGpu = -1;
        size_t maxDelta = 0;
-        size_t gpuCount = vramBefore.size() < vramAfter.size() ? vramBefore.size() : vramAfter.size();
-        for (size_t g = 0; g < gpuCount; g++) {
-            size_t delta = (vramBefore[g] > vramAfter[g]) ? (vramBefore[g] - vramAfter[g]) : 0;
-            if (delta > maxDelta) {
-                maxDelta = delta;
-                bestGpu = (int)g;
+
+        if (hasGpu) {
+            // Snapshot VRAM after engine load — find which GPU lost the most VRAM
+            auto vramAfter = GetPerGpuFreeMiB();
+            size_t gpuCnt = vramBefore.size() < vramAfter.size() ? vramBefore.size() : vramAfter.size();
+            bestGpu = 0;
+            for (size_t g = 0; g < gpuCnt; g++) {
+                size_t delta = (vramBefore[g] > vramAfter[g]) ? (vramBefore[g] - vramAfter[g]) : 0;
+                if (delta > maxDelta) {
+                    maxDelta = delta;
+                    bestGpu = (int)g;
+                }
            }
-        }

-        char buf[512];
-        snprintf(buf, sizeof(buf),
-            "%s Engine loaded in %.0f ms | GPU[%d] | VRAM used: %zu MiB (Stream%d)",
-            tag, loadMs, bestGpu, maxDelta, streamIdx);
-        printf("%s\n", buf);
-        g_log.add(buf);
+            char buf[512];
+            snprintf(buf, sizeof(buf),
+                "%s Engine loaded in %.0f ms | GPU[%d] | VRAM used: %zu MiB (Stream%d)",
+                tag, loadMs, bestGpu, maxDelta, streamIdx);
+            printf("%s\n", buf);
+            g_log.add(buf);

-        // Log per-GPU VRAM state after this engine load
-        for (size_t g = 0; g < vramAfter.size(); g++) {
-            size_t total = 0;
-            if (g < vramBefore.size()) {
-                // Compute total from free + used
+            // Log per-GPU VRAM state after this engine load
+            for (size_t g = 0; g < vramAfter.size(); g++) {
+                size_t total = 0;
                auto gpus = QueryGpuVram();
                if (g < gpus.size()) total = gpus[g].totalMiB;
+                char vbuf[256];
+                snprintf(vbuf, sizeof(vbuf),
+                    "  GPU[%zu] VRAM: %zu MiB free (of %zu MiB)",
+                    g, vramAfter[g], total);
+                printf("%s\n", vbuf);
+                g_log.add(vbuf);
            }
-            char vbuf[256];
-            snprintf(vbuf, sizeof(vbuf),
-                "  GPU[%zu] VRAM: %zu MiB free (of %zu MiB)",
-                g, vramAfter[g], total);
-            printf("%s\n", vbuf);
-            g_log.add(vbuf);
+        } else {
+            char buf[256];
+            snprintf(buf, sizeof(buf), "%s Engine loaded in %.0f ms (CPU mode, Stream%d)",
+                tag, loadMs, streamIdx);
+            printf("%s\n", buf);
+            g_log.add(buf);
        }

        {
@@ -1140,6 +1608,8 @@ int ANSLPR_MultiGPU_StressTest() {
    }

    // --- Align NVDEC decode GPU with inference GPU for NV12 zero-copy ---
+    // (GPU only — software decoding on CPU doesn't use NVDEC)
+    if (hasGpu)
    // Each stream should decode on the same GPU as its inference engine to enable
    // direct NVDEC→TensorRT zero-copy (0.5ms vs 17ms preprocess per frame).
    //
@@ -1343,7 +1813,7 @@ int ANSLPR_MultiGPU_StressTest() {
                    streamUrls[streamIdx].c_str());
                if (result == 1 && rtspClients[streamIdx]) {
                    SetRTSPImageQuality(&rtspClients[streamIdx], 0);
-                    SetRTSPHWDecoding(&rtspClients[streamIdx], 7);
+                    if (hasGpu) SetRTSPHWDecoding(&rtspClients[streamIdx], 7);
                    StartRTSP(&rtspClients[streamIdx]);

                    auto chaosEnd = std::chrono::steady_clock::now();
@@ -1368,8 +1838,9 @@ int ANSLPR_MultiGPU_StressTest() {
    const int cellW = 480, cellH = 360;  // Smaller cells for 3-column layout
    const int logPanelH = 220;
    const int gridCols = 3, gridRows = 2;
-    cv::namedWindow("ANSLPR Multi-GPU Stress Test", cv::WINDOW_NORMAL);
-    cv::resizeWindow("ANSLPR Multi-GPU Stress Test", cellW * gridCols, cellH * gridRows + logPanelH);
+    std::string windowTitle = hasGpu ? "ANSLPR Multi-GPU Stress Test" : "ANSLPR CPU Stress Test";
+    cv::namedWindow(windowTitle, cv::WINDOW_NORMAL);
+    cv::resizeWindow(windowTitle, cellW * gridCols, cellH * gridRows + logPanelH);

    auto testStart = std::chrono::steady_clock::now();
    auto lastGpuSnapshot = std::chrono::steady_clock::now();
@@ -1468,7 +1939,9 @@ int ANSLPR_MultiGPU_StressTest() {
            snprintf(bar1, sizeof(bar1), "T%d(S%d) | %.1f FPS | %.0fms | F:%d | D:%d | %s",
                     i, taskStreamMap[i], fps, infMs, fCount, dCount,
                     lastPlate.empty() ? "-" : lastPlate.c_str());
-            if (gpuId >= 0) {
+            if (!hasGpu) {
+                snprintf(bar2, sizeof(bar2), "CPU mode (software decoding)");
+            } else if (gpuId >= 0) {
                snprintf(bar2, sizeof(bar2), "GPU[%d] | VRAM: %zu MiB", gpuId, vramMiB);
            } else {
                snprintf(bar2, sizeof(bar2), "GPU: N/A");
@@ -1572,7 +2045,7 @@ int ANSLPR_MultiGPU_StressTest() {
            gpuLineY += 15;
        }

-        cv::imshow("ANSLPR Multi-GPU Stress Test", canvas);
+        cv::imshow(windowTitle, canvas);
        int key = cv::waitKey(30);
        if (key == 27) { // ESC
            g_log.add("ESC pressed — stopping all tasks...");
@@ -2930,6 +3403,136 @@ int ANSLPR_MultiGPU_StressTest_FilePlayer() {
    return 0;
 }

+// ANSLPR_OD_CPU_VideoTest — Uses ANSALPR_OD (engineType=1) on Intel CPU/iGPU.
+// ANSALPR_OD auto-detects hardware (OpenVINO on Intel, DirectML on AMD, etc.)
+// No CUDA calls — safe on non-NVIDIA systems.
+int ANSLPR_OD_CPU_VideoTest() {
+    std::cout << "\n============================================================" << std::endl;
+    std::cout << "  ANSLPR CPU/iGPU Test (ANSALPR_OD with auto-detect)" << std::endl;
+    std::cout << "============================================================\n" << std::endl;
+
+    std::string modelZipFile = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\ANSALPR\\ANS_ALPR_v1.2.zip";
+    std::string videoFilePath = "C:\\ProgramData\\ANSCENTER\\Shared\\classroom.mp4";
+
+    std::cout << "Model: " << modelZipFile << std::endl;
+    std::cout << "Video: " << videoFilePath << std::endl;
+
+    ANSCENTER::ANSALPR* infHandle = nullptr;
+    int engineType = 1; // ANSALPR_OD (auto-detects HW internally)
+    double detThresh = 0.5, ocrThresh = 0.5, colThresh = 0.5;
+
+    // Step 1: Create handle
+    std::cout << "[LPR-CPU] Step 1: Creating handle..." << std::endl;
+    int createResult = CreateANSALPRHandle(&infHandle, "", modelZipFile.c_str(), "",
+                                            engineType, detThresh, ocrThresh, colThresh);
+    std::cout << "[LPR-CPU] CreateANSALPRHandle result: " << createResult << std::endl;
+    if (createResult != 1 || infHandle == nullptr) {
+        std::cerr << "[LPR-CPU] FAILED: CreateANSALPRHandle returned " << createResult << std::endl;
+        return -1;
+    }
+
+    // Step 2: Load engine
+    std::cout << "[LPR-CPU] Step 2: Loading engine..." << std::endl;
+    int loadResult = LoadANSALPREngineHandle(&infHandle);
+    std::cout << "[LPR-CPU] LoadANSALPREngineHandle result: " << loadResult << std::endl;
+    if (loadResult != 1) {
+        std::cerr << "[LPR-CPU] FAILED: LoadANSALPREngineHandle returned " << loadResult << std::endl;
+        ReleaseANSALPRHandle(&infHandle);
+        return -2;
+    }
+
+    // Step 3: Open video
+    std::cout << "[LPR-CPU] Step 3: Opening video..." << std::endl;
+    cv::VideoCapture capture(videoFilePath);
+    if (!capture.isOpened()) {
+        std::cerr << "[LPR-CPU] FAILED: Could not open video: " << videoFilePath << std::endl;
+        ReleaseANSALPRHandle(&infHandle);
+        return -3;
+    }
+
+    int totalFrames = static_cast<int>(capture.get(cv::CAP_PROP_FRAME_COUNT));
+    std::cout << "[LPR-CPU] Video opened: " << totalFrames << " frames" << std::endl;
+
+    // Step 4: Run inference
+    std::cout << "[LPR-CPU] Step 4: Running inference..." << std::endl;
+    boost::property_tree::ptree pt;
+    int frameIndex = 0;
+    int totalDetections = 0;
+    double totalInferenceMs = 0.0;
+    int maxFrames = 200;
+
+    while (frameIndex < maxFrames) {
+        cv::Mat frame;
+        if (!capture.read(frame)) {
+            std::cout << "[LPR-CPU] End of video at frame " << frameIndex << std::endl;
+            break;
+        }
+        frameIndex++;
+
+        unsigned int bufferLength = 0;
+        unsigned char* jpeg_bytes = CVMatToBytes(frame, bufferLength);
+        int height = frame.rows;
+        int width = frame.cols;
+
+        auto start = std::chrono::system_clock::now();
+        std::string detectionResult = ANSALPR_RunInferenceBinary(&infHandle, jpeg_bytes, width, height);
+        auto end = std::chrono::system_clock::now();
+        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+        totalInferenceMs += static_cast<double>(elapsed.count());
+
+        delete[] jpeg_bytes;
+
+        if (!detectionResult.empty()) {
+            try {
+                pt.clear();
+                std::stringstream ss;
+                ss << detectionResult;
+                boost::property_tree::read_json(ss, pt);
+                int detCount = 0;
+                BOOST_FOREACH(const boost::property_tree::ptree::value_type& child, pt.get_child("results")) {
+                    const boost::property_tree::ptree& r = child.second;
+                    const auto class_name = GetData<std::string>(r, "class_name");
+                    const auto x = GetData<float>(r, "x");
+                    const auto y = GetData<float>(r, "y");
+                    const auto w = GetData<float>(r, "width");
+                    const auto h = GetData<float>(r, "height");
+                    detCount++;
+                    cv::rectangle(frame, cv::Rect(x, y, w, h), cv::Scalar(0, 255, 0), 2);
+                    cv::putText(frame, class_name, cv::Point(x, y - 5),
+                        0, 0.6, cv::Scalar(0, 0, 255), 1, cv::LINE_AA);
+                }
+                totalDetections += detCount;
+            }
+            catch (...) {}
+        }
+
+        if (frameIndex % 10 == 0) {
+            double avgSoFar = totalInferenceMs / frameIndex;
+            std::cout << "[LPR-CPU] Frame " << frameIndex << "/" << maxFrames
+                << " | Time: " << elapsed.count() << "ms"
+                << " | Avg: " << static_cast<int>(avgSoFar) << "ms"
+                << " | Detections: " << totalDetections << std::endl;
+        }
+
+        cv::imshow("ANSLPR CPU Test", frame);
+        if (cv::waitKey(1) == 27) break;
+    }
+
+    // Summary
+    double avgMs = (frameIndex > 0) ? (totalInferenceMs / frameIndex) : 0.0;
+    std::cout << "\n=== LPR CPU Test Summary ===" << std::endl;
+    std::cout << "Frames processed: " << frameIndex << std::endl;
+    std::cout << "Total detections: " << totalDetections << std::endl;
+    std::cout << "Avg inference:    " << avgMs << " ms/frame" << std::endl;
+    std::cout << "Total time:       " << totalInferenceMs << " ms" << std::endl;
+    std::cout << (frameIndex > 0 ? "[LPR-CPU] PASSED" : "[LPR-CPU] FAILED") << std::endl;
+
+    capture.release();
+    cv::destroyAllWindows();
+    ReleaseANSALPRHandle(&infHandle);
+    return (frameIndex > 0) ? 0 : -4;
+}
+
 int main()
 {
   // ANSLPR_OD_INDOInferences_FileTest();
@@ -2940,9 +3543,12 @@ int main()
    //for (int i = 0; i < 100; i++) {
    //    ANSLPR_CPU_Inferences_FileTest();
    //}
-    ANSLPR_MultiGPU_StressTest();
+    //ANSLPR_SingleTask_Test();
+    ANSLPR_CPU_StressTest();
+    //ANSLPR_MultiGPU_StressTest();
    //ANSLPR_MultiGPU_StressTest_SimulatedCam();
   // ANSLPR_MultiGPU_StressTest_FilePlayer();
+    //ANSLPR_OD_CPU_VideoTest();
    return 0;

 }